[med-svn] [stacks] 01/03: Imported Upstream version 1.42
Andreas Tille
tille at debian.org
Sat Aug 13 09:04:16 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository stacks.
commit df2c72a14c851c4a3d4d47691218acf25cecde46
Author: Andreas Tille <tille at debian.org>
Date: Sat Aug 13 10:48:52 2016 +0200
Imported Upstream version 1.42
---
ChangeLog | 23 +
Makefile.am | 28 +-
Makefile.in | 548 +-
README | 33 +-
configure | 54 +-
configure.ac | 25 +-
htslib/._INSTALL | Bin 0 -> 181 bytes
htslib/._LICENSE | Bin 0 -> 181 bytes
htslib/._Makefile | Bin 0 -> 181 bytes
htslib/._NEWS | Bin 0 -> 181 bytes
htslib/._README | Bin 0 -> 181 bytes
htslib/._bgzf.c | Bin 0 -> 181 bytes
htslib/._bgzip.c | Bin 0 -> 181 bytes
htslib/._cram | Bin 0 -> 181 bytes
htslib/._faidx.c | Bin 0 -> 181 bytes
htslib/._hfile.c | Bin 0 -> 181 bytes
htslib/._hfile_internal.h | Bin 0 -> 181 bytes
htslib/._hfile_irods.c | Bin 0 -> 181 bytes
htslib/._hfile_libcurl.c | Bin 0 -> 181 bytes
htslib/._hfile_net.c | Bin 0 -> 181 bytes
htslib/._hts.c | Bin 0 -> 181 bytes
htslib/._hts_internal.h | Bin 0 -> 181 bytes
htslib/._htsfile.c | Bin 0 -> 181 bytes
htslib/._htslib | Bin 0 -> 181 bytes
htslib/._htslib.pc.in | Bin 0 -> 181 bytes
htslib/._htslib_vars.mk | Bin 0 -> 181 bytes
htslib/._kfunc.c | Bin 0 -> 181 bytes
htslib/._knetfile.c | Bin 0 -> 181 bytes
htslib/._kstring.c | Bin 0 -> 181 bytes
htslib/._md5.c | Bin 0 -> 181 bytes
htslib/._plugin.c | Bin 0 -> 181 bytes
htslib/._regidx.c | Bin 0 -> 181 bytes
htslib/._sam.c | Bin 0 -> 181 bytes
htslib/._synced_bcf_reader.c | Bin 0 -> 181 bytes
htslib/._tabix.c | Bin 0 -> 181 bytes
htslib/._tbx.c | Bin 0 -> 181 bytes
htslib/._vcf.c | Bin 0 -> 181 bytes
htslib/._vcf_sweep.c | Bin 0 -> 181 bytes
htslib/._vcfutils.c | Bin 0 -> 181 bytes
htslib/INSTALL | 102 +
htslib/LICENSE | 69 +
htslib/Makefile | 426 ++
htslib/NEWS | 109 +
htslib/README | 5 +
htslib/bgzf.c | 1330 +++++
htslib/bgzip.c | 311 ++
htslib/cram/._cram.h | Bin 0 -> 181 bytes
htslib/cram/._cram_codecs.c | Bin 0 -> 181 bytes
htslib/cram/._cram_codecs.h | Bin 0 -> 181 bytes
htslib/cram/._cram_decode.c | Bin 0 -> 181 bytes
htslib/cram/._cram_decode.h | Bin 0 -> 181 bytes
htslib/cram/._cram_encode.c | Bin 0 -> 181 bytes
htslib/cram/._cram_encode.h | Bin 0 -> 181 bytes
htslib/cram/._cram_external.c | Bin 0 -> 181 bytes
htslib/cram/._cram_index.c | Bin 0 -> 181 bytes
htslib/cram/._cram_index.h | Bin 0 -> 181 bytes
htslib/cram/._cram_io.c | Bin 0 -> 181 bytes
htslib/cram/._cram_io.h | Bin 0 -> 181 bytes
htslib/cram/._cram_samtools.c | Bin 0 -> 181 bytes
htslib/cram/._cram_samtools.h | Bin 0 -> 181 bytes
htslib/cram/._cram_stats.c | Bin 0 -> 181 bytes
htslib/cram/._cram_stats.h | Bin 0 -> 181 bytes
htslib/cram/._cram_structs.h | Bin 0 -> 181 bytes
htslib/cram/._files.c | Bin 0 -> 181 bytes
htslib/cram/._mFILE.c | Bin 0 -> 181 bytes
htslib/cram/._mFILE.h | Bin 0 -> 181 bytes
htslib/cram/._misc.h | Bin 0 -> 181 bytes
htslib/cram/._open_trace_file.c | Bin 0 -> 181 bytes
htslib/cram/._open_trace_file.h | Bin 0 -> 181 bytes
htslib/cram/._os.h | Bin 0 -> 181 bytes
htslib/cram/._pooled_alloc.c | Bin 0 -> 181 bytes
htslib/cram/._pooled_alloc.h | Bin 0 -> 181 bytes
htslib/cram/._rANS_byte.h | Bin 0 -> 181 bytes
htslib/cram/._rANS_static.c | Bin 0 -> 181 bytes
htslib/cram/._rANS_static.h | Bin 0 -> 181 bytes
htslib/cram/._sam_header.c | Bin 0 -> 181 bytes
htslib/cram/._sam_header.h | Bin 0 -> 181 bytes
htslib/cram/._string_alloc.c | Bin 0 -> 181 bytes
htslib/cram/._string_alloc.h | Bin 0 -> 181 bytes
htslib/cram/._thread_pool.c | Bin 0 -> 181 bytes
htslib/cram/._thread_pool.h | Bin 0 -> 181 bytes
htslib/cram/._vlen.c | Bin 0 -> 181 bytes
htslib/cram/._vlen.h | Bin 0 -> 181 bytes
htslib/cram/._zfio.c | Bin 0 -> 181 bytes
htslib/cram/._zfio.h | Bin 0 -> 181 bytes
htslib/cram/cram.h | 61 +
htslib/cram/cram_codecs.c | 1950 +++++++
htslib/cram/cram_codecs.h | 194 +
htslib/cram/cram_decode.c | 3159 +++++++++++
htslib/cram/cram_decode.h | 112 +
htslib/cram/cram_encode.c | 3105 +++++++++++
htslib/cram/cram_encode.h | 105 +
htslib/cram/cram_external.c | 377 ++
htslib/cram/cram_index.c | 582 +++
htslib/cram/cram_index.h | 99 +
htslib/cram/cram_io.c | 4604 ++++++++++++++++
htslib/cram/cram_io.h | 669 +++
htslib/cram/cram_samtools.c | 149 +
htslib/cram/cram_samtools.h | 105 +
htslib/cram/cram_stats.c | 448 ++
htslib/cram/cram_stats.h | 59 +
htslib/cram/cram_structs.h | 821 +++
htslib/cram/files.c | 74 +
htslib/cram/mFILE.c | 694 +++
htslib/cram/mFILE.h | 89 +
htslib/cram/misc.h | 110 +
htslib/cram/open_trace_file.c | 414 ++
htslib/cram/open_trace_file.h | 125 +
htslib/cram/os.h | 308 ++
htslib/cram/pooled_alloc.c | 188 +
htslib/cram/pooled_alloc.h | 64 +
htslib/cram/rANS_byte.h | 336 ++
htslib/cram/rANS_static.c | 868 ++++
htslib/cram/rANS_static.h | 51 +
htslib/cram/sam_header.c | 1268 +++++
htslib/cram/sam_header.h | 459 ++
htslib/cram/string_alloc.c | 155 +
htslib/cram/string_alloc.h | 68 +
htslib/cram/thread_pool.c | 757 +++
htslib/cram/thread_pool.h | 218 +
htslib/cram/vlen.c | 430 ++
htslib/cram/vlen.h | 48 +
htslib/cram/zfio.c | 183 +
htslib/cram/zfio.h | 62 +
htslib/faidx.c | 525 ++
htslib/hfile.c | 751 +++
htslib/hfile_internal.h | 139 +
htslib/hfile_irods.c | 259 +
htslib/hfile_libcurl.c | 917 ++++
htslib/hfile_net.c | 112 +
htslib/hts.c | 2066 ++++++++
htslib/hts_internal.h | 90 +
htslib/htsfile.c | 234 +
htslib/htslib.pc.in | 10 +
htslib/htslib/._bgzf.h | Bin 0 -> 181 bytes
htslib/htslib/._cram.h | Bin 0 -> 181 bytes
htslib/htslib/._faidx.h | Bin 0 -> 181 bytes
htslib/htslib/._hfile.h | Bin 0 -> 181 bytes
htslib/htslib/._hts.h | Bin 0 -> 181 bytes
htslib/htslib/._hts_defs.h | Bin 0 -> 181 bytes
htslib/htslib/._kbitset.h | Bin 0 -> 181 bytes
htslib/htslib/._kfunc.h | Bin 0 -> 181 bytes
htslib/htslib/._khash.h | Bin 0 -> 181 bytes
htslib/htslib/._khash_str2int.h | Bin 0 -> 181 bytes
htslib/htslib/._klist.h | Bin 0 -> 181 bytes
htslib/htslib/._knetfile.h | Bin 0 -> 181 bytes
htslib/htslib/._kseq.h | Bin 0 -> 181 bytes
htslib/htslib/._ksort.h | Bin 0 -> 181 bytes
htslib/htslib/._kstring.h | Bin 0 -> 181 bytes
htslib/htslib/._regidx.h | Bin 0 -> 181 bytes
htslib/htslib/._sam.h | Bin 0 -> 181 bytes
htslib/htslib/._synced_bcf_reader.h | Bin 0 -> 181 bytes
htslib/htslib/._tbx.h | Bin 0 -> 181 bytes
htslib/htslib/._vcf.h | Bin 0 -> 181 bytes
htslib/htslib/._vcf_sweep.h | Bin 0 -> 181 bytes
htslib/htslib/._vcfutils.h | Bin 0 -> 181 bytes
htslib/htslib/bgzf.h | 335 ++
htslib/htslib/cram.h | 492 ++
htslib/htslib/faidx.h | 137 +
htslib/htslib/hfile.h | 215 +
htslib/htslib/hts.h | 639 +++
htslib/htslib/hts_defs.h | 72 +
htslib/htslib/kbitset.h | 160 +
htslib/htslib/kfunc.h | 83 +
htslib/htslib/khash.h | 627 +++
htslib/htslib/khash_str2int.h | 133 +
htslib/htslib/klist.h | 135 +
htslib/htslib/knetfile.h | 101 +
htslib/htslib/kseq.h | 253 +
htslib/htslib/ksort.h | 285 +
htslib/htslib/kstring.h | 277 +
htslib/htslib/regidx.h | 154 +
htslib/htslib/sam.h | 454 ++
htslib/htslib/synced_bcf_reader.h | 302 ++
htslib/htslib/tbx.h | 79 +
htslib/htslib/vcf.h | 914 ++++
htslib/htslib/vcf_sweep.h | 47 +
htslib/htslib/vcfutils.h | 134 +
htslib/htslib_vars.mk | 49 +
htslib/kfunc.c | 282 +
htslib/knetfile.c | 634 +++
htslib/kstring.c | 276 +
htslib/md5.c | 386 ++
htslib/plugin.c | 171 +
htslib/regidx.c | 340 ++
htslib/sam.c | 2073 ++++++++
htslib/synced_bcf_reader.c | 1284 +++++
htslib/tabix.c | 544 ++
htslib/tbx.c | 333 ++
htslib/vcf.c | 3483 +++++++++++++
htslib/vcf_sweep.c | 183 +
htslib/vcfutils.c | 691 +++
scripts/ref_map.pl | 2 +-
src/BamI.h | 41 +-
src/BamUnalignedI.h | 35 +-
src/BowtieI.h | 4 +-
src/GappedAln.h | 6 +-
src/MetaPopInfo.cc | 347 ++
src/MetaPopInfo.h | 111 +
src/PopMap.h | 247 +-
src/PopSum.h | 146 +-
src/SamI.h | 4 +-
src/Tsv.h | 2 +-
src/Vcf.cc | 506 ++
src/Vcf.h | 467 ++
src/aln_utils.cc | 7 +-
src/catalog_utils.cc | 58 +-
src/catalog_utils.h | 44 +
src/clone_filter.cc | 1336 ++---
src/cstacks.cc | 38 +-
src/export_formats.cc | 3333 ++++++++++++
src/export_formats.h | 70 +
src/file_io.cc | 3 +-
...ype_dictionaries.h => genotype_dictionaries.cc} | 224 +-
src/genotype_dictionaries.h | 558 +-
src/genotypes.cc | 110 +-
src/input.cc | 4 +-
src/kmers.cc | 2 +-
src/locus.cc | 2 +-
src/mstack.cc | 4 +-
src/ordered.h | 26 +-
src/populations.cc | 5484 ++++----------------
src/populations.h | 131 +-
src/pstacks.cc | 904 ++--
src/{renz.h => renz.cc} | 119 +-
src/renz.h | 219 +-
src/rxstacks.cc | 136 +-
src/smoothing.h | 13 +-
src/smoothing_utils.h | 4 +-
src/sql_utilities.cc | 299 ++
src/sql_utilities.h | 305 +-
src/sstacks.cc | 16 +-
src/stacks.h | 6 +-
src/ustacks.cc | 8 +-
234 files changed, 56367 insertions(+), 7505 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 6642845..0ae3d94 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+Stacks 1.42 - Aug 05, 2016
+--------------------------
+ Feature: populations program is now able to calculate populations statistics using arbitrary VCF files
+ as input.
+ Feature: Upgraded to the latest release of HTSLib (1.3.1) for reading BAM files. Embedded the library
+ in the Stacks distribution to remove previous libbam dependency.
+ Feature: Added an output directory option to 'populations' (--out_path).
+ Feature: Added restriction enzymes BsaHI, HpaII, NcoI; corrected NdeI.
+ Bugfix: Made the VCF output by 'populations' more standard-compliant.
+ Bugfix: Some output files included 0-based genomic coordinates, changed them to 1-based.
+ Bugfix: Replaced populations IDs with populations names in 'populations' output.
+ Bugfix: Corrected a bug affecting clone_filter when input was non-gzipped paired-end data.
+
+Stacks 1.41 - June 22, 2016
+---------------------------
+ Bugfix: the kernel-smoothing procedure in populations (used for Fst, Pi, heterozygosity etc. smoothing)
+ was not functioning at sizes larger than the default size. A bug was creating incorrect weights for the
+ smoothing operation when the sliding window size was set to a large value causing the smoothing
+ window to have a maximum size after which increasing the size did not change the smoothing.
+ Bugfix: cstacks was reporting gapped alignments even when --gapped was not enabled. This affected
+ a small number of (mostly) confounded catalog loci.
+ Feature: Added the Csp6I restriction enzyme.
+
Stacks 1.40 - May 04, 2016
--------------------------
Feature: Changed process_radtags and process_shortreads to print FASTQ/FASTA headers using
diff --git a/Makefile.am b/Makefile.am
index 96e3b88..4ad1fbe 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -27,21 +27,21 @@ cstacks_SOURCES = src/cstacks.h src/cstacks.cc src/constants.h \
src/kmers.h src/kmers.cc src/utils.h src/utils.cc \
src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc
hstacks_SOURCES = src/hstacks.h src/hstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/kmers.h src/kmers.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/utils.h src/utils.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc
sstacks_SOURCES = src/sstacks.h src/sstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/kmers.h src/kmers.cc \
src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc src/utils.h src/utils.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc
rxstacks_SOURCES = src/rxstacks.h src/rxstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
@@ -49,7 +49,8 @@ rxstacks_SOURCES = src/rxstacks.h src/rxstacks.cc src/constants.h \
src/mst.h src/mst.cc \
src/models.h src/models.cc \
src/aln_utils.h src/aln_utils.cc src/utils.h src/utils.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc \
+ src/MetaPopInfo.h src/MetaPopInfo.cc
process_radtags_SOURCES = src/process_radtags.h src/process_radtags.cc src/constants.h \
src/utils.h src/utils.cc src/log_utils.h src/log_utils.cc \
@@ -57,7 +58,7 @@ process_radtags_SOURCES = src/process_radtags.h src/process_radtags.cc src/const
src/clean.h src/clean.cc \
src/file_io.h src/file_io.cc \
src/input.h src/input.cc src/BustardI.h src/BamUnalignedI.h src/FastqI.h src/gzFastq.h \
- src/renz.h
+ src/renz.h src/renz.cc
process_shortreads_SOURCES = src/process_shortreads.h src/process_shortreads.cc src/constants.h \
src/clean.h src/clean.cc \
@@ -88,21 +89,24 @@ genotypes_SOURCES = src/genotypes.h src/genotypes.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/aln_utils.h src/aln_utils.cc \
- src/PopMap.h src/genotype_dictionaries.h \
- src/input.h src/input.cc src/sql_utilities.h src/renz.h
+ src/PopMap.h src/genotype_dictionaries.h src/genotype_dictionaries.cc\
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc src/renz.h src/renz.cc \
+ src/MetaPopInfo.h src/MetaPopInfo.cc
populations_SOURCES = src/populations.h src/populations.cc src/constants.h \
src/utils.h src/utils.cc src/catalog_utils.h src/catalog_utils.cc \
src/log_utils.h src/log_utils.cc \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
- src/PopMap.h src/PopSum.h src/aln_utils.h src/aln_utils.cc src/genotype_dictionaries.h \
- src/input.h src/input.cc src/sql_utilities.h src/renz.h \
- src/bootstrap.h src/ordered.h src/smoothing.h src/smoothing_utils.h
+ src/PopMap.h src/PopSum.h src/aln_utils.h src/aln_utils.cc src/genotype_dictionaries.h src/genotype_dictionaries.cc \
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc src/renz.h src/renz.cc \
+ src/bootstrap.h src/ordered.h src/smoothing.h src/smoothing_utils.h \
+ src/export_formats.h src/export_formats.cc \
+ src/MetaPopInfo.h src/MetaPopInfo.cc src/Vcf.h src/Vcf.cc
phasedstacks_SOURCES = src/phasedstacks.h src/phasedstacks.cc src/constants.h \
src/locus.h src/locus.cc \
- src/input.h src/input.cc src/sql_utilities.h \
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc \
src/log_utils.h src/log_utils.cc \
src/utils.h src/utils.cc src/catalog_utils.h src/catalog_utils.cc
@@ -163,6 +167,8 @@ TESTS = tests/process_radtags.t tests/kmer_filter.t tests/ustacks.t tests/pstack
EXTRA_DIST = $(nobase_pkgdata_DATA) LICENSE INSTALL README ChangeLog $(TESTS)
+SUBDIRS = htslib .
+
pkglocalstatedir = $(localstatedir)/$(PACKAGE)
debug:
diff --git a/Makefile.in b/Makefile.in
index c7e396b..7a479ef 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -129,7 +129,8 @@ am_cstacks_OBJECTS = src/cstacks-cstacks.$(OBJEXT) \
src/cstacks-stacks.$(OBJEXT) src/cstacks-locus.$(OBJEXT) \
src/cstacks-kmers.$(OBJEXT) src/cstacks-utils.$(OBJEXT) \
src/cstacks-aln_utils.$(OBJEXT) src/cstacks-DNASeq.$(OBJEXT) \
- src/cstacks-DNANSeq.$(OBJEXT) src/cstacks-input.$(OBJEXT)
+ src/cstacks-DNANSeq.$(OBJEXT) src/cstacks-input.$(OBJEXT) \
+ src/cstacks-sql_utilities.$(OBJEXT)
cstacks_OBJECTS = $(am_cstacks_OBJECTS)
cstacks_LDADD = $(LDADD)
cstacks_LINK = $(CXXLD) $(cstacks_CXXFLAGS) $(CXXFLAGS) \
@@ -150,7 +151,11 @@ am_genotypes_OBJECTS = src/genotypes-genotypes.$(OBJEXT) \
src/genotypes-stacks.$(OBJEXT) src/genotypes-locus.$(OBJEXT) \
src/genotypes-DNASeq.$(OBJEXT) src/genotypes-DNANSeq.$(OBJEXT) \
src/genotypes-aln_utils.$(OBJEXT) \
- src/genotypes-input.$(OBJEXT)
+ src/genotypes-genotype_dictionaries.$(OBJEXT) \
+ src/genotypes-input.$(OBJEXT) \
+ src/genotypes-sql_utilities.$(OBJEXT) \
+ src/genotypes-renz.$(OBJEXT) \
+ src/genotypes-MetaPopInfo.$(OBJEXT)
genotypes_OBJECTS = $(am_genotypes_OBJECTS)
genotypes_LDADD = $(LDADD)
genotypes_LINK = $(CXXLD) $(genotypes_CXXFLAGS) $(CXXFLAGS) \
@@ -159,7 +164,8 @@ am_hstacks_OBJECTS = src/hstacks-hstacks.$(OBJEXT) \
src/hstacks-stacks.$(OBJEXT) src/hstacks-locus.$(OBJEXT) \
src/hstacks-kmers.$(OBJEXT) src/hstacks-DNASeq.$(OBJEXT) \
src/hstacks-DNANSeq.$(OBJEXT) src/hstacks-utils.$(OBJEXT) \
- src/hstacks-input.$(OBJEXT)
+ src/hstacks-input.$(OBJEXT) \
+ src/hstacks-sql_utilities.$(OBJEXT)
hstacks_OBJECTS = $(am_hstacks_OBJECTS)
hstacks_LDADD = $(LDADD)
hstacks_LINK = $(CXXLD) $(hstacks_CXXFLAGS) $(CXXFLAGS) \
@@ -176,6 +182,7 @@ kmer_filter_LINK = $(CXXLD) $(kmer_filter_CXXFLAGS) $(CXXFLAGS) \
am_phasedstacks_OBJECTS = src/phasedstacks-phasedstacks.$(OBJEXT) \
src/phasedstacks-locus.$(OBJEXT) \
src/phasedstacks-input.$(OBJEXT) \
+ src/phasedstacks-sql_utilities.$(OBJEXT) \
src/phasedstacks-log_utils.$(OBJEXT) \
src/phasedstacks-utils.$(OBJEXT) \
src/phasedstacks-catalog_utils.$(OBJEXT)
@@ -192,7 +199,13 @@ am_populations_OBJECTS = src/populations-populations.$(OBJEXT) \
src/populations-DNASeq.$(OBJEXT) \
src/populations-DNANSeq.$(OBJEXT) \
src/populations-aln_utils.$(OBJEXT) \
- src/populations-input.$(OBJEXT)
+ src/populations-genotype_dictionaries.$(OBJEXT) \
+ src/populations-input.$(OBJEXT) \
+ src/populations-sql_utilities.$(OBJEXT) \
+ src/populations-renz.$(OBJEXT) \
+ src/populations-export_formats.$(OBJEXT) \
+ src/populations-MetaPopInfo.$(OBJEXT) \
+ src/populations-Vcf.$(OBJEXT)
populations_OBJECTS = $(am_populations_OBJECTS)
populations_LDADD = $(LDADD)
populations_LINK = $(CXXLD) $(populations_CXXFLAGS) $(CXXFLAGS) \
@@ -204,7 +217,8 @@ am_process_radtags_OBJECTS = \
src/process_radtags-write.$(OBJEXT) \
src/process_radtags-clean.$(OBJEXT) \
src/process_radtags-file_io.$(OBJEXT) \
- src/process_radtags-input.$(OBJEXT)
+ src/process_radtags-input.$(OBJEXT) \
+ src/process_radtags-renz.$(OBJEXT)
process_radtags_OBJECTS = $(am_process_radtags_OBJECTS)
process_radtags_DEPENDENCIES = $(am__DEPENDENCIES_1)
process_radtags_LINK = $(CXXLD) $(process_radtags_CXXFLAGS) \
@@ -236,7 +250,9 @@ am_rxstacks_OBJECTS = src/rxstacks-rxstacks.$(OBJEXT) \
src/rxstacks-DNANSeq.$(OBJEXT) src/rxstacks-DNASeq.$(OBJEXT) \
src/rxstacks-mst.$(OBJEXT) src/rxstacks-models.$(OBJEXT) \
src/rxstacks-aln_utils.$(OBJEXT) src/rxstacks-utils.$(OBJEXT) \
- src/rxstacks-input.$(OBJEXT)
+ src/rxstacks-input.$(OBJEXT) \
+ src/rxstacks-sql_utilities.$(OBJEXT) \
+ src/rxstacks-MetaPopInfo.$(OBJEXT)
rxstacks_OBJECTS = $(am_rxstacks_OBJECTS)
rxstacks_LDADD = $(LDADD)
rxstacks_LINK = $(CXXLD) $(rxstacks_CXXFLAGS) $(CXXFLAGS) \
@@ -245,7 +261,8 @@ am_sstacks_OBJECTS = src/sstacks-sstacks.$(OBJEXT) \
src/sstacks-stacks.$(OBJEXT) src/sstacks-locus.$(OBJEXT) \
src/sstacks-kmers.$(OBJEXT) src/sstacks-aln_utils.$(OBJEXT) \
src/sstacks-DNASeq.$(OBJEXT) src/sstacks-DNANSeq.$(OBJEXT) \
- src/sstacks-utils.$(OBJEXT) src/sstacks-input.$(OBJEXT)
+ src/sstacks-utils.$(OBJEXT) src/sstacks-input.$(OBJEXT) \
+ src/sstacks-sql_utilities.$(OBJEXT)
sstacks_OBJECTS = $(am_sstacks_OBJECTS)
sstacks_LDADD = $(LDADD)
sstacks_LINK = $(CXXLD) $(sstacks_CXXFLAGS) $(CXXFLAGS) \
@@ -346,12 +363,28 @@ DIST_SOURCES = $(clone_filter_SOURCES) $(cstacks_SOURCES) \
$(populations_SOURCES) $(process_radtags_SOURCES) \
$(process_shortreads_SOURCES) $(pstacks_SOURCES) \
$(rxstacks_SOURCES) $(sstacks_SOURCES) $(ustacks_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
+ ctags-recursive dvi-recursive html-recursive info-recursive \
+ install-data-recursive install-dvi-recursive \
+ install-exec-recursive install-html-recursive \
+ install-info-recursive install-pdf-recursive \
+ install-ps-recursive install-recursive installcheck-recursive \
+ installdirs-recursive pdf-recursive ps-recursive \
+ tags-recursive uninstall-recursive
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
DATA = $(nobase_pkgdata_DATA)
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
+ distclean-recursive maintainer-clean-recursive
+am__recursive_targets = \
+ $(RECURSIVE_TARGETS) \
+ $(RECURSIVE_CLEAN_TARGETS) \
+ $(am__extra_recursive_targets)
+AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
+ cscope check recheck distdir dist dist-all distcheck
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \
$(LISP)config.h.in
# Read a list of newline-separated strings from the standard input,
@@ -373,7 +406,6 @@ am__define_uniq_tagged_files = \
ETAGS = etags
CTAGS = ctags
CSCOPE = cscope
-AM_RECURSIVE_TARGETS = cscope check recheck
am__tty_colors_dummy = \
mgn= red= grn= lgn= blu= brg= std=; \
am__color_tests=no
@@ -549,6 +581,7 @@ TEST_LOGS = $(am__test_logs2:.test.log=.log)
TEST_LOG_DRIVER = $(SHELL) $(top_srcdir)/config/test-driver
TEST_LOG_COMPILE = $(TEST_LOG_COMPILER) $(AM_TEST_LOG_FLAGS) \
$(TEST_LOG_FLAGS)
+DIST_SUBDIRS = $(SUBDIRS)
am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/config.h.in \
$(top_srcdir)/config/compile $(top_srcdir)/config/depcomp \
$(top_srcdir)/config/install-sh $(top_srcdir)/config/missing \
@@ -564,6 +597,31 @@ am__remove_distdir = \
|| { sleep 5 && rm -rf "$(distdir)"; }; \
else :; fi
am__post_remove_distdir = $(am__remove_distdir)
+am__relativize = \
+ dir0=`pwd`; \
+ sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+ sed_rest='s,^[^/]*/*,,'; \
+ sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+ sed_butlast='s,/*[^/]*$$,,'; \
+ while test -n "$$dir1"; do \
+ first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+ if test "$$first" != "."; then \
+ if test "$$first" = ".."; then \
+ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+ else \
+ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+ if test "$$first2" = "$$first"; then \
+ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+ else \
+ dir2="../$$dir2"; \
+ fi; \
+ dir0="$$dir0"/"$$first"; \
+ fi; \
+ fi; \
+ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+ done; \
+ reldir="$$dir2"
DIST_ARCHIVES = $(distdir).tar.gz
GZIP_ENV = --best
DIST_TARGETS = dist-gzip
@@ -693,21 +751,21 @@ cstacks_SOURCES = src/cstacks.h src/cstacks.cc src/constants.h \
src/kmers.h src/kmers.cc src/utils.h src/utils.cc \
src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc
hstacks_SOURCES = src/hstacks.h src/hstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/kmers.h src/kmers.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/utils.h src/utils.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc
sstacks_SOURCES = src/sstacks.h src/sstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/kmers.h src/kmers.cc \
src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc src/utils.h src/utils.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc
rxstacks_SOURCES = src/rxstacks.h src/rxstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
@@ -715,7 +773,8 @@ rxstacks_SOURCES = src/rxstacks.h src/rxstacks.cc src/constants.h \
src/mst.h src/mst.cc \
src/models.h src/models.cc \
src/aln_utils.h src/aln_utils.cc src/utils.h src/utils.cc \
- src/input.h src/input.cc src/sql_utilities.h
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc \
+ src/MetaPopInfo.h src/MetaPopInfo.cc
process_radtags_SOURCES = src/process_radtags.h src/process_radtags.cc src/constants.h \
src/utils.h src/utils.cc src/log_utils.h src/log_utils.cc \
@@ -723,7 +782,7 @@ process_radtags_SOURCES = src/process_radtags.h src/process_radtags.cc src/const
src/clean.h src/clean.cc \
src/file_io.h src/file_io.cc \
src/input.h src/input.cc src/BustardI.h src/BamUnalignedI.h src/FastqI.h src/gzFastq.h \
- src/renz.h
+ src/renz.h src/renz.cc
process_shortreads_SOURCES = src/process_shortreads.h src/process_shortreads.cc src/constants.h \
src/clean.h src/clean.cc \
@@ -754,21 +813,24 @@ genotypes_SOURCES = src/genotypes.h src/genotypes.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/aln_utils.h src/aln_utils.cc \
- src/PopMap.h src/genotype_dictionaries.h \
- src/input.h src/input.cc src/sql_utilities.h src/renz.h
+ src/PopMap.h src/genotype_dictionaries.h src/genotype_dictionaries.cc\
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc src/renz.h src/renz.cc \
+ src/MetaPopInfo.h src/MetaPopInfo.cc
populations_SOURCES = src/populations.h src/populations.cc src/constants.h \
src/utils.h src/utils.cc src/catalog_utils.h src/catalog_utils.cc \
src/log_utils.h src/log_utils.cc \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
- src/PopMap.h src/PopSum.h src/aln_utils.h src/aln_utils.cc src/genotype_dictionaries.h \
- src/input.h src/input.cc src/sql_utilities.h src/renz.h \
- src/bootstrap.h src/ordered.h src/smoothing.h src/smoothing_utils.h
+ src/PopMap.h src/PopSum.h src/aln_utils.h src/aln_utils.cc src/genotype_dictionaries.h src/genotype_dictionaries.cc \
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc src/renz.h src/renz.cc \
+ src/bootstrap.h src/ordered.h src/smoothing.h src/smoothing_utils.h \
+ src/export_formats.h src/export_formats.cc \
+ src/MetaPopInfo.h src/MetaPopInfo.cc src/Vcf.h src/Vcf.cc
phasedstacks_SOURCES = src/phasedstacks.h src/phasedstacks.cc src/constants.h \
src/locus.h src/locus.cc \
- src/input.h src/input.cc src/sql_utilities.h \
+ src/input.h src/input.cc src/sql_utilities.h src/sql_utilities.cc \
src/log_utils.h src/log_utils.cc \
src/utils.h src/utils.cc src/catalog_utils.h src/catalog_utils.cc
@@ -824,9 +886,10 @@ nobase_pkgdata_DATA = sql/mysql.cnf.dist sql/catalog_index.sql sql/stacks.sql sq
LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) $(abs_top_srcdir)/tests/tap-driver.sh
TESTS = tests/process_radtags.t tests/kmer_filter.t tests/ustacks.t tests/pstacks.t
EXTRA_DIST = $(nobase_pkgdata_DATA) LICENSE INSTALL README ChangeLog $(TESTS)
+SUBDIRS = htslib .
pkglocalstatedir = $(localstatedir)/$(PACKAGE)
all: config.h
- $(MAKE) $(AM_MAKEFLAGS) all-am
+ $(MAKE) $(AM_MAKEFLAGS) all-recursive
.SUFFIXES:
.SUFFIXES: .cc .log .o .obj .test .test$(EXEEXT) .trs
@@ -962,6 +1025,8 @@ src/cstacks-DNANSeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/cstacks-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/cstacks-sql_utilities.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
cstacks$(EXEEXT): $(cstacks_OBJECTS) $(cstacks_DEPENDENCIES) $(EXTRA_cstacks_DEPENDENCIES)
@rm -f cstacks$(EXEEXT)
@@ -1004,8 +1069,16 @@ src/genotypes-DNANSeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/genotypes-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/genotypes-genotype_dictionaries.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/genotypes-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/genotypes-sql_utilities.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/genotypes-renz.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/genotypes-MetaPopInfo.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
genotypes$(EXEEXT): $(genotypes_OBJECTS) $(genotypes_DEPENDENCIES) $(EXTRA_genotypes_DEPENDENCIES)
@rm -f genotypes$(EXEEXT)
@@ -1026,6 +1099,8 @@ src/hstacks-utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/hstacks-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/hstacks-sql_utilities.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
hstacks$(EXEEXT): $(hstacks_OBJECTS) $(hstacks_DEPENDENCIES) $(EXTRA_hstacks_DEPENDENCIES)
@rm -f hstacks$(EXEEXT)
@@ -1050,6 +1125,8 @@ src/phasedstacks-locus.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/phasedstacks-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/phasedstacks-sql_utilities.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/phasedstacks-log_utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/phasedstacks-utils.$(OBJEXT): src/$(am__dirstamp) \
@@ -1078,8 +1155,20 @@ src/populations-DNANSeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/populations-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/populations-genotype_dictionaries.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/populations-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/populations-sql_utilities.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/populations-renz.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/populations-export_formats.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/populations-MetaPopInfo.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/populations-Vcf.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
populations$(EXEEXT): $(populations_OBJECTS) $(populations_DEPENDENCIES) $(EXTRA_populations_DEPENDENCIES)
@rm -f populations$(EXEEXT)
@@ -1098,6 +1187,8 @@ src/process_radtags-file_io.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/process_radtags-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/process_radtags-renz.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
process_radtags$(EXEEXT): $(process_radtags_OBJECTS) $(process_radtags_DEPENDENCIES) $(EXTRA_process_radtags_DEPENDENCIES)
@rm -f process_radtags$(EXEEXT)
@@ -1162,6 +1253,10 @@ src/rxstacks-utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/rxstacks-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/rxstacks-sql_utilities.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/rxstacks-MetaPopInfo.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
rxstacks$(EXEEXT): $(rxstacks_OBJECTS) $(rxstacks_DEPENDENCIES) $(EXTRA_rxstacks_DEPENDENCIES)
@rm -f rxstacks$(EXEEXT)
@@ -1184,6 +1279,8 @@ src/sstacks-utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/sstacks-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/sstacks-sql_utilities.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
sstacks$(EXEEXT): $(sstacks_OBJECTS) $(sstacks_DEPENDENCIES) $(EXTRA_sstacks_DEPENDENCIES)
@rm -f sstacks$(EXEEXT)
@@ -1273,6 +1370,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-kmers.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-locus.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-sql_utilities.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-stacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/estacks-DNANSeq.Po at am__quote@
@@ -1285,12 +1383,16 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/estacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-MetaPopInfo.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-catalog_utils.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-genotype_dictionaries.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-genotypes.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-locus.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-log_utils.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-renz.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-sql_utilities.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-stacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/hstacks-DNANSeq.Po at am__quote@
@@ -1299,6 +1401,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/hstacks-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/hstacks-kmers.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/hstacks-locus.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/hstacks-sql_utilities.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/hstacks-stacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/hstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/kmer_filter-input.Po at am__quote@
@@ -1311,15 +1414,22 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/phasedstacks-locus.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/phasedstacks-log_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/phasedstacks-phasedstacks.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/phasedstacks-sql_utilities.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/phasedstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-MetaPopInfo.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-Vcf.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-catalog_utils.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-export_formats.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-genotype_dictionaries.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-locus.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-log_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-populations.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-renz.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-sql_utilities.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-stacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_radtags-clean.Po at am__quote@
@@ -1327,6 +1437,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_radtags-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_radtags-log_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_radtags-process_radtags.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_radtags-renz.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_radtags-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_radtags-write.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/process_shortreads-clean.Po at am__quote@
@@ -1347,12 +1458,14 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/pstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-MetaPopInfo.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-locus.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-models.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-mst.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-rxstacks.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-sql_utilities.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-stacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-DNANSeq.Po at am__quote@
@@ -1361,6 +1474,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-kmers.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-locus.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-sql_utilities.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-sstacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-stacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-utils.Po at am__quote@
@@ -1617,6 +1731,20 @@ src/cstacks-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/cstacks-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/cstacks-sql_utilities.o: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -MT src/cstacks-sql_utilities.o -MD -MP -MF src/$(DEPDIR)/cstacks-sql_utilities.Tpo -c -o src/cstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/cstacks-sql_utilities.Tpo src/$(DEPDIR)/cstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/cstacks-sql_utilities.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/cstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+
+src/cstacks-sql_utilities.obj: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -MT src/cstacks-sql_utilities.obj -MD -MP -MF src/$(DEPDIR)/cstacks-sql_utilities.Tpo -c -o src/cstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/cstacks-sql_utilities.Tpo src/$(DEPDIR)/cstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/cstacks-sql_utilities.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/cstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+
src/estacks-estacks.o: src/estacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(estacks_CXXFLAGS) $(CXXFLAGS) -MT src/estacks-estacks.o -MD -MP -MF src/$(DEPDIR)/estacks-estacks.Tpo -c -o src/estacks-estacks.o `test -f 'src/estacks.cc' || echo '$(srcdir)/'`src/estacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/estacks-estacks.Tpo src/$(DEPDIR)/estacks-estacks.Po
@@ -1855,6 +1983,20 @@ src/genotypes-aln_utils.obj: src/aln_utils.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+src/genotypes-genotype_dictionaries.o: src/genotype_dictionaries.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-genotype_dictionaries.o -MD -MP -MF src/$(DEPDIR)/genotypes-genotype_dictionaries.Tpo -c -o src/genotypes-genotype_dictionaries.o `test -f 'src/genotype_dictionaries.cc' || echo '$(srcdir)/'`src/genotype_dictionaries.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-genotype_dictionaries.Tpo src/$(DEPDIR)/genotypes-genotype_dictionaries.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/genotype_dictionaries.cc' object='src/genotypes-genotype_dictionaries.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-genotype_dictionaries.o `test -f 'src/genotype_dictionaries.cc' || echo '$(srcdir)/'`src/genotype_dictionaries.cc
+
+src/genotypes-genotype_dictionaries.obj: src/genotype_dictionaries.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-genotype_dictionaries.obj -MD -MP -MF src/$(DEPDIR)/genotypes-genotype_dictionaries.Tpo -c -o src/genotypes-genotype_dictionaries.obj `if test -f 'src/genotype_dictionaries.cc'; then $(CYGPATH_W) 'src/genotype_dictionaries.cc'; else $(CYGPATH_W) '$(srcdir)/src/genotype_dictionaries.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-genotype_dictionaries.Tpo src/$(DEPDIR)/genotypes-genotype_dictionaries.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/genotype_dictionaries.cc' object='src/genotypes-genotype_dictionaries.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-genotype_dictionaries.obj `if test -f 'src/genotype_dictionaries.cc'; then $(CYGPATH_W) 'src/genotype_dictionaries.cc'; else $(CYGPATH_W) '$(srcdir)/src/genotype_dictionaries.cc'; fi`
+
src/genotypes-input.o: src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-input.o -MD -MP -MF src/$(DEPDIR)/genotypes-input.Tpo -c -o src/genotypes-input.o `test -f 'src/input.cc' || echo '$(srcdir)/'`src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-input.Tpo src/$(DEPDIR)/genotypes-input.Po
@@ -1869,6 +2011,48 @@ src/genotypes-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/genotypes-sql_utilities.o: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-sql_utilities.o -MD -MP -MF src/$(DEPDIR)/genotypes-sql_utilities.Tpo -c -o src/genotypes-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-sql_utilities.Tpo src/$(DEPDIR)/genotypes-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/genotypes-sql_utilities.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+
+src/genotypes-sql_utilities.obj: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-sql_utilities.obj -MD -MP -MF src/$(DEPDIR)/genotypes-sql_utilities.Tpo -c -o src/genotypes-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-sql_utilities.Tpo src/$(DEPDIR)/genotypes-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/genotypes-sql_utilities.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+
+src/genotypes-renz.o: src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-renz.o -MD -MP -MF src/$(DEPDIR)/genotypes-renz.Tpo -c -o src/genotypes-renz.o `test -f 'src/renz.cc' || echo '$(srcdir)/'`src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-renz.Tpo src/$(DEPDIR)/genotypes-renz.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/renz.cc' object='src/genotypes-renz.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-renz.o `test -f 'src/renz.cc' || echo '$(srcdir)/'`src/renz.cc
+
+src/genotypes-renz.obj: src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-renz.obj -MD -MP -MF src/$(DEPDIR)/genotypes-renz.Tpo -c -o src/genotypes-renz.obj `if test -f 'src/renz.cc'; then $(CYGPATH_W) 'src/renz.cc'; else $(CYGPATH_W) '$(srcdir)/src/renz.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-renz.Tpo src/$(DEPDIR)/genotypes-renz.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/renz.cc' object='src/genotypes-renz.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-renz.obj `if test -f 'src/renz.cc'; then $(CYGPATH_W) 'src/renz.cc'; else $(CYGPATH_W) '$(srcdir)/src/renz.cc'; fi`
+
+src/genotypes-MetaPopInfo.o: src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-MetaPopInfo.o -MD -MP -MF src/$(DEPDIR)/genotypes-MetaPopInfo.Tpo -c -o src/genotypes-MetaPopInfo.o `test -f 'src/MetaPopInfo.cc' || echo '$(srcdir)/'`src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-MetaPopInfo.Tpo src/$(DEPDIR)/genotypes-MetaPopInfo.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/MetaPopInfo.cc' object='src/genotypes-MetaPopInfo.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-MetaPopInfo.o `test -f 'src/MetaPopInfo.cc' || echo '$(srcdir)/'`src/MetaPopInfo.cc
+
+src/genotypes-MetaPopInfo.obj: src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-MetaPopInfo.obj -MD -MP -MF src/$(DEPDIR)/genotypes-MetaPopInfo.Tpo -c -o src/genotypes-MetaPopInfo.obj `if test -f 'src/MetaPopInfo.cc'; then $(CYGPATH_W) 'src/MetaPopInfo.cc'; else $(CYGPATH_W) '$(srcdir)/src/MetaPopInfo.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-MetaPopInfo.Tpo src/$(DEPDIR)/genotypes-MetaPopInfo.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/MetaPopInfo.cc' object='src/genotypes-MetaPopInfo.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-MetaPopInfo.obj `if test -f 'src/MetaPopInfo.cc'; then $(CYGPATH_W) 'src/MetaPopInfo.cc'; else $(CYGPATH_W) '$(srcdir)/src/MetaPopInfo.cc'; fi`
+
src/hstacks-hstacks.o: src/hstacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hstacks_CXXFLAGS) $(CXXFLAGS) -MT src/hstacks-hstacks.o -MD -MP -MF src/$(DEPDIR)/hstacks-hstacks.Tpo -c -o src/hstacks-hstacks.o `test -f 'src/hstacks.cc' || echo '$(srcdir)/'`src/hstacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/hstacks-hstacks.Tpo src/$(DEPDIR)/hstacks-hstacks.Po
@@ -1981,6 +2165,20 @@ src/hstacks-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/hstacks-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/hstacks-sql_utilities.o: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hstacks_CXXFLAGS) $(CXXFLAGS) -MT src/hstacks-sql_utilities.o -MD -MP -MF src/$(DEPDIR)/hstacks-sql_utilities.Tpo -c -o src/hstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/hstacks-sql_utilities.Tpo src/$(DEPDIR)/hstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/hstacks-sql_utilities.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/hstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+
+src/hstacks-sql_utilities.obj: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hstacks_CXXFLAGS) $(CXXFLAGS) -MT src/hstacks-sql_utilities.obj -MD -MP -MF src/$(DEPDIR)/hstacks-sql_utilities.Tpo -c -o src/hstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/hstacks-sql_utilities.Tpo src/$(DEPDIR)/hstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/hstacks-sql_utilities.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/hstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+
src/kmer_filter-kmer_filter.o: src/kmer_filter.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(kmer_filter_CXXFLAGS) $(CXXFLAGS) -MT src/kmer_filter-kmer_filter.o -MD -MP -MF src/$(DEPDIR)/kmer_filter-kmer_filter.Tpo -c -o src/kmer_filter-kmer_filter.o `test -f 'src/kmer_filter.cc' || echo '$(srcdir)/'`src/kmer_filter.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/kmer_filter-kmer_filter.Tpo src/$(DEPDIR)/kmer_filter-kmer_filter.Po
@@ -2093,6 +2291,20 @@ src/phasedstacks-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(phasedstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/phasedstacks-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/phasedstacks-sql_utilities.o: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(phasedstacks_CXXFLAGS) $(CXXFLAGS) -MT src/phasedstacks-sql_utilities.o -MD -MP -MF src/$(DEPDIR)/phasedstacks-sql_utilities.Tpo -c -o src/phasedstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/phasedstacks-sql_utilities.Tpo src/$(DEPDIR)/phasedstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/phasedstacks-sql_utilities.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(phasedstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/phasedstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+
+src/phasedstacks-sql_utilities.obj: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(phasedstacks_CXXFLAGS) $(CXXFLAGS) -MT src/phasedstacks-sql_utilities.obj -MD -MP -MF src/$(DEPDIR)/phasedstacks-sql_utilities.Tpo -c -o src/phasedstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/phasedstacks-sql_utilities.Tpo src/$(DEPDIR)/phasedstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/phasedstacks-sql_utilities.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(phasedstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/phasedstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+
src/phasedstacks-log_utils.o: src/log_utils.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(phasedstacks_CXXFLAGS) $(CXXFLAGS) -MT src/phasedstacks-log_utils.o -MD -MP -MF src/$(DEPDIR)/phasedstacks-log_utils.Tpo -c -o src/phasedstacks-log_utils.o `test -f 'src/log_utils.cc' || echo '$(srcdir)/'`src/log_utils.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/phasedstacks-log_utils.Tpo src/$(DEPDIR)/phasedstacks-log_utils.Po
@@ -2261,6 +2473,20 @@ src/populations-aln_utils.obj: src/aln_utils.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+src/populations-genotype_dictionaries.o: src/genotype_dictionaries.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-genotype_dictionaries.o -MD -MP -MF src/$(DEPDIR)/populations-genotype_dictionaries.Tpo -c -o src/populations-genotype_dictionaries.o `test -f 'src/genotype_dictionaries.cc' || echo '$(srcdir)/'`src/genotype_dictionaries.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-genotype_dictionaries.Tpo src/$(DEPDIR)/populations-genotype_dictionaries.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/genotype_dictionaries.cc' object='src/populations-genotype_dictionaries.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-genotype_dictionaries.o `test -f 'src/genotype_dictionaries.cc' || echo '$(srcdir)/'`src/genotype_dictionaries.cc
+
+src/populations-genotype_dictionaries.obj: src/genotype_dictionaries.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-genotype_dictionaries.obj -MD -MP -MF src/$(DEPDIR)/populations-genotype_dictionaries.Tpo -c -o src/populations-genotype_dictionaries.obj `if test -f 'src/genotype_dictionaries.cc'; then $(CYGPATH_W) 'src/genotype_dictionaries.cc'; else $(CYGPATH_W) '$(srcdir)/src/genotype_dictionaries.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-genotype_dictionaries.Tpo src/$(DEPDIR)/populations-genotype_dictionaries.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/genotype_dictionaries.cc' object='src/populations-genotype_dictionaries.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-genotype_dictionaries.obj `if test -f 'src/genotype_dictionaries.cc'; then $(CYGPATH_W) 'src/genotype_dictionaries.cc'; else $(CYGPATH_W) '$(srcdir)/src/genotype_dictionaries.cc'; fi`
+
src/populations-input.o: src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-input.o -MD -MP -MF src/$(DEPDIR)/populations-input.Tpo -c -o src/populations-input.o `test -f 'src/input.cc' || echo '$(srcdir)/'`src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-input.Tpo src/$(DEPDIR)/populations-input.Po
@@ -2275,6 +2501,76 @@ src/populations-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/populations-sql_utilities.o: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-sql_utilities.o -MD -MP -MF src/$(DEPDIR)/populations-sql_utilities.Tpo -c -o src/populations-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-sql_utilities.Tpo src/$(DEPDIR)/populations-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/populations-sql_utilities.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+
+src/populations-sql_utilities.obj: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-sql_utilities.obj -MD -MP -MF src/$(DEPDIR)/populations-sql_utilities.Tpo -c -o src/populations-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-sql_utilities.Tpo src/$(DEPDIR)/populations-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/populations-sql_utilities.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+
+src/populations-renz.o: src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-renz.o -MD -MP -MF src/$(DEPDIR)/populations-renz.Tpo -c -o src/populations-renz.o `test -f 'src/renz.cc' || echo '$(srcdir)/'`src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-renz.Tpo src/$(DEPDIR)/populations-renz.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/renz.cc' object='src/populations-renz.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-renz.o `test -f 'src/renz.cc' || echo '$(srcdir)/'`src/renz.cc
+
+src/populations-renz.obj: src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-renz.obj -MD -MP -MF src/$(DEPDIR)/populations-renz.Tpo -c -o src/populations-renz.obj `if test -f 'src/renz.cc'; then $(CYGPATH_W) 'src/renz.cc'; else $(CYGPATH_W) '$(srcdir)/src/renz.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-renz.Tpo src/$(DEPDIR)/populations-renz.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/renz.cc' object='src/populations-renz.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-renz.obj `if test -f 'src/renz.cc'; then $(CYGPATH_W) 'src/renz.cc'; else $(CYGPATH_W) '$(srcdir)/src/renz.cc'; fi`
+
+src/populations-export_formats.o: src/export_formats.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-export_formats.o -MD -MP -MF src/$(DEPDIR)/populations-export_formats.Tpo -c -o src/populations-export_formats.o `test -f 'src/export_formats.cc' || echo '$(srcdir)/'`src/export_formats.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-export_formats.Tpo src/$(DEPDIR)/populations-export_formats.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/export_formats.cc' object='src/populations-export_formats.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-export_formats.o `test -f 'src/export_formats.cc' || echo '$(srcdir)/'`src/export_formats.cc
+
+src/populations-export_formats.obj: src/export_formats.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-export_formats.obj -MD -MP -MF src/$(DEPDIR)/populations-export_formats.Tpo -c -o src/populations-export_formats.obj `if test -f 'src/export_formats.cc'; then $(CYGPATH_W) 'src/export_formats.cc'; else $(CYGPATH_W) '$(srcdir)/src/export_formats.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-export_formats.Tpo src/$(DEPDIR)/populations-export_formats.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/export_formats.cc' object='src/populations-export_formats.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-export_formats.obj `if test -f 'src/export_formats.cc'; then $(CYGPATH_W) 'src/export_formats.cc'; else $(CYGPATH_W) '$(srcdir)/src/export_formats.cc'; fi`
+
+src/populations-MetaPopInfo.o: src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-MetaPopInfo.o -MD -MP -MF src/$(DEPDIR)/populations-MetaPopInfo.Tpo -c -o src/populations-MetaPopInfo.o `test -f 'src/MetaPopInfo.cc' || echo '$(srcdir)/'`src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-MetaPopInfo.Tpo src/$(DEPDIR)/populations-MetaPopInfo.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/MetaPopInfo.cc' object='src/populations-MetaPopInfo.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-MetaPopInfo.o `test -f 'src/MetaPopInfo.cc' || echo '$(srcdir)/'`src/MetaPopInfo.cc
+
+src/populations-MetaPopInfo.obj: src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-MetaPopInfo.obj -MD -MP -MF src/$(DEPDIR)/populations-MetaPopInfo.Tpo -c -o src/populations-MetaPopInfo.obj `if test -f 'src/MetaPopInfo.cc'; then $(CYGPATH_W) 'src/MetaPopInfo.cc'; else $(CYGPATH_W) '$(srcdir)/src/MetaPopInfo.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-MetaPopInfo.Tpo src/$(DEPDIR)/populations-MetaPopInfo.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/MetaPopInfo.cc' object='src/populations-MetaPopInfo.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-MetaPopInfo.obj `if test -f 'src/MetaPopInfo.cc'; then $(CYGPATH_W) 'src/MetaPopInfo.cc'; else $(CYGPATH_W) '$(srcdir)/src/MetaPopInfo.cc'; fi`
+
+src/populations-Vcf.o: src/Vcf.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-Vcf.o -MD -MP -MF src/$(DEPDIR)/populations-Vcf.Tpo -c -o src/populations-Vcf.o `test -f 'src/Vcf.cc' || echo '$(srcdir)/'`src/Vcf.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-Vcf.Tpo src/$(DEPDIR)/populations-Vcf.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/Vcf.cc' object='src/populations-Vcf.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-Vcf.o `test -f 'src/Vcf.cc' || echo '$(srcdir)/'`src/Vcf.cc
+
+src/populations-Vcf.obj: src/Vcf.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-Vcf.obj -MD -MP -MF src/$(DEPDIR)/populations-Vcf.Tpo -c -o src/populations-Vcf.obj `if test -f 'src/Vcf.cc'; then $(CYGPATH_W) 'src/Vcf.cc'; else $(CYGPATH_W) '$(srcdir)/src/Vcf.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-Vcf.Tpo src/$(DEPDIR)/populations-Vcf.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/Vcf.cc' object='src/populations-Vcf.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-Vcf.obj `if test -f 'src/Vcf.cc'; then $(CYGPATH_W) 'src/Vcf.cc'; else $(CYGPATH_W) '$(srcdir)/src/Vcf.cc'; fi`
+
src/process_radtags-process_radtags.o: src/process_radtags.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(process_radtags_CXXFLAGS) $(CXXFLAGS) -MT src/process_radtags-process_radtags.o -MD -MP -MF src/$(DEPDIR)/process_radtags-process_radtags.Tpo -c -o src/process_radtags-process_radtags.o `test -f 'src/process_radtags.cc' || echo '$(srcdir)/'`src/process_radtags.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/process_radtags-process_radtags.Tpo src/$(DEPDIR)/process_radtags-process_radtags.Po
@@ -2373,6 +2669,20 @@ src/process_radtags-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(process_radtags_CXXFLAGS) $(CXXFLAGS) -c -o src/process_radtags-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/process_radtags-renz.o: src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(process_radtags_CXXFLAGS) $(CXXFLAGS) -MT src/process_radtags-renz.o -MD -MP -MF src/$(DEPDIR)/process_radtags-renz.Tpo -c -o src/process_radtags-renz.o `test -f 'src/renz.cc' || echo '$(srcdir)/'`src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/process_radtags-renz.Tpo src/$(DEPDIR)/process_radtags-renz.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/renz.cc' object='src/process_radtags-renz.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(process_radtags_CXXFLAGS) $(CXXFLAGS) -c -o src/process_radtags-renz.o `test -f 'src/renz.cc' || echo '$(srcdir)/'`src/renz.cc
+
+src/process_radtags-renz.obj: src/renz.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(process_radtags_CXXFLAGS) $(CXXFLAGS) -MT src/process_radtags-renz.obj -MD -MP -MF src/$(DEPDIR)/process_radtags-renz.Tpo -c -o src/process_radtags-renz.obj `if test -f 'src/renz.cc'; then $(CYGPATH_W) 'src/renz.cc'; else $(CYGPATH_W) '$(srcdir)/src/renz.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/process_radtags-renz.Tpo src/$(DEPDIR)/process_radtags-renz.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/renz.cc' object='src/process_radtags-renz.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(process_radtags_CXXFLAGS) $(CXXFLAGS) -c -o src/process_radtags-renz.obj `if test -f 'src/renz.cc'; then $(CYGPATH_W) 'src/renz.cc'; else $(CYGPATH_W) '$(srcdir)/src/renz.cc'; fi`
+
src/process_shortreads-process_shortreads.o: src/process_shortreads.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(process_shortreads_CXXFLAGS) $(CXXFLAGS) -MT src/process_shortreads-process_shortreads.o -MD -MP -MF src/$(DEPDIR)/process_shortreads-process_shortreads.Tpo -c -o src/process_shortreads-process_shortreads.o `test -f 'src/process_shortreads.cc' || echo '$(srcdir)/'`src/process_shortreads.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/process_shortreads-process_shortreads.Tpo src/$(DEPDIR)/process_shortreads-process_shortreads.Po
@@ -2737,6 +3047,34 @@ src/rxstacks-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/rxstacks-sql_utilities.o: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -MT src/rxstacks-sql_utilities.o -MD -MP -MF src/$(DEPDIR)/rxstacks-sql_utilities.Tpo -c -o src/rxstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/rxstacks-sql_utilities.Tpo src/$(DEPDIR)/rxstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/rxstacks-sql_utilities.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+
+src/rxstacks-sql_utilities.obj: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -MT src/rxstacks-sql_utilities.obj -MD -MP -MF src/$(DEPDIR)/rxstacks-sql_utilities.Tpo -c -o src/rxstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/rxstacks-sql_utilities.Tpo src/$(DEPDIR)/rxstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/rxstacks-sql_utilities.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+
+src/rxstacks-MetaPopInfo.o: src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -MT src/rxstacks-MetaPopInfo.o -MD -MP -MF src/$(DEPDIR)/rxstacks-MetaPopInfo.Tpo -c -o src/rxstacks-MetaPopInfo.o `test -f 'src/MetaPopInfo.cc' || echo '$(srcdir)/'`src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/rxstacks-MetaPopInfo.Tpo src/$(DEPDIR)/rxstacks-MetaPopInfo.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/MetaPopInfo.cc' object='src/rxstacks-MetaPopInfo.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-MetaPopInfo.o `test -f 'src/MetaPopInfo.cc' || echo '$(srcdir)/'`src/MetaPopInfo.cc
+
+src/rxstacks-MetaPopInfo.obj: src/MetaPopInfo.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -MT src/rxstacks-MetaPopInfo.obj -MD -MP -MF src/$(DEPDIR)/rxstacks-MetaPopInfo.Tpo -c -o src/rxstacks-MetaPopInfo.obj `if test -f 'src/MetaPopInfo.cc'; then $(CYGPATH_W) 'src/MetaPopInfo.cc'; else $(CYGPATH_W) '$(srcdir)/src/MetaPopInfo.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/rxstacks-MetaPopInfo.Tpo src/$(DEPDIR)/rxstacks-MetaPopInfo.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/MetaPopInfo.cc' object='src/rxstacks-MetaPopInfo.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-MetaPopInfo.obj `if test -f 'src/MetaPopInfo.cc'; then $(CYGPATH_W) 'src/MetaPopInfo.cc'; else $(CYGPATH_W) '$(srcdir)/src/MetaPopInfo.cc'; fi`
+
src/sstacks-sstacks.o: src/sstacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-sstacks.o -MD -MP -MF src/$(DEPDIR)/sstacks-sstacks.Tpo -c -o src/sstacks-sstacks.o `test -f 'src/sstacks.cc' || echo '$(srcdir)/'`src/sstacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-sstacks.Tpo src/$(DEPDIR)/sstacks-sstacks.Po
@@ -2863,6 +3201,20 @@ src/sstacks-input.obj: src/input.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-input.obj `if test -f 'src/input.cc'; then $(CYGPATH_W) 'src/input.cc'; else $(CYGPATH_W) '$(srcdir)/src/input.cc'; fi`
+src/sstacks-sql_utilities.o: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-sql_utilities.o -MD -MP -MF src/$(DEPDIR)/sstacks-sql_utilities.Tpo -c -o src/sstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-sql_utilities.Tpo src/$(DEPDIR)/sstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/sstacks-sql_utilities.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-sql_utilities.o `test -f 'src/sql_utilities.cc' || echo '$(srcdir)/'`src/sql_utilities.cc
+
+src/sstacks-sql_utilities.obj: src/sql_utilities.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-sql_utilities.obj -MD -MP -MF src/$(DEPDIR)/sstacks-sql_utilities.Tpo -c -o src/sstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-sql_utilities.Tpo src/$(DEPDIR)/sstacks-sql_utilities.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/sql_utilities.cc' object='src/sstacks-sql_utilities.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-sql_utilities.obj `if test -f 'src/sql_utilities.cc'; then $(CYGPATH_W) 'src/sql_utilities.cc'; else $(CYGPATH_W) '$(srcdir)/src/sql_utilities.cc'; fi`
+
src/ustacks-ustacks.o: src/ustacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ustacks_CXXFLAGS) $(CXXFLAGS) -MT src/ustacks-ustacks.o -MD -MP -MF src/$(DEPDIR)/ustacks-ustacks.Tpo -c -o src/ustacks-ustacks.o `test -f 'src/ustacks.cc' || echo '$(srcdir)/'`src/ustacks.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/ustacks-ustacks.Tpo src/$(DEPDIR)/ustacks-ustacks.Po
@@ -3055,14 +3407,61 @@ uninstall-nobase_pkgdataDATA:
$(am__nobase_strip_setup); files=`$(am__nobase_strip)`; \
dir='$(DESTDIR)$(pkgdatadir)'; $(am__uninstall_files_from_dir)
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run 'make' without going through this Makefile.
+# To change the values of 'make' variables: instead of editing Makefiles,
+# (1) if the variable is set in 'config.status', edit 'config.status'
+# (which will cause the Makefiles to be regenerated when you run 'make');
+# (2) otherwise, pass the desired values on the 'make' command line.
+$(am__recursive_targets):
+ @fail=; \
+ if $(am__make_keepgoing); then \
+ failcom='fail=yes'; \
+ else \
+ failcom='exit 1'; \
+ fi; \
+ dot_seen=no; \
+ target=`echo $@ | sed s/-recursive//`; \
+ case "$@" in \
+ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+ *) list='$(SUBDIRS)' ;; \
+ esac; \
+ for subdir in $$list; do \
+ echo "Making $$target in $$subdir"; \
+ if test "$$subdir" = "."; then \
+ dot_seen=yes; \
+ local_target="$$target-am"; \
+ else \
+ local_target="$$target"; \
+ fi; \
+ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+ || eval $$failcom; \
+ done; \
+ if test "$$dot_seen" = "no"; then \
+ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+ fi; test -z "$$fail"
+
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
-tags: tags-am
+tags: tags-recursive
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
+ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+ include_option=--etags-include; \
+ empty_fix=.; \
+ else \
+ include_option=--include; \
+ empty_fix=; \
+ fi; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ if test "$$subdir" = .; then :; else \
+ test ! -f $$subdir/TAGS || \
+ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+ fi; \
+ done; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
@@ -3075,7 +3474,7 @@ tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$$unique; \
fi; \
fi
-ctags: ctags-am
+ctags: ctags-recursive
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
@@ -3094,7 +3493,7 @@ cscope: cscope.files
clean-cscope:
-rm -f cscope.files
cscope.files: clean-cscope cscopelist
-cscopelist: cscopelist-am
+cscopelist: cscopelist-recursive
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
@@ -3330,6 +3729,31 @@ distdir: $(DISTFILES)
|| exit 1; \
fi; \
done
+ @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+ if test "$$subdir" = .; then :; else \
+ $(am__make_dryrun) \
+ || test -d "$(distdir)/$$subdir" \
+ || $(MKDIR_P) "$(distdir)/$$subdir" \
+ || exit 1; \
+ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+ $(am__relativize); \
+ new_distdir=$$reldir; \
+ dir1=$$subdir; dir2="$(top_distdir)"; \
+ $(am__relativize); \
+ new_top_distdir=$$reldir; \
+ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+ ($(am__cd) $$subdir && \
+ $(MAKE) $(AM_MAKEFLAGS) \
+ top_distdir="$$new_top_distdir" \
+ distdir="$$new_distdir" \
+ am__remove_distdir=: \
+ am__skip_length_check=: \
+ am__skip_mode_fix=: \
+ distdir) \
+ || exit 1; \
+ fi; \
+ done
-test -n "$(am__skip_mode_fix)" \
|| find "$(distdir)" -type d ! -perm -755 \
-exec chmod u+rwx,go+rx {} \; -o \
@@ -3463,21 +3887,22 @@ distcleancheck: distclean
exit 1; } >&2
check-am: all-am
$(MAKE) $(AM_MAKEFLAGS) check-TESTS
-check: check-am
+check: check-recursive
all-am: Makefile $(PROGRAMS) $(SCRIPTS) $(DATA) config.h
-installdirs:
+installdirs: installdirs-recursive
+installdirs-am:
for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(pkgdatadir)"; do \
test -z "$$dir" || $(MKDIR_P) "$$dir"; \
done
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-installcheck: installcheck-am
+installcheck: installcheck-recursive
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
@@ -3504,74 +3929,74 @@ distclean-generic:
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
+clean: clean-recursive
clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
-distclean: distclean-am
+distclean: distclean-recursive
-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-rm -rf src/$(DEPDIR)
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-hdr distclean-tags
-dvi: dvi-am
+dvi: dvi-recursive
dvi-am:
-html: html-am
+html: html-recursive
html-am:
-info: info-am
+info: info-recursive
info-am:
install-data-am: install-nobase_pkgdataDATA
@$(NORMAL_INSTALL)
$(MAKE) $(AM_MAKEFLAGS) install-data-hook
-install-dvi: install-dvi-am
+install-dvi: install-dvi-recursive
install-dvi-am:
install-exec-am: install-binPROGRAMS install-dist_binSCRIPTS
-install-html: install-html-am
+install-html: install-html-recursive
install-html-am:
-install-info: install-info-am
+install-info: install-info-recursive
install-info-am:
install-man:
-install-pdf: install-pdf-am
+install-pdf: install-pdf-recursive
install-pdf-am:
-install-ps: install-ps-am
+install-ps: install-ps-recursive
install-ps-am:
installcheck-am:
-maintainer-clean: maintainer-clean-am
+maintainer-clean: maintainer-clean-recursive
-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-rm -rf $(top_srcdir)/autom4te.cache
-rm -rf src/$(DEPDIR)
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
-mostlyclean: mostlyclean-am
+mostlyclean: mostlyclean-recursive
mostlyclean-am: mostlyclean-compile mostlyclean-generic
-pdf: pdf-am
+pdf: pdf-recursive
pdf-am:
-ps: ps-am
+ps: ps-recursive
ps-am:
@@ -3579,23 +4004,24 @@ uninstall-am: uninstall-binPROGRAMS uninstall-dist_binSCRIPTS \
uninstall-nobase_pkgdataDATA
@$(NORMAL_INSTALL)
$(MAKE) $(AM_MAKEFLAGS) uninstall-hook
-.MAKE: all check-am install-am install-data-am install-strip \
- uninstall-am
-
-.PHONY: CTAGS GTAGS TAGS all all-am am--refresh check check-TESTS \
- check-am clean clean-binPROGRAMS clean-cscope clean-generic \
- cscope cscopelist-am ctags ctags-am dist dist-all dist-bzip2 \
- dist-gzip dist-lzip dist-shar dist-tarZ dist-xz dist-zip \
- distcheck distclean distclean-compile distclean-generic \
- distclean-hdr distclean-tags distcleancheck distdir \
- distuninstallcheck dvi dvi-am html html-am info info-am \
- install install-am install-binPROGRAMS install-data \
- install-data-am install-data-hook install-dist_binSCRIPTS \
- install-dvi install-dvi-am install-exec install-exec-am \
- install-html install-html-am install-info install-info-am \
- install-man install-nobase_pkgdataDATA install-pdf \
- install-pdf-am install-ps install-ps-am install-strip \
- installcheck installcheck-am installdirs maintainer-clean \
+.MAKE: $(am__recursive_targets) all check-am install-am \
+ install-data-am install-strip uninstall-am
+
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
+ am--refresh check check-TESTS check-am clean clean-binPROGRAMS \
+ clean-cscope clean-generic cscope cscopelist-am ctags ctags-am \
+ dist dist-all dist-bzip2 dist-gzip dist-lzip dist-shar \
+ dist-tarZ dist-xz dist-zip distcheck distclean \
+ distclean-compile distclean-generic distclean-hdr \
+ distclean-tags distcleancheck distdir distuninstallcheck dvi \
+ dvi-am html html-am info info-am install install-am \
+ install-binPROGRAMS install-data install-data-am \
+ install-data-hook install-dist_binSCRIPTS install-dvi \
+ install-dvi-am install-exec install-exec-am install-html \
+ install-html-am install-info install-info-am install-man \
+ install-nobase_pkgdataDATA install-pdf install-pdf-am \
+ install-ps install-ps-am install-strip installcheck \
+ installcheck-am installdirs installdirs-am maintainer-clean \
maintainer-clean-generic mostlyclean mostlyclean-compile \
mostlyclean-generic pdf pdf-am ps ps-am recheck tags tags-am \
uninstall uninstall-am uninstall-binPROGRAMS \
diff --git a/README b/README
index 6ac640c..82774f4 100644
--- a/README
+++ b/README
@@ -25,16 +25,7 @@ cpan tool.
1. Install optional components:
-The performance of Stacks can be improved by installing two, optional external libraries.
-First, to enable reading of BAM files, install Samtools:
-
- http://samtools.sourceforge.net/.
-
- Note: Stacks is not yet compatible with newer Samtools versions that use HTSlib. For
- now, you must use Samtools version 0.1.19, which was the last non-HTSlib version:
- http://sourceforge.net/projects/samtools/files/samtools/0.1.19/
-
-Second, to lower memory usage install Google's SparseHash class.
+If you would like to lower memory usage install Google's SparseHash class.
http://code.google.com/p/sparsehash/
@@ -43,16 +34,14 @@ package manager. If you are using Ubuntu, you can install the following packages
% sudo apt-get install libdbd-mysql-perl
% sudo apt-get install libsparsehash-dev
-% sudo apt-get install samtools
-% sudo apt-get install libbam-dev
A similar set of commands can be executed on Debian using apt-get, or on a RedHat derived Linux
system using yum, or another package manager on other Linux distributions.
2. Build the software. Stacks uses the standard autotools install:
-% tar xfvz stacks_x.xx.tar.gz
-% cd stacks_x.xx
+% tar xfvz stacks-x.xx.tar.gz
+% cd stacks-x.xx
% ./configure
You can change the root of the install location (/usr/local/ on most
@@ -61,21 +50,9 @@ the configure script.
% ./configure --prefix=/home/smith/local
-You can enable Sparsehash and BAM by adding the following options:
-
-% ./configure --enable-sparsehash --enable-bam
-
-You probably need to specify the BAM library location:
-
-% ./configure --enable-bam \
- --with-bam-include-path=/usr/local/include/bam \
- --with-bam-lib-path=/usr/local/lib
-
-Or, if you installed with Ubuntu packages:
+You can enable Sparsehash by adding the following options:
-% ./configure --enable-bam \
- --with-bam-include-path=/usr/include/samtools \
- --with-bam-lib-path=/usr/lib
+% ./configure --enable-sparsehash
% make
diff --git a/configure b/configure
index 02e451f..25d6f19 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Stacks 1.40.
+# Generated by GNU Autoconf 2.69 for Stacks 1.42.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='Stacks'
PACKAGE_TARNAME='stacks'
-PACKAGE_VERSION='1.40'
-PACKAGE_STRING='Stacks 1.40'
+PACKAGE_VERSION='1.42'
+PACKAGE_STRING='Stacks 1.42'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -724,8 +724,6 @@ ac_user_opts='
enable_option_checking
enable_silent_rules
enable_bam
-with_bam_include_path
-with_bam_lib_path
enable_sparsehash
with_sparsehash_include_path
enable_dependency_tracking
@@ -1283,7 +1281,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures Stacks 1.40 to adapt to many kinds of systems.
+\`configure' configures Stacks 1.42 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1349,7 +1347,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of Stacks 1.40:";;
+ short | recursive ) echo "Configuration of Stacks 1.42:";;
esac
cat <<\_ACEOF
@@ -1359,8 +1357,7 @@ Optional Features:
--enable-FEATURE[=ARG] include FEATURE [ARG=yes]
--enable-silent-rules less verbose build output (undo: "make V=1")
--disable-silent-rules verbose build output (undo: "make V=0")
- --enable-bam Enable Samtools' use of BAM files (requires BAM
- library to be installed).
+ --disable-bam Disable use of BAM files through HTSLib.
--enable-sparsehash Enable the use of Google Sparsehash (must be
installed).
--enable-dependency-tracking
@@ -1372,9 +1369,6 @@ Optional Features:
Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
--without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
- --with-bam-include-path location of Samtools BAM headers, defaults to
- /usr/include/bam
- --with-bam-lib-path location of Samtools BAM library
--with-sparsehash-include-path
location of Google Sparsehash headers
@@ -1456,7 +1450,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-Stacks configure 1.40
+Stacks configure 1.42
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1913,7 +1907,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by Stacks $as_me 1.40, which was
+It was created by Stacks $as_me 1.42, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2776,7 +2770,7 @@ fi
# Define the identity of the package.
PACKAGE='stacks'
- VERSION='1.40'
+ VERSION='1.42'
cat >>confdefs.h <<_ACEOF
@@ -2874,42 +2868,32 @@ ac_config_headers="$ac_config_headers config.h"
-# Get BAM library and include locations if requested
+#
+# Get BAM, aka HTSLib library and include locations if requested
+#
# Check whether --enable-bam was given.
if test "${enable_bam+set}" = set; then :
enableval=$enable_bam;
fi
-if test "x$enable_bam" = "xyes"; then :
+if test "x$enable_bam" != "xno"; then :
$as_echo "#define HAVE_BAM 1" >>confdefs.h
-# Check whether --with-bam-include-path was given.
-if test "${with_bam_include_path+set}" = set; then :
- withval=$with_bam_include_path; BAM_CFLAGS="-I$withval"
-else
- BAM_CFLAGS='-I/usr/include/bam'
-fi
-
-
-
-
-# Check whether --with-bam-lib-path was given.
-if test "${with_bam_lib_path+set}" = set; then :
- withval=$with_bam_lib_path; BAM_LIBS="$withval/libbam.a"
-else
- BAM_LIBS='/usr/lib/libbam.a'
-fi
+BAM_CFLAGS='-I./htslib/htslib'
+BAM_LIBS='./htslib/libhts.a'
fi
+#
# Enable use of Google Sparsehash and get include location if requested.
+#
# Check whether --enable-sparsehash was given.
if test "${enable_sparsehash+set}" = set; then :
enableval=$enable_sparsehash;
@@ -6225,7 +6209,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by Stacks $as_me 1.40, which was
+This file was extended by Stacks $as_me 1.42, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6291,7 +6275,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-Stacks config.status 1.40
+Stacks config.status 1.42
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index dac7cdd..fb4e057 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,35 +2,32 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT([Stacks], [1.40])
+AC_INIT([Stacks], [1.42])
AC_CONFIG_AUX_DIR([config])
AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
AC_CONFIG_SRCDIR([src/ustacks.cc])
AC_CONFIG_HEADERS([config.h])
m4_pattern_allow([AC_OPENMP])
-# Get BAM library and include locations if requested
+#
+# Get BAM, aka HTSLib library and include locations if requested
+#
AC_ARG_ENABLE([bam],
- AS_HELP_STRING([--enable-bam], [Enable Samtools' use of BAM files (requires BAM library to be installed).]))
+ AS_HELP_STRING([--disable-bam], [Disable use of BAM files through HTSLib.]))
-AS_IF([test "x$enable_bam" = "xyes"], [
+AS_IF([test "x$enable_bam" != "xno"], [
AC_DEFINE([HAVE_BAM], [1], [Enable compilation with Samtools BAM library])
-AC_ARG_WITH([bam-include-path],
- [AS_HELP_STRING([--with-bam-include-path],
- [location of Samtools BAM headers, defaults to /usr/include/bam])],
- [BAM_CFLAGS="-I$withval"],
- [BAM_CFLAGS='-I/usr/include/bam'])
-AC_SUBST([BAM_CFLAGS])
-AC_ARG_WITH([bam-lib-path],
- [AS_HELP_STRING([--with-bam-lib-path], [location of Samtools BAM library])],
- [BAM_LIBS="$withval/libbam.a"],
- [BAM_LIBS='/usr/lib/libbam.a'])
+BAM_CFLAGS='-I./htslib/htslib'
+AC_SUBST([BAM_CFLAGS])
+BAM_LIBS='./htslib/libhts.a'
AC_SUBST([BAM_LIBS])
])
+#
# Enable use of Google Sparsehash and get include location if requested.
+#
AC_ARG_ENABLE([sparsehash],
AS_HELP_STRING([--enable-sparsehash], [Enable the use of Google Sparsehash (must be installed).]))
diff --git a/htslib/._INSTALL b/htslib/._INSTALL
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._INSTALL differ
diff --git a/htslib/._LICENSE b/htslib/._LICENSE
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._LICENSE differ
diff --git a/htslib/._Makefile b/htslib/._Makefile
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._Makefile differ
diff --git a/htslib/._NEWS b/htslib/._NEWS
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._NEWS differ
diff --git a/htslib/._README b/htslib/._README
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._README differ
diff --git a/htslib/._bgzf.c b/htslib/._bgzf.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._bgzf.c differ
diff --git a/htslib/._bgzip.c b/htslib/._bgzip.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._bgzip.c differ
diff --git a/htslib/._cram b/htslib/._cram
new file mode 100755
index 0000000..8252204
Binary files /dev/null and b/htslib/._cram differ
diff --git a/htslib/._faidx.c b/htslib/._faidx.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._faidx.c differ
diff --git a/htslib/._hfile.c b/htslib/._hfile.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._hfile.c differ
diff --git a/htslib/._hfile_internal.h b/htslib/._hfile_internal.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._hfile_internal.h differ
diff --git a/htslib/._hfile_irods.c b/htslib/._hfile_irods.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._hfile_irods.c differ
diff --git a/htslib/._hfile_libcurl.c b/htslib/._hfile_libcurl.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._hfile_libcurl.c differ
diff --git a/htslib/._hfile_net.c b/htslib/._hfile_net.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._hfile_net.c differ
diff --git a/htslib/._hts.c b/htslib/._hts.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._hts.c differ
diff --git a/htslib/._hts_internal.h b/htslib/._hts_internal.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._hts_internal.h differ
diff --git a/htslib/._htsfile.c b/htslib/._htsfile.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._htsfile.c differ
diff --git a/htslib/._htslib b/htslib/._htslib
new file mode 100755
index 0000000..8252204
Binary files /dev/null and b/htslib/._htslib differ
diff --git a/htslib/._htslib.pc.in b/htslib/._htslib.pc.in
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._htslib.pc.in differ
diff --git a/htslib/._htslib_vars.mk b/htslib/._htslib_vars.mk
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._htslib_vars.mk differ
diff --git a/htslib/._kfunc.c b/htslib/._kfunc.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._kfunc.c differ
diff --git a/htslib/._knetfile.c b/htslib/._knetfile.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._knetfile.c differ
diff --git a/htslib/._kstring.c b/htslib/._kstring.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._kstring.c differ
diff --git a/htslib/._md5.c b/htslib/._md5.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._md5.c differ
diff --git a/htslib/._plugin.c b/htslib/._plugin.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._plugin.c differ
diff --git a/htslib/._regidx.c b/htslib/._regidx.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._regidx.c differ
diff --git a/htslib/._sam.c b/htslib/._sam.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._sam.c differ
diff --git a/htslib/._synced_bcf_reader.c b/htslib/._synced_bcf_reader.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._synced_bcf_reader.c differ
diff --git a/htslib/._tabix.c b/htslib/._tabix.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._tabix.c differ
diff --git a/htslib/._tbx.c b/htslib/._tbx.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._tbx.c differ
diff --git a/htslib/._vcf.c b/htslib/._vcf.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._vcf.c differ
diff --git a/htslib/._vcf_sweep.c b/htslib/._vcf_sweep.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._vcf_sweep.c differ
diff --git a/htslib/._vcfutils.c b/htslib/._vcfutils.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/._vcfutils.c differ
diff --git a/htslib/INSTALL b/htslib/INSTALL
new file mode 100644
index 0000000..48602e4
--- /dev/null
+++ b/htslib/INSTALL
@@ -0,0 +1,102 @@
+Basic Installation
+==================
+
+To build and install HTSlib, 'cd' to the htslib-1.x directory containing
+the package's source and type the following commands:
+
+ ./configure
+ make
+ make install
+
+The './configure' command checks your build environment and allows various
+optional functionality to be enabled (see Configuration below). If you
+don't want to select any optional functionality, you may wish to omit
+configure and just type 'make; make install' as for previous versions
+of HTSlib. However if the build fails you should run './configure' as
+it can diagnose the common reasons for build failures.
+
+The 'make' command builds the HTSlib library and and various useful
+utilities: bgzip, htsfile, and tabix. If compilation fails you should
+run './configure' as it can diagnose problems with your build environment
+that cause build failures.
+
+The 'make install' command installs the libraries, library header files,
+utilities, several manual pages, and a pkgconfig file to /usr/local.
+The installation location can be changed by configuring with --prefix=DIR
+or via 'make prefix=DIR install' (see Installation Locations below).
+
+
+Configuration
+=============
+
+By default, './configure' examines your build environment, checking for
+requirements such as the zlib development files, and arranges for a plain
+HTSlib build. The following configure options can be used to enable
+various features and specify further optional external requirements:
+
+--enable-plugins
+ Use plugins to implement exotic file access protocols and other
+ specialised facilities. This enables such facilities to be developed
+ and packaged outwith HTSlib, and somewhat isolates HTSlib-using programs
+ from their library dependencies. By default, any enabled pluggable
+ facilities (such as iRODS and libcurl file access) are built directly
+ within HTSlib.
+
+--with-plugin-dir=DIR
+ Specifies the directory into which plugins built while building HTSlib
+ should be installed; by default, LIBEXECDIR/htslib.
+
+--with-plugin-path=DIR:DIR:DIR...
+ Specifies the list of directories that HTSlib will search for plugins.
+ By default, only the directory specified via --with-plugin-dir will be
+ searched; you can use --with-plugin-path='DIR:$(plugindir):DIR' and so
+ on to cause additional directories to be searched.
+
+--with-irods[=DIR]
+ Specifies the location of the iRODS client library to use to enable
+ access to data objects stored in iRODS (<http://irods.org/>) via file
+ paths like 'irods:DATAOBJ'. DIR is the base of an iRODS source tree
+ such that the library is present as DIR/lib/core/obj/libRodsAPI.* and
+ headers are present under DIR/lib/api/include and so on. If '=DIR' is
+ omitted, $IRODS_HOME will be used as a base directory.
+
+--enable-libcurl
+ Use libcurl (<http://curl.haxx.se/>) to implement network access to
+ remote files via FTP, HTTP, HTTPS, S3, etc. By default, HTSlib uses
+ its own simple networking code to provide access via FTP and HTTP only.
+
+The configure script also accepts the usual options and environment variables
+for tuning installation locations and compilers: type './configure --help'
+for details. For example,
+
+ ./configure CC=icc --prefix=/opt/icc-compiled
+
+would specify that HTSlib is to be built with icc and installed into bin,
+lib, etc subdirectories under /opt/icc-compiled.
+
+
+Installation Locations
+======================
+
+By default, 'make install' installs HTSlib libraries under /usr/local/lib,
+HTSlib header files under /usr/local/include, utility programs under
+/usr/local/bin, etc. (To be precise, the header files are installed within
+a fixed 'htslib' subdirectory under the specified .../include location.)
+
+You can specify a different location to install HTSlib by configuring
+with --prefix=DIR or specify locations for particular parts of HTSlib by
+configuring with --libdir=DIR and so on. Type './configure --help' for
+the full list of such install directory options.
+
+Alternatively you can specify different locations at install time by
+typing 'make prefix=DIR install' or 'make libdir=DIR install' and so on.
+Consult the list of prefix/exec_prefix/etc variables near the top of the
+Makefile for the full list of such variables that can be overridden.
+
+You can also specify a staging area by typing 'make DESTDIR=DIR install',
+possibly in conjunction with other --prefix or prefix=DIR settings.
+For example,
+
+ make DESTDIR=/tmp/staging prefix=/opt
+
+would install into bin, lib, etc subdirectories under /tmp/staging/opt.
diff --git a/htslib/LICENSE b/htslib/LICENSE
new file mode 100644
index 0000000..03db010
--- /dev/null
+++ b/htslib/LICENSE
@@ -0,0 +1,69 @@
+[Files in this distribution outwith the cram/ subdirectory are distributed
+according to the terms of the following MIT/Expat license.]
+
+The MIT/Expat License
+
+Copyright (C) 2012-2014 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+
+[Files within the cram/ subdirectory in this distribution are distributed
+according to the terms of the following Modified 3-Clause BSD license.]
+
+The Modified-BSD License
+
+Copyright (C) 2012-2014 Genome Research Ltd.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute
+ nor the names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+[The use of a range of years within a copyright notice in this distribution
+should be interpreted as being equivalent to a list of years including the
+first and last year specified and all consecutive years between them.
+
+For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009,
+2011-2012" should be interpreted as being identical to a notice that reads
+"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice
+that reads "Copyright (C) 2005-2012" should be interpreted as being identical
+to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012".]
diff --git a/htslib/Makefile b/htslib/Makefile
new file mode 100644
index 0000000..06fd7bd
--- /dev/null
+++ b/htslib/Makefile
@@ -0,0 +1,426 @@
+# Makefile for htslib, a C library for high-throughput sequencing data formats.
+#
+# Copyright (C) 2013-2015 Genome Research Ltd.
+#
+# Author: John Marshall <jm18 at sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+CC = gcc
+AR = ar
+RANLIB = ranlib
+
+CPPFLAGS =
+# TODO: probably update cram code to make it compile cleanly with -Wc++-compat
+CFLAGS = -g -Wall -O2
+EXTRA_CFLAGS_PIC = -fpic
+LDFLAGS =
+LIBS =
+
+# For now these don't work too well as samtools also needs to know to
+# add -lbz2 and -llzma if linking against the static libhts.a library.
+# TODO This needs configury and adding to htslib.pc.in.
+#
+# # Bzip2 support; optionally used by CRAM.
+# HAVE_LIBBZ2 := $(shell echo -e "\#include <bzlib.h>\012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -lbz2 2>/dev/null && echo yes)
+# ifeq "$(HAVE_LIBBZ2)" "yes"
+# CPPFLAGS += -DHAVE_LIBBZ2
+# LIBS += -lbz2
+# endif
+#
+# # Lzma support; optionally used by CRAM.
+# HAVE_LIBLZMA := $(shell echo -e "\#include <lzma.h>\012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -llzma 2>/dev/null && echo yes)
+# ifeq "$(HAVE_LIBLZMA)" "yes"
+# CPPFLAGS += -DHAVE_LIBLZMA
+# LIBS += -llzma
+# endif
+
+prefix = /usr/local
+exec_prefix = $(prefix)
+bindir = $(exec_prefix)/bin
+includedir = $(prefix)/include
+libdir = $(exec_prefix)/lib
+libexecdir = $(exec_prefix)/libexec
+datarootdir = $(prefix)/share
+mandir = $(datarootdir)/man
+man1dir = $(mandir)/man1
+man5dir = $(mandir)/man5
+pkgconfigdir= $(libdir)/pkgconfig
+
+MKDIR_P = mkdir -p
+INSTALL = install -p
+INSTALL_PROGRAM = $(INSTALL)
+INSTALL_DATA = $(INSTALL) -m 644
+INSTALL_DIR = $(MKDIR_P) -m 755
+
+# Set by config.mk if plugins are enabled
+plugindir =
+
+BUILT_PROGRAMS = \
+ bgzip \
+ htsfile \
+ tabix
+
+BUILT_TEST_PROGRAMS = \
+ test/fieldarith \
+ test/hfile \
+ test/sam \
+ test/test-regidx \
+ test/test_view \
+ test/test-vcf-api \
+ test/test-vcf-sweep
+
+# all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS)
+all: lib-static
+
+HTSPREFIX =
+include htslib_vars.mk
+
+
+PACKAGE_VERSION = 1.3.1
+LIBHTS_SOVERSION = 1
+
+
+# $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string
+# even if this is a dirty or untagged Git working tree.
+NUMERIC_VERSION = $(PACKAGE_VERSION)
+
+# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
+# description of the working tree: either a release tag with the same value
+# as $(PACKAGE_VERSION) above, or an exact description likely based on a tag.
+# Much of this is also GNU Make-specific. If you don't have GNU Make and/or
+# are not building from a Git repository, comment out this conditional.
+ifneq "$(wildcard .git)" ""
+original_version := $(PACKAGE_VERSION)
+PACKAGE_VERSION := $(shell git describe --always --dirty)
+
+# Unless the Git description matches /\d*\.\d*(\.\d*)?/, i.e., is exactly a tag
+# with a numeric name, revert $(NUMERIC_VERSION) to the original version number
+# written above, but with the patchlevel field bumped to 255.
+ifneq "$(subst ..,.,$(subst 0,,$(subst 1,,$(subst 2,,$(subst 3,,$(subst 4,,$(subst 5,,$(subst 6,,$(subst 7,,$(subst 8,,$(subst 9,,$(PACKAGE_VERSION))))))))))))" "."
+empty :=
+NUMERIC_VERSION := $(subst $(empty) ,.,$(wordlist 1,2,$(subst ., ,$(original_version))) 255)
+endif
+
+# Force version.h to be remade if $(PACKAGE_VERSION) has changed.
+version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force))
+endif
+
+version.h:
+ echo '#define HTS_VERSION "$(PACKAGE_VERSION)"' > $@
+
+print-version:
+ @echo $(PACKAGE_VERSION)
+
+
+.SUFFIXES: .bundle .c .o .pico .so
+
+.c.o:
+ $(CC) $(CFLAGS) -I. $(CPPFLAGS) -c -o $@ $<
+
+.c.pico:
+ $(CC) $(CFLAGS) -I. $(CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $<
+
+
+LIBHTS_OBJS = \
+ kfunc.o \
+ knetfile.o \
+ kstring.o \
+ bgzf.o \
+ faidx.o \
+ hfile.o \
+ hfile_net.o \
+ hts.o \
+ md5.o \
+ regidx.o \
+ sam.o \
+ synced_bcf_reader.o \
+ vcf_sweep.o \
+ tbx.o \
+ vcf.o \
+ vcfutils.o \
+ cram/cram_codecs.o \
+ cram/cram_decode.o \
+ cram/cram_encode.o \
+ cram/cram_external.o \
+ cram/cram_index.o \
+ cram/cram_io.o \
+ cram/cram_samtools.o \
+ cram/cram_stats.o \
+ cram/files.o \
+ cram/mFILE.o \
+ cram/open_trace_file.o \
+ cram/pooled_alloc.o \
+ cram/rANS_static.o \
+ cram/sam_header.o \
+ cram/string_alloc.o \
+ cram/thread_pool.o \
+ cram/vlen.o \
+ cram/zfio.o
+
+PLUGIN_OBJS =
+
+cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h)
+cram_io_h = cram/cram_io.h $(cram_misc_h)
+cram_misc_h = cram/misc.h cram/os.h
+cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h)
+cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h)
+cram_structs_h = cram/cram_structs.h cram/thread_pool.h cram/string_alloc.h $(htslib_khash_h)
+cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h
+hfile_internal_h = hfile_internal.h $(htslib_hfile_h)
+hts_internal_h = hts_internal.h $(htslib_hts_h)
+
+
+# To be effective, config.mk needs to appear after most Makefile variables are
+# set but before most rules appear, so that it can both use previously-set
+# variables in its own rules' prerequisites and also update variables for use
+# in later rules' prerequisites.
+
+# If your make doesn't accept -include, change this to 'include' if you are
+# using the configure script or just comment the line out if you are not.
+-include config.mk
+
+# Usually config.h is generated by running configure or config.status,
+# but if those aren't used create a default config.h here.
+config.h:
+ echo '/* Empty config.h generated by Makefile */' > $@
+
+
+lib-static: libhts.a
+
+# $(shell), :=, and ifeq/.../endif are GNU Make-specific. If you don't have
+# GNU Make, comment out the parts of this conditional that don't apply.
+PLATFORM := $(shell uname -s)
+ifeq "$(PLATFORM)" "Darwin"
+SHLIB_FLAVOUR = dylib
+lib-shared: libhts.dylib
+BUILT_PLUGINS = $(PLUGIN_OBJS:.o=.bundle)
+else
+SHLIB_FLAVOUR = so
+lib-shared: libhts.so
+BUILT_PLUGINS = $(PLUGIN_OBJS:.o=.so)
+endif
+
+plugins: $(BUILT_PLUGINS)
+
+
+libhts.a: $(LIBHTS_OBJS)
+ @-rm -f $@
+ $(AR) -rc $@ $(LIBHTS_OBJS)
+ -$(RANLIB) $@
+
+
+# The target here is libhts.so, as that is the built file that other rules
+# depend upon and that is used when -lhts appears in other program's recipes.
+# As a byproduct invisible to make, libhts.so.NN is also created, as it is the
+# file used at runtime (when $LD_LIBRARY_PATH includes the build directory).
+
+libhts.so: $(LIBHTS_OBJS:.o=.pico)
+ $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) -pthread $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) -lz -lm $(LIBS)
+ ln -sf $@ libhts.so.$(LIBHTS_SOVERSION)
+
+# Similarly this also creates libhts.NN.dylib as a byproduct, so that programs
+# when run can find this uninstalled shared library (when $DYLD_LIBRARY_PATH
+# includes this project's build directory).
+
+libhts.dylib: $(LIBHTS_OBJS)
+ $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) -lz $(LIBS)
+ ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib
+
+
+.pico.so:
+ $(CC) -shared -Wl,-E -pthread $(LDFLAGS) -o $@ $< $(LIBS)
+
+.o.bundle:
+ $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< $(LIBS)
+
+
+bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_khash_h)
+kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h)
+knetfile.o knetfile.pico: knetfile.c config.h $(htslib_knetfile_h)
+hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts_internal_h) $(htslib_khash_h)
+hfile_irods.o hfile_irods.pico: hfile_irods.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
+hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
+hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h)
+hts.o hts.pico: hts.c config.h version.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(hts_internal_h)
+vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) $(htslib_khash_str2int_h)
+sam.o sam.pico: sam.c config.h $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
+tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_khash_h)
+faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h)
+synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c config.h $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(htslib_bgzf_h)
+vcf_sweep.o vcf_sweep.pico: vcf_sweep.c config.h $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
+vcfutils.o vcfutils.pico: vcfutils.c config.h $(htslib_vcfutils_h) $(htslib_kbitset_h)
+kfunc.o kfunc.pico: kfunc.c config.h $(htslib_kfunc_h)
+regidx.o regidx.pico: regidx.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(htslib_regidx_h)
+md5.o md5.pico: md5.c config.h $(htslib_hts_h)
+plugin.o plugin.pico: plugin.c config.h $(hts_internal_h) $(htslib_kstring_h)
+
+cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(cram_h)
+cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) cram/os.h $(htslib_hts_h)
+cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) cram/os.h $(htslib_hts_h)
+cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h)
+cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_hfile_h) $(cram_h) cram/os.h cram/zfio.h $(hts_internal_h)
+cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h $(cram_h) cram/os.h $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h)
+cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h)
+cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) cram/os.h
+cram/files.o cram/files.pico: cram/files.c config.h $(cram_misc_h)
+cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h cram/os.h cram/mFILE.h cram/vlen.h
+cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h cram/os.h $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h)
+cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/pooled_alloc.h
+cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c config.h cram/rANS_static.h cram/rANS_byte.h
+cram/sam_header.o cram/sam_header.pico: cram/sam_header.c config.h $(cram_sam_header_h) cram/string_alloc.h
+cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h
+cram/thread_pool.o cram/thread_pool.pico: cram/thread_pool.c config.h cram/thread_pool.h
+cram/vlen.o cram/vlen.pico: cram/vlen.c config.h cram/vlen.h cram/os.h
+cram/zfio.o cram/zfio.pico: cram/zfio.c config.h cram/os.h cram/zfio.h
+
+
+bgzip: bgzip.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ bgzip.o libhts.a -lz $(LIBS)
+
+htsfile: htsfile.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ htsfile.o libhts.a -lz $(LIBS)
+
+tabix: tabix.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ tabix.o libhts.a -lz $(LIBS)
+
+bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h)
+htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
+tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h)
+
+
+# For tests that might use it, set $REF_PATH explicitly to use only reference
+# areas within the test suite (or set it to ':' to use no reference areas).
+check test: htsfile $(BUILT_TEST_PROGRAMS)
+ test/fieldarith test/fieldarith.sam
+ test/hfile
+ test/sam test/ce.fa test/faidx.fa
+ test/test-regidx
+ cd test && REF_PATH=: ./test.pl
+
+test/fieldarith: test/fieldarith.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/fieldarith.o libhts.a -lz $(LIBS)
+
+test/hfile: test/hfile.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/hfile.o libhts.a -lz $(LIBS)
+
+test/sam: test/sam.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/sam.o libhts.a -lz $(LIBS)
+
+test/test-regidx: test/test-regidx.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/test-regidx.o libhts.a -lz $(LIBS)
+
+test/test_view: test/test_view.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/test_view.o libhts.a -lz $(LIBS)
+
+test/test-vcf-api: test/test-vcf-api.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-api.o libhts.a -lz $(LIBS)
+
+test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
+ $(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a -lz $(LIBS)
+
+test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h)
+test/hfile.o: test/hfile.c config.h $(htslib_hfile_h) $(htslib_hts_defs_h)
+test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h)
+test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h)
+test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h)
+test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h)
+test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h)
+
+
+# install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig
+# $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir)
+# if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi
+# $(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib
+# $(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a
+# $(INSTALL_DATA) htsfile.1 tabix.1 $(DESTDIR)$(man1dir)
+# $(INSTALL_DATA) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir)
+
+installdirs:
+ $(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(includedir) $(DESTDIR)$(includedir)/htslib $(DESTDIR)$(libdir) $(DESTDIR)$(man1dir) $(DESTDIR)$(man5dir) $(DESTDIR)$(pkgconfigdir)
+ if test -n "$(plugindir)"; then $(INSTALL_DIR) $(DESTDIR)$(plugindir); fi
+
+# After installation, the real file in $(libdir) will be libhts.so.X.Y.Z,
+# with symlinks libhts.so (used via -lhts during linking of client programs)
+# and libhts.so.NN (used by client executables at runtime).
+
+install-so: libhts.so installdirs
+ $(INSTALL_DATA) libhts.so $(DESTDIR)$(libdir)/libhts.so.$(PACKAGE_VERSION)
+ ln -sf libhts.so.$(PACKAGE_VERSION) $(DESTDIR)$(libdir)/libhts.so
+ ln -sf libhts.so.$(PACKAGE_VERSION) $(DESTDIR)$(libdir)/libhts.so.$(LIBHTS_SOVERSION)
+
+install-dylib: libhts.dylib installdirs
+ $(INSTALL_PROGRAM) libhts.dylib $(DESTDIR)$(libdir)/libhts.$(PACKAGE_VERSION).dylib
+ ln -sf libhts.$(PACKAGE_VERSION).dylib $(DESTDIR)$(libdir)/libhts.dylib
+ ln -sf libhts.$(PACKAGE_VERSION).dylib $(DESTDIR)$(libdir)/libhts.$(LIBHTS_SOVERSION).dylib
+
+# Substitute these pseudo-autoconf variables only at install time
+# so that "make install prefix=/prefix/path" etc continue to work.
+install-pkgconfig: installdirs
+ sed -e 's#@includedir@#$(includedir)#g;s#@libdir@#$(libdir)#g;s#@PACKAGE_VERSION@#$(PACKAGE_VERSION)#g' htslib.pc.in > $(DESTDIR)$(pkgconfigdir)/htslib.pc
+ chmod 644 $(DESTDIR)$(pkgconfigdir)/htslib.pc
+
+# A pkg-config file (suitable for copying to $PKG_CONFIG_PATH) that provides
+# flags for building against the uninstalled library in this build directory.
+htslib-uninstalled.pc: htslib.pc.in
+ sed -e 's#@includedir@#'`pwd`'#g;s#@libdir@#'`pwd`'#g' htslib.pc.in > $@
+
+
+testclean:
+ -rm -f test/*.tmp test/*.tmp.*
+
+mostlyclean: testclean
+ -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h
+
+clean: mostlyclean clean-$(SHLIB_FLAVOUR)
+ -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(BUILT_TEST_PROGRAMS)
+
+distclean maintainer-clean: clean
+ -rm -f config.cache config.h config.log config.mk config.status
+ -rm -f TAGS *-uninstalled.pc
+
+clean-so:
+ -rm -f libhts.so libhts.so.*
+
+clean-dylib:
+ -rm -f libhts.dylib libhts.*.dylib
+
+
+tags TAGS:
+ ctags -f TAGS *.[ch] cram/*.[ch] htslib/*.h
+
+# We recommend libhts-using programs be built against a separate htslib
+# installation. However if you feel that you must bundle htslib source
+# code with your program, this hook enables Automake-style "make dist"
+# for this subdirectory. If you do bundle an htslib snapshot, please
+# add identifying information to $(PACKAGE_VERSION) as appropriate.
+# (The wildcards attempt to omit non-exported files (.git*, README.md,
+# etc) and other detritus that might be in the top-level directory.)
+distdir:
+ tar -c *.[ch15] [ILMNRcht]*[ELSbcekmnt] | (cd $(distdir) && tar -x)
+ +cd $(distdir) && $(MAKE) distclean
+
+force:
+
+
+.PHONY: all check clean distclean distdir force
+.PHONY: install install-pkgconfig installdirs lib-shared lib-static
+.PHONY: maintainer-clean mostlyclean plugins print-version tags test testclean
+.PHONY: clean-so install-so
+.PHONY: clean-dylib install-dylib
diff --git a/htslib/NEWS b/htslib/NEWS
new file mode 100644
index 0000000..b7bc4d4
--- /dev/null
+++ b/htslib/NEWS
@@ -0,0 +1,109 @@
+Noteworthy changes in release 1.3.1 (22 April 2016)
+
+* Improved error checking and reporting, especially of I/O errors when
+ writing output files (#17, #315, PR #271, PR #317).
+
+* Build fixes for 32-bit systems; be sure to run configure to enable
+ large file support and access to 2GiB+ files.
+
+* Numerous VCF parsing fixes (#321, #322, #323, #324, #325; PR #370).
+ Particular thanks to Kostya Kortchinsky of the Google Security Team
+ for testing and numerous input parsing bug reports.
+
+* HTSlib now prints an informational message when initially creating a
+ CRAM reference cache in the default location under your $HOME directory.
+ (No message is printed if you are using $REF_CACHE to specify a location.)
+
+* Avoided rare race condition when caching downloaded CRAM reference sequence
+ files, by using distinctive names for temporary files (in addition to O_EXCL,
+ which has always been used). Occasional corruption would previously occur
+ when multiple tools were simultaneously caching the same reference sequences
+ on an NFS filesystem that did not support O_EXCL (PR #320).
+
+* Prevented race condition in file access plugin loading (PR #341).
+
+* Fixed mpileup memory leak, so no more "[bam_plp_destroy] memory leak [...]
+ Continue anyway" warning messages (#299).
+
+* Various minor CRAM fixes.
+
+* Fixed documentation problems #348 and #358.
+
+
+Noteworthy changes in release 1.3 (15 December 2015)
+
+* Files can now be accessed via HTTPS and Amazon S3 in addition to HTTP
+ and FTP, when HTSlib is configured to use libcurl for network file access
+ rather than the included basic knetfile networking.
+
+* HTSlib can be built to use remote access hFILE backends (such as iRODS
+ and libcurl) via a plugin mechanism. This allows other backends to be
+ easily added and facilitates building tools that use HTSlib, as they
+ don't need to be linked with the backends' various required libraries.
+
+* When writing CRAM output, sam_open() etc now default to writing CRAM v3.0
+ rather than v2.1.
+
+* fai_build() and samtools faidx now accept initial whitespace in ">"
+ headers (e.g., "> chr1 description" is taken to refer to "chr1").
+
+* tabix --only-header works again (was broken in 1.2.x; #249).
+
+* HTSlib's configure script and Makefile now fully support the standard
+ convention of allowing CC/CPPFLAGS/CFLAGS/LDFLAGS/LIBS to be overridden
+ as needed. Previously the Makefile listened to $(LDLIBS) instead; if you
+ were overriding that, you should now override LIBS rather than LDLIBS.
+
+* Fixed bugs #168, #172, #176, #197, #206, #225, #245, #265, #295, and #296.
+
+
+Noteworthy changes in release 1.2.1 (3 February 2015)
+
+* Reinstated hts_file_type() and FT_* macros, which were available until 1.1
+ but briefly removed in 1.2. This function is deprecated and will be removed
+ in a future release -- you should use hts_detect_format() etc instead
+
+
+Noteworthy changes in release 1.2 (2 February 2015)
+
+* HTSlib now has a configure script which checks your build environment
+ and allows for selection of optional extras. See INSTALL for details
+
+* By default, reference sequences are fetched from the EBI CRAM Reference
+ Registry and cached in your $HOME cache directory. This behaviour can
+ be controlled by setting REF_PATH and REF_CACHE enviroment variables
+ (see the samtools(1) man page for details)
+
+* Numerous CRAM improvements:
+ - Support for CRAM v3.0, an upcoming revision to CRAM supporting
+ better compression and per-container checksums
+ - EOF checking for v2.1 and v3.0 (similar to checking BAM EOF blocks)
+ - Non-standard values for PNEXT and TLEN fields are now preserved
+ - hts_set_fai_filename() now provides a reference file when encoding
+ - Generated read names are now numbered from 1, rather than being
+ labelled 'slice:record-in-slice'
+ - Multi-threading and speed improvements
+
+* New htsfile command for identifying file formats, and corresponding
+ file format detection APIs
+
+* New tabix --regions FILE, --targets FILE options for filtering via BED files
+
+* Optional iRODS file access, disabled by default. Configure with --with-irods
+ to enable accessing iRODS data objects directly via 'irods:DATAOBJ'
+
+* All occurences of 2^29 in the source have been eliminated, so indexing
+ and querying against reference sequences larger than 512Mbp works (when
+ using CSI indices)
+
+* Support for plain GZIP compression in various places
+
+* VCF header editing speed improvements
+
+* Added seq_nt16_int[] (equivalent to the samtools API's bam_nt16_nt4_table)
+
+* Reinstated faidx_fetch_nseq(), which was accidentally removed from 1.1.
+ Now faidx_fetch_nseq() and faidx_nseq() are equivalent; eventually
+ faidx_fetch_nseq() will be deprecated and removed [#156]
+
+* Fixed bugs #141, #152, #155, #158, #159, and various memory leaks
diff --git a/htslib/README b/htslib/README
new file mode 100644
index 0000000..4225bec
--- /dev/null
+++ b/htslib/README
@@ -0,0 +1,5 @@
+HTSlib is an implementation of a unified C library for accessing common file
+formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing
+data. It is the core library used by samtools and bcftools.
+
+See INSTALL for building and installation instructions.
diff --git a/htslib/bgzf.c b/htslib/bgzf.c
new file mode 100644
index 0000000..a6c8897
--- /dev/null
+++ b/htslib/bgzf.c
@@ -0,0 +1,1330 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+ Copyright (C) 2009, 2013-2016 Genome Research Ltd
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <inttypes.h>
+
+#include "htslib/hts.h"
+#include "htslib/bgzf.h"
+#include "htslib/hfile.h"
+
+#define BGZF_CACHE
+#define BGZF_MT
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ BGZF extension:
+ ^ ^ ^ ^
+ | | | |
+ FLG.EXTRA XLEN B C
+
+ BGZF format is compatible with GZIP. It limits the size of each compressed
+ block to 2^16 bytes and adds and an extra "BC" field in the gzip header which
+ records the size.
+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+ int size;
+ uint8_t *block;
+ int64_t end_offset;
+} cache_t;
+#include "htslib/khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+typedef struct
+{
+ uint64_t uaddr; // offset w.r.t. uncompressed data
+ uint64_t caddr; // offset w.r.t. compressed data
+}
+bgzidx1_t;
+
+struct __bgzidx_t
+{
+ int noffs, moffs; // the size of the index, n:used, m:allocated
+ bgzidx1_t *offs; // offsets
+ uint64_t ublock_addr; // offset of the current block (uncompressed data)
+};
+
+void bgzf_index_destroy(BGZF *fp);
+int bgzf_index_add_block(BGZF *fp);
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
+}
+
+static const char *bgzf_zerr(int errnum, z_stream *zs)
+{
+ static char buffer[32];
+
+ /* Return zs->msg if available.
+ zlib doesn't set this very reliably. Looking at the source suggests
+ that it may get set to a useful message for deflateInit2, inflateInit2
+ and inflate when it returns Z_DATA_ERROR. For inflate with other
+ return codes, deflate, deflateEnd and inflateEnd it doesn't appear
+ to be useful. For the likely non-useful cases, the caller should
+ pass NULL into zs. */
+
+ if (zs && zs->msg) return zs->msg;
+
+ // gzerror OF((gzFile file, int *errnum)
+ switch (errnum) {
+ case Z_ERRNO:
+ return strerror(errno);
+ case Z_STREAM_ERROR:
+ return "invalid parameter/compression level, or inconsistent stream state";
+ case Z_DATA_ERROR:
+ return "invalid or incomplete IO";
+ case Z_MEM_ERROR:
+ return "out of memory";
+ case Z_BUF_ERROR:
+ return "progress temporarily not possible, or in() / out() returned an error";
+ case Z_VERSION_ERROR:
+ return "zlib version mismatch";
+ case Z_OK: // 0: maybe gzgets error Z_NULL
+ default:
+ snprintf(buffer, sizeof(buffer), "[%d] unknown", errnum);
+ return buffer; // FIXME: Not thread-safe.
+ }
+}
+
+static BGZF *bgzf_read_init(hFILE *hfpr)
+{
+ BGZF *fp;
+ uint8_t magic[18];
+ ssize_t n = hpeek(hfpr, magic, 18);
+ if (n < 0) return NULL;
+
+ fp = (BGZF*)calloc(1, sizeof(BGZF));
+ if (fp == NULL) return NULL;
+
+ fp->is_write = 0;
+ fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b);
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b) ? 1 : 0;
+ fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
+#ifdef BGZF_CACHE
+ fp->cache = kh_init(cache);
+#endif
+ return fp;
+}
+
+// get the compress level from the mode string: compress_level==-1 for the default level, -2 plain uncompressed
+static int mode2level(const char *__restrict mode)
+{
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = -2;
+ return compress_level;
+}
+static BGZF *bgzf_write_init(const char *mode)
+{
+ BGZF *fp;
+ fp = (BGZF*)calloc(1, sizeof(BGZF));
+ if (fp == NULL) goto mem_fail;
+ fp->is_write = 1;
+ int compress_level = mode2level(mode);
+ if ( compress_level==-2 )
+ {
+ fp->is_compressed = 0;
+ return fp;
+ }
+ fp->is_compressed = 1;
+
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ if (fp->uncompressed_block == NULL) goto mem_fail;
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ if (fp->compressed_block == NULL) goto mem_fail;
+
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+ if ( strchr(mode,'g') )
+ {
+ // gzip output
+ fp->is_gzip = 1;
+ fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream));
+ if (fp->gz_stream == NULL) goto mem_fail;
+ fp->gz_stream->zalloc = NULL;
+ fp->gz_stream->zfree = NULL;
+ fp->gz_stream->msg = NULL;
+
+ int ret = deflateInit2(fp->gz_stream, fp->compress_level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY);
+ if (ret!=Z_OK) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] deflateInit2 failed: %s\n",
+ __func__, bgzf_zerr(ret, fp->gz_stream));
+ }
+ goto fail;
+ }
+ }
+ return fp;
+
+ mem_fail:
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] %s\n", __func__, strerror(errno));
+ }
+ fail:
+ if (fp != NULL) {
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free(fp->gz_stream);
+ free(fp);
+ }
+ return NULL;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r')) {
+ hFILE *fpr;
+ if ((fpr = hopen(path, mode)) == 0) return 0;
+ fp = bgzf_read_init(fpr);
+ if (fp == 0) { hclose_abruptly(fpr); return NULL; }
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+ hFILE *fpw;
+ if ((fpw = hopen(path, mode)) == 0) return 0;
+ fp = bgzf_write_init(mode);
+ if (fp == NULL) return NULL;
+ fp->fp = fpw;
+ }
+ else { errno = EINVAL; return 0; }
+
+ fp->is_be = ed_is_big();
+ return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r')) {
+ hFILE *fpr;
+ if ((fpr = hdopen(fd, mode)) == 0) return 0;
+ fp = bgzf_read_init(fpr);
+ if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+ hFILE *fpw;
+ if ((fpw = hdopen(fd, mode)) == 0) return 0;
+ fp = bgzf_write_init(mode);
+ if (fp == NULL) return NULL;
+ fp->fp = fpw;
+ }
+ else { errno = EINVAL; return 0; }
+
+ fp->is_be = ed_is_big();
+ return fp;
+}
+
+BGZF *bgzf_hopen(hFILE *hfp, const char *mode)
+{
+ BGZF *fp = NULL;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r')) {
+ fp = bgzf_read_init(hfp);
+ if (fp == NULL) return NULL;
+ } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+ fp = bgzf_write_init(mode);
+ if (fp == NULL) return NULL;
+ }
+ else { errno = EINVAL; return 0; }
+
+ fp->fp = hfp;
+ fp->is_be = ed_is_big();
+ return fp;
+}
+
+int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int level)
+{
+ uint32_t crc;
+ z_stream zs;
+ uint8_t *dst = (uint8_t*)_dst;
+
+ // compress the body
+ zs.zalloc = NULL; zs.zfree = NULL;
+ zs.msg = NULL;
+ zs.next_in = (Bytef*)src;
+ zs.avail_in = slen;
+ zs.next_out = dst + BLOCK_HEADER_LENGTH;
+ zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+ int ret = deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
+ if (ret!=Z_OK) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] deflateInit2 failed: %s\n",
+ __func__, bgzf_zerr(ret, &zs));
+ }
+ return -1;
+ }
+ if ((ret = deflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] deflate failed: %s\n",
+ __func__, bgzf_zerr(ret, ret == Z_DATA_ERROR ? &zs : NULL));
+ }
+ return -1;
+ }
+ if ((ret = deflateEnd(&zs)) != Z_OK) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] deflateEnd failed: %s\n",
+ __func__, bgzf_zerr(ret, NULL));
+ }
+ return -1;
+ }
+ *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ // write the header
+ memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+ packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
+ // write the footer
+ crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen);
+ packInt32((uint8_t*)&dst[*dlen - 8], crc);
+ packInt32((uint8_t*)&dst[*dlen - 4], slen);
+ return 0;
+}
+
+static int bgzf_gzip_compress(BGZF *fp, void *_dst, size_t *dlen, const void *src, size_t slen, int level)
+{
+ uint8_t *dst = (uint8_t*)_dst;
+ z_stream *zs = fp->gz_stream;
+ int flush = slen ? Z_NO_FLUSH : Z_FINISH;
+ zs->next_in = (Bytef*)src;
+ zs->avail_in = slen;
+ zs->next_out = dst;
+ zs->avail_out = *dlen;
+ int ret = deflate(zs, flush);
+ if (ret == Z_STREAM_ERROR) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] deflate failed: %s\n",
+ __func__, bgzf_zerr(ret, NULL));
+ }
+ return -1;
+ }
+ *dlen = *dlen - zs->avail_out;
+ return 0;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+ size_t comp_size = BGZF_MAX_BLOCK_SIZE;
+ int ret;
+ if ( !fp->is_gzip )
+ ret = bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
+ else
+ ret = bgzf_gzip_compress(fp, fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
+
+ if ( ret != 0 )
+ {
+ if (hts_verbose >= 3) {
+ fprintf(stderr, "[E::%s] compression error %d\n", __func__, ret);
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_offset = 0;
+ return comp_size;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.msg = NULL;
+ zs.next_in = (Bytef*)fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = (Bytef*)fp->uncompressed_block;
+ zs.avail_out = BGZF_MAX_BLOCK_SIZE;
+
+ int ret = inflateInit2(&zs, -15);
+ if (ret != Z_OK) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] inflateInit2 failed: %s\n",
+ __func__, bgzf_zerr(ret, &zs));
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if ((ret = inflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] inflate failed: %s\n",
+ __func__, bgzf_zerr(ret, ret == Z_DATA_ERROR ? &zs : NULL));
+ }
+ if ((ret = inflateEnd(&zs)) != Z_OK) {
+ if (hts_verbose >= 2) {
+ fprintf(stderr, "[E::%s] inflateEnd failed: %s\n",
+ __func__, bgzf_zerr(ret, NULL));
+ }
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if ((ret = inflateEnd(&zs)) != Z_OK) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] inflateEnd failed: %s\n",
+ __func__, bgzf_zerr(ret, NULL));
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ return zs.total_out;
+}
+
+static int inflate_gzip_block(BGZF *fp, int cached)
+{
+ int ret = Z_OK;
+ do
+ {
+ if ( !cached && fp->gz_stream->avail_out!=0 )
+ {
+ fp->gz_stream->avail_in = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE);
+ if ( fp->gz_stream->avail_in<=0 ) return fp->gz_stream->avail_in;
+ if ( fp->gz_stream->avail_in==0 ) break;
+ fp->gz_stream->next_in = fp->compressed_block;
+ }
+ else cached = 0;
+ do
+ {
+ fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset;
+ fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset;
+ fp->gz_stream->msg = NULL;
+ ret = inflate(fp->gz_stream, Z_NO_FLUSH);
+ if ( ret==Z_BUF_ERROR ) continue; // non-critical error
+ if ( ret<0 ) {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] inflate failed: %s\n",
+ __func__,
+ bgzf_zerr(ret, ret == Z_DATA_ERROR ? fp->gz_stream : NULL));
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ unsigned int have = BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+ if ( have ) return have;
+ }
+ while ( fp->gz_stream->avail_out == 0 );
+ }
+ while (ret != Z_STREAM_END);
+ return BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+}
+
+// Returns: 0 on success (BGZF header); -1 on non-BGZF GZIP header; -2 on error
+static int check_header(const uint8_t *header)
+{
+ if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+ return ((header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+ khint_t k;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (fp->is_write) return;
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) free(kh_val(h, k).block);
+ kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ k = kh_get(cache, h, block_address);
+ if (k == kh_end(h)) return 0;
+ p = &kh_val(h, k);
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address = block_address;
+ fp->block_length = p->size;
+ memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
+ if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 )
+ {
+ // todo: move the error up
+ fprintf(stderr,"Could not hseek to %"PRId64"\n", p->end_offset);
+ exit(1);
+ }
+ return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+ int ret;
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
+ if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
+ /* A better way would be to remove the oldest block in the
+ * cache, but here we remove a random one for simplicity. This
+ * should not have a big impact on performance. */
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) break;
+ if (k < kh_end(h)) {
+ free(kh_val(h, k).block);
+ kh_del(cache, h, k);
+ }
+ }
+ k = kh_put(cache, h, fp->block_address, &ret);
+ if (ret == 0) return; // if this happens, a bug!
+ p = &kh_val(h, k);
+ p->size = fp->block_length;
+ p->end_offset = fp->block_address + size;
+ p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
+ memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+ uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+ int count, size = 0, block_length, remaining;
+
+ // Reading an uncompressed file
+ if ( !fp->is_compressed )
+ {
+ count = hread(fp->fp, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+ if (count < 0) // Error
+ {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ else if (count == 0) // EOF
+ {
+ fp->block_length = 0;
+ return 0;
+ }
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address += count;
+ fp->block_length = count;
+ return 0;
+ }
+
+ // Reading compressed file
+ int64_t block_address;
+ block_address = htell(fp->fp);
+ if ( fp->is_gzip && fp->gz_stream ) // is this is a initialized gzip stream?
+ {
+ count = inflate_gzip_block(fp, 0);
+ if ( count<0 )
+ {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_length = count;
+ fp->block_address = block_address;
+ return 0;
+ }
+ if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
+ count = hread(fp->fp, header, sizeof(header));
+ if (count == 0) { // no data read
+ fp->block_length = 0;
+ return 0;
+ }
+ int ret;
+ if ( count != sizeof(header) || (ret=check_header(header))==-2 )
+ {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
+ if ( ret==-1 )
+ {
+ // GZIP, not BGZF
+ uint8_t *cblock = (uint8_t*)fp->compressed_block;
+ memcpy(cblock, header, sizeof(header));
+ count = hread(fp->fp, cblock+sizeof(header), BGZF_BLOCK_SIZE - sizeof(header)) + sizeof(header);
+ int nskip = 10;
+
+ // Check optional fields to skip: FLG.FNAME,FLG.FCOMMENT,FLG.FHCRC,FLG.FEXTRA
+ // Note: Some of these fields are untested, I did not have appropriate data available
+ if ( header[3] & 0x4 ) // FLG.FEXTRA
+ {
+ nskip += unpackInt16(&cblock[nskip]) + 2;
+ }
+ if ( header[3] & 0x8 ) // FLG.FNAME
+ {
+ while ( nskip<count && cblock[nskip] ) nskip++;
+ nskip++;
+ }
+ if ( header[3] & 0x10 ) // FLG.FCOMMENT
+ {
+ while ( nskip<count && cblock[nskip] ) nskip++;
+ nskip++;
+ }
+ if ( header[3] & 0x2 ) nskip += 2; // FLG.FHCRC
+
+ /* FIXME: Should handle this better. There's no reason why
+ someone shouldn't include a massively long comment in their
+ gzip stream. */
+ if ( nskip >= count )
+ {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
+
+ fp->is_gzip = 1;
+ fp->gz_stream = (z_stream*) calloc(1,sizeof(z_stream));
+ int ret = inflateInit2(fp->gz_stream, -15);
+ if (ret != Z_OK)
+ {
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] inflateInit2 failed: %s",
+ __func__, bgzf_zerr(ret, fp->gz_stream));
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->gz_stream->avail_in = count - nskip;
+ fp->gz_stream->next_in = cblock + nskip;
+ count = inflate_gzip_block(fp, 1);
+ if ( count<0 )
+ {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_length = count;
+ fp->block_address = block_address;
+ if ( fp->idx_build_otf ) return -1; // cannot build index for gzip
+ return 0;
+ }
+ size = count;
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+ compressed_block = (uint8_t*)fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ remaining = block_length - BLOCK_HEADER_LENGTH;
+ count = hread(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+ if (count != remaining) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ size += count;
+ if ((count = inflate_block(fp, block_length)) < 0) {
+ if (hts_verbose >= 2) fprintf(stderr, "[E::%s] inflate_block error %d\n", __func__, count);
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+ fp->block_address = block_address;
+ fp->block_length = count;
+ if ( fp->idx_build_otf )
+ {
+ bgzf_index_add_block(fp);
+ fp->idx->ublock_addr += count;
+ }
+ cache_block(fp, size);
+ return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
+{
+ ssize_t bytes_read = 0;
+ uint8_t *output = (uint8_t*)data;
+ if (length <= 0) return 0;
+ assert(fp->is_write == 0);
+ while (bytes_read < length) {
+ int copy_length, available = fp->block_length - fp->block_offset;
+ uint8_t *buffer;
+ if (available <= 0) {
+ int ret = bgzf_read_block(fp);
+ if (ret != 0) {
+ if (hts_verbose >= 2) {
+ fprintf(stderr, "[E::%s] bgzf_read_block error %d after %zd of %zu bytes\n", __func__, ret, bytes_read, length);
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) break;
+ }
+ copy_length = length - bytes_read < available? length - bytes_read : available;
+ buffer = (uint8_t*)fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = htell(fp->fp);
+ fp->block_offset = fp->block_length = 0;
+ }
+ fp->uncompressed_address += bytes_read;
+ return bytes_read;
+}
+
+ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
+{
+ return hread(fp->fp, data, length);
+}
+
+#ifdef BGZF_MT
+
+typedef struct {
+ struct bgzf_mtaux_t *mt;
+ void *buf;
+ int i, errcode, toproc, compress_level;
+} worker_t;
+
+typedef struct bgzf_mtaux_t {
+ int n_threads, n_blks, curr, done;
+ volatile int proc_cnt;
+ void **blk;
+ int *len;
+ worker_t *w;
+ pthread_t *tid;
+ pthread_mutex_t lock;
+ pthread_cond_t cv;
+} mtaux_t;
+
+static int worker_aux(worker_t *w)
+{
+ int i, stop = 0;
+ // wait for condition: to process or all done
+ pthread_mutex_lock(&w->mt->lock);
+ while (!w->toproc && !w->mt->done)
+ pthread_cond_wait(&w->mt->cv, &w->mt->lock);
+ if (w->mt->done) stop = 1;
+ w->toproc = 0;
+ pthread_mutex_unlock(&w->mt->lock);
+ if (stop) return 1; // to quit the thread
+ w->errcode = 0;
+ for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
+ size_t clen = BGZF_MAX_BLOCK_SIZE;
+ int ret = bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->compress_level);
+ if (ret != 0) {
+ if (hts_verbose >= 2) fprintf(stderr, "[E::%s] bgzf_compress error %d\n", __func__, ret);
+ w->errcode |= BGZF_ERR_ZLIB; // Report error
+ // We're not going to do any more, so set remaining lengths to 0
+ for (; i < w->mt->curr; i += w->mt->n_threads) w->mt->len[i] = 0;
+ break; // Give up
+ } else {
+ memcpy(w->mt->blk[i], w->buf, clen);
+ w->mt->len[i] = clen;
+ }
+ }
+ __sync_fetch_and_add(&w->mt->proc_cnt, 1);
+ return 0;
+}
+
+static void *mt_worker(void *data)
+{
+ while (worker_aux((worker_t*)data) == 0);
+ return 0;
+}
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+ int i;
+ mtaux_t *mt;
+ pthread_attr_t attr;
+ if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
+ mt = (mtaux_t*)calloc(1, sizeof(mtaux_t));
+ mt->n_threads = n_threads;
+ mt->n_blks = n_threads * n_sub_blks;
+ mt->len = (int*)calloc(mt->n_blks, sizeof(int));
+ mt->blk = (void**)calloc(mt->n_blks, sizeof(void*));
+ for (i = 0; i < mt->n_blks; ++i)
+ mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
+ mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
+ mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t));
+ for (i = 0; i < mt->n_threads; ++i) {
+ mt->w[i].i = i;
+ mt->w[i].mt = mt;
+ mt->w[i].compress_level = fp->compress_level;
+ mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
+ }
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ pthread_mutex_init(&mt->lock, 0);
+ pthread_cond_init(&mt->cv, 0);
+ for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
+ pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
+ fp->mt = mt;
+ return 0;
+}
+
+static void mt_destroy(mtaux_t *mt)
+{
+ int i;
+ // signal all workers to quit
+ pthread_mutex_lock(&mt->lock);
+ mt->done = 1; mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
+ // free other data allocated on heap
+ for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
+ for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
+ free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
+ pthread_cond_destroy(&mt->cv);
+ pthread_mutex_destroy(&mt->lock);
+ free(mt);
+}
+
+static void mt_queue(BGZF *fp)
+{
+ mtaux_t *mt = fp->mt;
+ assert(mt->curr < mt->n_blks); // guaranteed by the caller
+ memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
+ mt->len[mt->curr] = fp->block_offset;
+ fp->block_offset = 0;
+ ++mt->curr;
+}
+
+static int mt_flush_queue(BGZF *fp)
+{
+ int i;
+ mtaux_t *mt = fp->mt;
+ // signal all the workers to compress
+ pthread_mutex_lock(&mt->lock);
+ for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
+ mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ // worker 0 is doing things here
+ worker_aux(&mt->w[0]);
+ // wait for all the threads to complete
+ while (mt->proc_cnt < mt->n_threads);
+ // dump data to disk
+ for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
+ if (fp->errcode == 0) {
+ /* Only try to write if all the threads worked, as otherwise we
+ could get a file with holes in it */
+ for (i = 0; i < mt->curr; ++i) {
+ if (hwrite(fp->fp, mt->blk[i], mt->len[i]) != mt->len[i]) {
+ fp->errcode |= BGZF_ERR_IO;
+ break;
+ }
+ }
+ }
+ mt->curr = 0;
+ return (fp->errcode == 0)? 0 : -1;
+}
+
+static int lazy_flush(BGZF *fp)
+{
+ if (fp->mt) {
+ if (fp->block_offset) mt_queue(fp);
+ return (fp->mt->curr < fp->mt->n_blks)? 0 : mt_flush_queue(fp);
+ }
+ else return bgzf_flush(fp);
+}
+
+#else // ~ #ifdef BGZF_MT
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+ return 0;
+}
+
+static inline int lazy_flush(BGZF *fp)
+{
+ return bgzf_flush(fp);
+}
+
+#endif // ~ #ifdef BGZF_MT
+
+int bgzf_flush(BGZF *fp)
+{
+ if (!fp->is_write) return 0;
+#ifdef BGZF_MT
+ if (fp->mt) {
+ if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
+ return mt_flush_queue(fp);
+ }
+#endif
+ while (fp->block_offset > 0) {
+ int block_length;
+ if ( fp->idx_build_otf )
+ {
+ bgzf_index_add_block(fp);
+ fp->idx->ublock_addr += fp->block_offset;
+ }
+ block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) {
+ if (hts_verbose >= 3) fprintf(stderr, "[E::%s] deflate_block error %d\n", __func__, block_length);
+ return -1;
+ }
+ if (hwrite(fp->fp, fp->compressed_block, block_length) != block_length) {
+ if (hts_verbose >= 1) fprintf(stderr, "[E::%s] hwrite error (wrong size)\n", __func__);
+ fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+ if (fp->block_offset + size > BGZF_BLOCK_SIZE) return lazy_flush(fp);
+ return 0;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
+{
+ if ( !fp->is_compressed )
+ return hwrite(fp->fp, data, length);
+
+ const uint8_t *input = (const uint8_t*)data;
+ ssize_t remaining = length;
+ assert(fp->is_write);
+ while (remaining > 0) {
+ uint8_t* buffer = (uint8_t*)fp->uncompressed_block;
+ int copy_length = BGZF_BLOCK_SIZE - fp->block_offset;
+ if (copy_length > remaining) copy_length = remaining;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ remaining -= copy_length;
+ if (fp->block_offset == BGZF_BLOCK_SIZE) {
+ if (lazy_flush(fp) != 0) return -1;
+ }
+ }
+ return length - remaining;
+}
+
+ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
+{
+ return hwrite(fp->fp, data, length);
+}
+
+int bgzf_close(BGZF* fp)
+{
+ int ret, block_length;
+ if (fp == 0) return -1;
+ if (fp->is_write && fp->is_compressed) {
+ if (bgzf_flush(fp) != 0) return -1;
+ fp->compress_level = -1;
+ block_length = deflate_block(fp, 0); // write an empty block
+ if (block_length < 0) {
+ if (hts_verbose >= 3) fprintf(stderr, "[E::%s] deflate_block error %d\n", __func__, block_length);
+ return -1;
+ }
+ if (hwrite(fp->fp, fp->compressed_block, block_length) < 0
+ || hflush(fp->fp) != 0) {
+ if (hts_verbose >= 1) fprintf(stderr, "[E::%s] file write error\n", __func__);
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+#ifdef BGZF_MT
+ if (fp->mt) mt_destroy(fp->mt);
+#endif
+ }
+ if ( fp->is_gzip )
+ {
+ if (!fp->is_write) ret = inflateEnd(fp->gz_stream);
+ else ret = deflateEnd(fp->gz_stream);
+ if (ret != Z_OK && hts_verbose >= 1)
+ fprintf(stderr, "[E::%s] inflateEnd/deflateEnd failed: %s\n",
+ __func__, bgzf_zerr(ret, NULL));
+ free(fp->gz_stream);
+ }
+ ret = hclose(fp->fp);
+ if (ret != 0) return -1;
+ bgzf_index_destroy(fp);
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free_cache(fp);
+ free(fp);
+ return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+ if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+ uint8_t buf[28];
+ off_t offset = htell(fp->fp);
+ if (hseek(fp->fp, -28, SEEK_END) < 0) {
+ if (errno == ESPIPE) { hclearerr(fp->fp); return 2; }
+ else return -1;
+ }
+ if ( hread(fp->fp, buf, 28) != 28 ) return -1;
+ if ( hseek(fp->fp, offset, SEEK_SET) < 0 ) return -1;
+ return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+ int block_offset;
+ int64_t block_address;
+
+ if (fp->is_write || where != SEEK_SET) {
+ fp->errcode |= BGZF_ERR_MISUSE;
+ return -1;
+ }
+ block_offset = pos & 0xFFFF;
+ block_address = pos >> 16;
+ if (hseek(fp->fp, block_address, SEEK_SET) < 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+ uint8_t buf[16];
+ int n;
+ hFILE *fp;
+ if ((fp = hopen(fn, "r")) == 0) return 0;
+ n = hread(fp, buf, 16);
+ if ( hclose(fp) < 0 ) return -1;
+ if (n != 16) return 0;
+ return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+ int c;
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) return -2; /* error */
+ if (fp->block_length == 0) return -1; /* end-of-file */
+ }
+ c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = htell(fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ fp->uncompressed_address++;
+ return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+ int l, state = 0;
+ unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+ str->l = 0;
+ do {
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) { state = -2; break; }
+ if (fp->block_length == 0) { state = -1; break; }
+ }
+ for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+ if (l < fp->block_length) state = 1;
+ l -= fp->block_offset;
+ if (str->l + l + 1 >= str->m) {
+ str->m = str->l + l + 2;
+ kroundup32(str->m);
+ str->s = (char*)realloc(str->s, str->m);
+ }
+ memcpy(str->s + str->l, buf + fp->block_offset, l);
+ str->l += l;
+ fp->block_offset += l + 1;
+ if (fp->block_offset >= fp->block_length) {
+ fp->block_address = htell(fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ } while (state == 0);
+ if (str->l == 0 && state < 0) return state;
+ fp->uncompressed_address += str->l;
+ if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--;
+ str->s[str->l] = 0;
+ return str->l;
+}
+
+void bgzf_index_destroy(BGZF *fp)
+{
+ if ( !fp->idx ) return;
+ free(fp->idx->offs);
+ free(fp->idx);
+ fp->idx = NULL;
+ fp->idx_build_otf = 0;
+}
+
+int bgzf_index_build_init(BGZF *fp)
+{
+ bgzf_index_destroy(fp);
+ fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+ if ( !fp->idx ) return -1;
+ fp->idx_build_otf = 1; // build index on the fly
+ return 0;
+}
+
+int bgzf_index_add_block(BGZF *fp)
+{
+ fp->idx->noffs++;
+ if ( fp->idx->noffs > fp->idx->moffs )
+ {
+ fp->idx->moffs = fp->idx->noffs;
+ kroundup32(fp->idx->moffs);
+ fp->idx->offs = (bgzidx1_t*) realloc(fp->idx->offs, fp->idx->moffs*sizeof(bgzidx1_t));
+ if ( !fp->idx->offs ) return -1;
+ }
+ fp->idx->offs[ fp->idx->noffs-1 ].uaddr = fp->idx->ublock_addr;
+ fp->idx->offs[ fp->idx->noffs-1 ].caddr = fp->block_address;
+ return 0;
+}
+
+static inline int fwrite_uint64(uint64_t x, FILE *f)
+{
+ if (ed_is_big()) x = ed_swap_8(x);
+ if (fwrite(&x, sizeof x, 1, f) != 1) return -1;
+ return 0;
+}
+
+int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
+{
+ if (bgzf_flush(fp) != 0) return -1;
+
+ assert(fp->idx);
+ char *tmp = NULL;
+ if ( suffix )
+ {
+ int blen = strlen(bname);
+ int slen = strlen(suffix);
+ tmp = (char*) malloc(blen + slen + 1);
+ if ( !tmp ) return -1;
+ memcpy(tmp,bname,blen);
+ memcpy(tmp+blen,suffix,slen+1);
+ }
+
+ FILE *idx = fopen(tmp?tmp:bname,"wb");
+ if ( tmp ) free(tmp);
+ if ( !idx ) {
+ if (hts_verbose > 1)
+ {
+ fprintf(stderr, "[E::%s] Error opening %s%s : %s\n",
+ __func__, bname, suffix ? suffix : "", strerror(errno));
+ }
+ return -1;
+ }
+
+ // Note that the index contains one extra record when indexing files opened
+ // for reading. The terminating record is not present when opened for writing.
+ // This is not a bug.
+
+ int i;
+ if (fwrite_uint64(fp->idx->noffs - 1, idx) < 0) goto fail;
+ for (i=1; i<fp->idx->noffs; i++)
+ {
+ if (fwrite_uint64(fp->idx->offs[i].caddr, idx) < 0) goto fail;
+ if (fwrite_uint64(fp->idx->offs[i].uaddr, idx) < 0) goto fail;
+ }
+
+ if (fclose(idx) < 0)
+ {
+ if (hts_verbose > 1)
+ {
+ fprintf(stderr, "[E::%s] Error on closing %s%s : %s\n",
+ __func__, bname, suffix ? suffix : "", strerror(errno));
+ }
+ return -1;
+ }
+ return 0;
+
+ fail:
+ if (hts_verbose > 1)
+ {
+ fprintf(stderr, "[E::%s] Error writing to %s%s : %s\n",
+ __func__, bname, suffix ? suffix : "", strerror(errno));
+ }
+ fclose(idx);
+ return -1;
+}
+
+static inline int fread_uint64(uint64_t *xptr, FILE *f)
+{
+ if (fread(xptr, sizeof *xptr, 1, f) != 1) return -1;
+ if (ed_is_big()) ed_swap_8p(xptr);
+ return 0;
+}
+
+int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
+{
+ char *tmp = NULL;
+ if ( suffix )
+ {
+ int blen = strlen(bname);
+ int slen = strlen(suffix);
+ tmp = (char*) malloc(blen + slen + 1);
+ if ( !tmp ) return -1;
+ memcpy(tmp,bname,blen);
+ memcpy(tmp+blen,suffix,slen+1);
+ }
+
+ FILE *idx = fopen(tmp?tmp:bname,"rb");
+ if ( tmp ) free(tmp);
+ if ( !idx ) {
+ if (hts_verbose > 1) {
+ fprintf(stderr, "[E::%s] Error opening %s%s : %s\n",
+ __func__, bname, suffix ? suffix : "", strerror(errno));
+ }
+ return -1;
+ }
+
+ fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+ if (fp->idx == NULL) goto fail;
+ uint64_t x;
+ if (fread_uint64(&x, idx) < 0) goto fail;
+
+ fp->idx->noffs = fp->idx->moffs = x + 1;
+ fp->idx->offs = (bgzidx1_t*) malloc(fp->idx->moffs*sizeof(bgzidx1_t));
+ if (fp->idx->offs == NULL) goto fail;
+ fp->idx->offs[0].caddr = fp->idx->offs[0].uaddr = 0;
+
+ int i;
+ for (i=1; i<fp->idx->noffs; i++)
+ {
+ if (fread_uint64(&fp->idx->offs[i].caddr, idx) < 0) goto fail;
+ if (fread_uint64(&fp->idx->offs[i].uaddr, idx) < 0) goto fail;
+ }
+
+ if (fclose(idx) != 0) goto fail;
+ return 0;
+
+ fail:
+ if (hts_verbose > 1)
+ {
+ fprintf(stderr, "[E::%s] Error reading %s%s : %s\n",
+ __func__, bname, suffix ? suffix : "", strerror(errno));
+ }
+ fclose(idx);
+ if (fp->idx) {
+ free(fp->idx->offs);
+ free(fp->idx);
+ fp->idx = NULL;
+ }
+ return -1;
+}
+
+int bgzf_useek(BGZF *fp, long uoffset, int where)
+{
+ if ( !fp->is_compressed )
+ {
+ if (hseek(fp->fp, uoffset, SEEK_SET) < 0)
+ {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = uoffset;
+ fp->block_offset = 0;
+ if (bgzf_read_block(fp) < 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->uncompressed_address = uoffset;
+ return 0;
+ }
+
+ if ( !fp->idx )
+ {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+
+ // binary search
+ int ilo = 0, ihi = fp->idx->noffs - 1;
+ while ( ilo<=ihi )
+ {
+ int i = (ilo+ihi)*0.5;
+ if ( uoffset < fp->idx->offs[i].uaddr ) ihi = i - 1;
+ else if ( uoffset >= fp->idx->offs[i].uaddr ) ilo = i + 1;
+ else break;
+ }
+ int i = ilo-1;
+ if (hseek(fp->fp, fp->idx->offs[i].caddr, SEEK_SET) < 0)
+ {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = fp->idx->offs[i].caddr;
+ fp->block_offset = 0;
+ if ( bgzf_read_block(fp) < 0 ) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ if ( uoffset - fp->idx->offs[i].uaddr > 0 )
+ {
+ fp->block_offset = uoffset - fp->idx->offs[i].uaddr;
+ assert( fp->block_offset <= fp->block_length ); // todo: skipped, unindexed, blocks
+ }
+ fp->uncompressed_address = uoffset;
+ return 0;
+}
+
+long bgzf_utell(BGZF *fp)
+{
+ return fp->uncompressed_address; // currently maintained only when reading
+}
diff --git a/htslib/bgzip.c b/htslib/bgzip.c
new file mode 100644
index 0000000..fa005b9
--- /dev/null
+++ b/htslib/bgzip.c
@@ -0,0 +1,311 @@
+/* bgzip.c -- Block compression/decompression utility.
+
+ Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
+ Copyright (C) 2010, 2013-2016 Genome Research Ltd.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notices and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <getopt.h>
+#include <sys/stat.h>
+#include "htslib/bgzf.h"
+#include "htslib/hts.h"
+
+static const int WINDOW_SIZE = 64 * 1024;
+
+static void error(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+static int confirm_overwrite(const char *fn)
+{
+ int save_errno = errno;
+ int ret = 0;
+
+ if (isatty(STDIN_FILENO)) {
+ char c;
+ fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
+ if (scanf("%c", &c) == 1 && (c == 'Y' || c == 'y')) ret = 1;
+ }
+
+ errno = save_errno;
+ return ret;
+}
+
+static int bgzip_main_usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Version: %s\n", hts_version());
+ fprintf(stderr, "Usage: bgzip [OPTIONS] [FILE] ...\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n");
+ fprintf(stderr, " -c, --stdout write on standard output, keep original files unchanged\n");
+ fprintf(stderr, " -d, --decompress decompress\n");
+ fprintf(stderr, " -f, --force overwrite files without asking\n");
+ fprintf(stderr, " -h, --help give this help\n");
+ fprintf(stderr, " -i, --index compress and create BGZF index\n");
+ fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n");
+ fprintf(stderr, " -r, --reindex (re)index compressed file\n");
+ fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n");
+ fprintf(stderr, " -@, --threads INT number of compression threads to use [1]\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+int main(int argc, char **argv)
+{
+ int c, compress, pstdout, is_forced, index = 0, reindex = 0;
+ BGZF *fp;
+ void *buffer;
+ long start, end, size;
+ char *index_fname = NULL;
+ int threads = 1;
+
+ static const struct option loptions[] =
+ {
+ {"help", no_argument, NULL, 'h'},
+ {"offset", required_argument, NULL, 'b'},
+ {"stdout", no_argument, NULL, 'c'},
+ {"decompress", no_argument, NULL, 'd'},
+ {"force", no_argument, NULL, 'f'},
+ {"index", no_argument, NULL, 'i'},
+ {"index-name", required_argument, NULL, 'I'},
+ {"reindex", no_argument, NULL, 'r'},
+ {"size", required_argument, NULL, 's'},
+ {"threads", required_argument, NULL, '@'},
+ {"version", no_argument, NULL, 1},
+ {NULL, 0, NULL, 0}
+ };
+
+ compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+ while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:r",loptions,NULL)) >= 0){
+ switch(c){
+ case 'd': compress = 0; break;
+ case 'c': pstdout = 1; break;
+ case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
+ case 's': size = atol(optarg); pstdout = 1; break;
+ case 'f': is_forced = 1; break;
+ case 'i': index = 1; break;
+ case 'I': index_fname = optarg; break;
+ case 'r': reindex = 1; compress = 0; break;
+ case '@': threads = atoi(optarg); break;
+ case 1:
+ printf(
+"bgzip (htslib) %s\n"
+"Copyright (C) 2016 Genome Research Ltd.\n", hts_version());
+ return EXIT_SUCCESS;
+ case 'h':
+ case '?': return bgzip_main_usage();
+ }
+ }
+ if (size >= 0) end = start + size;
+ if (end >= 0 && end < start) {
+ fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
+ return 1;
+ }
+ if (compress == 1) {
+ struct stat sbuf;
+ int f_src = fileno(stdin);
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
+ if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
+ if (pstdout)
+ fp = bgzf_open("-", "w");
+ else
+ {
+ char *name = malloc(strlen(argv[optind]) + 5);
+ strcpy(name, argv[optind]);
+ strcat(name, ".gz");
+ fp = bgzf_open(name, is_forced? "w" : "wx");
+ if (fp == NULL && errno == EEXIST && confirm_overwrite(name))
+ fp = bgzf_open(name, "w");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
+ free(name);
+ return 1;
+ }
+ free(name);
+ }
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdout)) )
+ return bgzip_main_usage();
+ else if ( index && !index_fname )
+ {
+ fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
+ return 1;
+ }
+ else
+ fp = bgzf_open("-", "w");
+
+ if (threads > 1)
+ bgzf_mt(fp, threads, 256);
+
+ if ( index ) bgzf_index_build_init(fp);
+ buffer = malloc(WINDOW_SIZE);
+ while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
+ if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
+ if ( index )
+ {
+ if (index_fname) {
+ if (bgzf_index_dump(fp, index_fname, NULL) < 0)
+ error("Could not write index to '%s'\n", index_fname);
+ } else {
+ if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0)
+ error("Could not write index to '%s.gz.gzi'", argv[optind]);
+ }
+ }
+ if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
+ if (argc > optind && !pstdout) unlink(argv[optind]);
+ free(buffer);
+ close(f_src);
+ return 0;
+ }
+ else if ( reindex )
+ {
+ if ( argc>optind )
+ {
+ fp = bgzf_open(argv[optind], "r");
+ if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
+ }
+ else
+ {
+ if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
+ fp = bgzf_open("-", "r");
+ if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
+ }
+
+ buffer = malloc(BGZF_BLOCK_SIZE);
+ bgzf_index_build_init(fp);
+ int ret;
+ while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
+ free(buffer);
+ if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
+
+ if ( index_fname ) {
+ if (bgzf_index_dump(fp, index_fname, NULL) < 0)
+ error("Could not write index to '%s'\n", index_fname);
+ } else {
+ if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0)
+ error("Could not write index to '%s.gzi'\n", argv[optind]);
+ }
+
+ if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
+ return 0;
+ }
+ else
+ {
+ struct stat sbuf;
+ int f_dst;
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+ char *name;
+ int len = strlen(argv[optind]);
+ if ( strcmp(argv[optind]+len-3,".gz") )
+ {
+ fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ fp = bgzf_open(argv[optind], "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
+ return 1;
+ }
+
+ if (pstdout) {
+ f_dst = fileno(stdout);
+ }
+ else {
+ const int wrflags = O_WRONLY | O_CREAT | O_TRUNC;
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666);
+ if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name))
+ f_dst = open(name, wrflags, 0666);
+ if (f_dst < 0) {
+ fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
+ free(name);
+ return 1;
+ }
+ free(name);
+ }
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdin)) )
+ return bgzip_main_usage();
+ else
+ {
+ f_dst = fileno(stdout);
+ fp = bgzf_open("-", "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
+ return 1;
+ }
+ }
+ buffer = malloc(WINDOW_SIZE);
+ if ( start>0 )
+ {
+ if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
+ if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
+ }
+ while (1) {
+ if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
+ else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if (c == 0) break;
+ if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
+ start += c;
+ if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c);
+ if (end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
+ if (!pstdout) unlink(argv[optind]);
+ return 0;
+ }
+ return 0;
+}
diff --git a/htslib/cram/._cram.h b/htslib/cram/._cram.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram.h differ
diff --git a/htslib/cram/._cram_codecs.c b/htslib/cram/._cram_codecs.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_codecs.c differ
diff --git a/htslib/cram/._cram_codecs.h b/htslib/cram/._cram_codecs.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_codecs.h differ
diff --git a/htslib/cram/._cram_decode.c b/htslib/cram/._cram_decode.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_decode.c differ
diff --git a/htslib/cram/._cram_decode.h b/htslib/cram/._cram_decode.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_decode.h differ
diff --git a/htslib/cram/._cram_encode.c b/htslib/cram/._cram_encode.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_encode.c differ
diff --git a/htslib/cram/._cram_encode.h b/htslib/cram/._cram_encode.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_encode.h differ
diff --git a/htslib/cram/._cram_external.c b/htslib/cram/._cram_external.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_external.c differ
diff --git a/htslib/cram/._cram_index.c b/htslib/cram/._cram_index.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_index.c differ
diff --git a/htslib/cram/._cram_index.h b/htslib/cram/._cram_index.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_index.h differ
diff --git a/htslib/cram/._cram_io.c b/htslib/cram/._cram_io.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_io.c differ
diff --git a/htslib/cram/._cram_io.h b/htslib/cram/._cram_io.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_io.h differ
diff --git a/htslib/cram/._cram_samtools.c b/htslib/cram/._cram_samtools.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_samtools.c differ
diff --git a/htslib/cram/._cram_samtools.h b/htslib/cram/._cram_samtools.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_samtools.h differ
diff --git a/htslib/cram/._cram_stats.c b/htslib/cram/._cram_stats.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_stats.c differ
diff --git a/htslib/cram/._cram_stats.h b/htslib/cram/._cram_stats.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_stats.h differ
diff --git a/htslib/cram/._cram_structs.h b/htslib/cram/._cram_structs.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._cram_structs.h differ
diff --git a/htslib/cram/._files.c b/htslib/cram/._files.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._files.c differ
diff --git a/htslib/cram/._mFILE.c b/htslib/cram/._mFILE.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._mFILE.c differ
diff --git a/htslib/cram/._mFILE.h b/htslib/cram/._mFILE.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._mFILE.h differ
diff --git a/htslib/cram/._misc.h b/htslib/cram/._misc.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._misc.h differ
diff --git a/htslib/cram/._open_trace_file.c b/htslib/cram/._open_trace_file.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._open_trace_file.c differ
diff --git a/htslib/cram/._open_trace_file.h b/htslib/cram/._open_trace_file.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._open_trace_file.h differ
diff --git a/htslib/cram/._os.h b/htslib/cram/._os.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._os.h differ
diff --git a/htslib/cram/._pooled_alloc.c b/htslib/cram/._pooled_alloc.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._pooled_alloc.c differ
diff --git a/htslib/cram/._pooled_alloc.h b/htslib/cram/._pooled_alloc.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._pooled_alloc.h differ
diff --git a/htslib/cram/._rANS_byte.h b/htslib/cram/._rANS_byte.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._rANS_byte.h differ
diff --git a/htslib/cram/._rANS_static.c b/htslib/cram/._rANS_static.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._rANS_static.c differ
diff --git a/htslib/cram/._rANS_static.h b/htslib/cram/._rANS_static.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._rANS_static.h differ
diff --git a/htslib/cram/._sam_header.c b/htslib/cram/._sam_header.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._sam_header.c differ
diff --git a/htslib/cram/._sam_header.h b/htslib/cram/._sam_header.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._sam_header.h differ
diff --git a/htslib/cram/._string_alloc.c b/htslib/cram/._string_alloc.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._string_alloc.c differ
diff --git a/htslib/cram/._string_alloc.h b/htslib/cram/._string_alloc.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._string_alloc.h differ
diff --git a/htslib/cram/._thread_pool.c b/htslib/cram/._thread_pool.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._thread_pool.c differ
diff --git a/htslib/cram/._thread_pool.h b/htslib/cram/._thread_pool.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._thread_pool.h differ
diff --git a/htslib/cram/._vlen.c b/htslib/cram/._vlen.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._vlen.c differ
diff --git a/htslib/cram/._vlen.h b/htslib/cram/._vlen.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._vlen.h differ
diff --git a/htslib/cram/._zfio.c b/htslib/cram/._zfio.c
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._zfio.c differ
diff --git a/htslib/cram/._zfio.h b/htslib/cram/._zfio.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/cram/._zfio.h differ
diff --git a/htslib/cram/cram.h b/htslib/cram/cram.h
new file mode 100644
index 0000000..c4e8809
--- /dev/null
+++ b/htslib/cram/cram.h
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*! \file
+ * CRAM interface.
+ *
+ * Consider using the higher level hts_*() API for programs that wish to
+ * be file format agnostic (see htslib/hts.h).
+ *
+ * This API should be used for CRAM specific code. The specifics of the
+ * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h
+ * although these should not be included directly (use this file instead).
+ */
+
+#ifndef _CRAM_H_
+#define _CRAM_H_
+
+#include "cram/cram_samtools.h"
+#include "cram/sam_header.h"
+#include "cram_structs.h"
+#include "cram_io.h"
+#include "cram_encode.h"
+#include "cram_decode.h"
+#include "cram_stats.h"
+#include "cram_codecs.h"
+#include "cram_index.h"
+
+// Validate against the external cram.h,
+//
+// This contains duplicated portions from cram_io.h and cram_structs.h,
+// so we want to ensure that the prototypes match.
+#include "htslib/cram.h"
+
+#endif
diff --git a/htslib/cram/cram_codecs.c b/htslib/cram/cram_codecs.c
new file mode 100644
index 0000000..0e073c6
--- /dev/null
+++ b/htslib/cram/cram_codecs.c
@@ -0,0 +1,1950 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * FIXME: add checking of cram_external_type to return NULL on unsupported
+ * {codec,type} tuples.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <limits.h>
+
+#include "cram/cram.h"
+
+/*
+ * ---------------------------------------------------------------------------
+ * Block bit-level I/O functions.
+ * All defined static here to promote easy inlining by the compiler.
+ */
+
+#if 0
+/* Get a single bit, MSB first */
+static signed int get_bit_MSB(cram_block *block) {
+ unsigned int val;
+
+ if (block->byte > block->alloc)
+ return -1;
+
+ val = block->data[block->byte] >> block->bit;
+ if (--block->bit == -1) {
+ block->bit = 7;
+ block->byte++;
+ //printf("(%02X)", block->data[block->byte]);
+ }
+
+ //printf("-B%d-", val&1);
+
+ return val & 1;
+}
+#endif
+
+/*
+ * Count number of successive 0 and 1 bits
+ */
+static int get_one_bits_MSB(cram_block *block) {
+ int n = 0, b;
+ if (block->byte >= block->uncomp_size)
+ return -1;
+ do {
+ b = block->data[block->byte] >> block->bit;
+ if (--block->bit == -1) {
+ block->bit = 7;
+ block->byte++;
+ if (block->byte == block->uncomp_size && (b&1))
+ return -1;
+ }
+ n++;
+ } while (b&1);
+
+ return n-1;
+}
+
+static int get_zero_bits_MSB(cram_block *block) {
+ int n = 0, b;
+ if (block->byte >= block->uncomp_size)
+ return -1;
+ do {
+ b = block->data[block->byte] >> block->bit;
+ if (--block->bit == -1) {
+ block->bit = 7;
+ block->byte++;
+ if (block->byte == block->uncomp_size && !(b&1))
+ return -1;
+ }
+ n++;
+ } while (!(b&1));
+
+ return n-1;
+}
+
+#if 0
+/* Stores a single bit */
+static void store_bit_MSB(cram_block *block, unsigned int bit) {
+ if (block->byte >= block->alloc) {
+ block->alloc = block->alloc ? block->alloc*2 : 1024;
+ block->data = realloc(block->data, block->alloc);
+ }
+
+ if (bit)
+ block->data[block->byte] |= (1 << block->bit);
+
+ if (--block->bit == -1) {
+ block->bit = 7;
+ block->byte++;
+ block->data[block->byte] = 0;
+ }
+}
+#endif
+
+#if 0
+/* Rounds to the next whole byte boundary first */
+static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
+ if (block->bit != 7) {
+ block->bit = 7;
+ block->byte++;
+ }
+
+ while (block->byte + len >= block->alloc) {
+ block->alloc = block->alloc ? block->alloc*2 : 1024;
+ block->data = realloc(block->data, block->alloc);
+ }
+
+ memcpy(&block->data[block->byte], bytes, len);
+ block->byte += len;
+}
+#endif
+
+/* Local optimised copy for inlining */
+static inline unsigned int get_bits_MSB(cram_block *block, int nbits) {
+ unsigned int val = 0;
+ int i;
+
+#if 0
+ // Fits within the current byte */
+ if (nbits <= block->bit+1) {
+ val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
+ if ((block->bit -= nbits) == -1) {
+ block->bit = 7;
+ block->byte++;
+ }
+ return val;
+ }
+
+ // partial first byte
+ val = block->data[block->byte] & ((1<<(block->bit+1))-1);
+ nbits -= block->bit+1;
+ block->bit = 7;
+ block->byte++;
+
+ // whole middle bytes
+ while (nbits >= 8) {
+ val = (val << 8) | block->data[block->byte++];
+ nbits -= 8;
+ }
+
+ val <<= nbits;
+ val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
+ block->bit -= nbits;
+ return val;
+#endif
+
+#if 0
+ /* Inefficient implementation! */
+ //printf("{");
+ for (i = 0; i < nbits; i++)
+ //val = (val << 1) | get_bit_MSB(block);
+ GET_BIT_MSB(block, val);
+#endif
+
+#if 1
+ /* Combination of 1st two methods */
+ if (nbits <= block->bit+1) {
+ val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
+ if ((block->bit -= nbits) == -1) {
+ block->bit = 7;
+ block->byte++;
+ }
+ return val;
+ }
+
+ switch(nbits) {
+// case 15: GET_BIT_MSB(block, val);
+// case 14: GET_BIT_MSB(block, val);
+// case 13: GET_BIT_MSB(block, val);
+// case 12: GET_BIT_MSB(block, val);
+// case 11: GET_BIT_MSB(block, val);
+// case 10: GET_BIT_MSB(block, val);
+// case 9: GET_BIT_MSB(block, val);
+ case 8: GET_BIT_MSB(block, val);
+ case 7: GET_BIT_MSB(block, val);
+ case 6: GET_BIT_MSB(block, val);
+ case 5: GET_BIT_MSB(block, val);
+ case 4: GET_BIT_MSB(block, val);
+ case 3: GET_BIT_MSB(block, val);
+ case 2: GET_BIT_MSB(block, val);
+ case 1: GET_BIT_MSB(block, val);
+ break;
+
+ default:
+ for (i = 0; i < nbits; i++)
+ //val = (val << 1) | get_bit_MSB(block);
+ GET_BIT_MSB(block, val);
+ }
+#endif
+
+ //printf("=0x%x}", val);
+
+ return val;
+}
+
+/*
+ * Can store up to 24-bits worth of data encoded in an integer value
+ * Possibly we'd want to have a less optimal store_bits function when dealing
+ * with nbits > 24, but for now we assume the codes generated are never
+ * that big. (Given this is only possible with 121392 or more
+ * characters with exactly the correct frequency distribution we check
+ * for it elsewhere.)
+ */
+static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) {
+ /* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */
+
+ /*
+ * Use slow mode until we tweak the huffman generator to never generate
+ * codes longer than 24-bits.
+ */
+ unsigned int mask;
+
+ if (block->byte+4 >= block->alloc) {
+ if (block->byte) {
+ block->alloc *= 2;
+ block->data = realloc(block->data, block->alloc + 4);
+ if (!block->data)
+ return -1;
+ } else {
+ block->alloc = 1024;
+ block->data = realloc(block->data, block->alloc + 4);
+ if (!block->data)
+ return -1;
+ block->data[0] = 0; // initialise first byte of buffer
+ }
+ }
+
+ /* fits in current bit-field */
+ if (nbits <= block->bit+1) {
+ block->data[block->byte] |= (val << (block->bit+1-nbits));
+ if ((block->bit-=nbits) == -1) {
+ block->bit = 7;
+ block->byte++;
+ block->data[block->byte] = 0;
+ }
+ return 0;
+ }
+
+ block->data[block->byte] |= (val >> (nbits -= block->bit+1));
+ block->bit = 7;
+ block->byte++;
+ block->data[block->byte] = 0;
+
+ mask = 1<<(nbits-1);
+ do {
+ if (val & mask)
+ block->data[block->byte] |= (1 << block->bit);
+ if (--block->bit == -1) {
+ block->bit = 7;
+ block->byte++;
+ block->data[block->byte] = 0;
+ }
+ mask >>= 1;
+ } while(--nbits);
+
+ return 0;
+}
+
+/*
+ * Returns the next 'size' bytes from a block, or NULL if insufficient
+ * data left.This is just a pointer into the block data and not an
+ * allocated object, so do not free the result.
+ */
+static char *cram_extract_block(cram_block *b, int size) {
+ char *cp = (char *)b->data + b->idx;
+ b->idx += size;
+ if (b->idx > b->uncomp_size)
+ return NULL;
+
+ return cp;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * EXTERNAL
+ */
+int cram_external_decode_int(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out, int *out_size) {
+ int l;
+ char *cp;
+ cram_block *b;
+
+ /* Find the external block */
+ b = cram_get_block_by_id(slice, c->external.content_id);
+ if (!b)
+ return *out_size?-1:0;
+
+ cp = (char *)b->data + b->idx;
+ // E_INT and E_LONG are guaranteed single item queries
+ l = safe_itf8_get(cp, (char *)b->data + b->uncomp_size, (int32_t *)out);
+ b->idx += l;
+ *out_size = 1;
+
+ return l > 0 ? 0 : -1;
+}
+
+int cram_external_decode_char(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out,
+ int *out_size) {
+ char *cp;
+ cram_block *b;
+
+ /* Find the external block */
+ b = cram_get_block_by_id(slice, c->external.content_id);
+ if (!b)
+ return *out_size?-1:0;
+
+ cp = cram_extract_block(b, *out_size);
+ if (!cp)
+ return -1;
+
+ if (out)
+ memcpy(out, cp, *out_size);
+ return 0;
+}
+
+static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out_,
+ int *out_size) {
+ char *cp;
+ cram_block *b = NULL;
+ cram_block *out = (cram_block *)out_;
+
+ /* Find the external block */
+ b = cram_get_block_by_id(slice, c->external.content_id);
+ if (!b)
+ return *out_size?-1:0;
+
+ cp = cram_extract_block(b, *out_size);
+ if (!cp)
+ return -1;
+
+ BLOCK_APPEND(out, cp, *out_size);
+ return 0;
+}
+
+void cram_external_decode_free(cram_codec *c) {
+ if (c)
+ free(c);
+}
+
+cram_codec *cram_external_decode_init(char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ cram_codec *c;
+ char *cp = data;
+
+ if (!(c = malloc(sizeof(*c))))
+ return NULL;
+
+ c->codec = E_EXTERNAL;
+ if (option == E_INT || option == E_LONG)
+ c->decode = cram_external_decode_int;
+ else if (option == E_BYTE_ARRAY || option == E_BYTE)
+ c->decode = cram_external_decode_char;
+ else
+ c->decode = cram_external_decode_block;
+ c->free = cram_external_decode_free;
+
+ cp += itf8_get(cp, &c->external.content_id);
+
+ if (cp - data != size) {
+ fprintf(stderr, "Malformed external header stream\n");
+ free(c);
+ return NULL;
+ }
+
+ c->external.type = option;
+
+ return c;
+}
+
+int cram_external_encode_int(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ uint32_t *i32 = (uint32_t *)in;
+
+ itf8_put_blk(c->out, *i32);
+ return 0;
+}
+
+int cram_external_encode_char(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ BLOCK_APPEND(c->out, in, in_size);
+ return 0;
+}
+
+void cram_external_encode_free(cram_codec *c) {
+ if (!c)
+ return;
+ free(c);
+}
+
+int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
+ int version) {
+ char tmp[99], *tp = tmp;
+ int len = 0;
+
+ if (prefix) {
+ size_t l = strlen(prefix);
+ BLOCK_APPEND(b, prefix, l);
+ len += l;
+ }
+
+ tp += itf8_put(tp, c->e_external.content_id);
+ len += itf8_put_blk(b, c->codec);
+ len += itf8_put_blk(b, tp-tmp);
+ BLOCK_APPEND(b, tmp, tp-tmp);
+ len += tp-tmp;
+
+ return len;
+}
+
+cram_codec *cram_external_encode_init(cram_stats *st,
+ enum cram_external_type option,
+ void *dat,
+ int version) {
+ cram_codec *c;
+
+ c = malloc(sizeof(*c));
+ if (!c)
+ return NULL;
+ c->codec = E_EXTERNAL;
+ c->free = cram_external_encode_free;
+ if (option == E_INT || option == E_LONG)
+ c->encode = cram_external_encode_int;
+ else if (option == E_BYTE_ARRAY || option == E_BYTE)
+ c->encode = cram_external_encode_char;
+ else
+ abort();
+ c->store = cram_external_encode_store;
+
+ c->e_external.content_id = (size_t)dat;
+
+ return c;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * BETA
+ */
+int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
+ int32_t *out_i = (int32_t *)out;
+ int i, n;
+
+ if (c->beta.nbits) {
+ if (cram_not_enough_bits(in, c->beta.nbits))
+ return -1;
+
+ for (i = 0, n = *out_size; i < n; i++)
+ out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
+ } else {
+ for (i = 0, n = *out_size; i < n; i++)
+ out_i[i] = -c->beta.offset;
+ }
+
+ return 0;
+}
+
+int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
+ int i, n;
+
+
+ if (c->beta.nbits) {
+ if (cram_not_enough_bits(in, c->beta.nbits))
+ return -1;
+
+ if (out)
+ for (i = 0, n = *out_size; i < n; i++)
+ out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
+ else
+ for (i = 0, n = *out_size; i < n; i++)
+ get_bits_MSB(in, c->beta.nbits);
+ } else {
+ if (out)
+ for (i = 0, n = *out_size; i < n; i++)
+ out[i] = -c->beta.offset;
+ }
+
+ return 0;
+}
+
+void cram_beta_decode_free(cram_codec *c) {
+ if (c)
+ free(c);
+}
+
+cram_codec *cram_beta_decode_init(char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ cram_codec *c;
+ char *cp = data;
+
+ if (!(c = malloc(sizeof(*c))))
+ return NULL;
+
+ c->codec = E_BETA;
+ if (option == E_INT || option == E_LONG)
+ c->decode = cram_beta_decode_int;
+ else if (option == E_BYTE_ARRAY || option == E_BYTE)
+ c->decode = cram_beta_decode_char;
+ else
+ abort();
+ c->free = cram_beta_decode_free;
+
+ cp += itf8_get(cp, &c->beta.offset);
+ cp += itf8_get(cp, &c->beta.nbits);
+
+ if (cp - data != size
+ || c->beta.nbits < 0 || c->beta.nbits > 8 * sizeof(int)) {
+ fprintf(stderr, "Malformed beta header stream\n");
+ free(c);
+ return NULL;
+ }
+
+ return c;
+}
+
+int cram_beta_encode_store(cram_codec *c, cram_block *b,
+ char *prefix, int version) {
+ int len = 0;
+
+ if (prefix) {
+ size_t l = strlen(prefix);
+ BLOCK_APPEND(b, prefix, l);
+ len += l;
+ }
+
+ len += itf8_put_blk(b, c->codec);
+ len += itf8_put_blk(b, itf8_size(c->e_beta.offset)
+ + itf8_size(c->e_beta.nbits)); // codec length
+ len += itf8_put_blk(b, c->e_beta.offset);
+ len += itf8_put_blk(b, c->e_beta.nbits);
+
+ return len;
+}
+
+int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ int *syms = (int *)in;
+ int i, r = 0;
+
+ for (i = 0; i < in_size; i++)
+ r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
+ c->e_beta.nbits);
+
+ return r;
+}
+
+int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ unsigned char *syms = (unsigned char *)in;
+ int i, r = 0;
+
+ for (i = 0; i < in_size; i++)
+ r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
+ c->e_beta.nbits);
+
+ return r;
+}
+
+void cram_beta_encode_free(cram_codec *c) {
+ if (c) free(c);
+}
+
+cram_codec *cram_beta_encode_init(cram_stats *st,
+ enum cram_external_type option,
+ void *dat,
+ int version) {
+ cram_codec *c;
+ int min_val, max_val, len = 0;
+
+ c = malloc(sizeof(*c));
+ if (!c)
+ return NULL;
+ c->codec = E_BETA;
+ c->free = cram_beta_encode_free;
+ if (option == E_INT)
+ c->encode = cram_beta_encode_int;
+ else
+ c->encode = cram_beta_encode_char;
+ c->store = cram_beta_encode_store;
+
+ if (dat) {
+ min_val = ((int *)dat)[0];
+ max_val = ((int *)dat)[1];
+ } else {
+ min_val = INT_MAX;
+ max_val = INT_MIN;
+ int i;
+ for (i = 0; i < MAX_STAT_VAL; i++) {
+ if (!st->freqs[i])
+ continue;
+ if (min_val > i)
+ min_val = i;
+ max_val = i;
+ }
+ if (st->h) {
+ khint_t k;
+
+ for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
+ if (!kh_exist(st->h, k))
+ continue;
+
+ i = kh_key(st->h, k);
+ if (min_val > i)
+ min_val = i;
+ if (max_val < i)
+ max_val = i;
+ }
+ }
+ }
+
+ assert(max_val >= min_val);
+ c->e_beta.offset = -min_val;
+ max_val -= min_val;
+ while (max_val) {
+ len++;
+ max_val >>= 1;
+ }
+ c->e_beta.nbits = len;
+
+ return c;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * SUBEXP
+ */
+int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
+ int32_t *out_i = (int32_t *)out;
+ int n, count;
+ int k = c->subexp.k;
+
+ for (count = 0, n = *out_size; count < n; count++) {
+ int i = 0, tail;
+ int val;
+
+ /* Get number of 1s */
+ //while (get_bit_MSB(in) == 1) i++;
+ i = get_one_bits_MSB(in);
+ if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
+ return -1;
+ /*
+ * Val is
+ * i > 0: 2^(k+i-1) + k+i-1 bits
+ * i = 0: k bits
+ */
+ if (i) {
+ tail = i + k-1;
+ val = 0;
+ while (tail) {
+ //val = val<<1; val |= get_bit_MSB(in);
+ GET_BIT_MSB(in, val);
+ tail--;
+ }
+ val += 1 << (i + k-1);
+ } else {
+ tail = k;
+ val = 0;
+ while (tail) {
+ //val = val<<1; val |= get_bit_MSB(in);
+ GET_BIT_MSB(in, val);
+ tail--;
+ }
+ }
+
+ out_i[count] = val - c->subexp.offset;
+ }
+
+ return 0;
+}
+
+void cram_subexp_decode_free(cram_codec *c) {
+ if (c)
+ free(c);
+}
+
+cram_codec *cram_subexp_decode_init(char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ cram_codec *c;
+ char *cp = data;
+
+ if (!(c = malloc(sizeof(*c))))
+ return NULL;
+
+ c->codec = E_SUBEXP;
+ c->decode = cram_subexp_decode;
+ c->free = cram_subexp_decode_free;
+ c->subexp.k = -1;
+
+ cp += safe_itf8_get(cp, data + size, &c->subexp.offset);
+ cp += safe_itf8_get(cp, data + size, &c->subexp.k);
+
+ if (cp - data != size || c->subexp.k < 0) {
+ fprintf(stderr, "Malformed subexp header stream\n");
+ free(c);
+ return NULL;
+ }
+
+ return c;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * GAMMA
+ */
+int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
+ int32_t *out_i = (int32_t *)out;
+ int i, n;
+
+ for (i = 0, n = *out_size; i < n; i++) {
+ int nz = 0;
+ int val;
+ //while (get_bit_MSB(in) == 0) nz++;
+ nz = get_zero_bits_MSB(in);
+ if (cram_not_enough_bits(in, nz))
+ return -1;
+ val = 1;
+ while (nz > 0) {
+ //val <<= 1; val |= get_bit_MSB(in);
+ GET_BIT_MSB(in, val);
+ nz--;
+ }
+
+ out_i[i] = val - c->gamma.offset;
+ }
+
+ return 0;
+}
+
+void cram_gamma_decode_free(cram_codec *c) {
+ if (c)
+ free(c);
+}
+
+cram_codec *cram_gamma_decode_init(char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ cram_codec *c;
+ char *cp = data;
+
+ if (!(c = malloc(sizeof(*c))))
+ return NULL;
+
+ c->codec = E_GAMMA;
+ c->decode = cram_gamma_decode;
+ c->free = cram_gamma_decode_free;
+
+ cp += itf8_get(cp, &c->gamma.offset);
+
+ if (cp - data != size) {
+ fprintf(stderr, "Malformed gamma header stream\n");
+ free(c);
+ return NULL;
+ }
+
+ return c;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * HUFFMAN
+ */
+
+static int code_sort(const void *vp1, const void *vp2) {
+ const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
+ const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
+
+ if (c1->len != c2->len)
+ return c1->len - c2->len;
+ else
+ return c1->symbol - c2->symbol;
+}
+
+void cram_huffman_decode_free(cram_codec *c) {
+ if (!c)
+ return;
+
+ if (c->huffman.codes)
+ free(c->huffman.codes);
+ free(c);
+}
+
+int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out, int *out_size) {
+ return -1;
+}
+
+int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out, int *out_size) {
+ int i, n;
+
+ if (!out)
+ return 0;
+
+ /* Special case of 0 length codes */
+ for (i = 0, n = *out_size; i < n; i++) {
+ out[i] = c->huffman.codes[0].symbol;
+ }
+ return 0;
+}
+
+int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out, int *out_size) {
+ int i, n, ncodes = c->huffman.ncodes;
+ const cram_huffman_code * const codes = c->huffman.codes;
+
+ for (i = 0, n = *out_size; i < n; i++) {
+ int idx = 0;
+ int val = 0, len = 0, last_len = 0;
+
+ for (;;) {
+ int dlen = codes[idx].len - last_len;
+ if (cram_not_enough_bits(in, dlen))
+ return -1;
+
+ //val <<= dlen;
+ //val |= get_bits_MSB(in, dlen);
+ //last_len = (len += dlen);
+
+ last_len = (len += dlen);
+ for (; dlen; dlen--) GET_BIT_MSB(in, val);
+
+ idx = val - codes[idx].p;
+ if (idx >= ncodes || idx < 0)
+ return -1;
+
+ if (codes[idx].code == val && codes[idx].len == len) {
+ out[i] = codes[idx].symbol;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out, int *out_size) {
+ int32_t *out_i = (int32_t *)out;
+ int i, n;
+ const cram_huffman_code * const codes = c->huffman.codes;
+
+ /* Special case of 0 length codes */
+ for (i = 0, n = *out_size; i < n; i++) {
+ out_i[i] = codes[0].symbol;
+ }
+ return 0;
+}
+
+int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out, int *out_size) {
+ int32_t *out_i = (int32_t *)out;
+ int i, n, ncodes = c->huffman.ncodes;
+ const cram_huffman_code * const codes = c->huffman.codes;
+
+ for (i = 0, n = *out_size; i < n; i++) {
+ int idx = 0;
+ int val = 0, len = 0, last_len = 0;
+
+ // Now one bit at a time for remaining checks
+ for (;;) {
+ int dlen = codes[idx].len - last_len;
+ if (cram_not_enough_bits(in, dlen))
+ return -1;
+
+ //val <<= dlen;
+ //val |= get_bits_MSB(in, dlen);
+ //last_len = (len += dlen);
+
+ last_len = (len += dlen);
+ for (; dlen; dlen--) GET_BIT_MSB(in, val);
+
+ idx = val - codes[idx].p;
+ if (idx >= ncodes || idx < 0)
+ return -1;
+
+ if (codes[idx].code == val && codes[idx].len == len) {
+ out_i[i] = codes[idx].symbol;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Initialises a huffman decoder from an encoding data stream.
+ */
+cram_codec *cram_huffman_decode_init(char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ int32_t ncodes = 0, i, j;
+ char *cp = data, *data_end = &data[size];
+ cram_codec *h;
+ cram_huffman_code *codes;
+ int32_t val, last_len, max_len = 0;
+ int l;
+
+ cp += safe_itf8_get(cp, data_end, &ncodes);
+ h = calloc(1, sizeof(*h));
+ if (!h)
+ return NULL;
+
+ h->codec = E_HUFFMAN;
+ h->free = cram_huffman_decode_free;
+
+ h->huffman.ncodes = ncodes;
+ codes = h->huffman.codes = malloc(ncodes * sizeof(*codes));
+ if (!codes) {
+ free(h);
+ return NULL;
+ }
+
+ /* Read symbols and bit-lengths */
+ for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) {
+ l = safe_itf8_get(cp, data_end, &codes[i].symbol);
+ }
+
+ if (l < 1) {
+ fprintf(stderr, "Malformed huffman header stream\n");
+ free(h);
+ return NULL;
+ }
+ cp += safe_itf8_get(cp, data_end, &i);
+ if (i != ncodes) {
+ fprintf(stderr, "Malformed huffman header stream\n");
+ free(h);
+ return NULL;
+ }
+
+ if (ncodes == 0) {
+ /* NULL huffman stream. Ensure it returns an error if
+ anything tries to use it. */
+ h->decode = cram_huffman_decode_null;
+ return h;
+ }
+
+ for (i = 0, l = 1; i < ncodes; i++, cp += l) {
+ l = safe_itf8_get(cp, data_end, &codes[i].len);
+ if (l < 1)
+ break;
+ if (max_len < codes[i].len)
+ max_len = codes[i].len;
+ }
+ if (l < 1 || cp - data != size || max_len >= ncodes) {
+ fprintf(stderr, "Malformed huffman header stream\n");
+ free(h);
+ return NULL;
+ }
+
+ /* Sort by bit length and then by symbol value */
+ qsort(codes, ncodes, sizeof(*codes), code_sort);
+
+ /* Assign canonical codes */
+ val = -1, last_len = 0;
+ for (i = 0; i < ncodes; i++) {
+ val++;
+ if (codes[i].len > last_len) {
+ while (codes[i].len > last_len) {
+ val <<= 1;
+ last_len++;
+ }
+ }
+ codes[i].code = val;
+ }
+
+ /*
+ * Compute the next starting point, offset by the i'th value.
+ * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
+ * codes[10..13].p = 30 - 10.
+ */
+ last_len = 0;
+ for (i = j = 0; i < ncodes; i++) {
+ if (codes[i].len > last_len) {
+ j = codes[i].code - i;
+ last_len = codes[i].len;
+ }
+ codes[i].p = j;
+ }
+
+// puts("==HUFF LEN==");
+// for (i = 0; i <= last_len+1; i++) {
+// printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]);
+// }
+// puts("===HUFFMAN CODES===");
+// for (i = 0; i < ncodes; i++) {
+// int j;
+// printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
+// j = codes[i].len;
+// while (j) {
+// putchar(codes[i].code & (1 << --j) ? '1' : '0');
+// }
+// printf(" %d\n", codes[i].code);
+// }
+
+ if (option == E_BYTE || option == E_BYTE_ARRAY) {
+ if (h->huffman.codes[0].len == 0)
+ h->decode = cram_huffman_decode_char0;
+ else
+ h->decode = cram_huffman_decode_char;
+ } else if (option == E_BYTE_ARRAY_BLOCK) {
+ abort();
+ } else {
+ if (h->huffman.codes[0].len == 0)
+ h->decode = cram_huffman_decode_int0;
+ else
+ h->decode = cram_huffman_decode_int;
+ }
+
+ return (cram_codec *)h;
+}
+
+int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ return 0;
+}
+
+int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ int i, code, len, r = 0;
+ unsigned char *syms = (unsigned char *)in;
+
+ while (in_size--) {
+ int sym = *syms++;
+ if (sym >= -1 && sym < MAX_HUFF) {
+ i = c->e_huffman.val2code[sym+1];
+ assert(c->e_huffman.codes[i].symbol == sym);
+ code = c->e_huffman.codes[i].code;
+ len = c->e_huffman.codes[i].len;
+ } else {
+ /* Slow - use a lookup table for when sym < MAX_HUFF? */
+ for (i = 0; i < c->e_huffman.nvals; i++) {
+ if (c->e_huffman.codes[i].symbol == sym)
+ break;
+ }
+ if (i == c->e_huffman.nvals)
+ return -1;
+
+ code = c->e_huffman.codes[i].code;
+ len = c->e_huffman.codes[i].len;
+ }
+
+ r |= store_bits_MSB(c->out, code, len);
+ }
+
+ return r;
+}
+
+int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ return 0;
+}
+
+int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ int i, code, len, r = 0;
+ int *syms = (int *)in;
+
+ while (in_size--) {
+ int sym = *syms++;
+
+ if (sym >= -1 && sym < MAX_HUFF) {
+ i = c->e_huffman.val2code[sym+1];
+ assert(c->e_huffman.codes[i].symbol == sym);
+ code = c->e_huffman.codes[i].code;
+ len = c->e_huffman.codes[i].len;
+ } else {
+ /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
+ for (i = 0; i < c->e_huffman.nvals; i++) {
+ if (c->e_huffman.codes[i].symbol == sym)
+ break;
+ }
+ if (i == c->e_huffman.nvals)
+ return -1;
+
+ code = c->e_huffman.codes[i].code;
+ len = c->e_huffman.codes[i].len;
+ }
+
+ r |= store_bits_MSB(c->out, code, len);
+ }
+
+ return r;
+}
+
+void cram_huffman_encode_free(cram_codec *c) {
+ if (!c)
+ return;
+
+ if (c->e_huffman.codes)
+ free(c->e_huffman.codes);
+ free(c);
+}
+
+/*
+ * Encodes a huffman tree.
+ * Returns number of bytes written.
+ */
+int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
+ int version) {
+ int i, len = 0;
+ cram_huffman_code *codes = c->e_huffman.codes;
+ /*
+ * Up to code length 127 means 2.5e+26 bytes of data required (worst
+ * case huffman tree needs symbols with freqs matching the Fibonacci
+ * series). So guaranteed 1 byte per code.
+ *
+ * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
+ *
+ * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
+ */
+ char *tmp = malloc(6*c->e_huffman.nvals+16);
+ char *tp = tmp;
+
+ if (!tmp)
+ return -1;
+
+ if (prefix) {
+ size_t l = strlen(prefix);
+ BLOCK_APPEND(b, prefix, l);
+ len += l;
+ }
+
+ tp += itf8_put(tp, c->e_huffman.nvals);
+ for (i = 0; i < c->e_huffman.nvals; i++) {
+ tp += itf8_put(tp, codes[i].symbol);
+ }
+
+ tp += itf8_put(tp, c->e_huffman.nvals);
+ for (i = 0; i < c->e_huffman.nvals; i++) {
+ tp += itf8_put(tp, codes[i].len);
+ }
+
+ len += itf8_put_blk(b, c->codec);
+ len += itf8_put_blk(b, tp-tmp);
+ BLOCK_APPEND(b, tmp, tp-tmp);
+ len += tp-tmp;
+
+ free(tmp);
+
+ return len;
+}
+
+cram_codec *cram_huffman_encode_init(cram_stats *st,
+ enum cram_external_type option,
+ void *dat,
+ int version) {
+ int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len;
+ int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
+ cram_codec *c;
+ cram_huffman_code *codes;
+
+ c = malloc(sizeof(*c));
+ if (!c)
+ return NULL;
+ c->codec = E_HUFFMAN;
+
+ /* Count number of unique symbols */
+ for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
+ if (!st->freqs[i])
+ continue;
+ if (nvals >= vals_alloc) {
+ vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
+ vals = realloc(vals, vals_alloc * sizeof(int));
+ freqs = realloc(freqs, vals_alloc * sizeof(int));
+ if (!vals || !freqs) {
+ if (vals) free(vals);
+ if (freqs) free(freqs);
+ free(c);
+ return NULL;
+ }
+ }
+ vals[nvals] = i;
+ freqs[nvals] = st->freqs[i];
+ assert(st->freqs[i] > 0);
+ ntot += freqs[nvals];
+ if (max_val < i) max_val = i;
+ if (min_val > i) min_val = i;
+ nvals++;
+ }
+ if (st->h) {
+ khint_t k;
+
+ for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
+ if (!kh_exist(st->h, k))
+ continue;
+ if (nvals >= vals_alloc) {
+ vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
+ vals = realloc(vals, vals_alloc * sizeof(int));
+ freqs = realloc(freqs, vals_alloc * sizeof(int));
+ if (!vals || !freqs)
+ return NULL;
+ }
+ vals[nvals]= kh_key(st->h, k);
+ freqs[nvals] = kh_val(st->h, k);
+ assert(freqs[nvals] > 0);
+ ntot += freqs[nvals];
+ if (max_val < i) max_val = i;
+ if (min_val > i) min_val = i;
+ nvals++;
+ }
+ }
+
+ assert(nvals > 0);
+
+ freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
+ lens = calloc(2*nvals, sizeof(*lens));
+ if (!lens || !freqs)
+ return NULL;
+
+ /* Inefficient, use pointers to form chain so we can insert and maintain
+ * a sorted list? This is currently O(nvals^2) complexity.
+ */
+ for (;;) {
+ int low1 = INT_MAX, low2 = INT_MAX;
+ int ind1 = 0, ind2 = 0;
+ for (i = 0; i < nvals; i++) {
+ if (freqs[i] < 0)
+ continue;
+ if (low1 > freqs[i])
+ low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
+ else if (low2 > freqs[i])
+ low2 = freqs[i], ind2 = i;
+ }
+ if (low2 == INT_MAX)
+ break;
+
+ freqs[nvals] = low1 + low2;
+ lens[ind1] = nvals;
+ lens[ind2] = nvals;
+ freqs[ind1] *= -1;
+ freqs[ind2] *= -1;
+ nvals++;
+ }
+ nvals = nvals/2+1;
+
+ /* Assign lengths */
+ for (i = 0; i < nvals; i++) {
+ int code_len = 0;
+ for (k = lens[i]; k; k = lens[k])
+ code_len++;
+ lens[i] = code_len;
+ freqs[i] *= -1;
+ //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
+ }
+
+
+ /* Sort, need in a struct */
+ if (!(codes = malloc(nvals * sizeof(*codes))))
+ return NULL;
+ for (i = 0; i < nvals; i++) {
+ codes[i].symbol = vals[i];
+ codes[i].len = lens[i];
+ }
+ qsort(codes, nvals, sizeof(*codes), code_sort);
+
+ /*
+ * Generate canonical codes from lengths.
+ * Sort by length.
+ * Start with 0.
+ * Every new code of same length is +1.
+ * Every new code of new length is +1 then <<1 per extra length.
+ *
+ * /\
+ * a/\
+ * /\/\
+ * bcd/\
+ * ef
+ *
+ * a 1 0
+ * b 3 4 (0+1)<<2
+ * c 3 5
+ * d 3 6
+ * e 4 14 (6+1)<<1
+ * f 5 15
+ */
+ code = 0; len = codes[0].len;
+ for (i = 0; i < nvals; i++) {
+ while (len != codes[i].len) {
+ code<<=1;
+ len++;
+ }
+ codes[i].code = code++;
+
+ if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
+ c->e_huffman.val2code[codes[i].symbol+1] = i;
+
+ //fprintf(stderr, "sym %d, code %d, len %d\n",
+ // codes[i].symbol, codes[i].code, codes[i].len);
+ }
+
+ free(lens);
+ free(vals);
+ free(freqs);
+
+ c->e_huffman.codes = codes;
+ c->e_huffman.nvals = nvals;
+
+ c->free = cram_huffman_encode_free;
+ if (option == E_BYTE || option == E_BYTE_ARRAY) {
+ if (c->e_huffman.codes[0].len == 0)
+ c->encode = cram_huffman_encode_char0;
+ else
+ c->encode = cram_huffman_encode_char;
+ } else {
+ if (c->e_huffman.codes[0].len == 0)
+ c->encode = cram_huffman_encode_int0;
+ else
+ c->encode = cram_huffman_encode_int;
+ }
+ c->store = cram_huffman_encode_store;
+
+ return c;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * BYTE_ARRAY_LEN
+ */
+int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out,
+ int *out_size) {
+ /* Fetch length */
+ int32_t len = 0, one = 1;
+ int r;
+
+ r = c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec,
+ in, (char *)&len, &one);
+ //printf("ByteArray Len=%d\n", len);
+
+ if (!r && c->byte_array_len.val_codec && len >= 0) {
+ r = c->byte_array_len.val_codec->decode(slice,
+ c->byte_array_len.val_codec,
+ in, out, &len);
+ } else {
+ return -1;
+ }
+
+ *out_size = len;
+
+ return r;
+}
+
+void cram_byte_array_len_decode_free(cram_codec *c) {
+ if (!c) return;
+
+ if (c->byte_array_len.len_codec)
+ c->byte_array_len.len_codec->free(c->byte_array_len.len_codec);
+
+ if (c->byte_array_len.val_codec)
+ c->byte_array_len.val_codec->free(c->byte_array_len.val_codec);
+
+ free(c);
+}
+
+cram_codec *cram_byte_array_len_decode_init(char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ cram_codec *c;
+ char *cp = data;
+ char *endp = data + size;
+ int32_t encoding = 0;
+ int32_t sub_size = -1;
+
+ if (!(c = malloc(sizeof(*c))))
+ return NULL;
+
+ c->codec = E_BYTE_ARRAY_LEN;
+ c->decode = cram_byte_array_len_decode;
+ c->free = cram_byte_array_len_decode_free;
+
+ cp += safe_itf8_get(cp, endp, &encoding);
+ cp += safe_itf8_get(cp, endp, &sub_size);
+ if (sub_size < 0 || endp - cp < sub_size)
+ goto malformed;
+ c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size,
+ E_INT, version);
+ if (c->byte_array_len.len_codec == NULL)
+ goto no_codec;
+ cp += sub_size;
+
+ sub_size = -1;
+ cp += safe_itf8_get(cp, endp, &encoding);
+ cp += safe_itf8_get(cp, endp, &sub_size);
+ if (sub_size < 0 || endp - cp < sub_size)
+ goto malformed;
+ c->byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size,
+ option, version);
+ if (c->byte_array_len.val_codec == NULL)
+ goto no_codec;
+ cp += sub_size;
+
+ if (cp - data != size)
+ goto malformed;
+
+ return c;
+
+ malformed:
+ fprintf(stderr, "Malformed byte_array_len header stream\n");
+ no_codec:
+ free(c);
+ return NULL;
+}
+
+int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ int32_t i32 = in_size;
+ int r = 0;
+
+ r |= c->e_byte_array_len.len_codec->encode(slice,
+ c->e_byte_array_len.len_codec,
+ (char *)&i32, 1);
+ r |= c->e_byte_array_len.val_codec->encode(slice,
+ c->e_byte_array_len.val_codec,
+ in, in_size);
+ return r;
+}
+
+void cram_byte_array_len_encode_free(cram_codec *c) {
+ if (!c)
+ return;
+
+ if (c->e_byte_array_len.len_codec)
+ c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec);
+
+ if (c->e_byte_array_len.val_codec)
+ c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec);
+
+ free(c);
+}
+
+int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
+ char *prefix, int version) {
+ int len = 0, len2, len3;
+ cram_codec *tc;
+ cram_block *b_len, *b_val;
+
+ if (prefix) {
+ size_t l = strlen(prefix);
+ BLOCK_APPEND(b, prefix, l);
+ len += l;
+ }
+
+ tc = c->e_byte_array_len.len_codec;
+ b_len = cram_new_block(0, 0);
+ len2 = tc->store(tc, b_len, NULL, version);
+
+ tc = c->e_byte_array_len.val_codec;
+ b_val = cram_new_block(0, 0);
+ len3 = tc->store(tc, b_val, NULL, version);
+
+ len += itf8_put_blk(b, c->codec);
+ len += itf8_put_blk(b, len2+len3);
+ BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
+ BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
+
+ cram_free_block(b_len);
+ cram_free_block(b_val);
+
+ return len + len2 + len3;
+}
+
+cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
+ enum cram_external_type option,
+ void *dat,
+ int version) {
+ cram_codec *c;
+ cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
+
+ c = malloc(sizeof(*c));
+ if (!c)
+ return NULL;
+ c->codec = E_BYTE_ARRAY_LEN;
+ c->free = cram_byte_array_len_encode_free;
+ c->encode = cram_byte_array_len_encode;
+ c->store = cram_byte_array_len_encode_store;
+
+ c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
+ NULL, E_INT,
+ e->len_dat,
+ version);
+ c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
+ NULL, E_BYTE_ARRAY,
+ e->val_dat,
+ version);
+
+ return c;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * BYTE_ARRAY_STOP
+ */
+static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out,
+ int *out_size) {
+ cram_block *b = NULL;
+ char *cp, ch;
+
+ b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
+ if (!b)
+ return *out_size?-1:0;
+
+ if (b->idx >= b->uncomp_size)
+ return -1;
+
+ cp = (char *)b->data + b->idx;
+ if (out) {
+ while ((ch = *cp) != (char)c->byte_array_stop.stop) {
+ if (cp - (char *)b->data >= b->uncomp_size)
+ return -1;
+ *out++ = ch;
+ cp++;
+ }
+ } else {
+ // Consume input, but produce no output
+ while ((ch = *cp) != (char)c->byte_array_stop.stop) {
+ if (cp - (char *)b->data >= b->uncomp_size)
+ return -1;
+ cp++;
+ }
+ }
+
+ *out_size = cp - (char *)(b->data + b->idx);
+ b->idx = cp - (char *)b->data + 1;
+
+ return 0;
+}
+
+int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
+ cram_block *in, char *out_,
+ int *out_size) {
+ cram_block *b;
+ cram_block *out = (cram_block *)out_;
+ char *cp, *out_cp, *cp_end;
+ char stop;
+
+ b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
+ if (!b)
+ return *out_size?-1:0;
+
+ if (b->idx >= b->uncomp_size)
+ return -1;
+ cp = (char *)b->data + b->idx;
+ cp_end = (char *)b->data + b->uncomp_size;
+ out_cp = (char *)BLOCK_END(out);
+
+ stop = c->byte_array_stop.stop;
+ if (cp_end - cp < out->alloc - out->byte) {
+ while (cp != cp_end && *cp != stop)
+ *out_cp++ = *cp++;
+ BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
+ } else {
+ char *cp_start;
+ for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
+ ;
+ BLOCK_APPEND(out, cp_start, cp - cp_start);
+ BLOCK_GROW(out, cp - cp_start);
+ }
+
+ *out_size = cp - (char *)(b->data + b->idx);
+ b->idx = cp - (char *)b->data + 1;
+
+ return 0;
+}
+
+void cram_byte_array_stop_decode_free(cram_codec *c) {
+ if (!c) return;
+
+ free(c);
+}
+
+cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ cram_codec *c;
+ unsigned char *cp = (unsigned char *)data;
+
+ if (!(c = malloc(sizeof(*c))))
+ return NULL;
+
+ c->codec = E_BYTE_ARRAY_STOP;
+ switch (option) {
+ case E_BYTE_ARRAY_BLOCK:
+ c->decode = cram_byte_array_stop_decode_block;
+ break;
+ case E_BYTE_ARRAY:
+ c->decode = cram_byte_array_stop_decode_char;
+ break;
+ default:
+ fprintf(stderr, "byte_array_stop codec only supports BYTE_ARRAYs.\n");
+ free(c);
+ return NULL;
+ }
+ c->free = cram_byte_array_stop_decode_free;
+
+ c->byte_array_stop.stop = *cp++;
+ if (CRAM_MAJOR_VERS(version) == 1) {
+ c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
+ + (cp[3]<<24);
+ cp += 4;
+ } else {
+ cp += itf8_get(cp, &c->byte_array_stop.content_id);
+ }
+
+ if ((char *)cp - data != size) {
+ fprintf(stderr, "Malformed byte_array_stop header stream\n");
+ free(c);
+ return NULL;
+ }
+
+ return c;
+}
+
+int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
+ char *in, int in_size) {
+ BLOCK_APPEND(c->out, in, in_size);
+ BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop);
+ return 0;
+}
+
+void cram_byte_array_stop_encode_free(cram_codec *c) {
+ if (!c)
+ return;
+ free(c);
+}
+
+int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
+ char *prefix, int version) {
+ int len = 0;
+ char buf[20], *cp = buf;
+
+ if (prefix) {
+ size_t l = strlen(prefix);
+ BLOCK_APPEND(b, prefix, l);
+ len += l;
+ }
+
+ cp += itf8_put(cp, c->codec);
+
+ if (CRAM_MAJOR_VERS(version) == 1) {
+ cp += itf8_put(cp, 5);
+ *cp++ = c->e_byte_array_stop.stop;
+ *cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff;
+ *cp++ = (c->e_byte_array_stop.content_id >> 8) & 0xff;
+ *cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff;
+ *cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff;
+ } else {
+ cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id));
+ *cp++ = c->e_byte_array_stop.stop;
+ cp += itf8_put(cp, c->e_byte_array_stop.content_id);
+ }
+
+ BLOCK_APPEND(b, buf, cp-buf);
+ len += cp-buf;
+
+ return len;
+}
+
+cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
+ enum cram_external_type option,
+ void *dat,
+ int version) {
+ cram_codec *c;
+
+ c = malloc(sizeof(*c));
+ if (!c)
+ return NULL;
+ c->codec = E_BYTE_ARRAY_STOP;
+ c->free = cram_byte_array_stop_encode_free;
+ c->encode = cram_byte_array_stop_encode;
+ c->store = cram_byte_array_stop_encode_store;
+
+ c->e_byte_array_stop.stop = ((int *)dat)[0];
+ c->e_byte_array_stop.content_id = ((int *)dat)[1];
+
+ return c;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ */
+
+const char *cram_encoding2str(enum cram_encoding t) {
+ switch (t) {
+ case E_NULL: return "NULL";
+ case E_EXTERNAL: return "EXTERNAL";
+ case E_GOLOMB: return "GOLOMB";
+ case E_HUFFMAN: return "HUFFMAN";
+ case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN";
+ case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
+ case E_BETA: return "BETA";
+ case E_SUBEXP: return "SUBEXP";
+ case E_GOLOMB_RICE: return "GOLOMB_RICE";
+ case E_GAMMA: return "GAMMA";
+ case E_NUM_CODECS:
+ default: return "?";
+ }
+}
+
+static cram_codec *(*decode_init[])(char *data,
+ int size,
+ enum cram_external_type option,
+ int version) = {
+ NULL,
+ cram_external_decode_init,
+ NULL,
+ cram_huffman_decode_init,
+ cram_byte_array_len_decode_init,
+ cram_byte_array_stop_decode_init,
+ cram_beta_decode_init,
+ cram_subexp_decode_init,
+ NULL,
+ cram_gamma_decode_init,
+};
+
+cram_codec *cram_decoder_init(enum cram_encoding codec,
+ char *data, int size,
+ enum cram_external_type option,
+ int version) {
+ if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
+ return decode_init[codec](data, size, option, version);
+ } else {
+ fprintf(stderr, "Unimplemented codec of type %s\n", cram_encoding2str(codec));
+ return NULL;
+ }
+}
+
+static cram_codec *(*encode_init[])(cram_stats *stx,
+ enum cram_external_type option,
+ void *opt,
+ int version) = {
+ NULL,
+ cram_external_encode_init,
+ NULL,
+ cram_huffman_encode_init,
+ cram_byte_array_len_encode_init,
+ cram_byte_array_stop_encode_init,
+ cram_beta_encode_init,
+ NULL, //cram_subexp_encode_init,
+ NULL,
+ NULL, //cram_gamma_encode_init,
+};
+
+cram_codec *cram_encoder_init(enum cram_encoding codec,
+ cram_stats *st,
+ enum cram_external_type option,
+ void *dat,
+ int version) {
+ if (st && !st->nvals)
+ return NULL;
+
+ if (encode_init[codec]) {
+ cram_codec *r;
+ if ((r = encode_init[codec](st, option, dat, version)))
+ r->out = NULL;
+ return r;
+ } else {
+ fprintf(stderr, "Unimplemented codec of type %s\n", cram_encoding2str(codec));
+ abort();
+ }
+}
+
+/*
+ * Returns the content_id used by this codec, also in id2 if byte_array_len.
+ * Returns -1 for the CORE block and -2 for unneeded.
+ * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
+ */
+int cram_codec_to_id(cram_codec *c, int *id2) {
+ int bnum1, bnum2 = -2;
+
+ switch (c->codec) {
+ case E_HUFFMAN:
+ bnum1 = c->huffman.ncodes == 1 ? -2 : -1;
+ break;
+ case E_GOLOMB:
+ case E_BETA:
+ case E_SUBEXP:
+ case E_GOLOMB_RICE:
+ case E_GAMMA:
+ bnum1 = -1;
+ break;
+ case E_EXTERNAL:
+ bnum1 = c->external.content_id;
+ break;
+ case E_BYTE_ARRAY_LEN:
+ bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL);
+ bnum2 = cram_codec_to_id(c->byte_array_len.val_codec, NULL);
+ break;
+ case E_BYTE_ARRAY_STOP:
+ bnum1 = c->byte_array_stop.content_id;
+ break;
+ case E_NULL:
+ bnum1 = -2;
+ break;
+ default:
+ fprintf(stderr, "Unknown codec type %d\n", c->codec);
+ bnum1 = -1;
+ }
+
+ if (id2)
+ *id2 = bnum2;
+ return bnum1;
+}
+
+
+/*
+ * cram_codec structures are specialised for decoding or encoding.
+ * Unfortunately this makes turning a decoder into an encoder (such as
+ * when transcoding files) problematic.
+ *
+ * This function converts a cram decoder codec into an encoder version
+ * in-place (ie it modifiers the codec itself).
+ *
+ * Returns 0 on success;
+ * -1 on failure.
+ */
+int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
+ int j;
+
+ switch (c->codec) {
+ case E_EXTERNAL:
+ // shares struct with decode
+ c->free = cram_external_encode_free;
+ c->store = cram_external_encode_store;
+ if (c->decode == cram_external_decode_int)
+ c->encode = cram_external_encode_int;
+ else if (c->decode == cram_external_decode_char)
+ c->encode = cram_external_encode_char;
+ else
+ return -1;
+ break;
+
+ case E_HUFFMAN: {
+ // New structure, so switch.
+ // FIXME: we huffman and e_huffman structs amended, we could
+ // unify this.
+ cram_codec *t = malloc(sizeof(*t));
+ t->codec = E_HUFFMAN;
+ t->free = cram_huffman_encode_free;
+ t->store = cram_huffman_encode_store;
+ t->e_huffman.codes = c->huffman.codes;
+ t->e_huffman.nvals = c->huffman.ncodes;
+ for (j = 0; j < t->e_huffman.nvals; j++) {
+ int32_t sym = t->e_huffman.codes[j].symbol;
+ if (sym >= -1 && sym < MAX_HUFF)
+ t->e_huffman.val2code[sym+1] = j;
+ }
+
+ if (c->decode == cram_huffman_decode_char0)
+ t->encode = cram_huffman_encode_char0;
+ else if (c->decode == cram_huffman_decode_char)
+ t->encode = cram_huffman_encode_char;
+ else if (c->decode == cram_huffman_decode_int0)
+ t->encode = cram_huffman_encode_int0;
+ else if (c->decode == cram_huffman_decode_int)
+ t->encode = cram_huffman_encode_int;
+ else {
+ free(t);
+ return -1;
+ }
+ *c = *t;
+ free(t);
+ break;
+ }
+
+ case E_BETA:
+ // shares struct with decode
+ c->free = cram_beta_encode_free;
+ c->store = cram_beta_encode_store;
+ if (c->decode == cram_beta_decode_int)
+ c->encode = cram_beta_encode_int;
+ else if (c->decode == cram_beta_decode_char)
+ c->encode = cram_beta_encode_char;
+ else
+ return -1;
+ break;
+
+ case E_BYTE_ARRAY_LEN: {
+ cram_codec *t = malloc(sizeof(*t));
+ t->codec = E_BYTE_ARRAY_LEN;
+ t->free = cram_byte_array_len_encode_free;
+ t->store = cram_byte_array_len_encode_store;
+ t->encode = cram_byte_array_len_encode;
+ t->e_byte_array_len.len_codec = c->byte_array_len.len_codec;
+ t->e_byte_array_len.val_codec = c->byte_array_len.val_codec;
+ if (cram_codec_decoder2encoder(fd, t->e_byte_array_len.len_codec) == -1 ||
+ cram_codec_decoder2encoder(fd, t->e_byte_array_len.val_codec) == -1) {
+ t->free(t);
+ return -1;
+ }
+
+ // {len,val}_{encoding,dat} are undefined, but unused.
+ // Leaving them unset here means we can test that assertion.
+ *c = *t;
+ free(t);
+ break;
+ }
+
+ case E_BYTE_ARRAY_STOP:
+ // shares struct with decode
+ c->free = cram_byte_array_stop_encode_free;
+ c->store = cram_byte_array_stop_encode_store;
+ c->encode = cram_byte_array_stop_encode;
+ break;
+
+ default:
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/htslib/cram/cram_codecs.h b/htslib/cram/cram_codecs.h
new file mode 100644
index 0000000..2c13c7e
--- /dev/null
+++ b/htslib/cram/cram_codecs.h
@@ -0,0 +1,194 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _CRAM_ENCODINGS_H_
+#define _CRAM_ENCODINGS_H_
+
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cram_codec;
+
+/*
+ * Slow but simple huffman decoder to start with.
+ * Read a bit at a time, keeping track of {length, value}
+ * eg. 1 1 0 1 => {1,1}, {2,3}, {3,6}, {4,13}
+ *
+ * Keep track of this through the huffman code table.
+ * For fast scanning we have an index of where the first code of length X
+ * appears.
+ */
+typedef struct {
+ int32_t symbol;
+ int32_t p; // next code start value, minus index to codes[]
+ int32_t code;
+ int32_t len;
+} cram_huffman_code;
+
+typedef struct {
+ int ncodes;
+ cram_huffman_code *codes;
+} cram_huffman_decoder;
+
+#define MAX_HUFF 128
+typedef struct {
+ cram_huffman_code *codes;
+ int nvals;
+ int val2code[MAX_HUFF+1]; // value to code lookup for small values
+} cram_huffman_encoder;
+
+typedef struct {
+ int32_t offset;
+ int32_t nbits;
+} cram_beta_decoder;
+
+typedef struct {
+ int32_t offset;
+} cram_gamma_decoder;
+
+typedef struct {
+ int32_t offset;
+ int32_t k;
+} cram_subexp_decoder;
+
+typedef struct {
+ int32_t content_id;
+ enum cram_external_type type;
+} cram_external_decoder;
+
+typedef struct {
+ struct cram_codec *len_codec;
+ struct cram_codec *val_codec;
+} cram_byte_array_len_decoder;
+
+typedef struct {
+ unsigned char stop;
+ int32_t content_id;
+} cram_byte_array_stop_decoder;
+
+typedef struct {
+ enum cram_encoding len_encoding;
+ enum cram_encoding val_encoding;
+ void *len_dat;
+ void *val_dat;
+ struct cram_codec *len_codec;
+ struct cram_codec *val_codec;
+} cram_byte_array_len_encoder;
+
+/*
+ * A generic codec structure.
+ */
+typedef struct cram_codec {
+ enum cram_encoding codec;
+ cram_block *out;
+ void (*free)(struct cram_codec *codec);
+ int (*decode)(cram_slice *slice, struct cram_codec *codec,
+ cram_block *in, char *out, int *out_size);
+ int (*encode)(cram_slice *slice, struct cram_codec *codec,
+ char *in, int in_size);
+ int (*store)(struct cram_codec *codec, cram_block *b, char *prefix,
+ int version);
+ union {
+ cram_huffman_decoder huffman;
+ cram_external_decoder external;
+ cram_beta_decoder beta;
+ cram_gamma_decoder gamma;
+ cram_subexp_decoder subexp;
+ cram_byte_array_len_decoder byte_array_len;
+ cram_byte_array_stop_decoder byte_array_stop;
+
+ cram_huffman_encoder e_huffman;
+ cram_external_decoder e_external;
+ cram_byte_array_stop_decoder e_byte_array_stop;
+ cram_byte_array_len_encoder e_byte_array_len;
+ cram_beta_decoder e_beta;
+ };
+} cram_codec;
+
+const char *cram_encoding2str(enum cram_encoding t);
+
+cram_codec *cram_decoder_init(enum cram_encoding codec, char *data, int size,
+ enum cram_external_type option,
+ int version);
+cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st,
+ enum cram_external_type option, void *dat,
+ int version);
+
+//int cram_decode(void *codes, char *in, int in_size, char *out, int *out_size);
+//void cram_decoder_free(void *codes);
+
+//#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, (--b->bit == -1) && (b->bit = 7, b->byte++))
+
+#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, b->byte += (--b->bit<0), b->bit&=7)
+
+/*
+ * Check that enough bits are left in a block to satisy a bit-based decoder.
+ * Return 0 if there are enough
+ * 1 if not.
+ */
+
+static inline int cram_not_enough_bits(cram_block *blk, int nbits) {
+ if (nbits < 0 ||
+ (blk->byte >= blk->uncomp_size && nbits > 0) ||
+ (blk->uncomp_size - blk->byte <= INT32_MAX / 8 + 1 &&
+ (blk->uncomp_size - blk->byte) * 8 + blk->bit - 7 < nbits)) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Returns the content_id used by this codec, also in id2 if byte_array_len.
+ * Returns -1 for the CORE block and -2 for unneeded.
+ * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
+ */
+int cram_codec_to_id(cram_codec *c, int *id2);
+
+/*
+ * cram_codec structures are specialised for decoding or encoding.
+ * Unfortunately this makes turning a decoder into an encoder (such as
+ * when transcoding files) problematic.
+ *
+ * This function converts a cram decoder codec into an encoder version
+ * in-place (ie it modifiers the codec itself).
+ *
+ * Returns 0 on success;
+ * -1 on failure.
+ */
+int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CRAM_ENCODINGS_H_ */
diff --git a/htslib/cram/cram_decode.c b/htslib/cram/cram_decode.c
new file mode 100644
index 0000000..7a3b5fc
--- /dev/null
+++ b/htslib/cram/cram_decode.c
@@ -0,0 +1,3159 @@
+/*
+Copyright (c) 2012-2014 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * - In-memory decoding of CRAM data structures.
+ * - Iterator for reading CRAM record by record.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "cram/cram.h"
+#include "cram/os.h"
+#include "htslib/hts.h"
+
+//Whether CIGAR has just M or uses = and X to indicate match and mismatch
+//#define USE_X
+
+/* ----------------------------------------------------------------------
+ * CRAM compression headers
+ */
+
+/*
+ * Decodes the Tag Dictionary record in the preservation map
+ * Updates the cram compression header.
+ *
+ * Returns number of bytes decoded on success
+ * -1 on failure
+ */
+int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) {
+ char *op = cp;
+ unsigned char *dat;
+ cram_block *b;
+ int32_t blk_size = 0;
+ int nTL, i, sz;
+
+ if (!(b = cram_new_block(0, 0)))
+ return -1;
+
+ /* Decode */
+ cp += safe_itf8_get(cp, endp, &blk_size);
+ if (!blk_size) {
+ h->nTL = 0;
+ h->TL = NULL;
+ cram_free_block(b);
+ return cp - op;
+ }
+
+ if (blk_size < 0 || endp - cp < blk_size) {
+ cram_free_block(b);
+ return -1;
+ }
+
+ BLOCK_APPEND(b, cp, blk_size);
+ cp += blk_size;
+ sz = cp - op;
+ // Force nul termination if missing
+ if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
+ BLOCK_APPEND_CHAR(b, '\0');
+
+ /* Set up TL lookup table */
+ dat = BLOCK_DATA(b);
+
+ // Count
+ for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
+ nTL++;
+ while (dat[i])
+ i++;
+ }
+
+ // Copy
+ h->nTL = nTL;
+ if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) {
+ cram_free_block(b);
+ return -1;
+ }
+ for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
+ h->TL[nTL++] = &dat[i];
+ while (dat[i])
+ i++;
+ }
+ h->TD_blk = b;
+
+ return sz;
+}
+
+/*
+ * Decodes a CRAM block compression header.
+ * Returns header ptr on success
+ * NULL on failure
+ */
+cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
+ cram_block *b) {
+ char *cp, *endp, *cp_copy;
+ cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
+ int i;
+ int32_t map_size = 0, map_count = 0;
+
+ if (!hdr)
+ return NULL;
+
+ if (b->method != RAW) {
+ if (cram_uncompress_block(b)) {
+ free(hdr);
+ return NULL;
+ }
+ }
+
+ cp = (char *)b->data;
+ endp = cp + b->uncomp_size;
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id);
+ cp += safe_itf8_get(cp, endp, &hdr->ref_seq_start);
+ cp += safe_itf8_get(cp, endp, &hdr->ref_seq_span);
+ cp += safe_itf8_get(cp, endp, &hdr->num_records);
+ cp += safe_itf8_get(cp, endp, &hdr->num_landmarks);
+ if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
+ free(hdr);
+ return NULL;
+ }
+ for (i = 0; i < hdr->num_landmarks; i++) {
+ cp += safe_itf8_get(cp, endp, &hdr->landmark[i]);
+ }
+ }
+
+ hdr->preservation_map = kh_init(map);
+
+ memset(hdr->rec_encoding_map, 0,
+ CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
+ memset(hdr->tag_encoding_map, 0,
+ CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
+
+ if (!hdr->preservation_map) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ /* Initialise defaults for preservation map */
+ hdr->mapped_qs_included = 0;
+ hdr->unmapped_qs_included = 0;
+ hdr->unmapped_placed = 0;
+ hdr->qs_included = 0;
+ hdr->read_names_included = 0;
+ hdr->AP_delta = 1;
+ memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
+
+ /* Preservation map */
+ cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
+ cp += safe_itf8_get(cp, endp, &map_count);
+ for (i = 0; i < map_count; i++) {
+ pmap_t hd;
+ khint_t k;
+ int r;
+
+ if (endp - cp < 2) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ cp += 2;
+ switch(CRAM_KEY(cp[-2],cp[-1])) {
+ case CRAM_KEY('M','I'):
+ hd.i = *cp++;
+ k = kh_put(map, hdr->preservation_map, "MI", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ kh_val(hdr->preservation_map, k) = hd;
+ hdr->mapped_qs_included = hd.i;
+ break;
+
+ case CRAM_KEY('U','I'):
+ hd.i = *cp++;
+ k = kh_put(map, hdr->preservation_map, "UI", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ kh_val(hdr->preservation_map, k) = hd;
+ hdr->unmapped_qs_included = hd.i;
+ break;
+
+ case CRAM_KEY('P','I'):
+ hd.i = *cp++;
+ k = kh_put(map, hdr->preservation_map, "PI", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ kh_val(hdr->preservation_map, k) = hd;
+ hdr->unmapped_placed = hd.i;
+ break;
+
+ case CRAM_KEY('R','N'):
+ hd.i = *cp++;
+ k = kh_put(map, hdr->preservation_map, "RN", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ kh_val(hdr->preservation_map, k) = hd;
+ hdr->read_names_included = hd.i;
+ break;
+
+ case CRAM_KEY('A','P'):
+ hd.i = *cp++;
+ k = kh_put(map, hdr->preservation_map, "AP", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ kh_val(hdr->preservation_map, k) = hd;
+ hdr->AP_delta = hd.i;
+ break;
+
+ case CRAM_KEY('R','R'):
+ hd.i = *cp++;
+ k = kh_put(map, hdr->preservation_map, "RR", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ kh_val(hdr->preservation_map, k) = hd;
+ fd->no_ref = !hd.i;
+ break;
+
+ case CRAM_KEY('S','M'):
+ if (endp - cp < 5) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
+ hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
+ hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
+ hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
+
+ hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
+ hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
+ hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
+ hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
+
+ hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
+ hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
+ hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
+ hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
+
+ hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
+ hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
+ hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
+ hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
+
+ hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
+ hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
+ hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
+ hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
+
+ hd.p = cp;
+ cp += 5;
+
+ k = kh_put(map, hdr->preservation_map, "SM", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ kh_val(hdr->preservation_map, k) = hd;
+ break;
+
+ case CRAM_KEY('T','D'): {
+ int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary
+ if (sz < 0) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ hd.p = cp;
+ cp += sz;
+
+ k = kh_put(map, hdr->preservation_map, "TD", &r);
+ if (-1 == r) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ kh_val(hdr->preservation_map, k) = hd;
+ break;
+ }
+
+ default:
+ fprintf(stderr, "Unrecognised preservation map key %c%c\n",
+ cp[-2], cp[-1]);
+ // guess byte;
+ cp++;
+ break;
+ }
+ }
+ if (cp - cp_copy != map_size) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ /* Record encoding map */
+ cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
+ cp += safe_itf8_get(cp, endp, &map_count);
+ for (i = 0; i < map_count; i++) {
+ char *key = cp;
+ int32_t encoding = E_NULL;
+ int32_t size = 0;
+ cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
+
+ if (!m || endp - cp < 4) {
+ free(m);
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ cp += 2;
+ cp += safe_itf8_get(cp, endp, &encoding);
+ cp += safe_itf8_get(cp, endp, &size);
+
+ // Fill out cram_map purely for cram_dump to dump out.
+ m->key = (key[0]<<8)|key[1];
+ m->encoding = encoding;
+ m->size = size;
+ m->offset = cp - (char *)b->data;
+ m->codec = NULL;
+
+ if (m->encoding == E_NULL)
+ continue;
+
+ if (size < 0 || endp - cp < size) {
+ free(m);
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
+
+ /*
+ * For CRAM1.0 CF and BF are Byte and not Int.
+ * Practically speaking it makes no difference unless we have a
+ * 1.0 format file that stores these in EXTERNAL as only then
+ * does Byte vs Int matter.
+ *
+ * Neither this C code nor Java reference implementations did this,
+ * so we gloss over it and treat them as int.
+ */
+
+ if (key[0] == 'B' && key[1] == 'F') {
+ if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'C' && key[1] == 'F') {
+ if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'R' && key[1] == 'I') {
+ if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'R' && key[1] == 'L') {
+ if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'A' && key[1] == 'P') {
+ if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'R' && key[1] == 'G') {
+ if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'M' && key[1] == 'F') {
+ if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'N' && key[1] == 'S') {
+ if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'N' && key[1] == 'P') {
+ if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'T' && key[1] == 'S') {
+ if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'N' && key[1] == 'F') {
+ if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'T' && key[1] == 'C') {
+ if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'T' && key[1] == 'N') {
+ if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'F' && key[1] == 'N') {
+ if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'F' && key[1] == 'C') {
+ if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'F' && key[1] == 'P') {
+ if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'B' && key[1] == 'S') {
+ if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'I' && key[1] == 'N') {
+ if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'S' && key[1] == 'C') {
+ if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'D' && key[1] == 'L') {
+ if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'B' && key[1] == 'A') {
+ if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'B' && key[1] == 'B') {
+ if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'R' && key[1] == 'S') {
+ if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'P' && key[1] == 'D') {
+ if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'H' && key[1] == 'C') {
+ if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'M' && key[1] == 'Q') {
+ if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'R' && key[1] == 'N') {
+ if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY_BLOCK,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'Q' && key[1] == 'S') {
+ if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size,
+ E_BYTE,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'Q' && key[1] == 'Q') {
+ if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'T' && key[1] == 'L') {
+ if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size,
+ E_INT,
+ fd->version))) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+ } else if (key[0] == 'T' && key[1] == 'M') {
+ } else if (key[0] == 'T' && key[1] == 'V') {
+ } else
+ fprintf(stderr, "Unrecognised key: %.2s\n", key);
+
+ cp += size;
+
+ m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
+ hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
+ }
+ if (cp - cp_copy != map_size) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ /* Tag encoding map */
+ cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
+ cp += safe_itf8_get(cp, endp, &map_count);
+ for (i = 0; i < map_count; i++) {
+ int32_t encoding = E_NULL;
+ int32_t size = 0;
+ cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
+ char *key;
+
+ if (!m || endp - cp < 6) {
+ free(m);
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ key = cp + 1;
+ m->key = (key[0]<<16)|(key[1]<<8)|key[2];
+
+ cp += 4; // Strictly ITF8, but this suffices
+ cp += safe_itf8_get(cp, endp, &encoding);
+ cp += safe_itf8_get(cp, endp, &size);
+
+ m->encoding = encoding;
+ m->size = size;
+ m->offset = cp - (char *)b->data;
+ if (size < 0 || endp - cp < size ||
+ !(m->codec = cram_decoder_init(encoding, cp, size,
+ E_BYTE_ARRAY_BLOCK, fd->version))) {
+ cram_free_compression_header(hdr);
+ free(m);
+ return NULL;
+ }
+
+ cp += size;
+
+ m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
+ hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
+ }
+ if (cp - cp_copy != map_size) {
+ cram_free_compression_header(hdr);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+/*
+ * Note we also need to scan through the record encoding map to
+ * see which data series share the same block, either external or
+ * CORE. For example if we need the BF data series but MQ and CF
+ * are also encoded in the same block then we need to add those in
+ * as a dependency in order to correctly decode BF.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_dependent_data_series(cram_fd *fd,
+ cram_block_compression_hdr *hdr,
+ cram_slice *s) {
+ int *block_used;
+ int core_used = 0;
+ int i;
+ static int i_to_id[] = {
+ DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
+ DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
+ DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
+ DS_HC, DS_SC, DS_BB, DS_QQ,
+ };
+ uint32_t orig_ds;
+
+ /*
+ * Set the data_series bit field based on fd->required_fields
+ * contents.
+ */
+ if (fd->required_fields && fd->required_fields != INT_MAX) {
+ hdr->data_series = 0;
+
+ if (fd->required_fields & SAM_QNAME)
+ hdr->data_series |= CRAM_RN;
+
+ if (fd->required_fields & SAM_FLAG)
+ hdr->data_series |= CRAM_BF;
+
+ if (fd->required_fields & SAM_RNAME)
+ hdr->data_series |= CRAM_RI | CRAM_BF;
+
+ if (fd->required_fields & SAM_POS)
+ hdr->data_series |= CRAM_AP | CRAM_BF;
+
+ if (fd->required_fields & SAM_MAPQ)
+ hdr->data_series |= CRAM_MQ;
+
+ if (fd->required_fields & SAM_CIGAR)
+ hdr->data_series |= CRAM_CIGAR;
+
+ if (fd->required_fields & SAM_RNEXT)
+ hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
+
+ if (fd->required_fields & SAM_PNEXT)
+ hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
+
+ if (fd->required_fields & SAM_TLEN)
+ hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
+ CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
+
+ if (fd->required_fields & SAM_SEQ)
+ hdr->data_series |= CRAM_SEQ;
+
+ if (!(fd->required_fields & SAM_AUX))
+ // No easy way to get MD/NM without other tags at present
+ fd->decode_md = 0;
+
+ if (fd->required_fields & SAM_QUAL)
+ hdr->data_series |= CRAM_QUAL;
+
+ if (fd->required_fields & SAM_AUX)
+ hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
+
+ if (fd->required_fields & SAM_RGAUX)
+ hdr->data_series |= CRAM_RG | CRAM_BF;
+
+ // Always uncompress CORE block
+ if (cram_uncompress_block(s->block[0]))
+ return -1;
+ } else {
+ hdr->data_series = CRAM_ALL;
+
+ for (i = 0; i < s->hdr->num_blocks; i++) {
+ if (cram_uncompress_block(s->block[i]))
+ return -1;
+ }
+
+ return 0;
+ }
+
+ block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
+ if (!block_used)
+ return -1;
+
+ do {
+ /*
+ * Also set data_series based on code prerequisites. Eg if we need
+ * CRAM_QS then we also need to know CRAM_RL so we know how long it
+ * is, or if we need FC/FP then we also need FN (number of features).
+ *
+ * It's not reciprocal though. We may be needing to decode FN
+ * but have no need to decode FC, FP and cigar ops.
+ */
+ if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP;
+ if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP;
+
+ // cram_decode_seq() needs seq[] array
+ if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL;
+
+ if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC;
+ if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN;
+ if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL;
+ if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF;
+ if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF;
+ if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI;
+ if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
+ hdr->data_series |= CRAM_CF;
+ if (!hdr->read_names_included && hdr->data_series & CRAM_RN)
+ hdr->data_series |= CRAM_CF | CRAM_NF;
+ if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
+ hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
+
+ orig_ds = hdr->data_series;
+
+ // Find which blocks are in use.
+ for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
+ int bnum1, bnum2, j;
+ cram_codec *c = hdr->codecs[i_to_id[i]];
+
+ if (!(hdr->data_series & (1<<i)))
+ continue;
+
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ core_used = 1;
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type == EXTERNAL &&
+ s->block[j]->content_id == bnum1) {
+ block_used[j] = 1;
+ if (cram_uncompress_block(s->block[j])) {
+ free(block_used);
+ return -1;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+ }
+
+ // Tags too
+ if ((fd->required_fields & SAM_AUX) ||
+ (hdr->data_series & CRAM_aux)) {
+ for (i = 0; i < CRAM_MAP_HASH; i++) {
+ int bnum1, bnum2, j;
+ cram_map *m = hdr->tag_encoding_map[i];
+
+ while (m) {
+ cram_codec *c = m->codec;
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ core_used = 1;
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type == EXTERNAL &&
+ s->block[j]->content_id == bnum1) {
+ block_used[j] = 1;
+ if (cram_uncompress_block(s->block[j])) {
+ free(block_used);
+ return -1;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+
+ m = m->next;
+ }
+ }
+ }
+
+ // We now know which blocks are in used, so repeat and find
+ // which other data series need to be added.
+ for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
+ int bnum1, bnum2, j;
+ cram_codec *c = hdr->codecs[i_to_id[i]];
+
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ if (core_used) {
+ //printf(" + data series %08x:\n", 1<<i);
+ hdr->data_series |= 1<<i;
+ }
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type == EXTERNAL &&
+ s->block[j]->content_id == bnum1) {
+ if (block_used[j]) {
+ //printf(" + data series %08x:\n", 1<<i);
+ hdr->data_series |= 1<<i;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+ }
+
+ // Tags too
+ for (i = 0; i < CRAM_MAP_HASH; i++) {
+ int bnum1, bnum2, j;
+ cram_map *m = hdr->tag_encoding_map[i];
+
+ while (m) {
+ cram_codec *c = m->codec;
+ if (!c)
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ for (;;) {
+ switch (bnum1) {
+ case -2:
+ break;
+
+ case -1:
+ //printf(" + data series %08x:\n", CRAM_aux);
+ hdr->data_series |= CRAM_aux;
+ break;
+
+ default:
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (s->block[j]->content_type &&
+ s->block[j]->content_id == bnum1) {
+ if (block_used[j]) {
+ //printf(" + data series %08x:\n",
+ // CRAM_aux);
+ hdr->data_series |= CRAM_aux;
+ }
+ }
+ }
+ break;
+ }
+
+ if (bnum2 == -2 || bnum1 == bnum2)
+ break;
+
+ bnum1 = bnum2; // 2nd pass
+ }
+
+ m = m->next;
+ }
+ }
+ } while (orig_ds != hdr->data_series);
+
+ free(block_used);
+ return 0;
+}
+
+/*
+ * Checks whether an external block is used solely by a single data series.
+ * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
+ * or 0 if not (E_NULL).
+ */
+static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
+ int id) {
+ int i, n_id = 0;
+ enum cram_encoding e_type = 0;
+
+ for (i = 0; i < DS_END; i++) {
+ cram_codec *c;
+ int bnum1, bnum2, old_n_id;
+
+ if (!(c = hdr->codecs[i]))
+ continue;
+
+ bnum1 = cram_codec_to_id(c, &bnum2);
+
+ old_n_id = n_id;
+ if (bnum1 == id) {
+ n_id++;
+ e_type = c->codec;
+ }
+ if (bnum2 == id) {
+ n_id++;
+ e_type = c->codec;
+ }
+
+ if (n_id == old_n_id+2)
+ n_id--; // len/val in same place counts once only.
+ }
+
+ return n_id == 1 ? e_type : 0;
+}
+
+/*
+ * Attempts to estimate the size of some blocks so we can preallocate them
+ * before decoding. Although decoding will automatically grow the blocks,
+ * it is typically more efficient to preallocate.
+ */
+void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
+ int *qual_size, int *name_size,
+ int *q_id) {
+ int bnum1, bnum2;
+ cram_codec *cd;
+
+ *qual_size = 0;
+ *name_size = 0;
+
+ /* Qual */
+ cd = hdr->codecs[DS_QS];
+ bnum1 = cram_codec_to_id(cd, &bnum2);
+ if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
+ if (cram_ds_unique(hdr, cd, bnum1)) {
+ cram_block *b = cram_get_block_by_id(s, bnum1);
+ if (b) *qual_size = b->uncomp_size;
+ if (q_id && cd->codec == E_EXTERNAL)
+ *q_id = bnum1;
+ }
+
+ /* Name */
+ cd = hdr->codecs[DS_RN];
+ bnum1 = cram_codec_to_id(cd, &bnum2);
+ if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
+ if (cram_ds_unique(hdr, cd, bnum1)) {
+ cram_block *b = cram_get_block_by_id(s, bnum1);
+ if (b) *name_size = b->uncomp_size;
+ }
+}
+
+
+/* ----------------------------------------------------------------------
+ * CRAM slices
+ */
+
+/*
+ * Decodes a CRAM (un)mapped slice header block.
+ * Returns slice header ptr on success
+ * NULL on failure
+ */
+cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
+ cram_block_slice_hdr *hdr;
+ char *cp;
+ char *cp_end;
+ int i;
+
+ if (b->method != RAW) {
+ /* Spec. says slice header should be RAW, but we can future-proof
+ by trying to decode it if it isn't. */
+ if (cram_uncompress_block(b) < 0)
+ return NULL;
+ }
+ cp = (char *)b->data;
+ cp_end = cp + b->uncomp_size;
+
+ if (b->content_type != MAPPED_SLICE &&
+ b->content_type != UNMAPPED_SLICE)
+ return NULL;
+
+ if (!(hdr = calloc(1, sizeof(*hdr))))
+ return NULL;
+
+ hdr->content_type = b->content_type;
+
+ if (b->content_type == MAPPED_SLICE) {
+ cp += safe_itf8_get(cp, cp_end, &hdr->ref_seq_id);
+ cp += safe_itf8_get(cp, cp_end, &hdr->ref_seq_start);
+ cp += safe_itf8_get(cp, cp_end, &hdr->ref_seq_span);
+ }
+ cp += safe_itf8_get(cp, cp_end, &hdr->num_records);
+ hdr->record_counter = 0;
+ if (CRAM_MAJOR_VERS(fd->version) == 2) {
+ int32_t i32 = 0;
+ cp += safe_itf8_get(cp, cp_end, &i32);
+ hdr->record_counter = i32;
+ } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ cp += ltf8_get(cp, &hdr->record_counter);
+ }
+
+ cp += safe_itf8_get(cp, cp_end, &hdr->num_blocks);
+
+ cp += safe_itf8_get(cp, cp_end, &hdr->num_content_ids);
+ if (hdr->num_content_ids < 1 ||
+ hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) {
+ /* Slice must have at least one data block,
+ and malloc'd size shouldn't wrap. */
+ free(hdr);
+ return NULL;
+ }
+ hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
+ if (!hdr->block_content_ids) {
+ free(hdr);
+ return NULL;
+ }
+
+ for (i = 0; i < hdr->num_content_ids; i++) {
+ int l = safe_itf8_get(cp, cp_end,
+ &hdr->block_content_ids[i]);
+ if (l <= 0) {
+ free(hdr->block_content_ids);
+ free(hdr);
+ return NULL;
+ }
+ cp += l;
+ }
+
+ if (b->content_type == MAPPED_SLICE) {
+ cp += safe_itf8_get(cp, cp_end, &hdr->ref_base_id);
+ }
+
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
+ if (cp_end - cp < 16) {
+ free(hdr->block_content_ids);
+ free(hdr);
+ return NULL;
+ }
+ memcpy(hdr->md5, cp, 16);
+ } else {
+ memset(hdr->md5, 0, 16);
+ }
+
+ return hdr;
+}
+
+
+#if 0
+/* Returns the number of bits set in val; it the highest bit used */
+static int nbits(int v) {
+ static const int MultiplyDeBruijnBitPosition[32] = {
+ 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
+ 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
+ };
+
+ v |= v >> 1; // first up to set all bits 1 after the first 1 */
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+
+ // DeBruijn magic to find top bit
+ return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
+}
+#endif
+
+#if 0
+static int sort_freqs(const void *vp1, const void *vp2) {
+ const int i1 = *(const int *)vp1;
+ const int i2 = *(const int *)vp2;
+ return i1-i2;
+}
+#endif
+
+/* ----------------------------------------------------------------------
+ * Primary CRAM sequence decoder
+ */
+
+/*
+ * Internal part of cram_decode_slice().
+ * Generates the sequence, quality and cigar components.
+ */
+static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
+ cram_block *blk, cram_record *cr, SAM_hdr *bfd,
+ int cf, char *seq, char *qual,
+ int has_MD, int has_NM) {
+ int prev_pos = 0, f, r = 0, out_sz = 1;
+ int seq_pos = 1;
+ int cig_len = 0, ref_pos = cr->apos;
+ int32_t fn, i32;
+ enum cigar_op cig_op = BAM_CMATCH;
+ uint32_t *cigar = s->cigar;
+ uint32_t ncigar = s->ncigar;
+ uint32_t cigar_alloc = s->cigar_alloc;
+ uint32_t nm = 0;
+ int32_t md_dist = 0;
+ int orig_aux = 0;
+ int decode_md = fd->decode_md && s->ref && !has_MD;
+ int decode_nm = fd->decode_md && s->ref && !has_NM;
+ uint32_t ds = c->comp_hdr->data_series;
+
+ if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
+ memset(qual, 255, cr->len);
+ }
+
+ if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
+ decode_md = decode_nm = 0;
+
+ if (decode_md) {
+ orig_aux = BLOCK_SIZE(s->aux_blk);
+ BLOCK_APPEND(s->aux_blk, "MDZ", 3);
+ }
+
+ if (ds & CRAM_FN) {
+ if (!c->comp_hdr->codecs[DS_FN]) return -1;
+ r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
+ blk, (char *)&fn, &out_sz);
+ if (r) return r;
+ } else {
+ fn = 0;
+ }
+
+ ref_pos--; // count from 0
+ cr->cigar = ncigar;
+
+ if (!(ds & (CRAM_FC | CRAM_FP)))
+ goto skip_cigar;
+
+ for (f = 0; f < fn; f++) {
+ int32_t pos = 0;
+ char op;
+
+ if (ncigar+2 >= cigar_alloc) {
+ cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
+ s->cigar = cigar;
+ if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
+ return -1;
+ }
+
+ if (ds & CRAM_FC) {
+ if (!c->comp_hdr->codecs[DS_FC]) return -1;
+ r |= c->comp_hdr->codecs[DS_FC]->decode(s,
+ c->comp_hdr->codecs[DS_FC],
+ blk,
+ &op, &out_sz);
+ if (r) return r;
+ }
+
+ if (!(ds & CRAM_FP))
+ continue;
+
+ if (!c->comp_hdr->codecs[DS_FP]) return -1;
+ r |= c->comp_hdr->codecs[DS_FP]->decode(s,
+ c->comp_hdr->codecs[DS_FP],
+ blk,
+ (char *)&pos, &out_sz);
+ if (r) return r;
+ pos += prev_pos;
+
+ if (pos <= 0) {
+ fprintf(stderr, "Error: feature position %d before start of read.\n",
+ pos);
+ return -1;
+ }
+
+ if (pos > seq_pos) {
+ if (pos > cr->len+1)
+ return -1;
+
+ if (s->ref && cr->ref_id >= 0) {
+ if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
+ static int whinged = 0;
+ int rlen;
+ if (!whinged)
+ fprintf(stderr, "Ref pos outside of ref "
+ "sequence boundary\n");
+ whinged = 1;
+ rlen = bfd->ref[cr->ref_id].len - ref_pos;
+ if (rlen > 0) {
+ memcpy(&seq[seq_pos-1],
+ &s->ref[ref_pos - s->ref_start +1], rlen);
+ if ((pos - seq_pos) - rlen > 0)
+ memset(&seq[seq_pos-1+rlen], 'N',
+ (pos - seq_pos) - rlen);
+ } else {
+ memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
+ }
+ } else {
+ memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
+ pos - seq_pos);
+ }
+ }
+#ifdef USE_X
+ if (cig_len && cig_op != BAM_CBASE_MATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ cig_op = BAM_CBASE_MATCH;
+#else
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ cig_op = BAM_CMATCH;
+#endif
+ cig_len += pos - seq_pos;
+ ref_pos += pos - seq_pos;
+ if (md_dist >= 0)
+ md_dist += pos - seq_pos;
+ seq_pos = pos;
+ }
+
+ prev_pos = pos;
+
+ if (!(ds & CRAM_FC))
+ goto skip_cigar;
+
+ if (!(ds & CRAM_FC))
+ continue;
+
+ switch(op) {
+ case 'S': { // soft clip: IN
+ int32_t out_sz2 = 1;
+ int have_sc = 0;
+
+ if (cig_len) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ switch (CRAM_MAJOR_VERS(fd->version)) {
+ case 1:
+ if (ds & CRAM_IN) {
+ r |= c->comp_hdr->codecs[DS_IN]
+ ? c->comp_hdr->codecs[DS_IN]
+ ->decode(s, c->comp_hdr->codecs[DS_IN],
+ blk,
+ cr->len ? &seq[pos-1] : NULL,
+ &out_sz2)
+ : (seq[pos-1] = 'N', out_sz2 = 1, 0);
+ have_sc = 1;
+ }
+ break;
+ case 2:
+ default:
+ if (ds & CRAM_SC) {
+ r |= c->comp_hdr->codecs[DS_SC]
+ ? c->comp_hdr->codecs[DS_SC]
+ ->decode(s, c->comp_hdr->codecs[DS_SC],
+ blk,
+ cr->len ? &seq[pos-1] : NULL,
+ &out_sz2)
+ : (seq[pos-1] = 'N', out_sz2 = 1, 0);
+ have_sc = 1;
+ }
+ break;
+
+// default:
+// r |= c->comp_hdr->codecs[DS_BB]
+// ? c->comp_hdr->codecs[DS_BB]
+// ->decode(s, c->comp_hdr->codecs[DS_BB],
+// blk, &seq[pos-1], &out_sz2)
+// : (seq[pos-1] = 'N', out_sz2 = 1, 0);
+ }
+ if (have_sc) {
+ if (r) return r;
+ cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
+ cig_op = BAM_CSOFT_CLIP;
+ seq_pos += out_sz2;
+ }
+ break;
+ }
+
+ case 'X': { // Substitution; BS
+ unsigned char base;
+#ifdef USE_X
+ if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ if (ds & CRAM_BS) {
+ if (!c->comp_hdr->codecs[DS_BS]) return -1;
+ r |= c->comp_hdr->codecs[DS_BS]
+ ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
+ (char *)&base, &out_sz);
+ if (pos-1 < cr->len)
+ seq[pos-1] = 'N'; // FIXME look up BS=base value
+ }
+ cig_op = BAM_CBASE_MISMATCH;
+#else
+ int ref_base;
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ if (ds & CRAM_BS) {
+ if (!c->comp_hdr->codecs[DS_BS]) return -1;
+ r |= c->comp_hdr->codecs[DS_BS]
+ ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
+ (char *)&base, &out_sz);
+ if (r) return -1;
+ if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
+ if (pos-1 < cr->len)
+ seq[pos-1] = c->comp_hdr->
+ substitution_matrix[fd->L1['N']][base];
+ if (decode_md || decode_nm) {
+ if (md_dist >= 0 && decode_md)
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+ md_dist = -1;
+ nm--;
+ }
+ } else {
+ unsigned char ref_call = ref_pos <= s->ref_end
+ ? (uc)s->ref[ref_pos - s->ref_start +1]
+ : 'N';
+ ref_base = fd->L1[ref_call];
+ if (pos-1 < cr->len)
+ seq[pos-1] = c->comp_hdr->
+ substitution_matrix[ref_base][base];
+ if (decode_md) {
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+ BLOCK_APPEND_CHAR(s->aux_blk, ref_call);
+ md_dist = 0;
+ }
+ }
+ }
+ cig_op = BAM_CMATCH;
+#endif
+ nm++;
+ cig_len++;
+ seq_pos++;
+ ref_pos++;
+ break;
+ }
+
+ case 'D': { // Deletion; DL
+ if (cig_len && cig_op != BAM_CDEL) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ if (ds & CRAM_DL) {
+ if (!c->comp_hdr->codecs[DS_DL]) return -1;
+ r |= c->comp_hdr->codecs[DS_DL]
+ ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
+ (char *)&i32, &out_sz);
+ if (r) return r;
+ if (decode_md || decode_nm) {
+ if (md_dist >= 0 && decode_md)
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+ if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
+ if (decode_md) {
+ BLOCK_APPEND_CHAR(s->aux_blk, '^');
+ BLOCK_APPEND(s->aux_blk,
+ &s->ref[ref_pos - s->ref_start +1],
+ i32);
+ md_dist = 0;
+ }
+ nm += i32;
+ } else {
+ uint32_t dlen;
+ if (bfd->ref[cr->ref_id].len >= ref_pos) {
+ if (decode_md) {
+ BLOCK_APPEND_CHAR(s->aux_blk, '^');
+ BLOCK_APPEND(s->aux_blk,
+ &s->ref[ref_pos - s->ref_start+1],
+ bfd->ref[cr->ref_id].len-ref_pos);
+ BLOCK_APPEND_UINT(s->aux_blk, 0);
+ }
+ dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
+ nm += i32 - dlen;
+ } else {
+ dlen = i32;
+ }
+
+ md_dist = -1;
+ }
+ }
+ cig_op = BAM_CDEL;
+ cig_len += i32;
+ ref_pos += i32;
+ //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
+ }
+ break;
+ }
+
+ case 'I': { // Insertion (several bases); IN
+ int32_t out_sz2 = 1;
+
+ if (cig_len && cig_op != BAM_CINS) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+
+ if (ds & CRAM_IN) {
+ if (!c->comp_hdr->codecs[DS_IN]) return -1;
+ r |= c->comp_hdr->codecs[DS_IN]
+ ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
+ cr->len ? &seq[pos-1] : NULL,
+ &out_sz2);
+ if (r) return r;
+ cig_op = BAM_CINS;
+ cig_len += out_sz2;
+ seq_pos += out_sz2;
+ nm += out_sz2;
+ //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
+ }
+ break;
+ }
+
+ case 'i': { // Insertion (single base); BA
+ if (cig_len && cig_op != BAM_CINS) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ if (ds & CRAM_BA) {
+ if (!c->comp_hdr->codecs[DS_BA]) return -1;
+ r |= c->comp_hdr->codecs[DS_BA]
+ ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
+ cr->len ? &seq[pos-1] : NULL,
+ &out_sz);
+ if (r) return r;
+ }
+ cig_op = BAM_CINS;
+ cig_len++;
+ seq_pos++;
+ nm++;
+ break;
+ }
+
+ case 'b': { // Several bases
+ int32_t len = 1;
+
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+
+ if (ds & CRAM_BB) {
+ if (!c->comp_hdr->codecs[DS_BB]) return -1;
+ r |= c->comp_hdr->codecs[DS_BB]
+ ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
+ cr->len ? &seq[pos-1] : NULL,
+ &len);
+ if (r) return r;
+
+ if (decode_md || decode_nm) {
+ int x;
+ if (md_dist >= 0 && decode_md)
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+
+ for (x = 0; x < len; x++) {
+ if (x && decode_md)
+ BLOCK_APPEND_UINT(s->aux_blk, 0);
+ if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
+ md_dist = -1;
+ break;
+ } else {
+ if (decode_md) {
+ char r = s->ref[ref_pos+x-s->ref_start +1];
+ BLOCK_APPEND_CHAR(s->aux_blk, r);
+ }
+ }
+ }
+
+ nm += x;
+ }
+ }
+
+ cig_op = BAM_CMATCH;
+
+ cig_len+=len;
+ seq_pos+=len;
+ ref_pos+=len;
+ //prev_pos+=len;
+ break;
+ }
+
+ case 'q': { // Several quality values
+ int32_t len = 1;
+
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+
+ if (ds & CRAM_QQ) {
+ if (!c->comp_hdr->codecs[DS_QQ]) return -1;
+ r |= c->comp_hdr->codecs[DS_QQ]
+ ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
+ (char *)&qual[pos-1], &len);
+ if (r) return r;
+ }
+
+ cig_op = BAM_CMATCH;
+
+ cig_len+=len;
+ seq_pos+=len;
+ ref_pos+=len;
+ //prev_pos+=len;
+ break;
+ }
+
+ case 'B': { // Read base; BA, QS
+#ifdef USE_X
+ if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+#else
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+#endif
+ if (ds & CRAM_BA) {
+ if (!c->comp_hdr->codecs[DS_BA]) return -1;
+ r |= c->comp_hdr->codecs[DS_BA]
+ ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
+ cr->len ? &seq[pos-1] : NULL,
+ &out_sz);
+
+ if (decode_md || decode_nm) {
+ if (md_dist >= 0 && decode_md)
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+ if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
+ md_dist = -1;
+ } else {
+ if (decode_md)
+ BLOCK_APPEND_CHAR(s->aux_blk,
+ s->ref[ref_pos-s->ref_start +1]);
+ nm++;
+ md_dist = 0;
+ }
+ }
+ }
+ if (ds & CRAM_QS) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
+ (char *)&qual[pos-1], &out_sz);
+ }
+#ifdef USE_X
+ cig_op = BAM_CBASE_MISMATCH;
+#else
+ cig_op = BAM_CMATCH;
+#endif
+ cig_len++;
+ seq_pos++;
+ ref_pos++;
+ //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
+ break;
+ }
+
+ case 'Q': { // Quality score; QS
+ if (ds & CRAM_QS) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
+ (char *)&qual[pos-1], &out_sz);
+ //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
+ }
+ break;
+ }
+
+ case 'H': { // hard clip; HC
+ if (cig_len && cig_op != BAM_CHARD_CLIP) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ if (ds & CRAM_HC) {
+ if (!c->comp_hdr->codecs[DS_HC]) return -1;
+ r |= c->comp_hdr->codecs[DS_HC]
+ ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
+ (char *)&i32, &out_sz);
+ if (r) return r;
+ cig_op = BAM_CHARD_CLIP;
+ cig_len += i32;
+ }
+ break;
+ }
+
+ case 'P': { // padding; PD
+ if (cig_len && cig_op != BAM_CPAD) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ if (ds & CRAM_PD) {
+ if (!c->comp_hdr->codecs[DS_PD]) return -1;
+ r |= c->comp_hdr->codecs[DS_PD]
+ ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
+ (char *)&i32, &out_sz);
+ if (r) return r;
+ cig_op = BAM_CPAD;
+ cig_len += i32;
+ nm += i32;
+ }
+ break;
+ }
+
+ case 'N': { // Ref skip; RS
+ if (cig_len && cig_op != BAM_CREF_SKIP) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ if (ds & CRAM_RS) {
+ if (!c->comp_hdr->codecs[DS_RS]) return -1;
+ r |= c->comp_hdr->codecs[DS_RS]
+ ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
+ (char *)&i32, &out_sz);
+ if (r) return r;
+ cig_op = BAM_CREF_SKIP;
+ cig_len += i32;
+ ref_pos += i32;
+ nm += i32;
+ }
+ break;
+ }
+
+ default:
+ fprintf(stderr, "Error: Unknown feature code '%c'\n", op);
+ return -1;
+ }
+ }
+
+ if (!(ds & CRAM_FC))
+ goto skip_cigar;
+
+ /* An implicit match op for any unaccounted for bases */
+ if ((ds & CRAM_FN) && cr->len >= seq_pos) {
+ if (s->ref) {
+ if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
+ static int whinged = 0;
+ int rlen;
+ if (!whinged)
+ fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
+ whinged = 1;
+ rlen = bfd->ref[cr->ref_id].len - ref_pos;
+ if (rlen > 0) {
+ if (seq_pos-1 + rlen < cr->len)
+ memcpy(&seq[seq_pos-1],
+ &s->ref[ref_pos - s->ref_start +1], rlen);
+ if ((cr->len - seq_pos + 1) - rlen > 0)
+ memset(&seq[seq_pos-1+rlen], 'N',
+ (cr->len - seq_pos + 1) - rlen);
+ } else {
+ if (cr->len - seq_pos + 1 > 0)
+ memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
+ }
+ } else {
+ if (cr->len - seq_pos + 1 > 0)
+ memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
+ cr->len - seq_pos + 1);
+ ref_pos += cr->len - seq_pos + 1;
+ if (md_dist >= 0)
+ md_dist += cr->len - seq_pos + 1;
+ }
+ }
+
+ if (ncigar+1 >= cigar_alloc) {
+ cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
+ s->cigar = cigar;
+ if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
+ return -1;
+ }
+#ifdef USE_X
+ if (cig_len && cig_op != BAM_CBASE_MATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ cig_op = BAM_CBASE_MATCH;
+#else
+ if (cig_len && cig_op != BAM_CMATCH) {
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ cig_len = 0;
+ }
+ cig_op = BAM_CMATCH;
+#endif
+ cig_len += cr->len - seq_pos+1;
+ }
+
+ skip_cigar:
+
+ if ((ds & CRAM_FN) && decode_md) {
+ if (md_dist >= 0)
+ BLOCK_APPEND_UINT(s->aux_blk, md_dist);
+ }
+
+ if (cig_len) {
+ if (ncigar >= cigar_alloc) {
+ cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
+ s->cigar = cigar;
+ if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
+ return -1;
+ }
+
+ cigar[ncigar++] = (cig_len<<4) + cig_op;
+ }
+
+ cr->ncigar = ncigar - cr->cigar;
+ cr->aend = ref_pos;
+
+ //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
+
+ if (ds & CRAM_MQ) {
+ if (!c->comp_hdr->codecs[DS_MQ]) return -1;
+ r |= c->comp_hdr->codecs[DS_MQ]
+ ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
+ (char *)&cr->mqual, &out_sz);
+ } else {
+ cr->mqual = 40;
+ }
+
+ if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
+ int32_t out_sz2 = cr->len;
+
+ if (ds & CRAM_QS) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
+ qual, &out_sz2);
+ }
+ }
+
+ s->cigar = cigar;
+ s->cigar_alloc = cigar_alloc;
+ s->ncigar = ncigar;
+
+ if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
+ cr->len = 0;
+
+ if (decode_md) {
+ BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
+ cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
+ }
+
+ if (decode_nm) {
+ char buf[7];
+ buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
+ buf[3] = (nm>> 0) & 0xff;
+ buf[4] = (nm>> 8) & 0xff;
+ buf[5] = (nm>>16) & 0xff;
+ buf[6] = (nm>>24) & 0xff;
+ BLOCK_APPEND(s->aux_blk, buf, 7);
+ cr->aux_size += 7;
+ }
+
+ return r;
+}
+
+/*
+ * Quick and simple hash lookup for cram_map arrays
+ */
+static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
+ cram_map *m;
+
+ m = map[CRAM_MAP(key[0],key[1])];
+ while (m && m->key != id)
+ m= m->next;
+
+ return m;
+}
+
+//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
+
+
+static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
+ cram_block *blk, cram_record *cr) {
+ int i, r = 0, out_sz = 1;
+ unsigned char ntags;
+
+ if (!c->comp_hdr->codecs[DS_TC]) return -1;
+ r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
+ (char *)&ntags, &out_sz);
+ cr->ntags = ntags;
+
+ //printf("TC=%d\n", cr->ntags);
+ cr->aux_size = 0;
+ cr->aux = BLOCK_SIZE(s->aux_blk);
+
+ for (i = 0; i < cr->ntags; i++) {
+ int32_t id, out_sz = 1;
+ unsigned char tag_data[3];
+ cram_map *m;
+
+ //printf("Tag %d/%d\n", i+1, cr->ntags);
+ if (!c->comp_hdr->codecs[DS_TN]) return -1;
+ r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
+ blk, (char *)&id, &out_sz);
+ if (out_sz == 3) {
+ tag_data[0] = ((char *)&id)[0];
+ tag_data[1] = ((char *)&id)[1];
+ tag_data[2] = ((char *)&id)[2];
+ } else {
+ tag_data[0] = (id>>16) & 0xff;
+ tag_data[1] = (id>>8) & 0xff;
+ tag_data[2] = id & 0xff;
+ }
+
+ m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
+ if (!m)
+ return -1;
+ BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
+
+ if (!m->codec) return -1;
+ r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
+
+ cr->aux_size += out_sz + 3;
+ }
+
+ return r;
+}
+
+static int cram_decode_aux(cram_container *c, cram_slice *s,
+ cram_block *blk, cram_record *cr,
+ int *has_MD, int *has_NM) {
+ int i, r = 0, out_sz = 1;
+ int32_t TL = 0;
+ unsigned char *TN;
+ uint32_t ds = c->comp_hdr->data_series;
+
+ if (!(ds & (CRAM_TL|CRAM_aux))) {
+ cr->aux = 0;
+ cr->aux_size = 0;
+ return 0;
+ }
+
+ if (!c->comp_hdr->codecs[DS_TL]) return -1;
+ r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
+ (char *)&TL, &out_sz);
+ if (r || TL < 0 || TL >= c->comp_hdr->nTL)
+ return -1;
+
+ TN = c->comp_hdr->TL[TL];
+ cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
+
+ //printf("TC=%d\n", cr->ntags);
+ cr->aux_size = 0;
+ cr->aux = BLOCK_SIZE(s->aux_blk);
+
+ if (!(ds & CRAM_aux))
+ return 0;
+
+ for (i = 0; i < cr->ntags; i++) {
+ int32_t id, out_sz = 1;
+ unsigned char tag_data[3];
+ cram_map *m;
+
+ if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
+ *has_MD = 1;
+ if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
+ *has_NM = 1;
+
+ //printf("Tag %d/%d\n", i+1, cr->ntags);
+ tag_data[0] = *TN++;
+ tag_data[1] = *TN++;
+ tag_data[2] = *TN++;
+ id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
+
+ m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
+ if (!m)
+ return -1;
+ BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
+
+ if (!m->codec) return -1;
+ r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
+ if (r) break;
+ cr->aux_size += out_sz + 3;
+ }
+
+ return r;
+}
+
+/* Resolve mate pair cross-references between recs within this slice */
+static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
+ int rec;
+
+ if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
+ for (rec = 0; rec < s->hdr->num_records; rec++) {
+ cram_record *cr = &s->crecs[rec];
+
+ cr->tlen = 0;
+ cr->mate_pos = 0;
+ cr->mate_ref_id = -1;
+ }
+
+ return 0;
+ }
+
+ for (rec = 0; rec < s->hdr->num_records; rec++) {
+ cram_record *cr = &s->crecs[rec];
+
+ if (cr->mate_line >= 0) {
+ if (cr->mate_line < s->hdr->num_records) {
+ /*
+ * On the first read, loop through computing lengths.
+ * It's not perfect as we have one slice per reference so we
+ * cannot detect when TLEN should be zero due to seqs that
+ * map to multiple references.
+ *
+ * We also cannot set tlen correct when it spans a slice for
+ * other reasons. This may make tlen too small. Should we
+ * fix this by forcing TLEN to be stored verbatim in such cases?
+ *
+ * Or do we just admit defeat and output 0 for tlen? It's the
+ * safe option...
+ */
+ if (cr->tlen == INT_MIN) {
+ int id1 = rec, id2 = rec;
+ int aleft = cr->apos, aright = cr->aend;
+ int tlen;
+ int ref = cr->ref_id;
+
+ // number of segments starting at the same point.
+ int left_cnt = 0;
+
+ do {
+ if (aleft > s->crecs[id2].apos)
+ aleft = s->crecs[id2].apos, left_cnt = 1;
+ else if (aleft == s->crecs[id2].apos)
+ left_cnt++;
+ if (aright < s->crecs[id2].aend)
+ aright = s->crecs[id2].aend;
+ if (s->crecs[id2].mate_line == -1) {
+ s->crecs[id2].mate_line = rec;
+ break;
+ }
+ if (s->crecs[id2].mate_line <= id2 ||
+ s->crecs[id2].mate_line >= s->hdr->num_records)
+ return -1;
+ id2 = s->crecs[id2].mate_line;
+
+ if (s->crecs[id2].ref_id != ref)
+ ref = -1;
+ } while (id2 != id1);
+
+ if (ref != -1) {
+ tlen = aright - aleft + 1;
+ id1 = id2 = rec;
+
+ /*
+ * When we have two seqs with identical start and
+ * end coordinates, set +/- tlen based on 1st/last
+ * bit flags instead, as a tie breaker.
+ */
+ if (s->crecs[id2].apos == aleft) {
+ if (left_cnt == 1 ||
+ (s->crecs[id2].flags & BAM_FREAD1))
+ s->crecs[id2].tlen = tlen;
+ else
+ s->crecs[id2].tlen = -tlen;
+ } else {
+ s->crecs[id2].tlen = -tlen;
+ }
+
+ id2 = s->crecs[id2].mate_line;
+ while (id2 != id1) {
+ if (s->crecs[id2].apos == aleft) {
+ if (left_cnt == 1 ||
+ (s->crecs[id2].flags & BAM_FREAD1))
+ s->crecs[id2].tlen = tlen;
+ else
+ s->crecs[id2].tlen = -tlen;
+ } else {
+ s->crecs[id2].tlen = -tlen;
+ }
+ id2 = s->crecs[id2].mate_line;
+ }
+ } else {
+ id1 = id2 = rec;
+
+ s->crecs[id2].tlen = 0;
+ id2 = s->crecs[id2].mate_line;
+ while (id2 != id1) {
+ s->crecs[id2].tlen = 0;
+ id2 = s->crecs[id2].mate_line;
+ }
+ }
+ }
+
+ cr->mate_pos = s->crecs[cr->mate_line].apos;
+ cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
+
+ // paired
+ cr->flags |= BAM_FPAIRED;
+
+ // set mate unmapped if needed
+ if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
+ cr->flags |= BAM_FMUNMAP;
+ cr->tlen = 0;
+ }
+ if (cr->flags & BAM_FUNMAP) {
+ cr->tlen = 0;
+ }
+
+ // set mate reversed if needed
+ if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
+ cr->flags |= BAM_FMREVERSE;
+ } else {
+ fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
+ cr->mate_line, s->hdr->num_records-1);
+ }
+
+ /* FIXME: construct read names here too if needed */
+ } else {
+ if (cr->mate_flags & CRAM_M_REVERSE) {
+ cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
+ }
+ if (cr->mate_flags & CRAM_M_UNMAP) {
+ cr->flags |= BAM_FMUNMAP;
+ //cr->mate_ref_id = -1;
+ }
+ if (!(cr->flags & BAM_FPAIRED))
+ cr->mate_ref_id = -1;
+ }
+
+ if (cr->tlen == INT_MIN)
+ cr->tlen = 0; // Just incase
+ }
+ return 0;
+}
+
+static char *md5_print(unsigned char *md5, char *out) {
+ int i;
+ for (i = 0; i < 16; i++) {
+ out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
+ out[i*2+1] = "0123456789abcdef"[md5[i]&15];
+ }
+ out[32] = 0;
+
+ return out;
+}
+
+/*
+ * Decode an entire slice from container blocks. Fills out s->crecs[] array.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
+ SAM_hdr *bfd) {
+ cram_block *blk = s->block[0];
+ int32_t bf, ref_id;
+ unsigned char cf;
+ int out_sz, r = 0;
+ int rec;
+ char *seq = NULL, *qual = NULL;
+ int unknown_rg = -1;
+ int embed_ref;
+ char **refs = NULL;
+ uint32_t ds;
+
+ if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
+ return -1;
+
+ ds = c->comp_hdr->data_series;
+
+ blk->bit = 7; // MSB first
+
+ // Study the blocks and estimate approx sizes to preallocate.
+ // This looks to speed up decoding by around 8-9%.
+ // We can always shrink back down at the end if we overestimated.
+ // However it's likely that this also saves memory as own growth
+ // factor (*=1.5) is never applied.
+ {
+ int qsize, nsize, q_id;
+ cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
+ //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
+
+ if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
+ if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
+ if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
+
+ // To do - consider using q_id here to usurp the quality block and
+ // avoid a memcpy during decode.
+ // Specifically when quality is an external block uniquely used by
+ // DS_QS only, then we can set s->qual_blk directly to this
+ // block and save the codec->decode() calls. (Approx 3% cpu saving)
+ }
+
+ /* Look for unknown RG, added as last by Java CRAM? */
+ if (bfd->nrg > 0 &&
+ bfd->rg[bfd->nrg-1].name != NULL &&
+ !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
+ unknown_rg = bfd->nrg-1;
+
+ if (blk->content_type != CORE)
+ return -1;
+
+ if (s->crecs)
+ free(s->crecs);
+ if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
+ return -1;
+
+ ref_id = s->hdr->ref_seq_id;
+ embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
+
+ if (ref_id >= 0) {
+ if (embed_ref) {
+ cram_block *b;
+ if (s->hdr->ref_base_id < 0) {
+ fprintf(stderr, "No reference specified and "
+ "no embedded reference is available.\n");
+ return -1;
+ }
+ b = cram_get_block_by_id(s, s->hdr->ref_base_id);
+ if (!b)
+ return -1;
+ if (cram_uncompress_block(b) != 0)
+ return -1;
+ s->ref = (char *)BLOCK_DATA(b);
+ s->ref_start = s->hdr->ref_seq_start;
+ s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
+ } else if (!fd->no_ref) {
+ //// Avoid Java cramtools bug by loading entire reference seq
+ //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
+ //s->ref_start = 1;
+
+ if (fd->required_fields & SAM_SEQ)
+ s->ref =
+ cram_get_ref(fd, s->hdr->ref_seq_id,
+ s->hdr->ref_seq_start,
+ s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
+ s->ref_start = s->hdr->ref_seq_start;
+ s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
+
+ /* Sanity check */
+ if (s->ref_start < 0) {
+ fprintf(stderr, "Slice starts before base 1.\n");
+ s->ref_start = 0;
+ }
+ pthread_mutex_lock(&fd->ref_lock);
+ pthread_mutex_lock(&fd->refs->lock);
+ if ((fd->required_fields & SAM_SEQ) &&
+ s->ref_end > fd->refs->ref_id[ref_id]->length) {
+ s->ref_end = fd->refs->ref_id[ref_id]->length;
+ }
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ }
+ }
+
+ if ((fd->required_fields & SAM_SEQ) &&
+ s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
+ fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
+ s->hdr->ref_seq_id, s->hdr->ref_seq_start,
+ s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
+ return -1;
+ }
+
+ if (CRAM_MAJOR_VERS(fd->version) != 1
+ && (fd->required_fields & SAM_SEQ)
+ && s->hdr->ref_seq_id >= 0
+ && !fd->ignore_md5
+ && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
+ hts_md5_context *md5;
+ unsigned char digest[16];
+
+ if (s->ref && s->hdr->ref_seq_id >= 0) {
+ int start, len;
+
+ if (s->hdr->ref_seq_start >= s->ref_start) {
+ start = s->hdr->ref_seq_start - s->ref_start;
+ } else {
+ fprintf(stderr, "Slice starts before base 1.\n");
+ start = 0;
+ }
+
+ if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
+ len = s->hdr->ref_seq_span;
+ } else {
+ fprintf(stderr, "Slice ends beyond reference end.\n");
+ len = s->ref_end - s->ref_start + 1;
+ }
+
+ if (!(md5 = hts_md5_init()))
+ return -1;
+ if (start + len > s->ref_end - s->ref_start + 1)
+ len = s->ref_end - s->ref_start + 1 - start;
+ if (len >= 0)
+ hts_md5_update(md5, s->ref + start, len);
+ hts_md5_final(digest, md5);
+ hts_md5_destroy(md5);
+ } else if (!s->ref && s->hdr->ref_base_id >= 0) {
+ cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
+ if (b) {
+ if (!(md5 = hts_md5_init()))
+ return -1;
+ hts_md5_update(md5, b->data, b->uncomp_size);
+ hts_md5_final(digest, md5);
+ hts_md5_destroy(md5);
+ }
+ }
+
+ if ((!s->ref && s->hdr->ref_base_id < 0)
+ || memcmp(digest, s->hdr->md5, 16) != 0) {
+ char M[33];
+ fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
+ "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
+ fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
+ fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
+ return -1;
+ }
+ }
+
+ if (ref_id == -2) {
+ pthread_mutex_lock(&fd->ref_lock);
+ pthread_mutex_lock(&fd->refs->lock);
+ refs = calloc(fd->refs->nref, sizeof(char *));
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ if (!refs)
+ return -1;
+ }
+
+ int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
+ for (rec = 0; rec < s->hdr->num_records; rec++) {
+ cram_record *cr = &s->crecs[rec];
+ int has_MD, has_NM;
+
+ //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
+
+ cr->s = s;
+
+ out_sz = 1; /* decode 1 item */
+ if (ds & CRAM_BF) {
+ if (!c->comp_hdr->codecs[DS_BF]) return -1;
+ r |= c->comp_hdr->codecs[DS_BF]
+ ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
+ (char *)&bf, &out_sz);
+ if (r || bf < 0 ||
+ bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
+ return -1;
+ bf = fd->bam_flag_swap[bf];
+ cr->flags = bf;
+ } else {
+ cr->flags = bf = 0x4; // unmapped
+ }
+
+ if (ds & CRAM_CF) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ /* CF is byte in 1.0, int32 in 2.0 */
+ if (!c->comp_hdr->codecs[DS_CF]) return -1;
+ r |= c->comp_hdr->codecs[DS_CF]
+ ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
+ (char *)&cf, &out_sz);
+ if (r) return -1;
+ cr->cram_flags = cf;
+ } else {
+ if (!c->comp_hdr->codecs[DS_CF]) return -1;
+ r |= c->comp_hdr->codecs[DS_CF]
+ ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
+ (char *)&cr->cram_flags, &out_sz);
+ if (r) return -1;
+ cf = cr->cram_flags;
+ }
+ } else {
+ cf = cr->cram_flags = 0;
+ }
+
+ if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
+ if (ds & CRAM_RI) {
+ if (!c->comp_hdr->codecs[DS_RI]) return -1;
+ r |= c->comp_hdr->codecs[DS_RI]
+ ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
+ (char *)&cr->ref_id, &out_sz);
+ if (r) return -1;
+ if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
+ && cr->ref_id >= 0
+ && cr->ref_id != last_ref_id) {
+ if (!fd->no_ref) {
+ if (!refs[cr->ref_id])
+ refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
+ 1, 0);
+ s->ref = refs[cr->ref_id];
+
+ if (!fd->unsorted && last_ref_id >= 0 && refs[last_ref_id]) {
+ cram_ref_decr(fd->refs, last_ref_id);
+ refs[last_ref_id] = NULL;
+ }
+ }
+ s->ref_start = 1;
+ pthread_mutex_lock(&fd->ref_lock);
+ pthread_mutex_lock(&fd->refs->lock);
+ s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+
+ last_ref_id = cr->ref_id;
+ }
+ } else {
+ cr->ref_id = -1;
+ }
+ } else {
+ cr->ref_id = ref_id; // Forced constant in CRAM 1.0
+ }
+ if (cr->ref_id >= bfd->nref) {
+ fprintf(stderr, "Requested unknown reference ID %d\n", cr->ref_id);
+ return -1;
+ }
+
+ if (ds & CRAM_RL) {
+ if (!c->comp_hdr->codecs[DS_RL]) return -1;
+ r |= c->comp_hdr->codecs[DS_RL]
+ ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
+ (char *)&cr->len, &out_sz);
+ if (r) return r;
+ if (cr->len < 0) {
+ fprintf(stderr, "Read has negative length\n");
+ return -1;
+ }
+ }
+
+ if (ds & CRAM_AP) {
+ if (!c->comp_hdr->codecs[DS_AP]) return -1;
+ r |= c->comp_hdr->codecs[DS_AP]
+ ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
+ (char *)&cr->apos, &out_sz);
+ if (r) return r;
+ if (c->comp_hdr->AP_delta)
+ cr->apos += s->last_apos;
+ s->last_apos= cr->apos;
+ } else {
+ cr->apos = c->ref_seq_start;
+ }
+
+ if (ds & CRAM_RG) {
+ if (!c->comp_hdr->codecs[DS_RG]) return -1;
+ r |= c->comp_hdr->codecs[DS_RG]
+ ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
+ (char *)&cr->rg, &out_sz);
+ if (r) return r;
+ if (cr->rg == unknown_rg)
+ cr->rg = -1;
+ } else {
+ cr->rg = -1;
+ }
+
+ cr->name_len = 0;
+
+ if (c->comp_hdr->read_names_included) {
+ int32_t out_sz2 = 1;
+
+ // Read directly into name cram_block
+ cr->name = BLOCK_SIZE(s->name_blk);
+ if (ds & CRAM_RN) {
+ if (!c->comp_hdr->codecs[DS_RN]) return -1;
+ r |= c->comp_hdr->codecs[DS_RN]
+ ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
+ (char *)s->name_blk, &out_sz2);
+ if (r) return r;
+ cr->name_len = out_sz2;
+ }
+ }
+
+ cr->mate_pos = 0;
+ cr->mate_line = -1;
+ cr->mate_ref_id = -1;
+ if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
+ if (ds & CRAM_MF) {
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ /* MF is byte in 1.0, int32 in 2.0 */
+ unsigned char mf;
+ if (!c->comp_hdr->codecs[DS_MF]) return -1;
+ r |= c->comp_hdr->codecs[DS_MF]
+ ->decode(s, c->comp_hdr->codecs[DS_MF],
+ blk, (char *)&mf, &out_sz);
+ if (r) return r;
+ cr->mate_flags = mf;
+ } else {
+ if (!c->comp_hdr->codecs[DS_MF]) return -1;
+ r |= c->comp_hdr->codecs[DS_MF]
+ ->decode(s, c->comp_hdr->codecs[DS_MF],
+ blk,
+ (char *)&cr->mate_flags,
+ &out_sz);
+ if (r) return r;
+ }
+ } else {
+ cr->mate_flags = 0;
+ }
+
+ if (!c->comp_hdr->read_names_included) {
+ int32_t out_sz2 = 1;
+
+ // Read directly into name cram_block
+ cr->name = BLOCK_SIZE(s->name_blk);
+ if (ds & CRAM_RN) {
+ if (!c->comp_hdr->codecs[DS_RN]) return -1;
+ r |= c->comp_hdr->codecs[DS_RN]
+ ->decode(s, c->comp_hdr->codecs[DS_RN],
+ blk, (char *)s->name_blk,
+ &out_sz2);
+ if (r) return r;
+ cr->name_len = out_sz2;
+ }
+ }
+
+ if (ds & CRAM_NS) {
+ if (!c->comp_hdr->codecs[DS_NS]) return -1;
+ r |= c->comp_hdr->codecs[DS_NS]
+ ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
+ (char *)&cr->mate_ref_id, &out_sz);
+ if (r) return r;
+ }
+
+// Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
+// if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
+// /* Paired, but unmapped */
+// cr->flags |= BAM_FMUNMAP;
+// }
+
+ if (ds & CRAM_NP) {
+ if (!c->comp_hdr->codecs[DS_NP]) return -1;
+ r |= c->comp_hdr->codecs[DS_NP]
+ ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
+ (char *)&cr->mate_pos, &out_sz);
+ if (r) return r;
+ }
+
+ if (ds & CRAM_TS) {
+ if (!c->comp_hdr->codecs[DS_TS]) return -1;
+ r |= c->comp_hdr->codecs[DS_TS]
+ ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
+ (char *)&cr->tlen, &out_sz);
+ if (r) return r;
+ } else {
+ cr->tlen = INT_MIN;
+ }
+ } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
+ if (ds & CRAM_NF) {
+ if (!c->comp_hdr->codecs[DS_NF]) return -1;
+ r |= c->comp_hdr->codecs[DS_NF]
+ ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
+ (char *)&cr->mate_line, &out_sz);
+ if (r) return r;
+ cr->mate_line += rec + 1;
+
+ //cr->name_len = sprintf(name, "%d", name_id++);
+ //cr->name = DSTRING_LEN(name_ds);
+ //dstring_nappend(name_ds, name, cr->name_len);
+
+ cr->mate_ref_id = -1;
+ cr->tlen = INT_MIN;
+ cr->mate_pos = 0;
+ } else {
+ cr->mate_flags = 0;
+ cr->tlen = INT_MIN;
+ }
+ } else {
+ cr->mate_flags = 0;
+ cr->tlen = INT_MIN;
+ }
+ /*
+ else if (!name[0]) {
+ //name[0] = '?'; name[1] = 0;
+ //cr->name_len = 1;
+ //cr->name= DSTRING_LEN(s->name_ds);
+ //dstring_nappend(s->name_ds, "?", 1);
+
+ cr->mate_ref_id = -1;
+ cr->tlen = 0;
+ cr->mate_pos = 0;
+ }
+ */
+
+ /* Auxiliary tags */
+ has_MD = has_NM = 0;
+ if (CRAM_MAJOR_VERS(fd->version) == 1)
+ r |= cram_decode_aux_1_0(c, s, blk, cr);
+ else
+ r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM);
+ if (r) return r;
+
+ /* Fake up dynamic string growth and appending */
+ if (ds & CRAM_RL) {
+ cr->seq = BLOCK_SIZE(s->seqs_blk);
+ BLOCK_GROW(s->seqs_blk, cr->len);
+ seq = (char *)BLOCK_END(s->seqs_blk);
+ BLOCK_SIZE(s->seqs_blk) += cr->len;
+
+ if (!seq)
+ return -1;
+
+ cr->qual = BLOCK_SIZE(s->qual_blk);
+ BLOCK_GROW(s->qual_blk, cr->len);
+ qual = (char *)BLOCK_END(s->qual_blk);
+ BLOCK_SIZE(s->qual_blk) += cr->len;
+
+ if (!s->ref)
+ memset(seq, '=', cr->len);
+ }
+
+ if (!(bf & BAM_FUNMAP)) {
+ if ((ds & CRAM_AP) && cr->apos <= 0) {
+ fprintf(stderr,
+ "Read has alignment position %d but no unmapped flag\n",
+ cr->apos);
+ return -1;
+ }
+ /* Decode sequence and generate CIGAR */
+ if (ds & (CRAM_SEQ | CRAM_MQ)) {
+ r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual,
+ has_MD, has_NM);
+ if (r) return r;
+ } else {
+ cr->cigar = 0;
+ cr->ncigar = 0;
+ cr->aend = cr->apos;
+ cr->mqual = 0;
+ }
+ } else {
+ int out_sz2 = cr->len;
+
+ //puts("Unmapped");
+ cr->cigar = 0;
+ cr->ncigar = 0;
+ cr->aend = cr->apos;
+ cr->mqual = 0;
+
+ if (ds & CRAM_BA && cr->len) {
+ if (!c->comp_hdr->codecs[DS_BA]) return -1;
+ r |= c->comp_hdr->codecs[DS_BA]
+ ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
+ (char *)seq, &out_sz2);
+ if (r) return r;
+ }
+
+ if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
+ out_sz2 = cr->len;
+ if (ds & CRAM_QS && cr->len >= 0) {
+ if (!c->comp_hdr->codecs[DS_QS]) return -1;
+ r |= c->comp_hdr->codecs[DS_QS]
+ ->decode(s, c->comp_hdr->codecs[DS_QS],
+ blk, qual, &out_sz2);
+ if (r) return r;
+ }
+ } else {
+ if (ds & CRAM_RL)
+ memset(qual, 255, cr->len);
+ }
+ }
+ }
+
+ pthread_mutex_lock(&fd->ref_lock);
+ if (refs) {
+ int i;
+ for (i = 0; i < fd->refs->nref; i++) {
+ if (refs[i])
+ cram_ref_decr(fd->refs, i);
+ }
+ free(refs);
+ } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
+ cram_ref_decr(fd->refs, ref_id);
+ }
+ pthread_mutex_unlock(&fd->ref_lock);
+
+ /* Resolve mate pair cross-references between recs within this slice */
+ r |= cram_decode_slice_xref(s, fd->required_fields);
+
+ // Free the original blocks as we no longer need these.
+ {
+ int i;
+ for (i = 0; i < s->hdr->num_blocks; i++) {
+ cram_block *b = s->block[i];
+ cram_free_block(b);
+ s->block[i] = NULL;
+ }
+ }
+
+ // Also see initial BLOCK_RESIZE_EXACT at top of function.
+ // As we grow blocks we overallocate by up to 50%. So shrink
+ // back to their final sizes here.
+ //
+// fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
+// (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
+// (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
+// (int)s->name_blk->byte, (int)s->name_blk->alloc,
+// (int)s->aux_blk->byte, (int)s->aux_blk->alloc);
+ BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
+ BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
+ BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
+ BLOCK_RESIZE_EXACT(s->aux_blk, BLOCK_SIZE(s->aux_blk)+1);
+
+ return r;
+}
+
+typedef struct {
+ cram_fd *fd;
+ cram_container *c;
+ cram_slice *s;
+ SAM_hdr *h;
+ int exit_code;
+} cram_decode_job;
+
+void *cram_decode_slice_thread(void *arg) {
+ cram_decode_job *j = (cram_decode_job *)arg;
+
+ j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
+
+ return j;
+}
+
+/*
+ * Spawn a multi-threaded version of cram_decode_slice().
+ */
+int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
+ SAM_hdr *bfd) {
+ cram_decode_job *j;
+ int nonblock;
+
+ if (!fd->pool)
+ return cram_decode_slice(fd, c, s, bfd);
+
+ if (!(j = malloc(sizeof(*j))))
+ return -1;
+
+ j->fd = fd;
+ j->c = c;
+ j->s = s;
+ j->h = bfd;
+
+ nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0;
+
+ if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
+ j, nonblock)) {
+ /* Would block */
+ fd->job_pending = j;
+ } else {
+ fd->job_pending = NULL;
+ }
+
+ // flush too
+ return 0;
+}
+
+
+/* ----------------------------------------------------------------------
+ * CRAM sequence iterators.
+ */
+
+/*
+ * Converts a cram in-memory record into a bam in-memory record. We
+ * pass a pointer to a bam_seq_t pointer along with the a pointer to
+ * the allocated size. These can initially be pointers to NULL and zero.
+ *
+ * This function will reallocate the bam buffer as required and update
+ * (*bam)->alloc accordingly, allowing it to be used within a loop
+ * efficiently without needing to allocate new bam objects over and
+ * over again.
+ *
+ * Returns the used size of the bam record on success
+ * -1 on failure.
+ */
+static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
+ cram_record *cr, int rec, bam_seq_t **bam) {
+ int bam_idx, rg_len;
+ char name_a[1024], *name;
+ int name_len;
+ char *aux, *aux_orig;
+ char *seq, *qual;
+
+ /* Assign names if not explicitly set */
+ if (fd->required_fields & SAM_QNAME) {
+ if (cr->name_len) {
+ name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
+ name_len = cr->name_len;
+ } else {
+ name = name_a;
+ name_len = strlen(fd->prefix);
+ memcpy(name, fd->prefix, name_len);
+ name += name_len;
+ *name++ = ':';
+ if (cr->mate_line >= 0 && cr->mate_line < rec)
+ name = (char *)append_uint64((unsigned char *)name,
+ s->hdr->record_counter +
+ cr->mate_line + 1);
+ else
+ name = (char *)append_uint64((unsigned char *)name,
+ s->hdr->record_counter +
+ rec + 1);
+ name_len = name - name_a;
+ name = name_a;
+ }
+ } else {
+ name = "?";
+ name_len = 1;
+ }
+
+ /* Generate BAM record */
+ if (cr->rg < -1 || cr->rg >= bfd->nrg)
+ return -1;
+ rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
+
+ if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
+ if (!BLOCK_DATA(s->seqs_blk))
+ return -1;
+ seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
+ } else {
+ seq = "*";
+ cr->len = 0;
+ }
+
+
+ if (fd->required_fields & SAM_QUAL) {
+ if (!BLOCK_DATA(s->qual_blk))
+ return -1;
+ qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
+ } else {
+ qual = NULL;
+ }
+
+ bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
+ name, name_len,
+ cr->flags,
+ cr->ref_id,
+ cr->apos,
+ cr->aend,
+ cr->mqual,
+ cr->ncigar, &s->cigar[cr->cigar],
+ cr->mate_ref_id,
+ cr->mate_pos,
+ cr->tlen,
+ cr->len,
+ seq,
+ qual);
+ if (bam_idx == -1)
+ return -1;
+
+ aux = aux_orig = (char *)bam_aux(*bam);
+
+ /* Auxiliary strings */
+ if (cr->aux_size != 0) {
+ memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
+ aux += cr->aux_size;
+ }
+
+ /* RG:Z: */
+ if (cr->rg != -1) {
+ int len = bfd->rg[cr->rg].name_len;
+ *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
+ memcpy(aux, bfd->rg[cr->rg].name, len);
+ aux += len;
+ *aux++ = 0;
+ }
+
+ return bam_idx + (aux - aux_orig);
+}
+
+/*
+ * Here be dragons! The multi-threading code in this is crufty beyond belief.
+ */
+static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
+ cram_container *c;
+ cram_slice *s = NULL;
+
+ if (!(c = fd->ctr)) {
+ // Load first container.
+ do {
+ if (!(c = fd->ctr = cram_read_container(fd)))
+ return NULL;
+ } while (c->length == 0);
+
+ /*
+ * The first container may be a result of a sub-range query.
+ * In which case it may still not be the optimal starting point
+ * due to skipped containers/slices in the index.
+ */
+ if (fd->range.refid != -2) {
+ while (c->ref_seq_id != -2 &&
+ (c->ref_seq_id < fd->range.refid ||
+ c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) {
+ if (0 != cram_seek(fd, c->length, SEEK_CUR))
+ return NULL;
+ cram_free_container(fd->ctr);
+ do {
+ if (!(c = fd->ctr = cram_read_container(fd)))
+ return NULL;
+ } while (c->length == 0);
+ }
+
+ if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid)
+ return NULL;
+ }
+
+ if (!(c->comp_hdr_block = cram_read_block(fd)))
+ return NULL;
+ if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
+ return NULL;
+
+ c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
+ if (!c->comp_hdr)
+ return NULL;
+ if (!c->comp_hdr->AP_delta &&
+ sam_hdr_sort_order(fd->header) != ORDER_COORD) {
+ pthread_mutex_lock(&fd->ref_lock);
+ fd->unsorted = 1;
+ pthread_mutex_unlock(&fd->ref_lock);
+ }
+ }
+
+ if ((s = c->slice)) {
+ c->slice = NULL;
+ cram_free_slice(s);
+ s = NULL;
+ }
+
+ if (c->curr_slice == c->max_slice) {
+ cram_free_container(c);
+ c = NULL;
+ }
+
+ /* Sorry this is so contorted! */
+ for (;;) {
+ if (fd->job_pending) {
+ cram_decode_job *j = (cram_decode_job *)fd->job_pending;
+ c = j->c;
+ s = j->s;
+ free(fd->job_pending);
+ fd->job_pending = NULL;
+ } else if (!fd->ooc) {
+ empty_container:
+ if (!c || c->curr_slice == c->max_slice) {
+ // new container
+ do {
+ if (!(c = fd->ctr = cram_read_container(fd))) {
+ if (fd->pool) {
+ fd->ooc = 1;
+ break;
+ }
+
+ return NULL;
+ }
+ } while (c->length == 0);
+ if (fd->ooc)
+ break;
+
+ /* Skip containers not yet spanning our range */
+ if (fd->range.refid != -2 && c->ref_seq_id != -2) {
+ fd->required_fields |= SAM_POS;
+
+ if (c->ref_seq_id != fd->range.refid) {
+ cram_free_container(c);
+ fd->ctr = NULL;
+ fd->ooc = 1;
+ fd->eof = 1;
+ break;
+ }
+
+ if (c->ref_seq_start > fd->range.end) {
+ cram_free_container(c);
+ fd->ctr = NULL;
+ fd->ooc = 1;
+ fd->eof = 1;
+ break;
+ }
+
+ if (c->ref_seq_start + c->ref_seq_span-1 <
+ fd->range.start) {
+ c->curr_rec = c->max_rec;
+ c->curr_slice = c->max_slice;
+ cram_seek(fd, c->length, SEEK_CUR);
+ cram_free_container(c);
+ c = NULL;
+ continue;
+ }
+ }
+
+ if (!(c->comp_hdr_block = cram_read_block(fd)))
+ return NULL;
+ if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
+ return NULL;
+
+ c->comp_hdr =
+ cram_decode_compression_header(fd, c->comp_hdr_block);
+ if (!c->comp_hdr)
+ return NULL;
+
+ if (!c->comp_hdr->AP_delta &&
+ sam_hdr_sort_order(fd->header) != ORDER_COORD) {
+ pthread_mutex_lock(&fd->ref_lock);
+ fd->unsorted = 1;
+ pthread_mutex_unlock(&fd->ref_lock);
+ }
+ }
+
+ if (c->num_records == 0) {
+ cram_free_container(c); c = NULL;
+ goto empty_container;
+ }
+
+
+ if (!(s = c->slice = cram_read_slice(fd)))
+ return NULL;
+ c->curr_slice++;
+ c->curr_rec = 0;
+ c->max_rec = s->hdr->num_records;
+
+ s->last_apos = s->hdr->ref_seq_start;
+
+ /* Skip slices not yet spanning our range */
+ if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) {
+ if (s->hdr->ref_seq_id != fd->range.refid) {
+ fd->eof = 1;
+ cram_free_slice(s);
+ c->slice = NULL;
+ return NULL;
+ }
+
+ if (s->hdr->ref_seq_start > fd->range.end) {
+ fd->eof = 1;
+ cram_free_slice(s);
+ c->slice = NULL;
+ return NULL;
+ }
+
+ if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 <
+ fd->range.start) {
+ cram_free_slice(s);
+ c->slice = NULL;
+ cram_free_container(c);
+ c = NULL;
+ continue;
+ }
+ }
+ }
+
+ /* Test decoding of 1st seq */
+ if (!c || !s)
+ break;
+
+ if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) {
+ // if (cram_decode_slice(fd, c, s, fd->header) != 0) {
+ fprintf(stderr, "Failure to decode slice\n");
+ cram_free_slice(s);
+ c->slice = NULL;
+ return NULL;
+ }
+
+ if (!fd->pool || fd->job_pending)
+ break;
+
+ // Push it a bit far, to qsize in queue rather than pending arrival,
+ // as cram tends to be a bit bursty in decode timings.
+ if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize)
+ break;
+ }
+
+ if (fd->pool) {
+ t_pool_result *res;
+ cram_decode_job *j;
+
+// fprintf(stderr, "Thread pool len = %d, %d\n",
+// t_pool_results_queue_len(fd->rqueue),
+// t_pool_results_queue_sz(fd->rqueue));
+
+ if (fd->ooc && t_pool_results_queue_empty(fd->rqueue))
+ return NULL;
+
+ res = t_pool_next_result_wait(fd->rqueue);
+
+ if (!res || !res->data) {
+ fprintf(stderr, "t_pool_next_result failure\n");
+ return NULL;
+ }
+
+ j = (cram_decode_job *)res->data;
+ c = j->c;
+ s = j->s;
+
+ fd->ctr = c;
+
+ t_pool_delete_result(res, 1);
+ }
+
+ *cp = c;
+ return s;
+}
+
+/*
+ * Read the next cram record and return it.
+ * Note that to decode cram_record the caller will need to look up some data
+ * in the current slice, pointed to by fd->ctr->slice. This is valid until
+ * the next call to cram_get_seq (which may invalidate it).
+ *
+ * Returns record pointer on success (do not free)
+ * NULL on failure
+ */
+cram_record *cram_get_seq(cram_fd *fd) {
+ cram_container *c;
+ cram_slice *s;
+
+ for (;;) {
+ c = fd->ctr;
+ if (c && c->slice && c->curr_rec < c->max_rec) {
+ s = c->slice;
+ } else {
+ if (!(s = cram_next_slice(fd, &c)))
+ return NULL;
+ continue; /* In case slice contains no records */
+ }
+
+ if (fd->range.refid != -2) {
+ if (fd->range.refid == -1 && s->crecs[c->curr_rec].ref_id != -1) {
+ // Special case when looking for unmapped blocks at end.
+ // If these are mixed in with mapped data (c->ref_id == -2)
+ // then we need skip until we find the unmapped data, if at all
+ c->curr_rec++;
+ continue;
+ }
+ if (s->crecs[c->curr_rec].ref_id < fd->range.refid &&
+ s->crecs[c->curr_rec].ref_id != -1) {
+ // Looking for a mapped read, but not there yet. Special case
+ // as -1 (unmapped) shouldn't be considered < refid.
+ c->curr_rec++;
+ continue;
+ }
+
+ if (s->crecs[c->curr_rec].ref_id != fd->range.refid) {
+ fd->eof = 1;
+ cram_free_slice(s);
+ c->slice = NULL;
+ return NULL;
+ }
+
+ if (fd->range.refid != -1 && s->crecs[c->curr_rec].apos > fd->range.end) {
+ fd->eof = 1;
+ cram_free_slice(s);
+ c->slice = NULL;
+ return NULL;
+ }
+
+ if (fd->range.refid != -1 && s->crecs[c->curr_rec].aend < fd->range.start) {
+ c->curr_rec++;
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ fd->ctr = c;
+ c->slice = s;
+ return &s->crecs[c->curr_rec++];
+}
+
+/*
+ * Read the next cram record and convert it to a bam_seq_t struct.
+ *
+ * Returns 0 on success
+ * -1 on EOF or failure (check fd->err)
+ */
+int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
+ cram_record *cr;
+ cram_container *c;
+ cram_slice *s;
+
+ if (!(cr = cram_get_seq(fd)))
+ return -1;
+
+ c = fd->ctr;
+ s = c->slice;
+
+ return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam);
+}
diff --git a/htslib/cram/cram_decode.h b/htslib/cram/cram_decode.h
new file mode 100644
index 0000000..64b188e
--- /dev/null
+++ b/htslib/cram/cram_decode.h
@@ -0,0 +1,112 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*! \file
+ * Include cram.h instead.
+ *
+ * This is an internal part of the CRAM system and is automatically included
+ * when you #include cram.h.
+ *
+ * Implements the decoding portion of CRAM I/O. Also see
+ * cram_codecs.[ch] for the actual encoding functions themselves.
+ */
+
+#ifndef _CRAM_READ_H_
+#define _CRAM_READ_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ----------------------------------------------------------------------
+ * CRAM sequence iterators.
+ */
+
+/*! Read the next cram record and return it as a cram_record.
+ *
+ * Note that to decode cram_record the caller will need to look up some data
+ * in the current slice, pointed to by fd->ctr->slice. This is valid until
+ * the next call to cram_get_seq (which may invalidate it).
+ *
+ * @return
+ * Returns record pointer on success (do not free);
+ * NULL on failure
+ */
+cram_record *cram_get_seq(cram_fd *fd);
+
+/*! Read the next cram record and convert it to a bam_seq_t struct.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on EOF or failure (check fd->err)
+ */
+int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam);
+
+
+/* ----------------------------------------------------------------------
+ * Internal functions
+ */
+
+/*! INTERNAL:
+ * Decodes a CRAM block compression header.
+ *
+ * @return
+ * Returns header ptr on success;
+ * NULL on failure
+ */
+cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
+ cram_block *b);
+
+/*! INTERNAL:
+ * Decodes a CRAM (un)mapped slice header block.
+ *
+ * @return
+ * Returns slice header ptr on success;
+ * NULL on failure
+ */
+cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b);
+
+
+/*! INTERNAL:
+ * Decode an entire slice from container blocks. Fills out s->crecs[] array.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
+ SAM_hdr *hdr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/cram/cram_encode.c b/htslib/cram/cram_encode.c
new file mode 100644
index 0000000..5a9d64b
--- /dev/null
+++ b/htslib/cram/cram_encode.c
@@ -0,0 +1,3105 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "cram/cram.h"
+#include "cram/os.h"
+#include "htslib/hts.h"
+
+#define Z_CRAM_STRAT Z_FILTERED
+//#define Z_CRAM_STRAT Z_RLE
+//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
+//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
+
+static int process_one_read(cram_fd *fd, cram_container *c,
+ cram_slice *s, cram_record *cr,
+ bam_seq_t *b, int rnum);
+
+/*
+ * Returns index of val into key.
+ * Basically strchr(key, val)-key;
+ */
+static int sub_idx(char *key, char val) {
+ int i;
+
+ for (i = 0; *key && *key++ != val; i++);
+ return i;
+}
+
+/*
+ * Encodes a compression header block into a generic cram_block structure.
+ *
+ * Returns cram_block ptr on success
+ * NULL on failure
+ */
+cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
+ cram_block_compression_hdr *h) {
+ cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0);
+ cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
+ int i, mc;
+
+ if (!cb || !map)
+ return NULL;
+
+ /*
+ * This is a concatenation of several blocks of data:
+ * header + landmarks, preservation map, read encoding map, and the tag
+ * encoding map.
+ * All 4 are variable sized and we need to know how large these are
+ * before creating the compression header itself as this starts with
+ * the total size (stored as a variable length string).
+ */
+
+ // Duplicated from container itself, and removed in 1.1
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ itf8_put_blk(cb, h->ref_seq_id);
+ itf8_put_blk(cb, h->ref_seq_start);
+ itf8_put_blk(cb, h->ref_seq_span);
+ itf8_put_blk(cb, h->num_records);
+ itf8_put_blk(cb, h->num_landmarks);
+ for (i = 0; i < h->num_landmarks; i++) {
+ itf8_put_blk(cb, h->landmark[i]);
+ }
+ }
+
+ if (h->preservation_map)
+ kh_destroy(map, h->preservation_map);
+
+ /* Create in-memory preservation map */
+ /* FIXME: should create this when we create the container */
+ {
+ khint_t k;
+ int r;
+
+ if (!(h->preservation_map = kh_init(map)))
+ return NULL;
+
+ k = kh_put(map, h->preservation_map, "RN", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = 1;
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ k = kh_put(map, h->preservation_map, "PI", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = 0;
+
+ k = kh_put(map, h->preservation_map, "UI", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = 1;
+
+ k = kh_put(map, h->preservation_map, "MI", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = 1;
+
+ } else {
+ // Technically SM was in 1.0, but wasn't in Java impl.
+ k = kh_put(map, h->preservation_map, "SM", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = 0;
+
+ k = kh_put(map, h->preservation_map, "TD", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = 0;
+
+ k = kh_put(map, h->preservation_map, "AP", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = h->AP_delta;
+
+ if (fd->no_ref || fd->embed_ref) {
+ // Reference Required == No
+ k = kh_put(map, h->preservation_map, "RR", &r);
+ if (-1 == r) return NULL;
+ kh_val(h->preservation_map, k).i = 0;
+ }
+ }
+ }
+
+ /* Encode preservation map; could collapse this and above into one */
+ mc = 0;
+ BLOCK_SIZE(map) = 0;
+ if (h->preservation_map) {
+ khint_t k;
+
+ for (k = kh_begin(h->preservation_map);
+ k != kh_end(h->preservation_map);
+ k++) {
+ const char *key;
+ khash_t(map) *pmap = h->preservation_map;
+
+
+ if (!kh_exist(pmap, k))
+ continue;
+
+ key = kh_key(pmap, k);
+ BLOCK_APPEND(map, key, 2);
+
+ switch(CRAM_KEY(key[0], key[1])) {
+ case CRAM_KEY('M','I'):
+ BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
+ break;
+
+ case CRAM_KEY('U','I'):
+ BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
+ break;
+
+ case CRAM_KEY('P','I'):
+ BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
+ break;
+
+ case CRAM_KEY('A','P'):
+ BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
+ break;
+
+ case CRAM_KEY('R','N'):
+ BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
+ break;
+
+ case CRAM_KEY('R','R'):
+ BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
+ break;
+
+ case CRAM_KEY('S','M'): {
+ char smat[5], *mp = smat;
+ *mp++ =
+ (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
+ (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
+ (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
+ (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
+ *mp++ =
+ (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
+ (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
+ (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
+ (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
+ *mp++ =
+ (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
+ (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
+ (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
+ (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
+ *mp++ =
+ (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
+ (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
+ (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
+ (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
+ *mp++ =
+ (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
+ (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
+ (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
+ (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
+ BLOCK_APPEND(map, smat, 5);
+ break;
+ }
+
+ case CRAM_KEY('T','D'): {
+ itf8_put_blk(map, BLOCK_SIZE(h->TD_blk));
+ BLOCK_APPEND(map,
+ BLOCK_DATA(h->TD_blk),
+ BLOCK_SIZE(h->TD_blk));
+ break;
+ }
+
+ default:
+ fprintf(stderr, "Unknown preservation key '%.2s'\n", key);
+ break;
+ }
+
+ mc++;
+ }
+ }
+ itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
+ itf8_put_blk(cb, mc);
+ BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
+
+ /* rec encoding map */
+ mc = 0;
+ BLOCK_SIZE(map) = 0;
+ if (h->codecs[DS_BF]) {
+ if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_CF]) {
+ if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_RL]) {
+ if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_AP]) {
+ if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_RG]) {
+ if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_MF]) {
+ if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_NS]) {
+ if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_NP]) {
+ if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_TS]) {
+ if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_NF]) {
+ if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_TC]) {
+ if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_TN]) {
+ if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_TL]) {
+ if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_FN]) {
+ if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_FC]) {
+ if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_FP]) {
+ if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_BS]) {
+ if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_IN]) {
+ if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_DL]) {
+ if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_BA]) {
+ if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_BB]) {
+ if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_MQ]) {
+ if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_RN]) {
+ if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_QS]) {
+ if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_QQ]) {
+ if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_RI]) {
+ if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
+ if (h->codecs[DS_SC]) {
+ if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_RS]) {
+ if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_PD]) {
+ if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_HC]) {
+ if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ }
+ if (h->codecs[DS_TM]) {
+ if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ if (h->codecs[DS_TV]) {
+ if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
+ fd->version))
+ return NULL;
+ mc++;
+ }
+ itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
+ itf8_put_blk(cb, mc);
+ BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
+
+ /* tag encoding map */
+#if 0
+ mp = map; mc = 0;
+ if (h->tag_encoding_map) {
+ HashItem *hi;
+ HashIter *iter = HashTableIterCreate();
+ if (!iter)
+ return NULL;
+
+ while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) {
+ cram_map *m = hi->data.p;
+ int sz;
+
+ mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]);
+ if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version)))
+ return NULL;
+ mp += sz;
+ mc++;
+ }
+
+ HashTableIterDestroy(iter);
+ }
+#else
+ mc = 0;
+ BLOCK_SIZE(map) = 0;
+ if (c->tags_used) {
+ khint_t k;
+
+#define TAG_ID(a) ((#a[0]<<8)+#a[1])
+
+ for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
+ int key;
+ if (!kh_exist(c->tags_used, k))
+ continue;
+
+ mc++;
+ itf8_put_blk(map, kh_key(c->tags_used, k));
+
+ // use block content id 4
+ switch((key = kh_key(c->tags_used, k)) & 0xff) {
+ case 'Z': case 'H':
+ // string as byte_array_stop
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\005" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_S "\000\000\000",
+ 7);
+ } else {
+ if (key>>8 == TAG_ID(OQ))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_OQ_S,
+ 4);
+ else if (key>>8 == TAG_ID(BQ))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_BQ_S,
+ 4);
+ else if (key>>8 == TAG_ID(BD))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_BD_S,
+ 4);
+ else if (key>>8 == TAG_ID(BI))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_BI_S,
+ 4);
+ else if ((key>>8 == TAG_ID(Q2)) ||
+ (key>>8 == TAG_ID(U2)) ||
+ (key>>8 == TAG_ID(QT)) ||
+ (key>>8 == TAG_ID(CQ)))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_oq_S,
+ 4);
+ else if ((key>>8 == TAG_ID(R2)) ||
+ (key>>8 == TAG_ID(E2)) ||
+ (key>>8 == TAG_ID(CS)) ||
+ (key>>8 == TAG_ID(BC)) ||
+ (key>>8 == TAG_ID(RT)))
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_os_S,
+ 4);
+ else
+ BLOCK_APPEND(map,
+ "\005" // BYTE_ARRAY_STOP
+ "\002" // len
+ "\t" // stop-byte is also SAM separator
+ DS_aux_oz_S,
+ 4);
+ }
+ break;
+
+ case 'A': case 'c': case 'C':
+ // byte array len, 1 byte
+ BLOCK_APPEND(map,
+ "\004" // BYTE_ARRAY_LEN
+ "\011" // length
+ "\003" // HUFFMAN (len)
+ "\004" // huffman-len
+ "\001" // 1 symbol
+ "\001" // symbol=1 byte value
+ "\001" // 1 length
+ "\000" // length=0
+ "\001" // EXTERNAL (val)
+ "\001" // external-len
+ DS_aux_S,// content-id
+ 11);
+ break;
+
+ case 's': case 'S':
+ // byte array len, 2 byte
+ BLOCK_APPEND(map,
+ "\004" // BYTE_ARRAY_LEN
+ "\011" // length
+ "\003" // HUFFMAN (len)
+ "\004" // huffman-len
+ "\001" // 1 symbol
+ "\002" // symbol=2 byte value
+ "\001" // 1 length
+ "\000" // length=0
+ "\001" // EXTERNAL (val)
+ "\001" // external-len
+ DS_aux_S,// content-id
+ 11);
+ break;
+
+ case 'i': case 'I': case 'f':
+ // byte array len, 4 byte
+ BLOCK_APPEND(map,
+ "\004" // BYTE_ARRAY_LEN
+ "\011" // length
+ "\003" // HUFFMAN (len)
+ "\004" // huffman-len
+ "\001" // 1 symbol
+ "\004" // symbol=4 byte value
+ "\001" // 1 length
+ "\000" // length=0
+ "\001" // EXTERNAL (val)
+ "\001" // external-len
+ DS_aux_S,// content-id
+ 11);
+ break;
+
+ case 'B':
+ // Byte array of variable size, but we generate our tag
+ // byte stream at the wrong stage (during reading and not
+ // after slice header construction). So we use
+ // BYTE_ARRAY_LEN with the length codec being external
+ // too.
+ if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM)))
+ BLOCK_APPEND(map,
+ "\004" // BYTE_ARRAY_LEN
+ "\006" // length
+ "\001" // EXTERNAL (len)
+ "\001" // external-len
+ DS_aux_FZ_S // content-id
+ "\001" // EXTERNAL (val)
+ "\001" // external-len
+ DS_aux_FZ_S,// content-id
+ 8);
+ else
+ BLOCK_APPEND(map,
+ "\004" // BYTE_ARRAY_LEN
+ "\006" // length
+ "\001" // EXTERNAL (len)
+ "\001" // external-len
+ DS_aux_S // content-id
+ "\001" // EXTERNAL (val)
+ "\001" // external-len
+ DS_aux_S,// content-id
+ 8);
+ break;
+
+ default:
+ fprintf(stderr, "Unsupported SAM aux type '%c'\n",
+ kh_key(c->tags_used, k) & 0xff);
+ }
+ //mp += m->codec->store(m->codec, mp, NULL, fd->version);
+ }
+ }
+#endif
+ itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
+ itf8_put_blk(cb, mc);
+ BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
+
+ if (fd->verbose)
+ fprintf(stderr, "Wrote compression block header in %d bytes\n",
+ (int)BLOCK_SIZE(cb));
+
+ BLOCK_UPLEN(cb);
+
+ cram_free_block(map);
+
+ return cb;
+}
+
+
+/*
+ * Encodes a slice compression header.
+ *
+ * Returns cram_block on success
+ * NULL on failure
+ */
+cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
+ char *buf;
+ char *cp;
+ cram_block *b = cram_new_block(MAPPED_SLICE, 0);
+ int j;
+
+ if (!b)
+ return NULL;
+
+ if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) {
+ cram_free_block(b);
+ return NULL;
+ }
+
+ cp += itf8_put(cp, s->hdr->ref_seq_id);
+ cp += itf8_put(cp, s->hdr->ref_seq_start);
+ cp += itf8_put(cp, s->hdr->ref_seq_span);
+ cp += itf8_put(cp, s->hdr->num_records);
+ if (CRAM_MAJOR_VERS(fd->version) == 2)
+ cp += itf8_put(cp, s->hdr->record_counter);
+ else if (CRAM_MAJOR_VERS(fd->version) >= 3)
+ cp += ltf8_put(cp, s->hdr->record_counter);
+ cp += itf8_put(cp, s->hdr->num_blocks);
+ cp += itf8_put(cp, s->hdr->num_content_ids);
+ for (j = 0; j < s->hdr->num_content_ids; j++) {
+ cp += itf8_put(cp, s->hdr->block_content_ids[j]);
+ }
+ if (s->hdr->content_type == MAPPED_SLICE)
+ cp += itf8_put(cp, s->hdr->ref_base_id);
+
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
+ memcpy(cp, s->hdr->md5, 16); cp += 16;
+ }
+
+ assert(cp-buf <= 16+5*(8+s->hdr->num_blocks));
+
+ b->data = (unsigned char *)buf;
+ b->comp_size = b->uncomp_size = cp-buf;
+
+ return b;
+}
+
+
+/*
+ * Encodes a single read.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+static int cram_encode_slice_read(cram_fd *fd,
+ cram_container *c,
+ cram_block_compression_hdr *h,
+ cram_slice *s,
+ cram_record *cr,
+ int *last_pos) {
+ int r = 0;
+ int32_t i32;
+ unsigned char uc;
+
+ //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
+
+ //printf("BF=0x%x\n", cr->flags);
+ // bf = cram_flag_swap[cr->flags];
+ i32 = fd->cram_flag_swap[cr->flags & 0xfff];
+ r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
+
+ i32 = cr->cram_flags & CRAM_FLAG_MASK;
+ r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
+
+ if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
+ r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
+
+ r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
+
+ if (c->pos_sorted) {
+ i32 = cr->apos - *last_pos;
+ r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
+ *last_pos = cr->apos;
+ } else {
+ i32 = cr->apos;
+ r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
+ }
+
+ r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
+
+ if (c->comp_hdr->read_names_included) {
+ // RN codec: Already stored in block[3].
+ }
+
+ if (cr->cram_flags & CRAM_FLAG_DETACHED) {
+ i32 = cr->mate_flags;
+ r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
+
+ if (!c->comp_hdr->read_names_included) {
+ // RN codec: Already stored in block[3].
+ }
+
+ r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
+ (char *)&cr->mate_ref_id, 1);
+
+ r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
+ (char *)&cr->mate_pos, 1);
+
+ r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
+ (char *)&cr->tlen, 1);
+ } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
+ r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
+ (char *)&cr->mate_line, 1);
+ }
+
+ /* Aux tags */
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ int j;
+ uc = cr->ntags;
+ r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
+
+ for (j = 0; j < cr->ntags; j++) {
+ uint32_t i32 = s->TN[cr->TN_idx + j]; // id
+ r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
+ }
+ } else {
+ r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
+ }
+
+ // qual
+ // QS codec : Already stored in block[2].
+
+ // features (diffs)
+ if (!(cr->flags & BAM_FUNMAP)) {
+ int prev_pos = 0, j;
+
+ r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
+ (char *)&cr->nfeature, 1);
+ for (j = 0; j < cr->nfeature; j++) {
+ cram_feature *f = &s->features[cr->feature + j];
+
+ uc = f->X.code;
+ r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
+ i32 = f->X.pos - prev_pos;
+ r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
+ prev_pos = f->X.pos;
+
+ switch(f->X.code) {
+ //char *seq;
+
+ case 'X':
+ //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
+
+ uc = f->X.base;
+ r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
+ (char *)&uc, 1);
+ break;
+ case 'S':
+ // Already done
+// r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
+// BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
+// f->S.len);
+
+// if (IS_CRAM_3_VERS(fd)) {
+// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
+// BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
+// f->S.len);
+// }
+ break;
+ case 'I':
+ //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
+ //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
+ // seq, f->S.len);
+// if (IS_CRAM_3_VERS(fd)) {
+// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
+// BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
+// f->I.len);
+// }
+ break;
+ case 'i':
+ uc = f->i.base;
+ r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
+ (char *)&uc, 1);
+ //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
+ //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
+ // seq, 1);
+ break;
+ case 'D':
+ i32 = f->D.len;
+ r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
+ (char *)&i32, 1);
+ break;
+
+ case 'B':
+ // // Used when we try to store a non ACGTN base or an N
+ // // that aligns against a non ACGTN reference
+
+ uc = f->B.base;
+ r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
+ (char *)&uc, 1);
+
+ // Already added
+ // uc = f->B.qual;
+ // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
+ // (char *)&uc, 1);
+ break;
+
+ case 'b':
+ // string of bases
+ r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
+ (char *)BLOCK_DATA(s->seqs_blk)
+ + f->b.seq_idx,
+ f->b.len);
+ break;
+
+ case 'Q':
+ // Already added
+ // uc = f->B.qual;
+ // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
+ // (char *)&uc, 1);
+ break;
+
+ case 'N':
+ i32 = f->N.len;
+ r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
+ (char *)&i32, 1);
+ break;
+
+ case 'P':
+ i32 = f->P.len;
+ r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
+ (char *)&i32, 1);
+ break;
+
+ case 'H':
+ i32 = f->H.len;
+ r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
+ (char *)&i32, 1);
+ break;
+
+
+ default:
+ fprintf(stderr, "unhandled feature code %c\n",
+ f->X.code);
+ return -1;
+ }
+ }
+
+ r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
+ (char *)&cr->mqual, 1);
+ } else {
+ char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
+ if (cr->len)
+ r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
+ }
+
+ return r ? -1 : 0;
+}
+
+
+/*
+ * Applies various compression methods to specific blocks, depending on
+ * known observations of how data series compress.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+static int cram_compress_slice(cram_fd *fd, cram_slice *s) {
+ int level = fd->level, i;
+ int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
+
+ /* Compress the CORE Block too, with minimal zlib level */
+ if (level > 5 && s->block[0]->uncomp_size > 500)
+ cram_compress_block(fd, s->block[0], NULL, GZIP, 1);
+
+ if (fd->use_bz2)
+ method |= 1<<BZIP2;
+
+ if (fd->use_rans)
+ method |= (1<<RANS0) | (1<<RANS1);
+
+ if (fd->use_lzma)
+ method |= (1<<LZMA);
+
+ /* Faster method for data series we only need entropy encoding on */
+ methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
+ if (level >= 6)
+ methodF = method;
+
+
+ /* Specific compression methods for certain block types */
+ if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
+ method, level))
+ return -1;
+
+ if (fd->level == 0) {
+ /* Do nothing */
+ } else if (fd->level == 1) {
+ if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
+ methodF, 1))
+ return -1;
+ for (i = DS_aux; i <= DS_aux_oz; i++) {
+ if (s->block[i])
+ if (cram_compress_block(fd, s->block[i], fd->m[i],
+ method, 1))
+ return -1;
+ }
+ } else if (fd->level < 3) {
+ if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
+ method, 1))
+ return -1;
+ if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
+ method, 1))
+ return -1;
+ if (s->block[DS_BB])
+ if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
+ method, 1))
+ return -1;
+ for (i = DS_aux; i <= DS_aux_oz; i++) {
+ if (s->block[i])
+ if (cram_compress_block(fd, s->block[i], fd->m[i],
+ method, level))
+ return -1;
+ }
+ } else {
+ if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
+ method, level))
+ return -1;
+ if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
+ method, level))
+ return -1;
+ if (s->block[DS_BB])
+ if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
+ method, level))
+ return -1;
+ for (i = DS_aux; i <= DS_aux_oz; i++) {
+ if (s->block[i])
+ if (cram_compress_block(fd, s->block[i], fd->m[i],
+ method, level))
+ return -1;
+ }
+ }
+
+ // NAME: best is generally xz, bzip2, zlib then rans1
+ // It benefits well from a little bit extra compression level.
+ if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN],
+ method & ~(1<<RANS0 | 1<<GZIP_RLE),
+ MIN(9,level)))
+ return -1;
+
+ // NS shows strong local correlation as rearrangements are localised
+ if (s->block[DS_NS] != s->block[0])
+ if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS],
+ method, level))
+ return -1;
+
+
+ /*
+ * Minimal compression of any block still uncompressed, bar CORE
+ */
+ {
+ int i;
+ for (i = 1; i < DS_END; i++) {
+ if (!s->block[i] || s->block[i] == s->block[0])
+ continue;
+
+ // fast methods only
+ if (s->block[i]->method == RAW) {
+ cram_compress_block(fd, s->block[i], fd->m[i],
+ methodF, level);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Encodes a single slice from a container
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+static int cram_encode_slice(cram_fd *fd, cram_container *c,
+ cram_block_compression_hdr *h, cram_slice *s) {
+ int rec, r = 0, last_pos;
+ int embed_ref;
+ enum cram_DS_ID id;
+
+ embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
+
+ /*
+ * Slice external blocks:
+ * ID 0 => base calls (insertions, soft-clip)
+ * ID 1 => qualities
+ * ID 2 => names
+ * ID 3 => TS (insert size), NP (next frag)
+ * ID 4 => tag values
+ * ID 6 => tag IDs (TN), if CRAM_V1.0
+ * ID 7 => TD tag dictionary, if !CRAM_V1.0
+ */
+
+ /* Create cram slice header */
+ s->hdr->ref_base_id = embed_ref ? DS_ref : -1;
+ s->hdr->record_counter = c->num_records + c->record_counter;
+ c->num_records += s->hdr->num_records;
+
+ s->block = calloc(DS_END, sizeof(s->block[0]));
+ s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
+ if (!s->block || !s->hdr->block_content_ids)
+ return -1;
+
+ // Create first fixed blocks, always external.
+ // CORE
+ if (!(s->block[0] = cram_new_block(CORE, 0)))
+ return -1;
+
+ // TN block for CRAM v1
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
+ if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
+ h->codecs[DS_TN]->external.content_id = DS_TN;
+ } else {
+ s->block[DS_TN] = s->block[0];
+ }
+ s->block[DS_TN] = s->block[DS_TN];
+ }
+
+ // Embedded reference
+ if (embed_ref) {
+ if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
+ return -1;
+ s->ref_id = DS_ref; // needed?
+ BLOCK_APPEND(s->block[DS_ref],
+ c->ref + c->first_base - c->ref_start,
+ c->last_base - c->first_base + 1);
+ }
+
+ /*
+ * All the data-series blocks if appropriate.
+ */
+ for (id = DS_BF; id < DS_TN; id++) {
+ if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL ||
+ h->codecs[id]->codec == E_BYTE_ARRAY_STOP ||
+ h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) {
+ switch (h->codecs[id]->codec) {
+ case E_EXTERNAL:
+ if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
+ return -1;
+ h->codecs[id]->external.content_id = id;
+ break;
+
+ case E_BYTE_ARRAY_STOP:
+ if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
+ return -1;
+ h->codecs[id]->byte_array_stop.content_id = id;
+ break;
+
+ case E_BYTE_ARRAY_LEN: {
+ cram_codec *cc;
+
+ cc = h->codecs[id]->e_byte_array_len.len_codec;
+ if (cc->codec == E_EXTERNAL) {
+ int eid = cc->external.content_id;
+ if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
+ return -1;
+ cc->external.content_id = eid;
+ cc->out = s->block[eid];
+ }
+
+ cc = h->codecs[id]->e_byte_array_len.val_codec;
+ if (cc->codec == E_EXTERNAL) {
+ int eid = cc->external.content_id;
+ if (!s->block[eid])
+ if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
+ return -1;
+ cc->external.content_id = eid;
+ cc->out = s->block[eid];
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ } else {
+ if (!(id == DS_BB && !h->codecs[DS_BB]))
+ s->block[id] = s->block[0];
+ }
+ if (h->codecs[id])
+ h->codecs[id]->out = s->block[id];
+ }
+
+ /* Encode reads */
+ last_pos = s->hdr->ref_seq_start;
+ for (rec = 0; rec < s->hdr->num_records; rec++) {
+ cram_record *cr = &s->crecs[rec];
+ if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
+ return -1;
+ }
+
+ s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
+ s->block[0]->comp_size = s->block[0]->uncomp_size;
+
+ // Make sure the fixed blocks point to the correct sources
+ s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
+ s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
+ s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
+ s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
+ s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL;
+ s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL;
+ s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL;
+ s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL;
+ s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL;
+ s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL;
+ s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL;
+ s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL;
+ s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL;
+
+ // Ensure block sizes are up to date.
+ for (id = 1; id < DS_END; id++) {
+ if (!s->block[id] || s->block[id] == s->block[0])
+ continue;
+
+ if (s->block[id]->uncomp_size == 0)
+ BLOCK_UPLEN(s->block[id]);
+ }
+
+ // Compress it all
+ if (cram_compress_slice(fd, s) == -1)
+ return -1;
+
+ // Collapse empty blocks and create hdr_block
+ {
+ int i, j;
+ for (i = j = 1; i < DS_END; i++) {
+ if (!s->block[i] || s->block[i] == s->block[0])
+ continue;
+ if (s->block[i]->uncomp_size == 0) {
+ cram_free_block(s->block[i]);
+ s->block[i] = NULL;
+ continue;
+ }
+ s->block[j] = s->block[i];
+ s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
+ j++;
+ }
+ s->hdr->num_content_ids = j-1;
+ s->hdr->num_blocks = j;
+
+ if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
+ return -1;
+ }
+
+ return r ? -1 : 0;
+}
+
+/*
+ * Encodes all slices in a container into blocks.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_encode_container(cram_fd *fd, cram_container *c) {
+ int i, j, slice_offset;
+ cram_block_compression_hdr *h = c->comp_hdr;
+ cram_block *c_hdr;
+ int multi_ref = 0;
+ int r1, r2, sn, nref;
+ spare_bams *spares;
+
+ /* Cache references up-front if we have unsorted access patterns */
+ pthread_mutex_lock(&fd->ref_lock);
+ nref = fd->refs->nref;
+ pthread_mutex_unlock(&fd->ref_lock);
+
+ if (!fd->no_ref && c->refs_used) {
+ for (i = 0; i < nref; i++) {
+ if (c->refs_used[i])
+ cram_get_ref(fd, i, 1, 0);
+ }
+ }
+
+ /* To create M5 strings */
+ /* Fetch reference sequence */
+ if (!fd->no_ref) {
+ bam_seq_t *b = c->bams[0];
+ char *ref;
+
+ ref = cram_get_ref(fd, bam_ref(b), 1, 0);
+ if (!ref && bam_ref(b) >= 0) {
+ fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b));
+ return -1;
+ }
+ if ((c->ref_id = bam_ref(b)) >= 0) {
+ c->ref_seq_id = c->ref_id;
+ c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
+ c->ref_start = 1;
+ c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
+ } else {
+ c->ref_seq_id = c->ref_id; // FIXME remove one var!
+ }
+ } else {
+ c->ref_id = bam_ref(c->bams[0]);
+ cram_ref_incr(fd->refs, c->ref_id);
+ c->ref_seq_id = c->ref_id;
+ }
+
+ /* Turn bams into cram_records and gather basic stats */
+ for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
+ cram_slice *s = c->slices[sn];
+ int first_base = INT_MAX, last_base = INT_MIN;
+
+ assert(sn < c->curr_slice);
+
+ /* FIXME: we could create our slice objects here too instead of
+ * in cram_put_bam_seq. It's more natural here and also this is
+ * bit is threaded so it's less work in the main thread.
+ */
+
+ for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) {
+ cram_record *cr = &s->crecs[r2];
+ bam_seq_t *b = c->bams[r1];
+
+ /* If multi-ref we need to cope with changing reference per seq */
+ if (c->multi_seq && !fd->no_ref) {
+ if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
+ if (c->ref_seq_id >= 0)
+ cram_ref_decr(fd->refs, c->ref_seq_id);
+
+ if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
+ fprintf(stderr, "Failed to load reference #%d\n",
+ bam_ref(b));
+ return -1;
+ }
+
+ c->ref_seq_id = bam_ref(b); // overwritten later by -2
+ assert(fd->refs->ref_id[c->ref_seq_id]->seq);
+ c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
+ c->ref_start = 1;
+ c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
+ }
+ }
+
+ if (process_one_read(fd, c, s, cr, b, r2) != 0)
+ return -1;
+
+ if (first_base > cr->apos)
+ first_base = cr->apos;
+
+ if (last_base < cr->aend)
+ last_base = cr->aend;
+ }
+
+ if (c->multi_seq) {
+ s->hdr->ref_seq_id = -2;
+ s->hdr->ref_seq_start = 0;
+ s->hdr->ref_seq_span = 0;
+ } else {
+ s->hdr->ref_seq_id = c->ref_id;
+ s->hdr->ref_seq_start = first_base;
+ s->hdr->ref_seq_span = last_base - first_base + 1;
+ }
+ s->hdr->num_records = r2;
+ }
+
+ if (c->multi_seq && !fd->no_ref) {
+ if (c->ref_seq_id >= 0)
+ cram_ref_decr(fd->refs, c->ref_seq_id);
+ }
+
+ /* Link our bams[] array onto the spare bam list for reuse */
+ spares = malloc(sizeof(*spares));
+ pthread_mutex_lock(&fd->bam_list_lock);
+ spares->bams = c->bams;
+ spares->next = fd->bl;
+ fd->bl = spares;
+ pthread_mutex_unlock(&fd->bam_list_lock);
+ c->bams = NULL;
+
+ /* Detect if a multi-seq container */
+ cram_stats_encoding(fd, c->stats[DS_RI]);
+ multi_ref = c->stats[DS_RI]->nvals > 1;
+
+ if (multi_ref) {
+ if (fd->verbose)
+ fprintf(stderr, "Multi-ref container\n");
+ c->ref_seq_id = -2;
+ c->ref_seq_start = 0;
+ c->ref_seq_span = 0;
+ }
+
+
+ /* Compute MD5s */
+ for (i = 0; i < c->curr_slice; i++) {
+ cram_slice *s = c->slices[i];
+
+ if (CRAM_MAJOR_VERS(fd->version) != 1) {
+ if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
+ hts_md5_context *md5 = hts_md5_init();
+ if (!md5)
+ return -1;
+ hts_md5_update(md5,
+ c->ref + s->hdr->ref_seq_start - c->ref_start,
+ s->hdr->ref_seq_span);
+ hts_md5_final(s->hdr->md5, md5);
+ hts_md5_destroy(md5);
+ } else {
+ memset(s->hdr->md5, 0, 16);
+ }
+ }
+ }
+
+ c->num_records = 0;
+ c->num_blocks = 1; // cram_block_compression_hdr
+ c->length = 0;
+
+ //fprintf(stderr, "=== BF ===\n");
+ h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
+ c->stats[DS_BF], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== CF ===\n");
+ h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
+ c->stats[DS_CF], E_INT, NULL,
+ fd->version);
+// fprintf(stderr, "=== RN ===\n");
+// h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
+// c->stats[DS_RN], E_BYTE_ARRAY, NULL,
+// fd->version);
+
+ //fprintf(stderr, "=== AP ===\n");
+ if (c->pos_sorted) {
+ h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
+ c->stats[DS_AP], E_INT, NULL,
+ fd->version);
+ } else {
+ int p[2] = {0, c->max_apos};
+ h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p,
+ fd->version);
+ }
+
+ //fprintf(stderr, "=== RG ===\n");
+ h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
+ c->stats[DS_RG], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== MQ ===\n");
+ h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
+ c->stats[DS_MQ], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== NS ===\n");
+ h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
+ c->stats[DS_NS], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== MF ===\n");
+ h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
+ c->stats[DS_MF], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== TS ===\n");
+ h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
+ c->stats[DS_TS], E_INT, NULL,
+ fd->version);
+ //fprintf(stderr, "=== NP ===\n");
+ h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
+ c->stats[DS_NP], E_INT, NULL,
+ fd->version);
+ //fprintf(stderr, "=== NF ===\n");
+ h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
+ c->stats[DS_NF], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== RL ===\n");
+ h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
+ c->stats[DS_RL], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== FN ===\n");
+ h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
+ c->stats[DS_FN], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== FC ===\n");
+ h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
+ c->stats[DS_FC], E_BYTE, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== FP ===\n");
+ h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
+ c->stats[DS_FP], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== DL ===\n");
+ h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
+ c->stats[DS_DL], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== BA ===\n");
+ h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
+ c->stats[DS_BA], E_BYTE, NULL,
+ fd->version);
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ cram_byte_array_len_encoder e;
+
+ e.len_encoding = E_EXTERNAL;
+ e.len_dat = (void *)DS_BB_len;
+ //e.len_dat = (void *)DS_BB;
+
+ e.val_encoding = E_EXTERNAL;
+ e.val_dat = (void *)DS_BB;
+
+ h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
+ E_BYTE_ARRAY, (void *)&e,
+ fd->version);
+ } else {
+ h->codecs[DS_BB] = NULL;
+ }
+
+ //fprintf(stderr, "=== BS ===\n");
+ h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
+ c->stats[DS_BS], E_BYTE, NULL,
+ fd->version);
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ h->codecs[DS_TL] = NULL;
+ h->codecs[DS_RI] = NULL;
+ h->codecs[DS_RS] = NULL;
+ h->codecs[DS_PD] = NULL;
+ h->codecs[DS_HC] = NULL;
+ h->codecs[DS_SC] = NULL;
+
+ //fprintf(stderr, "=== TC ===\n");
+ h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
+ c->stats[DS_TC], E_BYTE, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== TN ===\n");
+ h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
+ c->stats[DS_TN], E_INT, NULL,
+ fd->version);
+ } else {
+ h->codecs[DS_TC] = NULL;
+ h->codecs[DS_TN] = NULL;
+
+ //fprintf(stderr, "=== TL ===\n");
+ h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
+ c->stats[DS_TL], E_INT, NULL,
+ fd->version);
+
+
+ //fprintf(stderr, "=== RI ===\n");
+ h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
+ c->stats[DS_RI], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== RS ===\n");
+ h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
+ c->stats[DS_RS], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== PD ===\n");
+ h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
+ c->stats[DS_PD], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== HC ===\n");
+ h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
+ c->stats[DS_HC], E_INT, NULL,
+ fd->version);
+
+ //fprintf(stderr, "=== SC ===\n");
+ if (1) {
+ int i2[2] = {0, DS_SC};
+
+ h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
+ E_BYTE_ARRAY, (void *)i2,
+ fd->version);
+ } else {
+ // Appears to be no practical benefit to using this method,
+ // but it may work better if we start mixing SC, IN and BB
+ // elements into the same external block.
+ cram_byte_array_len_encoder e;
+
+ e.len_encoding = E_EXTERNAL;
+ e.len_dat = (void *)DS_SC_len;
+
+ e.val_encoding = E_EXTERNAL;
+ e.val_dat = (void *)DS_SC;
+
+ h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
+ E_BYTE_ARRAY, (void *)&e,
+ fd->version);
+ }
+ }
+
+ //fprintf(stderr, "=== IN ===\n");
+ {
+ int i2[2] = {0, DS_IN};
+ h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
+ E_BYTE_ARRAY, (void *)i2,
+ fd->version);
+ }
+
+ h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
+ (void *)DS_QS,
+ fd->version);
+ {
+ int i2[2] = {0, DS_RN};
+ h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
+ E_BYTE_ARRAY, (void *)i2,
+ fd->version);
+ }
+
+
+ /* Encode slices */
+ for (i = 0; i < c->curr_slice; i++) {
+ if (fd->verbose)
+ fprintf(stderr, "Encode slice %d\n", i);
+ if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
+ return -1;
+ }
+
+ /* Create compression header */
+ {
+ h->ref_seq_id = c->ref_seq_id;
+ h->ref_seq_start = c->ref_seq_start;
+ h->ref_seq_span = c->ref_seq_span;
+ h->num_records = c->num_records;
+
+ h->mapped_qs_included = 0; // fixme
+ h->unmapped_qs_included = 0; // fixme
+ h->AP_delta = c->pos_sorted;
+ // h->... fixme
+ memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
+
+ if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
+ return -1;
+ }
+
+ /* Compute landmarks */
+ /* Fill out slice landmarks */
+ c->num_landmarks = c->curr_slice;
+ c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
+ if (!c->landmark)
+ return -1;
+
+ /*
+ * Slice offset starts after the first block, so we need to simulate
+ * writing it to work out the correct offset
+ */
+ {
+ slice_offset = c_hdr->method == RAW
+ ? c_hdr->uncomp_size
+ : c_hdr->comp_size;
+ slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
+ itf8_size(c_hdr->content_id) +
+ itf8_size(c_hdr->comp_size) +
+ itf8_size(c_hdr->uncomp_size);
+ }
+
+ c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
+ c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
+ c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
+ for (i = 0; i < c->curr_slice; i++) {
+ cram_slice *s = c->slices[i];
+
+ c->num_blocks += s->hdr->num_blocks + 1; // slice header
+ c->landmark[i] = slice_offset;
+
+ if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
+ c->ref_seq_start + c->ref_seq_span) {
+ c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
+ - c->ref_seq_start;
+ }
+
+ slice_offset += s->hdr_block->method == RAW
+ ? s->hdr_block->uncomp_size
+ : s->hdr_block->comp_size;
+
+ slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
+ itf8_size(s->hdr_block->content_id) +
+ itf8_size(s->hdr_block->comp_size) +
+ itf8_size(s->hdr_block->uncomp_size);
+
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
+ itf8_size(s->block[j]->content_id) +
+ itf8_size(s->block[j]->comp_size) +
+ itf8_size(s->block[j]->uncomp_size);
+
+ slice_offset += s->block[j]->method == RAW
+ ? s->block[j]->uncomp_size
+ : s->block[j]->comp_size;
+ }
+ }
+ c->length += slice_offset; // just past the final slice
+
+ c->comp_hdr_block = c_hdr;
+
+ if (c->ref_seq_id >= 0) {
+ cram_ref_decr(fd->refs, c->ref_seq_id);
+ }
+
+ /* Cache references up-front if we have unsorted access patterns */
+ if (!fd->no_ref && c->refs_used) {
+ for (i = 0; i < fd->refs->nref; i++) {
+ if (c->refs_used[i])
+ cram_ref_decr(fd->refs, i);
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * Adds a feature code to a read within a slice. For purposes of minimising
+ * memory allocations and fragmentation we have one array of features for all
+ * reads within the slice. We return the index into this array for this new
+ * feature.
+ *
+ * Returns feature index on success
+ * -1 on failure.
+ */
+static int cram_add_feature(cram_container *c, cram_slice *s,
+ cram_record *r, cram_feature *f) {
+ if (s->nfeatures >= s->afeatures) {
+ s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
+ s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
+ if (!s->features)
+ return -1;
+ }
+
+ if (!r->nfeature++) {
+ r->feature = s->nfeatures;
+ cram_stats_add(c->stats[DS_FP], f->X.pos);
+ } else {
+ cram_stats_add(c->stats[DS_FP],
+ f->X.pos - s->features[r->feature + r->nfeature-2].X.pos);
+ }
+ cram_stats_add(c->stats[DS_FC], f->X.code);
+
+ s->features[s->nfeatures++] = *f;
+
+ return 0;
+}
+
+static int cram_add_substitution(cram_fd *fd, cram_container *c,
+ cram_slice *s, cram_record *r,
+ int pos, char base, char qual, char ref) {
+ cram_feature f;
+
+ // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
+ if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
+ f.X.pos = pos+1;
+ f.X.code = 'X';
+ f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
+ cram_stats_add(c->stats[DS_BS], f.X.base);
+ } else {
+ f.B.pos = pos+1;
+ f.B.code = 'B';
+ f.B.base = base;
+ f.B.qual = qual;
+ cram_stats_add(c->stats[DS_BA], f.B.base);
+ cram_stats_add(c->stats[DS_QS], f.B.qual);
+ BLOCK_APPEND_CHAR(s->qual_blk, qual);
+ }
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_bases(cram_fd *fd, cram_container *c,
+ cram_slice *s, cram_record *r,
+ int pos, int len, char *base) {
+ cram_feature f;
+
+ f.b.pos = pos+1;
+ f.b.code = 'b';
+ f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
+ f.b.len = len;
+
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_base(cram_fd *fd, cram_container *c,
+ cram_slice *s, cram_record *r,
+ int pos, char base, char qual) {
+ cram_feature f;
+ f.B.pos = pos+1;
+ f.B.code = 'B';
+ f.B.base = base;
+ f.B.qual = qual;
+ cram_stats_add(c->stats[DS_BA], base);
+ cram_stats_add(c->stats[DS_QS], qual);
+ BLOCK_APPEND_CHAR(s->qual_blk, qual);
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_quality(cram_fd *fd, cram_container *c,
+ cram_slice *s, cram_record *r,
+ int pos, char qual) {
+ cram_feature f;
+ f.Q.pos = pos+1;
+ f.Q.code = 'Q';
+ f.Q.qual = qual;
+ cram_stats_add(c->stats[DS_QS], qual);
+ BLOCK_APPEND_CHAR(s->qual_blk, qual);
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
+ int pos, int len, char *base) {
+ cram_feature f;
+ f.D.pos = pos+1;
+ f.D.code = 'D';
+ f.D.len = len;
+ cram_stats_add(c->stats[DS_DL], len);
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
+ int pos, int len, char *base, int version) {
+ cram_feature f;
+ f.S.pos = pos+1;
+ f.S.code = 'S';
+ f.S.len = len;
+ switch (CRAM_MAJOR_VERS(version)) {
+ case 1:
+ f.S.seq_idx = BLOCK_SIZE(s->base_blk);
+ BLOCK_APPEND(s->base_blk, base, len);
+ BLOCK_APPEND_CHAR(s->base_blk, '\0');
+ break;
+
+ case 2:
+ default:
+ f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
+ if (base) {
+ BLOCK_APPEND(s->soft_blk, base, len);
+ } else {
+ int i;
+ for (i = 0; i < len; i++)
+ BLOCK_APPEND_CHAR(s->soft_blk, 'N');
+ }
+ BLOCK_APPEND_CHAR(s->soft_blk, '\0');
+ break;
+
+// default:
+// // v3.0 onwards uses BB data-series
+// f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
+ }
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
+ int pos, int len, char *base) {
+ cram_feature f;
+ f.S.pos = pos+1;
+ f.S.code = 'H';
+ f.S.len = len;
+ cram_stats_add(c->stats[DS_HC], len);
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
+ int pos, int len, char *base) {
+ cram_feature f;
+ f.S.pos = pos+1;
+ f.S.code = 'N';
+ f.S.len = len;
+ cram_stats_add(c->stats[DS_RS], len);
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
+ int pos, int len, char *base) {
+ cram_feature f;
+ f.S.pos = pos+1;
+ f.S.code = 'P';
+ f.S.len = len;
+ cram_stats_add(c->stats[DS_PD], len);
+ return cram_add_feature(c, s, r, &f);
+}
+
+static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
+ int pos, int len, char *base) {
+ cram_feature f;
+ f.I.pos = pos+1;
+ if (len == 1) {
+ char b = base ? *base : 'N';
+ f.i.code = 'i';
+ f.i.base = b;
+ cram_stats_add(c->stats[DS_BA], b);
+ } else {
+ f.I.code = 'I';
+ f.I.len = len;
+ f.S.seq_idx = BLOCK_SIZE(s->base_blk);
+ if (base) {
+ BLOCK_APPEND(s->base_blk, base, len);
+ } else {
+ int i;
+ for (i = 0; i < len; i++)
+ BLOCK_APPEND_CHAR(s->base_blk, 'N');
+ }
+ BLOCK_APPEND_CHAR(s->base_blk, '\0');
+ }
+ return cram_add_feature(c, s, r, &f);
+}
+
+/*
+ * Encodes auxiliary data.
+ * Returns the read-group parsed out of the BAM aux fields on success
+ * NULL on failure or no rg present (FIXME)
+ */
+static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
+ cram_slice *s, cram_record *cr) {
+ char *aux, *tmp, *rg = NULL;
+ int aux_size = bam_blk_size(b) -
+ ((char *)bam_aux(b) - (char *)&bam_ref(b));
+
+ /* Worst case is 1 nul char on every ??:Z: string, so +33% */
+ BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_blk);
+
+ aux = (char *)bam_aux(b);
+ cr->TN_idx = s->nTN;
+
+ while (aux[0] != 0) {
+ int32_t i32;
+ int r;
+
+ if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
+ rg = &aux[3];
+ while (*aux++);
+ continue;
+ }
+ if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
+ while (*aux++);
+ continue;
+ }
+ if (aux[0] == 'N' && aux[1] == 'M') {
+ switch(aux[2]) {
+ case 'A': case 'C': case 'c': aux+=4; break;
+ case 'I': case 'i': case 'f': aux+=7; break;
+ default:
+ fprintf(stderr, "Unhandled type code for NM tag\n");
+ return NULL;
+ }
+ continue;
+ }
+
+ cr->ntags++;
+
+ i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
+ kh_put(s_i2i, c->tags_used, i32, &r);
+ if (-1 == r)
+ return NULL;
+
+ if (s->nTN >= s->aTN) {
+ s->aTN = s->aTN ? s->aTN*2 : 1024;
+ if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN))))
+ return NULL;
+ }
+ s->TN[s->nTN++] = i32;
+ cram_stats_add(c->stats[DS_TN], i32);
+
+ switch(aux[2]) {
+ case 'A': case 'C': case 'c':
+ aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ *tmp++=*aux++;
+ break;
+
+ case 'S': case 's':
+ aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ *tmp++=*aux++; *tmp++=*aux++;
+ break;
+
+ case 'I': case 'i': case 'f':
+ aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ break;
+
+ case 'd':
+ aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ break;
+
+ case 'Z': case 'H':
+ aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t'; // stop byte
+ break;
+
+ case 'B': {
+ int type = aux[3], blen;
+ uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
+ (((unsigned char *)aux)[5]<< 8) +
+ (((unsigned char *)aux)[6]<<16) +
+ (((unsigned char *)aux)[7]<<24));
+ // skip TN field
+ aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+
+ // We use BYTE_ARRAY_LEN with external length, so store that first
+ switch (type) {
+ case 'c': case 'C':
+ blen = count;
+ break;
+ case 's': case 'S':
+ blen = 2*count;
+ break;
+ case 'i': case 'I': case 'f':
+ blen = 4*count;
+ break;
+ default:
+ fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
+ type);
+ return NULL;
+
+ }
+
+ tmp += itf8_put(tmp, blen+5);
+
+ *tmp++=*aux++; // sub-type & length
+ *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+
+ // The tag data itself
+ memcpy(tmp, aux, blen); tmp += blen; aux += blen;
+
+ //cram_stats_add(c->aux_B_stats, blen);
+ break;
+ }
+ default:
+ fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
+ return NULL;
+ }
+ }
+ cram_stats_add(c->stats[DS_TC], cr->ntags);
+
+ cr->aux = BLOCK_SIZE(s->aux_blk);
+ cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
+ BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
+ assert(s->aux_blk->byte <= s->aux_blk->alloc);
+
+ return rg;
+}
+
+/*
+ * Encodes auxiliary data. Largely duplicated from above, but done so to
+ * keep it simple and avoid a myriad of version ifs.
+ *
+ * Returns the read-group parsed out of the BAM aux fields on success
+ * NULL on failure or no rg present (FIXME)
+ */
+static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
+ cram_slice *s, cram_record *cr) {
+ char *aux, *orig, *tmp, *rg = NULL;
+ int aux_size = bam_get_l_aux(b);
+ cram_block *td_b = c->comp_hdr->TD_blk;
+ int TD_blk_size = BLOCK_SIZE(td_b), new;
+ char *key;
+ khint_t k;
+
+
+ /* Worst case is 1 nul char on every ??:Z: string, so +33% */
+ BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_blk);
+
+
+ orig = aux = (char *)bam_aux(b);
+
+ // Copy aux keys to td_b and aux values to s->aux_blk
+ while (aux - orig < aux_size && aux[0] != 0) {
+ uint32_t i32;
+ int r;
+
+ // RG:Z
+ if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
+ rg = &aux[3];
+ while (*aux++);
+ continue;
+ }
+
+ // MD:Z
+ if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
+ if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
+ while (*aux++);
+ continue;
+ }
+ }
+
+ // NM:i
+ if (aux[0] == 'N' && aux[1] == 'M') {
+ if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
+ switch(aux[2]) {
+ case 'A': case 'C': case 'c': aux+=4; break;
+ case 'S': case 's': aux+=5; break;
+ case 'I': case 'i': case 'f': aux+=7; break;
+ default:
+ fprintf(stderr, "Unhandled type code for NM tag\n");
+ return NULL;
+ }
+ continue;
+ }
+ }
+
+ BLOCK_APPEND(td_b, aux, 3);
+
+ i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
+ kh_put(s_i2i, c->tags_used, i32, &r);
+ if (-1 == r)
+ return NULL;
+
+ // BQ:Z
+ if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_BQ_blk)
+ if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ)))
+ return NULL;
+ BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_BQ_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk);
+ continue;
+ }
+
+ // BD:Z
+ if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_BD_blk)
+ if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD)))
+ return NULL;
+ BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_BD_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk);
+ continue;
+ }
+
+ // BI:Z
+ if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_BI_blk)
+ if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI)))
+ return NULL;
+ BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_BI_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk);
+ continue;
+ }
+
+ // OQ:Z:
+ if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_OQ_blk)
+ if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ)))
+ return NULL;
+ BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_OQ_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk);
+ continue;
+ }
+
+ // FZ:B or ZM:B
+ if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') ||
+ (aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) {
+ int type = aux[3], blen;
+ uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
+ (((unsigned char *)aux)[5]<< 8) +
+ (((unsigned char *)aux)[6]<<16) +
+ (((unsigned char *)aux)[7]<<24));
+ char *tmp;
+ if (!s->aux_FZ_blk)
+ if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ)))
+ return NULL;
+ BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_FZ_blk);
+
+ // skip TN field
+ aux+=3;
+
+ // We use BYTE_ARRAY_LEN with external length, so store that first
+ switch (type) {
+ case 'c': case 'C':
+ blen = count;
+ break;
+ case 's': case 'S':
+ blen = 2*count;
+ break;
+ case 'i': case 'I': case 'f':
+ blen = 4*count;
+ break;
+ default:
+ fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
+ type);
+ return NULL;
+
+ }
+
+ blen += 5; // sub-type & length
+ tmp += itf8_put(tmp, blen);
+
+ // The tag data itself
+ memcpy(tmp, aux, blen); tmp += blen; aux += blen;
+
+ BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk);
+ continue;
+ }
+
+ // Other quality data - {Q2,E2,U2,CQ}:Z and similar
+ if (((aux[0] == 'Q' && aux[1] == '2') ||
+ (aux[0] == 'U' && aux[1] == '2') ||
+ (aux[0] == 'Q' && aux[1] == 'T') ||
+ (aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_oq_blk)
+ if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq)))
+ return NULL;
+ BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_oq_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk);
+ continue;
+ }
+
+ // Other sequence data - {R2,E2,CS,BC,RT}:Z and similar
+ if (((aux[0] == 'R' && aux[1] == '2') ||
+ (aux[0] == 'E' && aux[1] == '2') ||
+ (aux[0] == 'C' && aux[1] == 'S') ||
+ (aux[0] == 'B' && aux[1] == 'C') ||
+ (aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') {
+ char *tmp;
+ if (!s->aux_os_blk)
+ if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os)))
+ return NULL;
+ BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_os_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk);
+ continue;
+ }
+
+
+ switch(aux[2]) {
+ case 'A': case 'C': case 'c':
+ aux+=3;
+ *tmp++=*aux++;
+ break;
+
+ case 'S': case 's':
+ aux+=3;
+ *tmp++=*aux++; *tmp++=*aux++;
+ break;
+
+ case 'I': case 'i': case 'f':
+ aux+=3;
+ *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ break;
+
+ case 'd':
+ aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
+ break;
+
+ case 'Z': case 'H':
+ {
+ char *tmp;
+ if (!s->aux_oz_blk)
+ if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz)))
+ return NULL;
+ BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1);
+ tmp = (char *)BLOCK_END(s->aux_oz_blk);
+ aux += 3;
+ while ((*tmp++=*aux++));
+ *tmp++ = '\t';
+ BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp -
+ BLOCK_DATA(s->aux_oz_blk);
+ }
+ break;
+
+ case 'B': {
+ int type = aux[3], blen;
+ uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
+ (((unsigned char *)aux)[5]<< 8) +
+ (((unsigned char *)aux)[6]<<16) +
+ (((unsigned char *)aux)[7]<<24));
+ // skip TN field
+ aux+=3;
+
+ // We use BYTE_ARRAY_LEN with external length, so store that first
+ switch (type) {
+ case 'c': case 'C':
+ blen = count;
+ break;
+ case 's': case 'S':
+ blen = 2*count;
+ break;
+ case 'i': case 'I': case 'f':
+ blen = 4*count;
+ break;
+ default:
+ fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
+ type);
+ return NULL;
+
+ }
+
+ blen += 5; // sub-type & length
+ tmp += itf8_put(tmp, blen);
+
+ // The tag data itself
+ memcpy(tmp, aux, blen); tmp += blen; aux += blen;
+
+ //cram_stats_add(c->aux_B_stats, blen);
+ break;
+ }
+ default:
+ fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
+ return NULL;
+ }
+ }
+
+ // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
+
+ // And and increment TD hash entry
+ BLOCK_APPEND_CHAR(td_b, 0);
+
+ // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
+ key = string_ndup(c->comp_hdr->TD_keys,
+ (char *)BLOCK_DATA(td_b) + TD_blk_size,
+ BLOCK_SIZE(td_b) - TD_blk_size);
+ k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
+ if (new < 0) {
+ return NULL;
+ } else if (new == 0) {
+ BLOCK_SIZE(td_b) = TD_blk_size;
+ } else {
+ kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
+ c->comp_hdr->nTL++;
+ }
+
+ cr->TL = kh_val(c->comp_hdr->TD_hash, k);
+ cram_stats_add(c->stats[DS_TL], cr->TL);
+
+ cr->aux = BLOCK_SIZE(s->aux_blk);
+ cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
+ BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
+ assert(s->aux_blk->byte <= s->aux_blk->alloc);
+
+ return rg;
+}
+
+
+/*
+ * Handles creation of a new container or new slice, flushing any
+ * existing containers when appropriate.
+ *
+ * Really this is next slice, which may or may not lead to a new container.
+ *
+ * Returns cram_container pointer on success
+ * NULL on failure.
+ */
+static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
+ cram_container *c = fd->ctr;
+ cram_slice *s;
+ int i;
+
+ /* First occurence */
+ if (c->curr_ref == -2)
+ c->curr_ref = bam_ref(b);
+
+ if (c->slice) {
+ s = c->slice;
+ if (c->multi_seq) {
+ s->hdr->ref_seq_id = -2;
+ s->hdr->ref_seq_start = 0;
+ s->hdr->ref_seq_span = 0;
+ } else {
+ s->hdr->ref_seq_id = c->curr_ref;
+ s->hdr->ref_seq_start = c->first_base;
+ s->hdr->ref_seq_span = c->last_base - c->first_base + 1;
+ }
+ s->hdr->num_records = c->curr_rec;
+
+ if (c->curr_slice == 0) {
+ if (c->ref_seq_id != s->hdr->ref_seq_id)
+ c->ref_seq_id = s->hdr->ref_seq_id;
+ c->ref_seq_start = c->first_base;
+ }
+
+ c->curr_slice++;
+ }
+
+ /* Flush container */
+ if (c->curr_slice == c->max_slice ||
+ (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
+ c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
+ if (fd->verbose)
+ fprintf(stderr, "Flush container %d/%d..%d\n",
+ c->ref_seq_id, c->ref_seq_start,
+ c->ref_seq_start + c->ref_seq_span -1);
+
+ /* Encode slices */
+ if (fd->pool) {
+ if (-1 == cram_flush_container_mt(fd, c))
+ return NULL;
+ } else {
+ if (-1 == cram_flush_container(fd, c))
+ return NULL;
+
+ // Move to sep func, as we need cram_flush_container for
+ // the closing phase to flush the partial container.
+ for (i = 0; i < c->max_slice; i++) {
+ cram_free_slice(c->slices[i]);
+ c->slices[i] = NULL;
+ }
+
+ c->slice = NULL;
+ c->curr_slice = 0;
+
+ /* Easy approach for purposes of freeing stats */
+ cram_free_container(c);
+ }
+
+ c = fd->ctr = cram_new_container(fd->seqs_per_slice,
+ fd->slices_per_container);
+ if (!c)
+ return NULL;
+ c->record_counter = fd->record_counter;
+ c->curr_ref = bam_ref(b);
+ }
+
+ c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
+
+ /* New slice */
+ c->slice = c->slices[c->curr_slice] =
+ cram_new_slice(MAPPED_SLICE, c->max_rec);
+ if (!c->slice)
+ return NULL;
+
+ if (c->multi_seq) {
+ c->slice->hdr->ref_seq_id = -2;
+ c->slice->hdr->ref_seq_start = 0;
+ c->slice->last_apos = 1;
+ } else {
+ c->slice->hdr->ref_seq_id = bam_ref(b);
+ // wrong for unsorted data, will fix during encoding.
+ c->slice->hdr->ref_seq_start = bam_pos(b)+1;
+ c->slice->last_apos = bam_pos(b)+1;
+ }
+
+ c->curr_rec = 0;
+
+ return c;
+}
+
+/*
+ * Converts a single bam record into a cram record.
+ * Possibly used within a thread.
+ *
+ * Returns 0 on success;
+ * -1 on failure
+ */
+static int process_one_read(cram_fd *fd, cram_container *c,
+ cram_slice *s, cram_record *cr,
+ bam_seq_t *b, int rnum) {
+ int i, fake_qual = -1;
+ char *cp, *rg;
+ char *ref, *seq, *qual;
+
+ // FIXME: multi-ref containers
+
+ ref = c->ref;
+ cr->flags = bam_flag(b);
+ cr->len = bam_seq_len(b);
+
+ //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
+
+ // Fields to resolve later
+ //cr->mate_line; // index to another cram_record
+ //cr->mate_flags; // MF
+ //cr->ntags; // TC
+ cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
+ if (CRAM_MAJOR_VERS(fd->version) == 1)
+ rg = cram_encode_aux_1_0(fd, b, c, s, cr);
+ else
+ rg = cram_encode_aux(fd, b, c, s, cr);
+
+ //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b));
+ //cr->aux = DSTRING_LEN(s->aux_ds);
+ //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size);
+
+ /* Read group, identified earlier */
+ if (rg) {
+ SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
+ cr->rg = brg ? brg->id : -1;
+ } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
+ assert(brg);
+ } else {
+ cr->rg = -1;
+ }
+ cram_stats_add(c->stats[DS_RG], cr->rg);
+
+
+ cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id);
+ cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]);
+
+ // Non reference based encoding means storing the bases verbatim as features, which in
+ // turn means every base also has a quality already stored.
+ if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
+ cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES;
+ else
+ cr->cram_flags = 0;
+
+ if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
+ cr->cram_flags |= CRAM_FLAG_NO_SEQ;
+ //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
+
+ c->num_bases += cr->len;
+ cr->apos = bam_pos(b)+1;
+ if (c->pos_sorted) {
+ if (cr->apos < s->last_apos) {
+ c->pos_sorted = 0;
+ } else {
+ cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos);
+ s->last_apos = cr->apos;
+ }
+ } else {
+ //cram_stats_add(c->stats[DS_AP], cr->apos);
+ }
+ c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
+
+ cr->name = BLOCK_SIZE(s->name_blk);
+ cr->name_len = bam_name_len(b);
+ cram_stats_add(c->stats[DS_RN], cr->name_len);
+
+ BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
+
+
+ /*
+ * This seqs_ds is largely pointless and it could reuse the same memory
+ * over and over.
+ * s->base_blk is what we need for encoding.
+ */
+ cr->seq = BLOCK_SIZE(s->seqs_blk);
+ cr->qual = BLOCK_SIZE(s->qual_blk);
+ BLOCK_GROW(s->seqs_blk, cr->len+1);
+ BLOCK_GROW(s->qual_blk, cr->len);
+ seq = cp = (char *)BLOCK_END(s->seqs_blk);
+
+ *seq = 0;
+#ifdef ALLOW_UAC
+ {
+ // Convert seq 2 bases at a time for speed.
+ static const uint16_t code2base[256] = {
+ 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077,
+ 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029,
+ 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081,
+ 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033,
+ 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083,
+ 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035,
+ 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093,
+ 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045,
+ 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087,
+ 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039,
+ 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098,
+ 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050,
+ 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099,
+ 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051,
+ 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102,
+ 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054,
+ 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100,
+ 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052,
+ 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103,
+ 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055,
+ 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105,
+ 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057,
+ 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088,
+ 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040,
+ 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091,
+ 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043,
+ 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084,
+ 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036,
+ 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082,
+ 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034,
+ 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094,
+ 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046
+ };
+
+ int l2 = cr->len / 2;
+ unsigned char *from = (unsigned char *)bam_seq(b);
+ uint16_t *cpi = (uint16_t *)cp;
+ cp[0] = 0;
+ for (i = 0; i < l2; i++)
+ cpi[i] = le_int2(code2base[from[i]]);
+ if ((i *= 2) < cr->len)
+ cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
+ }
+#else
+ for (i = 0; i < cr->len; i++)
+ cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
+#endif
+ BLOCK_SIZE(s->seqs_blk) += cr->len;
+
+ qual = cp = (char *)bam_qual(b);
+
+ /* Copy and parse */
+ if (!(cr->flags & BAM_FUNMAP)) {
+ uint32_t *cig_to, *cig_from;
+ int apos = cr->apos-1, spos = 0;
+
+ cr->cigar = s->ncigar;
+ cr->ncigar = bam_cigar_len(b);
+ while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
+ s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
+ s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
+ if (!s->cigar)
+ return -1;
+ }
+
+ cig_to = (uint32_t *)s->cigar;
+ cig_from = (uint32_t *)bam_cigar(b);
+
+ cr->feature = 0;
+ cr->nfeature = 0;
+ for (i = 0; i < cr->ncigar; i++) {
+ enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
+ uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
+ cig_to[i] = cig_from[i];
+
+ /* Can also generate events from here for CRAM diffs */
+
+ switch (cig_op) {
+ int l;
+
+ // Don't trust = and X ops to be correct.
+ case BAM_CMATCH:
+ case BAM_CBASE_MATCH:
+ case BAM_CBASE_MISMATCH:
+ //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
+ // cig_len, &ref[apos], cig_len, &seq[spos]);
+ l = 0;
+ if (!fd->no_ref && cr->len) {
+ int end = cig_len+apos < c->ref_end
+ ? cig_len : c->ref_end - apos;
+ char *sp = &seq[spos];
+ char *rp = &ref[apos];
+ char *qp = &qual[spos];
+ if (end > cr->len) {
+ fprintf(stderr, "CIGAR and query sequence are of "
+ "different length\n");
+ return -1;
+ }
+ for (l = 0; l < end; l++) {
+ if (rp[l] != sp[l]) {
+ if (!sp[l])
+ break;
+ if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
+ // Disabled for the time being as it doesn't
+ // seem to gain us much.
+ int ol=l;
+ while (l<end && rp[l] != sp[l])
+ l++;
+ if (l-ol > 1) {
+ if (cram_add_bases(fd, c, s, cr, spos+ol,
+ l-ol, &seq[spos+ol]))
+ return -1;
+ l--;
+ } else {
+ l = ol;
+ if (cram_add_substitution(fd, c, s, cr,
+ spos+l, sp[l],
+ qp[l], rp[l]))
+ return -1;
+ }
+ } else {
+ if (cram_add_substitution(fd, c, s, cr, spos+l,
+ sp[l], qp[l], rp[l]))
+ return -1;
+ }
+ }
+ }
+ spos += l;
+ apos += l;
+ }
+
+ if (l < cig_len && cr->len) {
+ if (fd->no_ref) {
+ if (CRAM_MAJOR_VERS(fd->version) == 3) {
+ if (cram_add_bases(fd, c, s, cr, spos,
+ cig_len-l, &seq[spos]))
+ return -1;
+ spos += cig_len-l;
+ } else {
+ for (; l < cig_len && seq[spos]; l++, spos++) {
+ if (cram_add_base(fd, c, s, cr, spos,
+ seq[spos], qual[spos]))
+ return -1;
+ }
+ }
+ } else {
+ /* off end of sequence or non-ref based output */
+ for (; l < cig_len && seq[spos]; l++, spos++) {
+ if (cram_add_base(fd, c, s, cr, spos,
+ seq[spos], qual[spos]))
+ return -1;
+ }
+ }
+ apos += cig_len;
+ } else if (!cr->len) {
+ /* Seq "*" */
+ apos += cig_len;
+ spos += cig_len;
+ }
+ break;
+
+ case BAM_CDEL:
+ if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
+ return -1;
+ apos += cig_len;
+ break;
+
+ case BAM_CREF_SKIP:
+ if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
+ return -1;
+ apos += cig_len;
+ break;
+
+ case BAM_CINS:
+ if (cram_add_insertion(c, s, cr, spos, cig_len,
+ cr->len ? &seq[spos] : NULL))
+ return -1;
+ if (fd->no_ref && cr->len) {
+ for (l = 0; l < cig_len; l++, spos++) {
+ cram_add_quality(fd, c, s, cr, spos, qual[spos]);
+ }
+ } else {
+ spos += cig_len;
+ }
+ break;
+
+ case BAM_CSOFT_CLIP:
+ if (cram_add_softclip(c, s, cr, spos, cig_len,
+ cr->len ? &seq[spos] : NULL,
+ fd->version))
+ return -1;
+ if (fd->no_ref &&
+ !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
+ if (cr->len) {
+ for (l = 0; l < cig_len; l++, spos++) {
+ cram_add_quality(fd, c, s, cr, spos, qual[spos]);
+ }
+ } else {
+ for (l = 0; l < cig_len; l++, spos++) {
+ cram_add_quality(fd, c, s, cr, spos, -1);
+ }
+ }
+ } else {
+ spos += cig_len;
+ }
+ break;
+
+ case BAM_CHARD_CLIP:
+ if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
+ return -1;
+ break;
+
+ case BAM_CPAD:
+ if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
+ return -1;
+ break;
+ }
+ }
+ if (cr->len && spos != cr->len) {
+ fprintf(stderr, "CIGAR and query sequence are of different "
+ "length\n");
+ return -1;
+ }
+ fake_qual = spos;
+ cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end);
+ cram_stats_add(c->stats[DS_FN], cr->nfeature);
+ } else {
+ // Unmapped
+ cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
+ cr->cigar = 0;
+ cr->ncigar = 0;
+ cr->nfeature = 0;
+ cr->aend = cr->apos;
+ for (i = 0; i < cr->len; i++)
+ cram_stats_add(c->stats[DS_BA], seq[i]);
+ fake_qual = 0;
+ }
+
+ /*
+ * Append to the qual block now. We do this here as
+ * cram_add_substitution() can generate BA/QS events which need to
+ * be in the qual block before we append the rest of the data.
+ */
+ if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
+ /* Special case of seq "*" */
+ if (cr->len == 0) {
+ cr->len = fake_qual;
+ BLOCK_GROW(s->qual_blk, cr->len);
+ cp = (char *)BLOCK_END(s->qual_blk);
+ memset(cp, 255, cr->len);
+ } else {
+ BLOCK_GROW(s->qual_blk, cr->len);
+ cp = (char *)BLOCK_END(s->qual_blk);
+ char *from = (char *)&bam_qual(b)[0];
+ char *to = &cp[0];
+ memcpy(to, from, cr->len);
+ //for (i = 0; i < cr->len; i++) cp[i] = from[i];
+ }
+ BLOCK_SIZE(s->qual_blk) += cr->len;
+ } else {
+ if (cr->len == 0)
+ cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
+ }
+
+ cram_stats_add(c->stats[DS_RL], cr->len);
+
+ /* Now we know apos and aend both, update mate-pair information */
+ {
+ int new;
+ khint_t k;
+ int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
+
+ //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
+ // cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
+ if (cr->flags & BAM_FPAIRED) {
+ char *key = string_ndup(s->pair_keys,
+ (char *)BLOCK_DATA(s->name_blk)+cr->name,
+ cr->name_len);
+ if (!key)
+ return -1;
+
+ k = kh_put(m_s2i, s->pair[sec], key, &new);
+ if (-1 == new)
+ return -1;
+ else if (new > 0)
+ kh_val(s->pair[sec], k) = rnum;
+ } else {
+ new = 1;
+ }
+
+ if (new == 0) {
+ cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
+ int aleft, aright, sign;
+
+ aleft = MIN(cr->apos, p->apos);
+ aright = MAX(cr->aend, p->aend);
+ if (cr->apos < p->apos) {
+ sign = 1;
+ } else if (cr->apos > p->apos) {
+ sign = -1;
+ } else if (cr->flags & BAM_FREAD1) {
+ sign = 1;
+ } else {
+ sign = -1;
+ }
+
+ //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k));
+
+ // This vs p: tlen, matepos, flags
+ if (bam_ins_size(b) != sign*(aright-aleft+1))
+ goto detached;
+
+ if (MAX(bam_mate_pos(b)+1, 0) != p->apos)
+ goto detached;
+
+ if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
+ ((p->flags & BAM_FUNMAP) != 0))
+ goto detached;
+
+ if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
+ ((p->flags & BAM_FREVERSE) != 0))
+ goto detached;
+
+
+ // p vs this: tlen, matepos, flags
+ if (p->ref_id != cr->ref_id)
+ goto detached;
+
+ if (p->tlen != -sign*(aright-aleft+1))
+ goto detached;
+
+ if (p->mate_pos != cr->apos)
+ goto detached;
+
+ if (((p->flags & BAM_FMUNMAP) != 0) !=
+ ((p->mate_flags & CRAM_M_UNMAP) != 0))
+ goto detached;
+
+ if (((p->flags & BAM_FMREVERSE) != 0) !=
+ ((p->mate_flags & CRAM_M_REVERSE) != 0))
+ goto detached;
+
+ // Supplementary reads are just too ill defined
+ if ((cr->flags & BAM_FSUPPLEMENTARY) ||
+ (p->flags & BAM_FSUPPLEMENTARY))
+ goto detached;
+
+ /*
+ * The fields below are unused when encoding this read as it is
+ * no longer detached. In theory they may get referred to when
+ * processing a 3rd or 4th read in this template?, so we set them
+ * here just to be sure.
+ *
+ * They do not need cram_stats_add() calls those as they are
+ * not emitted.
+ */
+ cr->mate_pos = p->apos;
+ cr->tlen = sign*(aright-aleft+1);
+ cr->mate_flags =
+ ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
+ ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
+
+ // Decrement statistics aggregated earlier
+ if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
+ cram_stats_del(c->stats[DS_NP], p->mate_pos);
+ cram_stats_del(c->stats[DS_MF], p->mate_flags);
+ cram_stats_del(c->stats[DS_TS], p->tlen);
+ cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
+ }
+
+ /* Similarly we could correct the p-> values too, but these will no
+ * longer have any code that refers back to them as the new 'p'
+ * for this template is our current 'cr'.
+ */
+ //p->mate_pos = cr->apos;
+ //p->mate_flags =
+ // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
+ // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
+ //p->tlen = p->apos - cr->aend;
+
+ // Clear detached from cr flags
+ cr->cram_flags &= ~CRAM_FLAG_DETACHED;
+ cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
+
+ // Clear detached from p flags and set downstream
+ if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
+ cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
+ p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
+ }
+
+ p->cram_flags &= ~CRAM_FLAG_DETACHED;
+ p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM;
+ cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
+
+ p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
+ cram_stats_add(c->stats[DS_NF], p->mate_line);
+
+ kh_val(s->pair[sec], k) = rnum;
+ } else {
+ detached:
+ //fprintf(stderr, "unpaired\n");
+
+ /* Derive mate flags from this flag */
+ cr->mate_flags = 0;
+ if (bam_flag(b) & BAM_FMUNMAP)
+ cr->mate_flags |= CRAM_M_UNMAP;
+ if (bam_flag(b) & BAM_FMREVERSE)
+ cr->mate_flags |= CRAM_M_REVERSE;
+
+ cram_stats_add(c->stats[DS_MF], cr->mate_flags);
+
+ cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
+ cram_stats_add(c->stats[DS_NP], cr->mate_pos);
+
+ cr->tlen = bam_ins_size(b);
+ cram_stats_add(c->stats[DS_TS], cr->tlen);
+
+ cr->cram_flags |= CRAM_FLAG_DETACHED;
+ cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
+ cram_stats_add(c->stats[DS_NS], bam_mate_ref(b));
+
+ cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
+ }
+ }
+
+ cr->mqual = bam_map_qual(b);
+ cram_stats_add(c->stats[DS_MQ], cr->mqual);
+
+ cr->mate_ref_id = bam_mate_ref(b);
+
+ if (!(bam_flag(b) & BAM_FUNMAP)) {
+ if (c->first_base > cr->apos)
+ c->first_base = cr->apos;
+
+ if (c->last_base < cr->aend)
+ c->last_base = cr->aend;
+ }
+
+ return 0;
+}
+
+/*
+ * Write iterator: put BAM format sequences into a CRAM file.
+ * We buffer up a containers worth of data at a time.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
+ cram_container *c;
+
+ if (!fd->ctr) {
+ fd->ctr = cram_new_container(fd->seqs_per_slice,
+ fd->slices_per_container);
+ if (!fd->ctr)
+ return -1;
+ fd->ctr->record_counter = fd->record_counter;
+ }
+ c = fd->ctr;
+
+ if (!c->slice || c->curr_rec == c->max_rec ||
+ (bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) {
+ int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
+ int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
+
+
+ /*
+ * Start packing slices when we routinely have under 1/4tr full.
+ *
+ * This option isn't available if we choose to embed references
+ * since we can only have one per slice.
+ */
+ if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
+ fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
+ !fd->embed_ref) {
+ if (fd->verbose && !c->multi_seq)
+ fprintf(stderr, "Multi-ref enabled for this container\n");
+ multi_seq = 1;
+ }
+
+ slice_rec = c->slice_rec;
+ curr_rec = c->curr_rec;
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1 ||
+ c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) {
+ if (NULL == (c = cram_next_container(fd, b))) {
+ if (fd->ctr) {
+ // prevent cram_close attempting to flush
+ cram_free_container(fd->ctr);
+ fd->ctr = NULL;
+ }
+ return -1;
+ }
+ }
+
+ /*
+ * Due to our processing order, some things we've already done we
+ * cannot easily undo. So when we first notice we should be packing
+ * multiple sequences per container we emit the small partial
+ * container as-is and then start a fresh one in a different mode.
+ */
+ if (multi_seq) {
+ fd->multi_seq = 1;
+ c->multi_seq = 1;
+ c->pos_sorted = 0; // required atm for multi_seq slices
+
+ if (!c->refs_used) {
+ pthread_mutex_lock(&fd->ref_lock);
+ c->refs_used = calloc(fd->refs->nref, sizeof(int));
+ pthread_mutex_unlock(&fd->ref_lock);
+ if (!c->refs_used)
+ return -1;
+ }
+ }
+
+ fd->last_slice = curr_rec - slice_rec;
+ c->slice_rec = c->curr_rec;
+
+ // Have we seen this reference before?
+ if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
+ !fd->unsorted && multi_seq) {
+
+ if (!c->refs_used) {
+ pthread_mutex_lock(&fd->ref_lock);
+ c->refs_used = calloc(fd->refs->nref, sizeof(int));
+ pthread_mutex_unlock(&fd->ref_lock);
+ if (!c->refs_used)
+ return -1;
+ } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
+ pthread_mutex_lock(&fd->ref_lock);
+ fd->unsorted = 1;
+ pthread_mutex_unlock(&fd->ref_lock);
+ fd->multi_seq = 1;
+ }
+ }
+
+ c->curr_ref = bam_ref(b);
+ if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
+ }
+
+ if (!c->bams) {
+ /* First time through, allocate a set of bam pointers */
+ pthread_mutex_lock(&fd->bam_list_lock);
+ if (fd->bl) {
+ spare_bams *spare = fd->bl;
+ c->bams = spare->bams;
+ fd->bl = spare->next;
+ free(spare);
+ } else {
+ c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
+ if (!c->bams)
+ return -1;
+ }
+ pthread_mutex_unlock(&fd->bam_list_lock);
+ }
+
+ /* Copy or alloc+copy the bam record, for later encoding */
+ if (c->bams[c->curr_c_rec])
+ bam_copy1(c->bams[c->curr_c_rec], b);
+ else
+ c->bams[c->curr_c_rec] = bam_dup(b);
+
+ c->curr_rec++;
+ c->curr_c_rec++;
+ fd->record_counter++;
+
+ return 0;
+}
diff --git a/htslib/cram/cram_encode.h b/htslib/cram/cram_encode.h
new file mode 100644
index 0000000..9131d6e
--- /dev/null
+++ b/htslib/cram/cram_encode.h
@@ -0,0 +1,105 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*! \file
+ * Include cram.h instead.
+ *
+ * This is an internal part of the CRAM system and is automatically included
+ * when you #include cram.h.
+ *
+ * Implements the encoding portion of CRAM I/O. Also see
+ * cram_codecs.[ch] for the actual encoding functions themselves.
+ */
+
+#ifndef _CRAM_WRITE_H_
+#define _CRAM_WRITE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ----------------------------------------------------------------------
+ * CRAM sequence iterators.
+ */
+
+/*! Write iterator: put BAM format sequences into a CRAM file.
+ *
+ * We buffer up a containers worth of data at a time.
+ *
+ * FIXME: break this into smaller pieces.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b);
+
+
+/* ----------------------------------------------------------------------
+ * Internal functions
+ */
+
+/*! INTERNAL:
+ * Encodes a compression header block into a generic cram_block structure.
+ *
+ * @return
+ * Returns cram_block ptr on success;
+ * NULL on failure
+ */
+cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
+ cram_block_compression_hdr *h);
+
+/*! INTERNAL:
+ * Encodes a slice compression header.
+ *
+ * @return
+ * Returns cram_block on success;
+ * NULL on failure
+ */
+cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s);
+
+/*! INTERNAL:
+ * Encodes all slices in a container into blocks.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ *
+ * FIXME: separate into encode_container and write_container. Ideally
+ * we should be able to do read_container / write_container or
+ * decode_container / encode_container.
+ */
+int cram_encode_container(cram_fd *fd, cram_container *c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/cram/cram_external.c b/htslib/cram/cram_external.c
new file mode 100644
index 0000000..f5accf2
--- /dev/null
+++ b/htslib/cram/cram_external.c
@@ -0,0 +1,377 @@
+/*
+Copyright (c) 2015 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*! \file
+ * External CRAM interface.
+ *
+ * Internally we're happy to use macros and to grub around in the cram
+ * structures. This isn't very sustainable for an externally usable
+ * ABI though, so we have anonymous structs and accessor functions too
+ * to permit software such as samtools reheader to manipulate cram
+ * containers and blocks in a robust manner.
+ */
+
+#include <config.h>
+
+#include "htslib/hfile.h"
+#include "cram/cram.h"
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_fd
+ */
+SAM_hdr *cram_fd_get_header(cram_fd *fd) { return fd->header; }
+void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) { fd->header = hdr; }
+
+int cram_fd_get_version(cram_fd *fd) { return fd->version; }
+void cram_fd_set_version(cram_fd *fd, int vers) { fd->version = vers; }
+
+int cram_major_vers(cram_fd *fd) { return CRAM_MAJOR_VERS(fd->version); }
+int cram_minor_vers(cram_fd *fd) { return CRAM_MINOR_VERS(fd->version); }
+
+hFILE *cram_fd_get_fp(cram_fd *fd) { return fd->fp; }
+void cram_fd_set_fp(cram_fd *fd, hFILE *fp) { fd->fp = fp; }
+
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_container
+ */
+int32_t cram_container_get_length(cram_container *c) {
+ return c->length;
+}
+
+void cram_container_set_length(cram_container *c, int32_t length) {
+ c->length = length;
+}
+
+
+int32_t cram_container_get_num_blocks(cram_container *c) {
+ return c->num_blocks;
+}
+
+void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) {
+ c->num_blocks = num_blocks;
+}
+
+
+/* Returns the landmarks[] array and the number of elements
+ * in num_landmarks.
+ */
+int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) {
+ *num_landmarks = c->num_landmarks;
+ return c->landmark;
+}
+
+/* Sets the landmarks[] array (pointer copy, not a memory dup) and
+ * num_landmarks value.
+ */
+void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
+ int32_t *landmarks) {
+ c->num_landmarks = num_landmarks;
+ c->landmark = landmarks;
+}
+
+
+/* Returns true if the container is empty (EOF marker) */
+int cram_container_is_empty(cram_fd *fd) {
+ return fd->empty_container;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_block_compression_hdr
+ */
+
+/*
+ * Utility function to edit an RG id.
+ * This is only possible if there is one single RG value used and it
+ * is in the container compression header using HUFFMAN or BETA
+ * codec. In this case it is essentially hard coded and needs no
+ * editing of external (or worse, CORE) blocks.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+// Or arbitrary set compression header constant?
+
+static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch,
+ int ds, int new_rg) {
+ if (!ch || !ch->codecs[ds])
+ return -1;
+
+ switch (ch->codecs[ds]->codec) {
+ case E_HUFFMAN:
+ if (ch->codecs[ds]->huffman.ncodes != 1)
+ return -1;
+ ch->codecs[ds]->huffman.codes[0].symbol = new_rg;
+ return 0;
+
+ case E_BETA:
+ if (ch->codecs[ds]->beta.nbits != 0)
+ return -1;
+ ch->codecs[ds]->beta.offset = -new_rg;
+ return 0;
+
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+int cram_block_compression_hdr_set_rg(cram_block_compression_hdr *ch, int new_rg) {
+ return cram_block_compression_hdr_set_DS(ch, DS_RG, new_rg);
+}
+
+/*
+ * Converts a cram_block_compression_hdr struct used for decoding to
+ * one used for encoding. Maybe this should be a transparent
+ * operation applied on-demand.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_block_compression_hdr_decoder2encoder(cram_fd *fd,
+ cram_block_compression_hdr *ch) {
+ int i;
+
+ if (!ch)
+ return -1;
+
+ for (i = 0; i < DS_END; i++) {
+ cram_codec *co = ch->codecs[i];
+ if (!co)
+ continue;
+
+ if (-1 == cram_codec_decoder2encoder(fd, co))
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_slice
+ */
+int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) {
+ return hdr->num_blocks;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_block
+ */
+int32_t cram_block_get_content_id(cram_block *b) { return b->content_id; }
+int32_t cram_block_get_comp_size(cram_block *b) { return b->comp_size; }
+int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; }
+int32_t cram_block_get_crc32(cram_block *b) { return b->crc32; }
+void * cram_block_get_data(cram_block *b) { return BLOCK_DATA(b); }
+int32_t cram_block_get_size(cram_block *b) { return BLOCK_SIZE(b); }
+enum cram_content_type cram_block_get_content_type(cram_block *b) {
+ return b->content_type;
+}
+
+void cram_block_set_content_id(cram_block *b, int32_t id) { b->content_id = id; }
+void cram_block_set_comp_size(cram_block *b, int32_t size) { b->comp_size = size; }
+void cram_block_set_uncomp_size(cram_block *b, int32_t size) { b->uncomp_size = size; }
+void cram_block_set_crc32(cram_block *b, int32_t crc) { b->crc32 = crc; }
+void cram_block_set_data(cram_block *b, void *data) { BLOCK_DATA(b) = data; }
+void cram_block_set_size(cram_block *b, int32_t size) { BLOCK_SIZE(b) = size; }
+
+int cram_block_append(cram_block *b, void *data, int size) {
+ BLOCK_APPEND(b, data, size);
+ return BLOCK_DATA(b) ? 0 : -1; // It'll do for now...
+}
+void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); }
+
+// Offset is known as "size" internally, but it can be confusing.
+size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); }
+void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; }
+
+
+/*
+ * Copies the blocks representing the next num_slice slices from a
+ * container from 'in' to 'out'. It is expected that the file pointer
+ * is just after the read of the cram_container and cram compression
+ * header.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
+ int32_t i, j;
+
+ for (i = 0; i < num_slice; i++) {
+ cram_block *blk;
+ cram_block_slice_hdr *hdr;
+
+ if (!(blk = cram_read_block(in)))
+ return -1;
+ if (!(hdr = cram_decode_slice_header(in, blk))) {
+ cram_free_block(blk);
+ return -1;
+ }
+ if (cram_write_block(out, blk) != 0) {
+ cram_free_block(blk);
+ return -1;
+ }
+ cram_free_block(blk);
+
+ int num_blocks = cram_slice_hdr_get_num_blocks(hdr);
+ for (j = 0; j < num_blocks; j++) {
+ blk = cram_read_block(in);
+ if (!blk || cram_write_block(out, blk) != 0) {
+ if (blk) cram_free_block(blk);
+ return -1;
+ }
+ cram_free_block(blk);
+ }
+ cram_free_slice_header(hdr);
+ }
+
+ return 0;
+}
+
+/*
+ * Renumbers RG numbers in a cram compression header.
+ *
+ * CRAM stores RG as the Nth number in the header, rather than a
+ * string holding the ID: tag. This is smaller in space, but means
+ * "samtools cat" to join files together that contain single but
+ * different RG lines needs a way of renumbering them.
+ *
+ * The file descriptor is expected to be immediately after the
+ * cram_container structure (ie before the cram compression header).
+ * Due to the nature of the CRAM format, this needs to read and write
+ * the blocks itself. Note that there may be multiple slices within
+ * the container, meaning multiple compression headers to manipulate.
+ * Changing RG may change the size of the compression header and
+ * therefore the length field in the container. Hence we rewrite all
+ * blocks just incase and also emit the adjusted container.
+ *
+ * The current implementation can only cope with renumbering a single
+ * RG (and only then if it is using HUFFMAN or BETA codecs). In
+ * theory it *may* be possible to renumber multiple RGs if they use
+ * HUFFMAN to the CORE block or use an external block unshared by any
+ * other data series. So we have an API that can be upgraded to
+ * support this, but do not implement it for now. An example
+ * implementation of RG as an EXTERNAL block would be to find that
+ * block and rewrite it, returning the number of blocks consumed.
+ *
+ * Returns 0 on success;
+ * -1 if unable to edit;
+ * -2 on other errors (eg I/O).
+ */
+int cram_transcode_rg(cram_fd *in, cram_fd *out,
+ cram_container *c,
+ int nrg, int *in_rg, int *out_rg) {
+ int new_rg = *out_rg, old_size, new_size;
+ cram_block *o_blk, *n_blk;
+ cram_block_compression_hdr *ch;
+
+ if (nrg != 1) {
+ fprintf(stderr, "[%s] ERROR: not implemented for nrg != 1\n",
+ __func__);
+ return -2;
+ }
+
+ // Produce a new block holding the updated compression header,
+ // with RG transcoded to a new value. (Single only supported.)
+ o_blk = cram_read_block(in);
+ old_size = cram_block_size(o_blk);
+ ch = cram_decode_compression_header(in, o_blk);
+ if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
+ return -1;
+ cram_block_compression_hdr_decoder2encoder(in, ch);
+ n_blk = cram_encode_compression_header(in, c, ch);
+ cram_free_compression_header(ch);
+
+ /*
+ * Warning: this has internal knowledge of the cram compression
+ * header format.
+ *
+ * The decoder doesn't set c->tags_used, so the encoder puts a two
+ * byte blank segment. This means n_blk is too short. We skip
+ * through the decoded old block (o_blk) and copy from there.
+ */
+ char *cp = cram_block_get_data(o_blk);
+ char *op = cp;
+ char *endp = cp + cram_block_get_uncomp_size(o_blk);
+ //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
+ int32_t i32;
+
+ cp += safe_itf8_get(cp, endp, &i32);
+ cp += i32;
+ cp += safe_itf8_get(cp, endp, &i32);
+ cp += i32;
+ op = cp;
+ cp += safe_itf8_get(cp, endp, &i32);
+ i32 += (cp-op);
+
+ //fprintf(stderr, "remaining %d bytes\n", i32);
+ cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
+ cram_block_append(n_blk, op, i32);
+ cram_block_update_size(n_blk);
+
+ new_size = cram_block_size(n_blk);
+
+ //fprintf(stderr, "size %d -> %d\n", old_size, new_size);
+
+ // Now we've constructedthe updated compression header,
+ // amend the container too (it may have changed size).
+ int32_t *landmarks, num_landmarks;
+ landmarks = cram_container_get_landmarks(c, &num_landmarks);
+
+ if (old_size != new_size) {
+ int diff = new_size - old_size, j;
+
+ for (j = 0; j < num_landmarks; j++)
+ landmarks[j] += diff;
+ //cram_container_set_landmarks(c, num_landmarks, landmarks);
+ cram_container_set_length(c, cram_container_get_length(c) + diff);
+ }
+
+ // Finally write it all out; container, compression header,
+ // and then all the remaining slice blocks.
+ if (cram_write_container(out, c) != 0)
+ return -2;
+
+ cram_write_block(out, n_blk);
+ cram_free_block(o_blk);
+ cram_free_block(n_blk);
+
+ // Container num_blocks can be invalid, due to a bug.
+ // Instead we iterate in slice context instead.
+ return cram_copy_slice(in, out, num_landmarks);
+}
diff --git a/htslib/cram/cram_index.c b/htslib/cram/cram_index.c
new file mode 100644
index 0000000..9818991
--- /dev/null
+++ b/htslib/cram/cram_index.c
@@ -0,0 +1,582 @@
+/*
+Copyright (c) 2013-2014 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * The index is a gzipped tab-delimited text file with one line per slice.
+ * The columns are:
+ * 1: reference number (0 to N-1, as per BAM ref_id)
+ * 2: reference position of 1st read in slice (1..?)
+ * 3: number of reads in slice
+ * 4: offset of container start (relative to end of SAM header, so 1st
+ * container is offset 0).
+ * 5: slice number within container (ie which landmark).
+ *
+ * In memory, we hold this in a nested containment list. Each list element is
+ * a cram_index struct. Each element in turn can contain its own list of
+ * cram_index structs.
+ *
+ * Any start..end range which is entirely contained within another (and
+ * earlier as it is sorted) range will be held within it. This ensures that
+ * the outer list will never have containments and we can safely do a
+ * binary search to find the first range which overlaps any given coordinate.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "htslib/hfile.h"
+#include "hts_internal.h"
+#include "cram/cram.h"
+#include "cram/os.h"
+#include "cram/zfio.h"
+
+#if 0
+static void dump_index_(cram_index *e, int level) {
+ int i, n;
+ n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
+ printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
+ for (i = 0; i < e->nslice; i++) {
+ dump_index_(&e->e[i], level+1);
+ }
+}
+
+static void dump_index(cram_fd *fd) {
+ int i;
+ for (i = 0; i < fd->index_sz; i++) {
+ dump_index_(&fd->index[i], 0);
+ }
+}
+#endif
+
+static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
+ int sign = 1;
+ int32_t val = 0;
+ size_t p = *pos;
+
+ while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
+ p++;
+
+ if (p < k->l && k->s[p] == '-')
+ sign = -1, p++;
+
+ if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
+ return -1;
+
+ while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
+ val = val*10 + k->s[p++]-'0';
+
+ *pos = p;
+ *val_p = sign*val;
+
+ return 0;
+}
+
+static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
+ int sign = 1;
+ int64_t val = 0;
+ size_t p = *pos;
+
+ while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
+ p++;
+
+ if (p < k->l && k->s[p] == '-')
+ sign = -1, p++;
+
+ if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
+ return -1;
+
+ while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
+ val = val*10 + k->s[p++]-'0';
+
+ *pos = p;
+ *val_p = sign*val;
+
+ return 0;
+}
+
+/*
+ * Loads a CRAM .crai index into memory.
+ *
+ * Returns 0 for success
+ * -1 for failure
+ */
+int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
+ char *fn2 = NULL;
+ char buf[65536];
+ ssize_t len;
+ kstring_t kstr = {0};
+ FILE *fp;
+ cram_index *idx;
+ cram_index **idx_stack = NULL, *ep, e;
+ int idx_stack_alloc = 0, idx_stack_ptr = 0;
+ size_t pos = 0;
+
+ /* Check if already loaded */
+ if (fd->index)
+ return 0;
+
+ fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
+ if (!fd->index)
+ return -1;
+
+ idx = &fd->index[0];
+ idx->refid = -1;
+ idx->start = INT_MIN;
+ idx->end = INT_MAX;
+
+ idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
+ idx_stack[idx_stack_ptr] = idx;
+
+ if (!fn_idx) {
+ fn2 = hts_idx_getfn(fn, ".crai");
+ if (!fn2) {
+ free(idx_stack);
+ return -1;
+ }
+ fn_idx = fn2;
+ }
+
+ if (!(fp = fopen(fn_idx, "r"))) {
+ perror(fn_idx);
+ free(idx_stack);
+ free(fn2);
+ return -1;
+ }
+
+ // Load the file into memory
+ while ((len = fread(buf, 1, 65536, fp)) > 0)
+ kputsn(buf, len, &kstr);
+ if (len < 0 || kstr.l < 2) {
+ if (kstr.s)
+ free(kstr.s);
+ free(idx_stack);
+ free(fn2);
+ return -1;
+ }
+
+ if (fclose(fp)) {
+ if (kstr.s)
+ free(kstr.s);
+ free(idx_stack);
+ free(fn2);
+ return -1;
+ }
+
+
+ // Uncompress if required
+ if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
+ size_t l;
+ char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
+ free(kstr.s);
+ if (!s) {
+ free(idx_stack);
+ free(fn2);
+ return -1;
+ }
+ kstr.s = s;
+ kstr.l = l;
+ kstr.m = l; // conservative estimate of the size allocated
+ kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated
+ }
+
+
+ // Parse it line at a time
+ do {
+ /* 1.1 layout */
+ if (kget_int32(&kstr, &pos, &e.refid) == -1) {
+ free(kstr.s); free(idx_stack); free(fn2); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.start) == -1) {
+ free(kstr.s); free(idx_stack); free(fn2); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.end) == -1) {
+ free(kstr.s); free(idx_stack); free(fn2); return -1;
+ }
+ if (kget_int64(&kstr, &pos, &e.offset) == -1) {
+ free(kstr.s); free(idx_stack); free(fn2); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.slice) == -1) {
+ free(kstr.s); free(idx_stack); free(fn2); return -1;
+ }
+ if (kget_int32(&kstr, &pos, &e.len) == -1) {
+ free(kstr.s); free(idx_stack); free(fn2); return -1;
+ }
+
+ e.end += e.start-1;
+ //printf("%d/%d..%d\n", e.refid, e.start, e.end);
+
+ if (e.refid < -1) {
+ free(kstr.s);
+ free(idx_stack);
+ free(fn2);
+ fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
+ return -1;
+ }
+
+ if (e.refid != idx->refid) {
+ if (fd->index_sz < e.refid+2) {
+ size_t index_end = fd->index_sz * sizeof(*fd->index);
+ fd->index_sz = e.refid+2;
+ fd->index = realloc(fd->index,
+ fd->index_sz * sizeof(*fd->index));
+ memset(((char *)fd->index) + index_end, 0,
+ fd->index_sz * sizeof(*fd->index) - index_end);
+ }
+ idx = &fd->index[e.refid+1];
+ idx->refid = e.refid;
+ idx->start = INT_MIN;
+ idx->end = INT_MAX;
+ idx->nslice = idx->nalloc = 0;
+ idx->e = NULL;
+ idx_stack[(idx_stack_ptr = 0)] = idx;
+ }
+
+ while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
+ idx = idx_stack[--idx_stack_ptr];
+ }
+
+ // Now contains, so append
+ if (idx->nslice+1 >= idx->nalloc) {
+ idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
+ idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
+ }
+
+ e.nalloc = e.nslice = 0; e.e = NULL;
+ *(ep = &idx->e[idx->nslice++]) = e;
+ idx = ep;
+
+ if (++idx_stack_ptr >= idx_stack_alloc) {
+ idx_stack_alloc *= 2;
+ idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
+ }
+ idx_stack[idx_stack_ptr] = idx;
+
+ while (pos < kstr.l && kstr.s[pos] != '\n')
+ pos++;
+ pos++;
+ } while (pos < kstr.l);
+
+ free(idx_stack);
+ free(kstr.s);
+ free(fn2);
+
+ // dump_index(fd);
+
+ return 0;
+}
+
+static void cram_index_free_recurse(cram_index *e) {
+ if (e->e) {
+ int i;
+ for (i = 0; i < e->nslice; i++) {
+ cram_index_free_recurse(&e->e[i]);
+ }
+ free(e->e);
+ }
+}
+
+void cram_index_free(cram_fd *fd) {
+ int i;
+
+ if (!fd->index)
+ return;
+
+ for (i = 0; i < fd->index_sz; i++) {
+ cram_index_free_recurse(&fd->index[i]);
+ }
+ free(fd->index);
+
+ fd->index = NULL;
+}
+
+/*
+ * Searches the index for the first slice overlapping a reference ID
+ * and position, or one immediately preceding it if none is found in
+ * the index to overlap this position. (Our index may have missing
+ * entries, but we require at least one per reference.)
+ *
+ * If the index finds multiple slices overlapping this position we
+ * return the first one only. Subsequent calls should specifying
+ * "from" as the last slice we checked to find the next one. Otherwise
+ * set "from" to be NULL to find the first one.
+ *
+ * Returns the cram_index pointer on sucess
+ * NULL on failure
+ */
+cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
+ cram_index *from) {
+ int i, j, k;
+ cram_index *e;
+
+ if (refid+1 < 0 || refid+1 >= fd->index_sz)
+ return NULL;
+
+ if (!from)
+ from = &fd->index[refid+1];
+
+ // Ref with nothing aligned against it.
+ if (!from->e)
+ return NULL;
+
+ // This sequence is covered by the index, so binary search to find
+ // the optimal starting block.
+ i = 0, j = fd->index[refid+1].nslice-1;
+ for (k = j/2; k != i; k = (j-i)/2 + i) {
+ if (from->e[k].refid > refid) {
+ j = k;
+ continue;
+ }
+
+ if (from->e[k].refid < refid) {
+ i = k;
+ continue;
+ }
+
+ if (from->e[k].start >= pos) {
+ j = k;
+ continue;
+ }
+
+ if (from->e[k].start < pos) {
+ i = k;
+ continue;
+ }
+ }
+ // i==j or i==j-1. Check if j is better.
+ if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
+ i = j;
+
+ /* The above found *a* bin overlapping, but not necessarily the first */
+ while (i > 0 && from->e[i-1].end >= pos)
+ i--;
+
+ /* We may be one bin before the optimum, so check */
+ while (i+1 < from->nslice &&
+ (from->e[i].refid < refid ||
+ from->e[i].end < pos))
+ i++;
+
+ e = &from->e[i];
+
+ return e;
+}
+
+
+/*
+ * Skips to a container overlapping the start coordinate listed in
+ * cram_range.
+ *
+ * In theory we call cram_index_query multiple times, once per slice
+ * overlapping the range. However slices may be absent from the index
+ * which makes this problematic. Instead we find the left-most slice
+ * and then read from then on, skipping decoding of slices and/or
+ * whole containers when they don't overlap the specified cram_range.
+ *
+ * Returns 0 on success
+ * -1 on general failure
+ * -2 on no-data (empty chromosome)
+ */
+int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
+ cram_index *e;
+
+ // Ideally use an index, so see if we have one.
+ if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
+ if (0 != cram_seek(fd, e->offset, SEEK_SET))
+ if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
+ return -1;
+ } else {
+ // Absent from index, but this most likely means it simply has no data.
+ return -2;
+ }
+
+ if (fd->ctr) {
+ cram_free_container(fd->ctr);
+ fd->ctr = NULL;
+ fd->ooc = 0;
+ }
+
+ return 0;
+}
+
+
+/*
+ * A specialised form of cram_index_build (below) that deals with slices
+ * having multiple references in this (ref_id -2). In this scenario we
+ * decode the slice to look at the RI data series instead.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+static int cram_index_build_multiref(cram_fd *fd,
+ cram_container *c,
+ cram_slice *s,
+ zfp *fp,
+ off_t cpos,
+ int32_t landmark,
+ int sz) {
+ int i, ref = -2, ref_start = 0, ref_end;
+ char buf[1024];
+
+ if (0 != cram_decode_slice(fd, c, s, fd->header))
+ return -1;
+
+ ref_end = INT_MIN;
+ for (i = 0; i < s->hdr->num_records; i++) {
+ if (s->crecs[i].ref_id == ref) {
+ if (ref_end < s->crecs[i].aend)
+ ref_end = s->crecs[i].aend;
+ continue;
+ }
+
+ if (ref != -2) {
+ sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
+ ref, ref_start, ref_end - ref_start + 1,
+ (int64_t)cpos, landmark, sz);
+ zfputs(buf, fp);
+ }
+
+ ref = s->crecs[i].ref_id;
+ ref_start = s->crecs[i].apos;
+ ref_end = INT_MIN;
+ }
+
+ if (ref != -2) {
+ sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
+ ref, ref_start, ref_end - ref_start + 1,
+ (int64_t)cpos, landmark, sz);
+ zfputs(buf, fp);
+ }
+
+ return 0;
+}
+
+/*
+ * Builds an index file.
+ *
+ * fd is a newly opened cram file that we wish to index.
+ * fn_base is the filename of the associated CRAM file.
+ * fn_idx is the filename of the index file to be written;
+ * if NULL, we add ".crai" to fn_base to get the index filename.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
+ cram_container *c;
+ off_t cpos, spos, hpos;
+ zfp *fp;
+ kstring_t fn_idx_str = {0};
+
+ if (! fn_idx) {
+ kputs(fn_base, &fn_idx_str);
+ kputs(".crai", &fn_idx_str);
+ fn_idx = fn_idx_str.s;
+ }
+
+ if (!(fp = zfopen(fn_idx, "wz"))) {
+ perror(fn_idx);
+ free(fn_idx_str.s);
+ return -1;
+ }
+
+ free(fn_idx_str.s);
+
+ cpos = htell(fd->fp);
+ while ((c = cram_read_container(fd))) {
+ int j;
+
+ if (fd->err) {
+ perror("Cram container read");
+ return -1;
+ }
+
+ hpos = htell(fd->fp);
+
+ if (!(c->comp_hdr_block = cram_read_block(fd)))
+ return -1;
+ assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
+
+ c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
+ if (!c->comp_hdr)
+ return -1;
+
+ // 2.0 format
+ for (j = 0; j < c->num_landmarks; j++) {
+ char buf[1024];
+ cram_slice *s;
+ int sz;
+
+ spos = htell(fd->fp);
+ assert(spos - cpos - c->offset == c->landmark[j]);
+
+ if (!(s = cram_read_slice(fd))) {
+ zfclose(fp);
+ return -1;
+ }
+
+ sz = (int)(htell(fd->fp) - spos);
+
+ if (s->hdr->ref_seq_id == -2) {
+ cram_index_build_multiref(fd, c, s, fp,
+ cpos, c->landmark[j], sz);
+ } else {
+ sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
+ s->hdr->ref_seq_id, s->hdr->ref_seq_start,
+ s->hdr->ref_seq_span, (int64_t)cpos,
+ c->landmark[j], sz);
+ zfputs(buf, fp);
+ }
+
+ cram_free_slice(s);
+ }
+
+ cpos = htell(fd->fp);
+ assert(cpos == hpos + c->length);
+
+ cram_free_container(c);
+ }
+ if (fd->err) {
+ zfclose(fp);
+ return -1;
+ }
+
+
+ return (zfclose(fp) >= 0)? 0 : -1;
+}
diff --git a/htslib/cram/cram_index.h b/htslib/cram/cram_index.h
new file mode 100644
index 0000000..c0429e0
--- /dev/null
+++ b/htslib/cram/cram_index.h
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _CRAM_INDEX_H_
+#define _CRAM_INDEX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Loads a CRAM .crai index into memory.
+ * Returns 0 for success
+ * -1 for failure
+ */
+int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx);
+
+void cram_index_free(cram_fd *fd);
+
+/*
+ * Searches the index for the first slice overlapping a reference ID
+ * and position.
+ *
+ * Returns the cram_index pointer on sucess
+ * NULL on failure
+ */
+cram_index *cram_index_query(cram_fd *fd, int refid, int pos, cram_index *frm);
+
+/*
+ * Skips to a container overlapping the start coordinate listed in
+ * cram_range.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_seek_to_refpos(cram_fd *fd, cram_range *r);
+
+void cram_index_free(cram_fd *fd);
+
+/*
+ * Skips to a container overlapping the start coordinate listed in
+ * cram_range.
+ *
+ * In theory we call cram_index_query multiple times, once per slice
+ * overlapping the range. However slices may be absent from the index
+ * which makes this problematic. Instead we find the left-most slice
+ * and then read from then on, skipping decoding of slices and/or
+ * whole containers when they don't overlap the specified cram_range.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_seek_to_refpos(cram_fd *fd, cram_range *r);
+
+/*
+ * Builds an index file.
+ *
+ * fd is a newly opened cram file that we wish to index.
+ * fn_base is the filename of the associated CRAM file.
+ * fn_idx is the filename of the index file to be written;
+ * if NULL, we add ".crai" to fn_base to get the index filename.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/cram/cram_io.c b/htslib/cram/cram_io.c
new file mode 100644
index 0000000..a1f51a0
--- /dev/null
+++ b/htslib/cram/cram_io.c
@@ -0,0 +1,4604 @@
+/*
+Copyright (c) 2012-2014 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * CRAM I/O primitives.
+ *
+ * - ITF8 encoding and decoding.
+ * - Block based I/O
+ * - Zlib inflating and deflating (memory)
+ * - CRAM basic data structure reading and writing
+ * - File opening / closing
+ * - Reference sequence handling
+ */
+
+/*
+ * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
+ * a way to return errors for when malloc fails.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+#ifdef HAVE_LIBBZ2
+#include <bzlib.h>
+#endif
+#ifdef HAVE_LIBLZMA
+#include <lzma.h>
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "cram/cram.h"
+#include "cram/os.h"
+#include "htslib/hts.h"
+#include "cram/open_trace_file.h"
+#include "cram/rANS_static.h"
+
+//#define REF_DEBUG
+
+#ifdef REF_DEBUG
+#include <sys/syscall.h>
+#define gettid() (int)syscall(SYS_gettid)
+
+#define RP(...) fprintf (stderr, __VA_ARGS__)
+#else
+#define RP(...)
+#endif
+
+#include "htslib/hfile.h"
+#include "htslib/bgzf.h"
+#include "htslib/faidx.h"
+
+#define TRIAL_SPAN 50
+#define NTRIALS 3
+
+
+/* ----------------------------------------------------------------------
+ * ITF8 encoding and decoding.
+ *
+* Also see the itf8_get and itf8_put macros in cram_io.h
+ */
+
+/*
+ * LEGACY: consider using itf8_decode_crc.
+ *
+ * Reads an integer in ITF-8 encoding from 'cp' and stores it in
+ * *val.
+ *
+ * Returns the number of bytes read on success
+ * -1 on failure
+ */
+int itf8_decode(cram_fd *fd, int32_t *val_p) {
+ static int nbytes[16] = {
+ 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
+ 1,1,1,1, // 1000xxxx - 1011xxxx
+ 2,2, // 1100xxxx - 1101xxxx
+ 3, // 1110xxxx
+ 4, // 1111xxxx
+ };
+
+ static int nbits[16] = {
+ 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
+ 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
+ 0x1f, 0x1f, // 1100xxxx - 1101xxxx
+ 0x0f, // 1110xxxx
+ 0x0f, // 1111xxxx
+ };
+
+ int32_t val = hgetc(fd->fp);
+ if (val == -1)
+ return -1;
+
+ int i = nbytes[val>>4];
+ val &= nbits[val>>4];
+
+ switch(i) {
+ case 0:
+ *val_p = val;
+ return 1;
+
+ case 1:
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val;
+ return 2;
+
+ case 2:
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val;
+ return 3;
+
+ case 3:
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val;
+ return 4;
+
+ case 4: // really 3.5 more, why make it different?
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
+ *val_p = val;
+ }
+
+ return 5;
+}
+
+int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
+ static int nbytes[16] = {
+ 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
+ 1,1,1,1, // 1000xxxx - 1011xxxx
+ 2,2, // 1100xxxx - 1101xxxx
+ 3, // 1110xxxx
+ 4, // 1111xxxx
+ };
+
+ static int nbits[16] = {
+ 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
+ 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
+ 0x1f, 0x1f, // 1100xxxx - 1101xxxx
+ 0x0f, // 1110xxxx
+ 0x0f, // 1111xxxx
+ };
+ unsigned char c[5];
+
+ int32_t val = hgetc(fd->fp);
+ if (val == -1)
+ return -1;
+
+ c[0]=val;
+
+ int i = nbytes[val>>4];
+ val &= nbits[val>>4];
+
+ switch(i) {
+ case 0:
+ *val_p = val;
+ *crc = crc32(*crc, c, 1);
+ return 1;
+
+ case 1:
+ val = (val<<8) | (c[1]=hgetc(fd->fp));
+ *val_p = val;
+ *crc = crc32(*crc, c, 2);
+ return 2;
+
+ case 2:
+ val = (val<<8) | (c[1]=hgetc(fd->fp));
+ val = (val<<8) | (c[2]=hgetc(fd->fp));
+ *val_p = val;
+ *crc = crc32(*crc, c, 3);
+ return 3;
+
+ case 3:
+ val = (val<<8) | (c[1]=hgetc(fd->fp));
+ val = (val<<8) | (c[2]=hgetc(fd->fp));
+ val = (val<<8) | (c[3]=hgetc(fd->fp));
+ *val_p = val;
+ *crc = crc32(*crc, c, 4);
+ return 4;
+
+ case 4: // really 3.5 more, why make it different?
+ val = (val<<8) | (c[1]=hgetc(fd->fp));
+ val = (val<<8) | (c[2]=hgetc(fd->fp));
+ val = (val<<8) | (c[3]=hgetc(fd->fp));
+ val = (val<<4) | (((c[4]=hgetc(fd->fp))) & 0x0f);
+ *val_p = val;
+ *crc = crc32(*crc, c, 5);
+ }
+
+ return 5;
+}
+
+/*
+ * Encodes and writes a single integer in ITF-8 format.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int itf8_encode(cram_fd *fd, int32_t val) {
+ char buf[5];
+ int len = itf8_put(buf, val);
+ return hwrite(fd->fp, buf, len) == len ? 0 : -1;
+}
+
+const int itf8_bytes[16] = {
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 3, 3, 4, 5
+};
+
+#ifndef ITF8_MACROS
+/*
+ * As above, but decoding from memory
+ */
+int itf8_get(char *cp, int32_t *val_p) {
+ unsigned char *up = (unsigned char *)cp;
+
+ if (up[0] < 0x80) {
+ *val_p = up[0];
+ return 1;
+ } else if (up[0] < 0xc0) {
+ *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
+ return 2;
+ } else if (up[0] < 0xe0) {
+ *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
+ return 3;
+ } else if (up[0] < 0xf0) {
+ *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
+ return 4;
+ } else {
+ *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
+ return 5;
+ }
+}
+
+/*
+ * Stores a value to memory in ITF-8 format.
+ *
+ * Returns the number of bytes required to store the number.
+ * This is a maximum of 5 bytes.
+ */
+int itf8_put(char *cp, int32_t val) {
+ if (!(val & ~0x00000007f)) { // 1 byte
+ *cp = val;
+ return 1;
+ } else if (!(val & ~0x00003fff)) { // 2 byte
+ *cp++ = (val >> 8 ) | 0x80;
+ *cp = val & 0xff;
+ return 2;
+ } else if (!(val & ~0x01fffff)) { // 3 byte
+ *cp++ = (val >> 16) | 0xc0;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 3;
+ } else if (!(val & ~0x0fffffff)) { // 4 byte
+ *cp++ = (val >> 24) | 0xe0;
+ *cp++ = (val >> 16) & 0xff;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 4;
+ } else { // 5 byte
+ *cp++ = 0xf0 | ((val>>28) & 0xff);
+ *cp++ = (val >> 20) & 0xff;
+ *cp++ = (val >> 12) & 0xff;
+ *cp++ = (val >> 4 ) & 0xff;
+ *cp = val & 0x0f;
+ return 5;
+ }
+}
+#endif
+
+/* 64-bit itf8 variant */
+int ltf8_put(char *cp, int64_t val) {
+ if (!(val & ~((1LL<<7)-1))) {
+ *cp = val;
+ return 1;
+ } else if (!(val & ~((1LL<<(6+8))-1))) {
+ *cp++ = (val >> 8 ) | 0x80;
+ *cp = val & 0xff;
+ return 2;
+ } else if (!(val & ~((1LL<<(5+2*8))-1))) {
+ *cp++ = (val >> 16) | 0xc0;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 3;
+ } else if (!(val & ~((1LL<<(4+3*8))-1))) {
+ *cp++ = (val >> 24) | 0xe0;
+ *cp++ = (val >> 16) & 0xff;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 4;
+ } else if (!(val & ~((1LL<<(3+4*8))-1))) {
+ *cp++ = (val >> 32) | 0xf0;
+ *cp++ = (val >> 24) & 0xff;
+ *cp++ = (val >> 16) & 0xff;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 5;
+ } else if (!(val & ~((1LL<<(2+5*8))-1))) {
+ *cp++ = (val >> 40) | 0xf8;
+ *cp++ = (val >> 32) & 0xff;
+ *cp++ = (val >> 24) & 0xff;
+ *cp++ = (val >> 16) & 0xff;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 6;
+ } else if (!(val & ~((1LL<<(1+6*8))-1))) {
+ *cp++ = (val >> 48) | 0xfc;
+ *cp++ = (val >> 40) & 0xff;
+ *cp++ = (val >> 32) & 0xff;
+ *cp++ = (val >> 24) & 0xff;
+ *cp++ = (val >> 16) & 0xff;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 7;
+ } else if (!(val & ~((1LL<<(7*8))-1))) {
+ *cp++ = (val >> 56) | 0xfe;
+ *cp++ = (val >> 48) & 0xff;
+ *cp++ = (val >> 40) & 0xff;
+ *cp++ = (val >> 32) & 0xff;
+ *cp++ = (val >> 24) & 0xff;
+ *cp++ = (val >> 16) & 0xff;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 8;
+ } else {
+ *cp++ = 0xff;
+ *cp++ = (val >> 56) & 0xff;
+ *cp++ = (val >> 48) & 0xff;
+ *cp++ = (val >> 40) & 0xff;
+ *cp++ = (val >> 32) & 0xff;
+ *cp++ = (val >> 24) & 0xff;
+ *cp++ = (val >> 16) & 0xff;
+ *cp++ = (val >> 8 ) & 0xff;
+ *cp = val & 0xff;
+ return 9;
+ }
+}
+
+int ltf8_get(char *cp, int64_t *val_p) {
+ unsigned char *up = (unsigned char *)cp;
+
+ if (up[0] < 0x80) {
+ *val_p = up[0];
+ return 1;
+ } else if (up[0] < 0xc0) {
+ *val_p = (((uint64_t)up[0]<< 8) |
+ (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
+ return 2;
+ } else if (up[0] < 0xe0) {
+ *val_p = (((uint64_t)up[0]<<16) |
+ ((uint64_t)up[1]<< 8) |
+ (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
+ return 3;
+ } else if (up[0] < 0xf0) {
+ *val_p = (((uint64_t)up[0]<<24) |
+ ((uint64_t)up[1]<<16) |
+ ((uint64_t)up[2]<< 8) |
+ (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
+ return 4;
+ } else if (up[0] < 0xf8) {
+ *val_p = (((uint64_t)up[0]<<32) |
+ ((uint64_t)up[1]<<24) |
+ ((uint64_t)up[2]<<16) |
+ ((uint64_t)up[3]<< 8) |
+ (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
+ return 5;
+ } else if (up[0] < 0xfc) {
+ *val_p = (((uint64_t)up[0]<<40) |
+ ((uint64_t)up[1]<<32) |
+ ((uint64_t)up[2]<<24) |
+ ((uint64_t)up[3]<<16) |
+ ((uint64_t)up[4]<< 8) |
+ (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
+ return 6;
+ } else if (up[0] < 0xfe) {
+ *val_p = (((uint64_t)up[0]<<48) |
+ ((uint64_t)up[1]<<40) |
+ ((uint64_t)up[2]<<32) |
+ ((uint64_t)up[3]<<24) |
+ ((uint64_t)up[4]<<16) |
+ ((uint64_t)up[5]<< 8) |
+ (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
+ return 7;
+ } else if (up[0] < 0xff) {
+ *val_p = (((uint64_t)up[1]<<48) |
+ ((uint64_t)up[2]<<40) |
+ ((uint64_t)up[3]<<32) |
+ ((uint64_t)up[4]<<24) |
+ ((uint64_t)up[5]<<16) |
+ ((uint64_t)up[6]<< 8) |
+ (uint64_t)up[7]) & ((1LL<<(7*8))-1);
+ return 8;
+ } else {
+ *val_p = (((uint64_t)up[1]<<56) |
+ ((uint64_t)up[2]<<48) |
+ ((uint64_t)up[3]<<40) |
+ ((uint64_t)up[4]<<32) |
+ ((uint64_t)up[5]<<24) |
+ ((uint64_t)up[6]<<16) |
+ ((uint64_t)up[7]<< 8) |
+ (uint64_t)up[8]);
+ return 9;
+ }
+}
+
+/*
+ * LEGACY: consider using ltf8_decode_crc.
+ */
+int ltf8_decode(cram_fd *fd, int64_t *val_p) {
+ int c = hgetc(fd->fp);
+ int64_t val = (unsigned char)c;
+ if (c == -1)
+ return -1;
+
+ if (val < 0x80) {
+ *val_p = val;
+ return 1;
+
+ } else if (val < 0xc0) {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val & (((1LL<<(6+8)))-1);
+ return 2;
+
+ } else if (val < 0xe0) {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val & ((1LL<<(5+2*8))-1);
+ return 3;
+
+ } else if (val < 0xf0) {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val & ((1LL<<(4+3*8))-1);
+ return 4;
+
+ } else if (val < 0xf8) {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val & ((1LL<<(3+4*8))-1);
+ return 5;
+
+ } else if (val < 0xfc) {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val & ((1LL<<(2+5*8))-1);
+ return 6;
+
+ } else if (val < 0xfe) {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val & ((1LL<<(1+6*8))-1);
+ return 7;
+
+ } else if (val < 0xff) {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val & ((1LL<<(7*8))-1);
+ return 8;
+
+ } else {
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ val = (val<<8) | (unsigned char)hgetc(fd->fp);
+ *val_p = val;
+ }
+
+ return 9;
+}
+
+int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
+ unsigned char c[9];
+ int64_t val = (unsigned char)hgetc(fd->fp);
+ if (val == -1)
+ return -1;
+
+ c[0] = val;
+
+ if (val < 0x80) {
+ *val_p = val;
+ *crc = crc32(*crc, c, 1);
+ return 1;
+
+ } else if (val < 0xc0) {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ *val_p = val & (((1LL<<(6+8)))-1);
+ *crc = crc32(*crc, c, 2);
+ return 2;
+
+ } else if (val < 0xe0) {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ val = (val<<8) | (c[2]=hgetc(fd->fp));;
+ *val_p = val & ((1LL<<(5+2*8))-1);
+ *crc = crc32(*crc, c, 3);
+ return 3;
+
+ } else if (val < 0xf0) {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ val = (val<<8) | (c[2]=hgetc(fd->fp));;
+ val = (val<<8) | (c[3]=hgetc(fd->fp));;
+ *val_p = val & ((1LL<<(4+3*8))-1);
+ *crc = crc32(*crc, c, 4);
+ return 4;
+
+ } else if (val < 0xf8) {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ val = (val<<8) | (c[2]=hgetc(fd->fp));;
+ val = (val<<8) | (c[3]=hgetc(fd->fp));;
+ val = (val<<8) | (c[4]=hgetc(fd->fp));;
+ *val_p = val & ((1LL<<(3+4*8))-1);
+ *crc = crc32(*crc, c, 5);
+ return 5;
+
+ } else if (val < 0xfc) {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ val = (val<<8) | (c[2]=hgetc(fd->fp));;
+ val = (val<<8) | (c[3]=hgetc(fd->fp));;
+ val = (val<<8) | (c[4]=hgetc(fd->fp));;
+ val = (val<<8) | (c[5]=hgetc(fd->fp));;
+ *val_p = val & ((1LL<<(2+5*8))-1);
+ *crc = crc32(*crc, c, 6);
+ return 6;
+
+ } else if (val < 0xfe) {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ val = (val<<8) | (c[2]=hgetc(fd->fp));;
+ val = (val<<8) | (c[3]=hgetc(fd->fp));;
+ val = (val<<8) | (c[4]=hgetc(fd->fp));;
+ val = (val<<8) | (c[5]=hgetc(fd->fp));;
+ val = (val<<8) | (c[6]=hgetc(fd->fp));;
+ *val_p = val & ((1LL<<(1+6*8))-1);
+ *crc = crc32(*crc, c, 7);
+ return 7;
+
+ } else if (val < 0xff) {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ val = (val<<8) | (c[2]=hgetc(fd->fp));;
+ val = (val<<8) | (c[3]=hgetc(fd->fp));;
+ val = (val<<8) | (c[4]=hgetc(fd->fp));;
+ val = (val<<8) | (c[5]=hgetc(fd->fp));;
+ val = (val<<8) | (c[6]=hgetc(fd->fp));;
+ val = (val<<8) | (c[7]=hgetc(fd->fp));;
+ *val_p = val & ((1LL<<(7*8))-1);
+ *crc = crc32(*crc, c, 8);
+ return 8;
+
+ } else {
+ val = (val<<8) | (c[1]=hgetc(fd->fp));;
+ val = (val<<8) | (c[2]=hgetc(fd->fp));;
+ val = (val<<8) | (c[3]=hgetc(fd->fp));;
+ val = (val<<8) | (c[4]=hgetc(fd->fp));;
+ val = (val<<8) | (c[5]=hgetc(fd->fp));;
+ val = (val<<8) | (c[6]=hgetc(fd->fp));;
+ val = (val<<8) | (c[7]=hgetc(fd->fp));;
+ val = (val<<8) | (c[8]=hgetc(fd->fp));;
+ *crc = crc32(*crc, c, 9);
+ *val_p = val;
+ }
+
+ return 9;
+}
+
+/*
+ * Pushes a value in ITF8 format onto the end of a block.
+ * This shouldn't be used for high-volume data as it is not the fastest
+ * method.
+ *
+ * Returns the number of bytes written
+ */
+int itf8_put_blk(cram_block *blk, int val) {
+ char buf[5];
+ int sz;
+
+ sz = itf8_put(buf, val);
+ BLOCK_APPEND(blk, buf, sz);
+ return sz;
+}
+
+/*
+ * Decodes a 32-bit little endian value from fd and stores in val.
+ *
+ * Returns the number of bytes read on success
+ * -1 on failure
+ */
+int int32_decode(cram_fd *fd, int32_t *val) {
+ int32_t i;
+ if (4 != hread(fd->fp, &i, 4))
+ return -1;
+
+ *val = le_int4(i);
+ return 4;
+}
+
+/*
+ * Encodes a 32-bit little endian value 'val' and writes to fd.
+ *
+ * Returns the number of bytes written on success
+ * -1 on failure
+ */
+int int32_encode(cram_fd *fd, int32_t val) {
+ val = le_int4(val);
+ if (4 != hwrite(fd->fp, &val, 4))
+ return -1;
+
+ return 4;
+}
+
+/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
+int int32_get_blk(cram_block *b, int32_t *val) {
+ if (b->uncomp_size - BLOCK_SIZE(b) < 4)
+ return -1;
+
+ *val =
+ b->data[b->byte ] |
+ (b->data[b->byte+1] << 8) |
+ (b->data[b->byte+2] << 16) |
+ (b->data[b->byte+3] << 24);
+ BLOCK_SIZE(b) += 4;
+ return 4;
+}
+
+/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
+int int32_put_blk(cram_block *b, int32_t val) {
+ unsigned char cp[4];
+ cp[0] = ( val & 0xff);
+ cp[1] = ((val>>8) & 0xff);
+ cp[2] = ((val>>16) & 0xff);
+ cp[3] = ((val>>24) & 0xff);
+
+ BLOCK_APPEND(b, cp, 4);
+ return b->data ? 0 : -1;
+}
+
+/* ----------------------------------------------------------------------
+ * zlib compression code - from Gap5's tg_iface_g.c
+ * They're static here as they're only used within the cram_compress_block
+ * and cram_uncompress_block functions, which are the external interface.
+ */
+char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
+ z_stream s;
+ unsigned char *data = NULL; /* Uncompressed output */
+ int data_alloc = 0;
+ int err;
+
+ /* Starting point at uncompressed size, and scale after that */
+ data = malloc(data_alloc = csize*1.2+100);
+ if (!data)
+ return NULL;
+
+ /* Initialise zlib stream */
+ s.zalloc = Z_NULL; /* use default allocation functions */
+ s.zfree = Z_NULL;
+ s.opaque = Z_NULL;
+ s.next_in = (unsigned char *)cdata;
+ s.avail_in = csize;
+ s.total_in = 0;
+ s.next_out = data;
+ s.avail_out = data_alloc;
+ s.total_out = 0;
+
+ //err = inflateInit(&s);
+ err = inflateInit2(&s, 15 + 32);
+ if (err != Z_OK) {
+ fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
+ free(data);
+ return NULL;
+ }
+
+ /* Decode to 'data' array */
+ for (;s.avail_in;) {
+ unsigned char *data_tmp;
+ int alloc_inc;
+
+ s.next_out = &data[s.total_out];
+ err = inflate(&s, Z_NO_FLUSH);
+ if (err == Z_STREAM_END)
+ break;
+
+ if (err != Z_OK) {
+ fprintf(stderr, "zlib inflate error: %s\n", s.msg);
+ if (data)
+ free(data);
+ return NULL;
+ }
+
+ /* More to come, so realloc based on growth so far */
+ alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
+ data = realloc((data_tmp = data), data_alloc += alloc_inc);
+ if (!data) {
+ free(data_tmp);
+ return NULL;
+ }
+ s.avail_out += alloc_inc;
+ }
+ inflateEnd(&s);
+
+ *size = s.total_out;
+ return (char *)data;
+}
+
+static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
+ int level, int strat) {
+ z_stream s;
+ unsigned char *cdata = NULL; /* Compressed output */
+ int cdata_alloc = 0;
+ int cdata_pos = 0;
+ int err;
+
+ cdata = malloc(cdata_alloc = size*1.05+100);
+ if (!cdata)
+ return NULL;
+ cdata_pos = 0;
+
+ /* Initialise zlib stream */
+ s.zalloc = Z_NULL; /* use default allocation functions */
+ s.zfree = Z_NULL;
+ s.opaque = Z_NULL;
+ s.next_in = (unsigned char *)data;
+ s.avail_in = size;
+ s.total_in = 0;
+ s.next_out = cdata;
+ s.avail_out = cdata_alloc;
+ s.total_out = 0;
+ s.data_type = Z_BINARY;
+
+ err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
+ if (err != Z_OK) {
+ fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
+ return NULL;
+ }
+
+ /* Encode to 'cdata' array */
+ for (;s.avail_in;) {
+ s.next_out = &cdata[cdata_pos];
+ s.avail_out = cdata_alloc - cdata_pos;
+ if (cdata_alloc - cdata_pos <= 0) {
+ fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
+ return NULL;
+ }
+ err = deflate(&s, Z_NO_FLUSH);
+ cdata_pos = cdata_alloc - s.avail_out;
+ if (err != Z_OK) {
+ fprintf(stderr, "zlib deflate error: %s\n", s.msg);
+ break;
+ }
+ }
+ if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
+ fprintf(stderr, "zlib deflate error: %s\n", s.msg);
+ }
+ *cdata_size = s.total_out;
+
+ if (deflateEnd(&s) != Z_OK) {
+ fprintf(stderr, "zlib deflate error: %s\n", s.msg);
+ }
+ return (char *)cdata;
+}
+
+#ifdef HAVE_LIBLZMA
+/* ------------------------------------------------------------------------ */
+/*
+ * Data compression routines using liblzma (xz)
+ *
+ * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
+ * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
+ * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
+ * as compression times.
+ *
+ * For now we disable this functionality. If it's to be reenabled make sure you
+ * improve the mem_inflate implementation as it's just a test hack at the
+ * moment.
+ */
+
+static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
+ int level) {
+ char *out;
+ size_t out_size = lzma_stream_buffer_bound(size);
+ *cdata_size = 0;
+
+ out = malloc(out_size);
+
+ /* Single call compression */
+ if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
+ (uint8_t *)data, size,
+ (uint8_t *)out, cdata_size,
+ out_size))
+ return NULL;
+
+ return out;
+}
+
+static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
+ lzma_stream strm = LZMA_STREAM_INIT;
+ size_t out_size = 0, out_pos = 0;
+ char *out = NULL;
+ int r;
+
+ /* Initiate the decoder */
+ if (LZMA_OK != lzma_stream_decoder(&strm, 50000000, 0))
+ return NULL;
+
+ /* Decode loop */
+ strm.avail_in = csize;
+ strm.next_in = (uint8_t *)cdata;
+
+ for (;strm.avail_in;) {
+ if (strm.avail_in > out_size - out_pos) {
+ out_size += strm.avail_in * 4 + 32768;
+ out = realloc(out, out_size);
+ }
+ strm.avail_out = out_size - out_pos;
+ strm.next_out = (uint8_t *)&out[out_pos];
+
+ r = lzma_code(&strm, LZMA_RUN);
+ if (LZMA_OK != r && LZMA_STREAM_END != r) {
+ fprintf(stderr, "r=%d\n", r);
+ fprintf(stderr, "mem=%"PRId64"d\n", (int64_t)lzma_memusage(&strm));
+ return NULL;
+ }
+
+ out_pos = strm.total_out;
+
+ if (r == LZMA_STREAM_END)
+ break;
+ }
+
+ /* finish up any unflushed data; necessary? */
+ r = lzma_code(&strm, LZMA_FINISH);
+ if (r != LZMA_OK && r != LZMA_STREAM_END) {
+ fprintf(stderr, "r=%d\n", r);
+ return NULL;
+ }
+
+ out = realloc(out, strm.total_out);
+ *size = strm.total_out;
+
+ lzma_end(&strm);
+
+ return out;
+}
+#endif
+
+/* ----------------------------------------------------------------------
+ * CRAM blocks - the dynamically growable data block. We have code to
+ * create, update, (un)compress and read/write.
+ *
+ * These are derived from the deflate_interlaced.c blocks, but with the
+ * CRAM extension of content types and IDs.
+ */
+
+/*
+ * Allocates a new cram_block structure with a specified content_type and
+ * id.
+ *
+ * Returns block pointer on success
+ * NULL on failure
+ */
+cram_block *cram_new_block(enum cram_content_type content_type,
+ int content_id) {
+ cram_block *b = malloc(sizeof(*b));
+ if (!b)
+ return NULL;
+ b->method = b->orig_method = RAW;
+ b->content_type = content_type;
+ b->content_id = content_id;
+ b->comp_size = 0;
+ b->uncomp_size = 0;
+ b->data = NULL;
+ b->alloc = 0;
+ b->byte = 0;
+ b->bit = 7; // MSB
+
+ return b;
+}
+
+/*
+ * Reads a block from a cram file.
+ * Returns cram_block pointer on success.
+ * NULL on failure
+ */
+cram_block *cram_read_block(cram_fd *fd) {
+ cram_block *b = malloc(sizeof(*b));
+ unsigned char c;
+ uint32_t crc = 0;
+ if (!b)
+ return NULL;
+
+ //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
+
+ if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
+ c = b->method; crc = crc32(crc, &c, 1);
+ if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
+ c = b->content_type; crc = crc32(crc, &c, 1);
+ if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; }
+ if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; }
+ if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
+
+ // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
+ // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
+
+ if (b->method == RAW) {
+ b->alloc = b->uncomp_size;
+ if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
+ if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
+ free(b->data);
+ free(b);
+ return NULL;
+ }
+ } else {
+ b->alloc = b->comp_size;
+ if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
+ if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
+ free(b->data);
+ free(b);
+ return NULL;
+ }
+ }
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
+ free(b);
+ return NULL;
+ }
+
+ crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
+ if (crc != b->crc32) {
+ fprintf(stderr, "Block CRC32 failure\n");
+ free(b->data);
+ free(b);
+ return NULL;
+ }
+ }
+
+ b->orig_method = b->method;
+ b->idx = 0;
+ b->byte = 0;
+ b->bit = 7; // MSB
+
+ return b;
+}
+
+
+/*
+ * Computes the size of a cram block, including the block
+ * header itself.
+ */
+uint32_t cram_block_size(cram_block *b) {
+ unsigned char dat[100], *cp = dat;;
+ uint32_t sz;
+
+ *cp++ = b->method;
+ *cp++ = b->content_type;
+ cp += itf8_put(cp, b->content_id);
+ cp += itf8_put(cp, b->comp_size);
+ cp += itf8_put(cp, b->uncomp_size);
+
+ sz = cp-dat + 4;
+ sz += b->method == RAW ? b->uncomp_size : b->comp_size;
+
+ return sz;
+}
+
+/*
+ * Writes a CRAM block.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_write_block(cram_fd *fd, cram_block *b) {
+ assert(b->method != RAW || (b->comp_size == b->uncomp_size));
+
+ if (hputc(b->method, fd->fp) == EOF) return -1;
+ if (hputc(b->content_type, fd->fp) == EOF) return -1;
+ if (itf8_encode(fd, b->content_id) == -1) return -1;
+ if (itf8_encode(fd, b->comp_size) == -1) return -1;
+ if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
+
+ if (b->method == RAW) {
+ if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
+ return -1;
+ } else {
+ if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
+ return -1;
+ }
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ unsigned char dat[100], *cp = dat;;
+ uint32_t crc;
+
+ *cp++ = b->method;
+ *cp++ = b->content_type;
+ cp += itf8_put(cp, b->content_id);
+ cp += itf8_put(cp, b->comp_size);
+ cp += itf8_put(cp, b->uncomp_size);
+ crc = crc32(0L, dat, cp-dat);
+
+ if (b->method == RAW) {
+ b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
+ } else {
+ b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
+ }
+
+ if (-1 == int32_encode(fd, b->crc32))
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Frees a CRAM block, deallocating internal data too.
+ */
+void cram_free_block(cram_block *b) {
+ if (!b)
+ return;
+ if (b->data)
+ free(b->data);
+ free(b);
+}
+
+/*
+ * Uncompresses a CRAM block, if compressed.
+ */
+int cram_uncompress_block(cram_block *b) {
+ char *uncomp;
+ size_t uncomp_size = 0;
+
+ if (b->uncomp_size == 0) {
+ // blank block
+ b->method = RAW;
+ return 0;
+ }
+
+ switch (b->method) {
+ case RAW:
+ return 0;
+
+ case GZIP:
+ uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
+ if (!uncomp)
+ return -1;
+ if ((int)uncomp_size != b->uncomp_size) {
+ free(uncomp);
+ return -1;
+ }
+ free(b->data);
+ b->data = (unsigned char *)uncomp;
+ b->alloc = uncomp_size;
+ b->method = RAW;
+ break;
+
+#ifdef HAVE_LIBBZ2
+ case BZIP2: {
+ unsigned int usize = b->uncomp_size;
+ if (!(uncomp = malloc(usize)))
+ return -1;
+ if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
+ (char *)b->data, b->comp_size,
+ 0, 0)) {
+ free(uncomp);
+ return -1;
+ }
+ free(b->data);
+ b->data = (unsigned char *)uncomp;
+ b->alloc = usize;
+ b->method = RAW;
+ b->uncomp_size = usize; // Just incase it differs
+ break;
+ }
+#else
+ case BZIP2:
+ fprintf(stderr, "Bzip2 compression is not compiled into this "
+ "version.\nPlease rebuild and try again.\n");
+ return -1;
+#endif
+
+#ifdef HAVE_LIBLZMA
+ case LZMA:
+ uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
+ if (!uncomp)
+ return -1;
+ if ((int)uncomp_size != b->uncomp_size)
+ return -1;
+ free(b->data);
+ b->data = (unsigned char *)uncomp;
+ b->alloc = uncomp_size;
+ b->method = RAW;
+ break;
+#else
+ case LZMA:
+ fprintf(stderr, "Lzma compression is not compiled into this "
+ "version.\nPlease rebuild and try again.\n");
+ return -1;
+ break;
+#endif
+
+ case RANS: {
+ unsigned int usize = b->uncomp_size, usize2;
+ uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
+ if (!uncomp || usize != usize2)
+ return -1;
+ free(b->data);
+ b->data = (unsigned char *)uncomp;
+ b->alloc = usize2;
+ b->method = RAW;
+ b->uncomp_size = usize2; // Just incase it differs
+ //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
+ break;
+ }
+
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static char *cram_compress_by_method(char *in, size_t in_size,
+ size_t *out_size,
+ enum cram_block_method method,
+ int level, int strat) {
+ switch (method) {
+ case GZIP:
+ return zlib_mem_deflate(in, in_size, out_size, level, strat);
+
+ case BZIP2: {
+#ifdef HAVE_LIBBZ2
+ unsigned int comp_size = in_size*1.01 + 600;
+ char *comp = malloc(comp_size);
+ if (!comp)
+ return NULL;
+
+ if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
+ in, in_size,
+ level, 0, 30)) {
+ free(comp);
+ return NULL;
+ }
+ *out_size = comp_size;
+ return comp;
+#else
+ return NULL;
+#endif
+ }
+
+ case LZMA:
+#ifdef HAVE_LIBLZMA
+ return lzma_mem_deflate(in, in_size, out_size, level);
+#else
+ return NULL;
+#endif
+
+ case RANS0: {
+ unsigned int out_size_i;
+ unsigned char *cp;
+ cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
+ *out_size = out_size_i;
+ return (char *)cp;
+ }
+
+ case RANS1: {
+ unsigned int out_size_i;
+ unsigned char *cp;
+
+ cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
+ *out_size = out_size_i;
+ return (char *)cp;
+ }
+
+ case RAW:
+ break;
+
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Compresses a block using one of two different zlib strategies. If we only
+ * want one choice set strat2 to be -1.
+ *
+ * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
+ * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
+ * significantly faster.
+ *
+ * Method and level -1 implies defaults, as specified in cram_fd.
+ */
+int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
+ int method, int level) {
+
+ char *comp = NULL;
+ size_t comp_size = 0;
+ int strat;
+
+ if (b->method != RAW) {
+ // Maybe already compressed if s->block[0] was compressed and
+ // we have e.g. s->block[DS_BA] set to s->block[0] due to only
+ // one base type present and hence using E_HUFFMAN on block 0.
+ // A second explicit attempt to compress the same block then
+ // occurs.
+ return 0;
+ }
+
+ if (method == -1) {
+ method = 1<<GZIP;
+ if (fd->use_bz2)
+ method |= 1<<BZIP2;
+ if (fd->use_lzma)
+ method |= 1<<LZMA;
+ }
+
+ if (level == -1)
+ level = fd->level;
+
+ //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
+
+ if (method == RAW || level == 0 || b->uncomp_size == 0) {
+ b->method = RAW;
+ b->comp_size = b->uncomp_size;
+ //fprintf(stderr, "Skip block id %d\n", b->content_id);
+ return 0;
+ }
+
+ if (metrics) {
+ pthread_mutex_lock(&fd->metrics_lock);
+ if (metrics->trial > 0 || --metrics->next_trial <= 0) {
+ size_t sz_best = INT_MAX;
+ size_t sz_gz_rle = 0;
+ size_t sz_gz_def = 0;
+ size_t sz_rans0 = 0;
+ size_t sz_rans1 = 0;
+ size_t sz_bzip2 = 0;
+ size_t sz_lzma = 0;
+ int method_best = 0;
+ char *c_best = NULL, *c = NULL;
+
+ if (metrics->revised_method)
+ method = metrics->revised_method;
+ else
+ metrics->revised_method = method;
+
+ if (metrics->next_trial == 0) {
+ metrics->next_trial = TRIAL_SPAN;
+ metrics->trial = NTRIALS;
+ metrics->sz_gz_rle /= 2;
+ metrics->sz_gz_def /= 2;
+ metrics->sz_rans0 /= 2;
+ metrics->sz_rans1 /= 2;
+ metrics->sz_bzip2 /= 2;
+ metrics->sz_lzma /= 2;
+ }
+
+ pthread_mutex_unlock(&fd->metrics_lock);
+
+ if (method & (1<<GZIP_RLE)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_gz_rle, GZIP, 1, Z_RLE);
+ if (c && sz_best > sz_gz_rle) {
+ sz_best = sz_gz_rle;
+ method_best = GZIP_RLE;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_gz_rle = b->uncomp_size*2+1000;
+ }
+
+ //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
+ }
+
+ if (method & (1<<GZIP)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_gz_def, GZIP, level,
+ Z_FILTERED);
+ if (c && sz_best > sz_gz_def) {
+ sz_best = sz_gz_def;
+ method_best = GZIP;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_gz_def = b->uncomp_size*2+1000;
+ }
+
+ //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
+ }
+
+ if (method & (1<<RANS0)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_rans0, RANS0, 0, 0);
+ if (c && sz_best > sz_rans0) {
+ sz_best = sz_rans0;
+ method_best = RANS0;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_rans0 = b->uncomp_size*2+1000;
+ }
+ }
+
+ if (method & (1<<RANS1)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_rans1, RANS1, 0, 0);
+ if (c && sz_best > sz_rans1) {
+ sz_best = sz_rans1;
+ method_best = RANS1;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_rans1 = b->uncomp_size*2+1000;
+ }
+ }
+
+ if (method & (1<<BZIP2)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_bzip2, BZIP2, level, 0);
+ if (c && sz_best > sz_bzip2) {
+ sz_best = sz_bzip2;
+ method_best = BZIP2;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_bzip2 = b->uncomp_size*2+1000;
+ }
+ }
+
+ if (method & (1<<LZMA)) {
+ c = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &sz_lzma, LZMA, level, 0);
+ if (c && sz_best > sz_lzma) {
+ sz_best = sz_lzma;
+ method_best = LZMA;
+ if (c_best)
+ free(c_best);
+ c_best = c;
+ } else if (c) {
+ free(c);
+ } else {
+ sz_lzma = b->uncomp_size*2+1000;
+ }
+ }
+
+ //fprintf(stderr, "sz_best = %d\n", sz_best);
+
+ free(b->data);
+ b->data = (unsigned char *)c_best;
+ //printf("method_best = %s\n", cram_block_method2str(method_best));
+ b->method = method_best == GZIP_RLE ? GZIP : method_best;
+ b->comp_size = sz_best;
+
+ pthread_mutex_lock(&fd->metrics_lock);
+ metrics->sz_gz_rle += sz_gz_rle;
+ metrics->sz_gz_def += sz_gz_def;
+ metrics->sz_rans0 += sz_rans0;
+ metrics->sz_rans1 += sz_rans1;
+ metrics->sz_bzip2 += sz_bzip2;
+ metrics->sz_lzma += sz_lzma;
+ if (--metrics->trial == 0) {
+ int best_method = RAW;
+ int best_sz = INT_MAX;
+
+ // Scale methods by cost
+ if (fd->level <= 3) {
+ metrics->sz_rans1 *= 1.02;
+ metrics->sz_gz_def *= 1.04;
+ metrics->sz_bzip2 *= 1.08;
+ metrics->sz_lzma *= 1.10;
+ } else if (fd->level <= 6) {
+ metrics->sz_rans1 *= 1.01;
+ metrics->sz_gz_def *= 1.02;
+ metrics->sz_bzip2 *= 1.03;
+ metrics->sz_lzma *= 1.05;
+ }
+
+ if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
+ best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
+
+ if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
+ best_sz = metrics->sz_gz_def, best_method = GZIP;
+
+ if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
+ best_sz = metrics->sz_rans0, best_method = RANS0;
+
+ if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
+ best_sz = metrics->sz_rans1, best_method = RANS1;
+
+ if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
+ best_sz = metrics->sz_bzip2, best_method = BZIP2;
+
+ if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
+ best_sz = metrics->sz_lzma, best_method = LZMA;
+
+ if (best_method == GZIP_RLE) {
+ metrics->method = GZIP;
+ metrics->strat = Z_RLE;
+ } else {
+ metrics->method = best_method;
+ metrics->strat = Z_FILTERED;
+ }
+
+ // If we see at least MAXFAIL trials in a row for a specific
+ // compression method with more than MAXDELTA aggregate
+ // size then we drop this from the list of methods used
+ // for this block type.
+#define MAXDELTA 0.20
+#define MAXFAILS 4
+ if (best_method == GZIP_RLE) {
+ metrics->gz_rle_cnt = 0;
+ metrics->gz_rle_extra = 0;
+ } else if (best_sz < metrics->sz_gz_rle) {
+ double r = (double)metrics->sz_gz_rle / best_sz - 1;
+ if (++metrics->gz_rle_cnt >= MAXFAILS &&
+ (metrics->gz_rle_extra += r) >= MAXDELTA)
+ method &= ~(1<<GZIP_RLE);
+ }
+
+ if (best_method == GZIP) {
+ metrics->gz_def_cnt = 0;
+ metrics->gz_def_extra = 0;
+ } else if (best_sz < metrics->sz_gz_def) {
+ double r = (double)metrics->sz_gz_def / best_sz - 1;
+ if (++metrics->gz_def_cnt >= MAXFAILS &&
+ (metrics->gz_def_extra += r) >= MAXDELTA)
+ method &= ~(1<<GZIP);
+ }
+
+ if (best_method == RANS0) {
+ metrics->rans0_cnt = 0;
+ metrics->rans0_extra = 0;
+ } else if (best_sz < metrics->sz_rans0) {
+ double r = (double)metrics->sz_rans0 / best_sz - 1;
+ if (++metrics->rans0_cnt >= MAXFAILS &&
+ (metrics->rans0_extra += r) >= MAXDELTA)
+ method &= ~(1<<RANS0);
+ }
+
+ if (best_method == RANS1) {
+ metrics->rans1_cnt = 0;
+ metrics->rans1_extra = 0;
+ } else if (best_sz < metrics->sz_rans1) {
+ double r = (double)metrics->sz_rans1 / best_sz - 1;
+ if (++metrics->rans1_cnt >= MAXFAILS &&
+ (metrics->rans1_extra += r) >= MAXDELTA)
+ method &= ~(1<<RANS1);
+ }
+
+ if (best_method == BZIP2) {
+ metrics->bzip2_cnt = 0;
+ metrics->bzip2_extra = 0;
+ } else if (best_sz < metrics->sz_bzip2) {
+ double r = (double)metrics->sz_bzip2 / best_sz - 1;
+ if (++metrics->bzip2_cnt >= MAXFAILS &&
+ (metrics->bzip2_extra += r) >= MAXDELTA)
+ method &= ~(1<<BZIP2);
+ }
+
+ if (best_method == LZMA) {
+ metrics->lzma_cnt = 0;
+ metrics->lzma_extra = 0;
+ } else if (best_sz < metrics->sz_lzma) {
+ double r = (double)metrics->sz_lzma / best_sz - 1;
+ if (++metrics->lzma_cnt >= MAXFAILS &&
+ (metrics->lzma_extra += r) >= MAXDELTA)
+ method &= ~(1<<LZMA);
+ }
+
+ //if (method != metrics->revised_method)
+ // fprintf(stderr, "%d: method from %x to %x\n",
+ // b->content_id, metrics->revised_method, method);
+ metrics->revised_method = method;
+ }
+ pthread_mutex_unlock(&fd->metrics_lock);
+ } else {
+ strat = metrics->strat;
+ method = metrics->method;
+
+ pthread_mutex_unlock(&fd->metrics_lock);
+ comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &comp_size, method,
+ level, strat);
+ if (!comp)
+ return -1;
+ free(b->data);
+ b->data = (unsigned char *)comp;
+ b->comp_size = comp_size;
+ b->method = method;
+ }
+
+ } else {
+ // no cached metrics, so just do zlib?
+ comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
+ &comp_size, GZIP, level, Z_FILTERED);
+ if (!comp) {
+ fprintf(stderr, "Compression failed!\n");
+ return -1;
+ }
+ free(b->data);
+ b->data = (unsigned char *)comp;
+ b->comp_size = comp_size;
+ b->method = GZIP;
+ }
+
+ if (fd->verbose)
+ fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n",
+ b->content_id, b->uncomp_size, b->comp_size,
+ cram_block_method2str(b->method));
+
+ if (b->method == RANS1)
+ b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
+
+ return 0;
+}
+
+cram_metrics *cram_new_metrics(void) {
+ cram_metrics *m = calloc(1, sizeof(*m));
+ if (!m)
+ return NULL;
+ m->trial = NTRIALS-1;
+ m->next_trial = TRIAL_SPAN;
+ m->method = RAW;
+ m->strat = 0;
+ m->revised_method = 0;
+
+ return m;
+}
+
+char *cram_block_method2str(enum cram_block_method m) {
+ switch(m) {
+ case RAW: return "RAW";
+ case GZIP: return "GZIP";
+ case BZIP2: return "BZIP2";
+ case LZMA: return "LZMA";
+ case RANS0: return "RANS0";
+ case RANS1: return "RANS1";
+ case GZIP_RLE: return "GZIP_RLE";
+ case ERROR: break;
+ }
+ return "?";
+}
+
+char *cram_content_type2str(enum cram_content_type t) {
+ switch (t) {
+ case FILE_HEADER: return "FILE_HEADER";
+ case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
+ case MAPPED_SLICE: return "MAPPED_SLICE";
+ case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
+ case EXTERNAL: return "EXTERNAL";
+ case CORE: return "CORE";
+ case CT_ERROR: break;
+ }
+ return "?";
+}
+
+/*
+ * Extra error checking on fclose to really ensure data is written.
+ * Care needs to be taken to handle pipes vs real files.
+ *
+ * Returns 0 on success
+ * -1 on failure.
+ */
+int paranoid_fclose(FILE *fp) {
+ if (-1 == fflush(fp) && errno != EBADF) {
+ fclose(fp);
+ return -1;
+ }
+
+ errno = 0;
+ if (-1 == fsync(fileno(fp))) {
+ if (errno != EINVAL) { // eg pipe
+ fclose(fp);
+ return -1;
+ }
+ }
+ return fclose(fp);
+}
+
+/* ----------------------------------------------------------------------
+ * Reference sequence handling
+ *
+ * These revolve around the refs_t structure, which may potentially be
+ * shared between multiple cram_fd.
+ *
+ * We start with refs_create() to allocate an empty refs_t and then
+ * populate it with @SQ line data using refs_from_header(). This is done on
+ * cram_open(). Also at start up we can call cram_load_reference() which
+ * is used with "scramble -r foo.fa". This replaces the fd->refs with the
+ * new one specified. In either case refs2id() is then called which
+ * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
+ *
+ * Later, possibly within a thread, we will want to know the actual ref
+ * seq itself, obtained by calling cram_get_ref(). This may use the
+ * UR: or M5: fields or the filename specified in the original
+ * cram_load_reference() call.
+ *
+ * Given the potential for multi-threaded reference usage, we have
+ * reference counting (sorry for the confusing double use of "ref") to
+ * track the number of callers interested in any specific reference.
+ */
+
+/*
+ * Frees/unmaps a reference sequence and associated file handles.
+ */
+static void ref_entry_free_seq(ref_entry *e) {
+ if (e->mf)
+ mfclose(e->mf);
+ if (e->seq && !e->mf)
+ free(e->seq);
+
+ e->seq = NULL;
+ e->mf = NULL;
+}
+
+void refs_free(refs_t *r) {
+ RP("refs_free()\n");
+
+ if (--r->count > 0)
+ return;
+
+ if (!r)
+ return;
+
+ if (r->pool)
+ string_pool_destroy(r->pool);
+
+ if (r->h_meta) {
+ khint_t k;
+
+ for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
+ ref_entry *e;
+
+ if (!kh_exist(r->h_meta, k))
+ continue;
+ if (!(e = kh_val(r->h_meta, k)))
+ continue;
+ ref_entry_free_seq(e);
+ free(e);
+ }
+
+ kh_destroy(refs, r->h_meta);
+ }
+
+ if (r->ref_id)
+ free(r->ref_id);
+
+ if (r->fp)
+ bgzf_close(r->fp);
+
+ pthread_mutex_destroy(&r->lock);
+
+ free(r);
+}
+
+static refs_t *refs_create(void) {
+ refs_t *r = calloc(1, sizeof(*r));
+
+ RP("refs_create()\n");
+
+ if (!r)
+ return NULL;
+
+ if (!(r->pool = string_pool_create(8192)))
+ goto err;
+
+ r->ref_id = NULL; // see refs2id() to populate.
+ r->count = 1;
+ r->last = NULL;
+ r->last_id = -1;
+
+ if (!(r->h_meta = kh_init(refs)))
+ goto err;
+
+ pthread_mutex_init(&r->lock, NULL);
+
+ return r;
+
+ err:
+ refs_free(r);
+ return NULL;
+}
+
+/*
+ * Opens a reference fasta file as a BGZF stream, allowing for
+ * compressed files. It automatically builds a .fai file if
+ * required and if compressed a .gzi bgzf index too.
+ *
+ * Returns a BGZF handle on success;
+ * NULL on failure.
+ */
+static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
+ BGZF *fp;
+
+ if (!is_md5) {
+ char fai_file[PATH_MAX];
+
+ snprintf(fai_file, PATH_MAX, "%s.fai", fn);
+ if (access(fai_file, R_OK) != 0)
+ if (fai_build(fn) != 0)
+ return NULL;
+ }
+
+ if (!(fp = bgzf_open(fn, mode))) {
+ perror(fn);
+ return NULL;
+ }
+
+ if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
+ fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn);
+ bgzf_close(fp);
+ return NULL;
+ }
+
+ return fp;
+}
+
+/*
+ * Loads a FAI file for a reference.fasta.
+ * "is_err" indicates whether failure to load is worthy of emitting an
+ * error message. In some cases (eg with embedded references) we
+ * speculatively load, just incase, and silently ignore errors.
+ *
+ * Returns the refs_t struct on success (maybe newly allocated);
+ * NULL on failure
+ */
+static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
+ struct stat sb;
+ FILE *fp = NULL;
+ char fai_fn[PATH_MAX];
+ char line[8192];
+ refs_t *r = r_orig;
+ size_t fn_l = strlen(fn);
+ int id = 0, id_alloc = 0;
+
+ RP("refs_load_fai %s\n", fn);
+
+ if (!r)
+ if (!(r = refs_create()))
+ goto err;
+
+ /* Open reference, for later use */
+ if (stat(fn, &sb) != 0) {
+ if (is_err)
+ perror(fn);
+ goto err;
+ }
+
+ if (r->fp)
+ if (bgzf_close(r->fp) != 0)
+ goto err;
+ r->fp = NULL;
+
+ if (!(r->fn = string_dup(r->pool, fn)))
+ goto err;
+
+ if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
+ r->fn[fn_l-4] = 0;
+
+ if (!(r->fp = bgzf_open_ref(r->fn, "r", 0)))
+ goto err;
+
+ /* Parse .fai file and load meta-data */
+ sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
+
+ if (stat(fai_fn, &sb) != 0) {
+ if (is_err)
+ perror(fai_fn);
+ goto err;
+ }
+ if (!(fp = fopen(fai_fn, "r"))) {
+ if (is_err)
+ perror(fai_fn);
+ goto err;
+ }
+ while (fgets(line, 8192, fp) != NULL) {
+ ref_entry *e = malloc(sizeof(*e));
+ char *cp;
+ int n;
+ khint_t k;
+
+ if (!e)
+ return NULL;
+
+ // id
+ for (cp = line; *cp && !isspace(*cp); cp++)
+ ;
+ *cp++ = 0;
+ e->name = string_dup(r->pool, line);
+
+ // length
+ while (*cp && isspace(*cp))
+ cp++;
+ e->length = strtoll(cp, &cp, 10);
+
+ // offset
+ while (*cp && isspace(*cp))
+ cp++;
+ e->offset = strtoll(cp, &cp, 10);
+
+ // bases per line
+ while (*cp && isspace(*cp))
+ cp++;
+ e->bases_per_line = strtol(cp, &cp, 10);
+
+ // line length
+ while (*cp && isspace(*cp))
+ cp++;
+ e->line_length = strtol(cp, &cp, 10);
+
+ // filename
+ e->fn = r->fn;
+
+ e->count = 0;
+ e->seq = NULL;
+ e->mf = NULL;
+ e->is_md5 = 0;
+
+ k = kh_put(refs, r->h_meta, e->name, &n);
+ if (-1 == n) {
+ free(e);
+ return NULL;
+ }
+
+ if (n) {
+ kh_val(r->h_meta, k) = e;
+ } else {
+ ref_entry *re = kh_val(r->h_meta, k);
+ if (re && (re->count != 0 || re->length != 0)) {
+ /* Keep old */
+ free(e);
+ } else {
+ /* Replace old */
+ if (re)
+ free(re);
+ kh_val(r->h_meta, k) = e;
+ }
+ }
+
+ if (id >= id_alloc) {
+ int x;
+
+ id_alloc = id_alloc ?id_alloc*2 : 16;
+ r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
+
+ for (x = id; x < id_alloc; x++)
+ r->ref_id[x] = NULL;
+ }
+ r->ref_id[id] = e;
+ r->nref = ++id;
+ }
+
+ return r;
+
+ err:
+ if (fp)
+ fclose(fp);
+
+ if (!r_orig)
+ refs_free(r);
+
+ return NULL;
+}
+
+/*
+ * Verifies that the CRAM @SQ lines and .fai files match.
+ */
+static void sanitise_SQ_lines(cram_fd *fd) {
+ int i;
+
+ if (!fd->header)
+ return;
+
+ if (!fd->refs || !fd->refs->h_meta)
+ return;
+
+ for (i = 0; i < fd->header->nref; i++) {
+ char *name = fd->header->ref[i].name;
+ khint_t k = kh_get(refs, fd->refs->h_meta, name);
+ ref_entry *r;
+
+ // We may have @SQ lines which have no known .fai, but do not
+ // in themselves pose a problem because they are unused in the file.
+ if (k == kh_end(fd->refs->h_meta))
+ continue;
+
+ if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
+ continue;
+
+ if (r->length && r->length != fd->header->ref[i].len) {
+ assert(strcmp(r->name, fd->header->ref[i].name) == 0);
+
+ // Should we also check MD5sums here to ensure the correct
+ // reference was given?
+ fprintf(stderr, "WARNING: Header @SQ length mismatch for "
+ "ref %s, %d vs %d\n",
+ r->name, fd->header->ref[i].len, (int)r->length);
+
+ // Fixing the parsed @SQ header will make MD:Z: strings work
+ // and also stop it producing N for the sequence.
+ fd->header->ref[i].len = r->length;
+ }
+ }
+}
+
+/*
+ * Indexes references by the order they appear in a BAM file. This may not
+ * necessarily be the same order they appear in the fasta reference file.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int refs2id(refs_t *r, SAM_hdr *h) {
+ int i;
+
+ if (r->ref_id)
+ free(r->ref_id);
+ if (r->last)
+ r->last = NULL;
+
+ r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
+ if (!r->ref_id)
+ return -1;
+
+ r->nref = h->nref;
+ for (i = 0; i < h->nref; i++) {
+ khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
+ if (k != kh_end(r->h_meta)) {
+ r->ref_id[i] = kh_val(r->h_meta, k);
+ } else {
+ fprintf(stderr, "Unable to find ref name '%s'\n",
+ h->ref[i].name);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Generates refs_t entries based on @SQ lines in the header.
+ * Returns 0 on success
+ * -1 on failure
+ */
+static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
+ int i, j;
+
+ if (!r)
+ return -1;
+
+ if (!h || h->nref == 0)
+ return 0;
+
+ //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
+
+ /* Existing refs are fine, as long as they're compatible with the hdr. */
+ if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
+ return -1;
+
+ /* Copy info from h->ref[i] over to r */
+ for (i = 0, j = r->nref; i < h->nref; i++) {
+ SAM_hdr_type *ty;
+ SAM_hdr_tag *tag;
+ khint_t k;
+ int n;
+
+ k = kh_get(refs, r->h_meta, h->ref[i].name);
+ if (k != kh_end(r->h_meta))
+ // Ref already known about
+ continue;
+
+ if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
+ return -1;
+
+ if (!h->ref[i].name)
+ return -1;
+
+ r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
+ r->ref_id[j]->length = 0; // marker for not yet loaded
+
+ /* Initialise likely filename if known */
+ if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
+ if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
+ r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
+ //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
+ }
+ }
+
+ k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
+ if (n <= 0) // already exists or error
+ return -1;
+ kh_val(r->h_meta, k) = r->ref_id[j];
+
+ j++;
+ }
+ r->nref = j;
+
+ return 0;
+}
+
+/*
+ * Attaches a header to a cram_fd.
+ *
+ * This should be used when creating a new cram_fd for writing where
+ * we have an SAM_hdr already constructed (eg from a file we've read
+ * in).
+ */
+int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
+ if (fd->header)
+ sam_hdr_free(fd->header);
+ fd->header = hdr;
+ return refs_from_header(fd->refs, fd, hdr);
+}
+
+/*
+ * Converts a directory and a filename into an expanded path, replacing %s
+ * in directory with the filename and %[0-9]+s with portions of the filename
+ * Any remaining parts of filename are added to the end with /%s.
+ */
+void expand_cache_path(char *path, char *dir, char *fn) {
+ char *cp;
+
+ while ((cp = strchr(dir, '%'))) {
+ strncpy(path, dir, cp-dir);
+ path += cp-dir;
+
+ if (*++cp == 's') {
+ strcpy(path, fn);
+ path += strlen(fn);
+ fn += strlen(fn);
+ cp++;
+ } else if (*cp >= '0' && *cp <= '9') {
+ char *endp;
+ long l;
+
+ l = strtol(cp, &endp, 10);
+ l = MIN(l, strlen(fn));
+ if (*endp == 's') {
+ strncpy(path, fn, l);
+ path += l;
+ fn += l;
+ *path = 0;
+ cp = endp+1;
+ } else {
+ *path++ = '%';
+ *path++ = *cp++;
+ }
+ } else {
+ *path++ = '%';
+ *path++ = *cp++;
+ }
+ dir = cp;
+ }
+ strcpy(path, dir);
+ path += strlen(dir);
+ if (*fn && path[-1] != '/')
+ *path++ = '/';
+ strcpy(path, fn);
+}
+
+/*
+ * Make the directory containing path and any prefix directories.
+ */
+void mkdir_prefix(char *path, int mode) {
+ char *cp = strrchr(path, '/');
+ if (!cp)
+ return;
+
+ *cp = 0;
+ if (is_directory(path)) {
+ *cp = '/';
+ return;
+ }
+
+ if (mkdir(path, mode) == 0) {
+ chmod(path, mode);
+ *cp = '/';
+ return;
+ }
+
+ mkdir_prefix(path, mode);
+ mkdir(path, mode);
+ chmod(path, mode);
+ *cp = '/';
+}
+
+/*
+ * Return the cache directory to use, based on the first of these
+ * environment variables to be set to a non-empty value.
+ */
+static const char *get_cache_basedir(const char **extra) {
+ char *base;
+
+ *extra = "";
+
+ base = getenv("XDG_CACHE_HOME");
+ if (base && *base) return base;
+
+ base = getenv("HOME");
+ if (base && *base) { *extra = "/.cache"; return base; }
+
+ base = getenv("TMPDIR");
+ if (base && *base) return base;
+
+ base = getenv("TEMP");
+ if (base && *base) return base;
+
+ return "/tmp";
+}
+
+/*
+ * Return an integer representation of pthread_self().
+ */
+static unsigned get_int_threadid() {
+ pthread_t pt = pthread_self();
+ unsigned char *s = (unsigned char *) &pt;
+ size_t i;
+ unsigned h = 0;
+ for (i = 0; i < sizeof(pthread_t); i++)
+ h = (h << 5) - h + s[i];
+ return h;
+}
+
+/*
+ * Queries the M5 string from the header and attempts to populate the
+ * reference from this using the REF_PATH environment.
+ *
+ * Returns 0 on sucess
+ * -1 on failure
+ */
+static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
+ char *ref_path = getenv("REF_PATH");
+ SAM_hdr_type *ty;
+ SAM_hdr_tag *tag;
+ char path[PATH_MAX], path_tmp[PATH_MAX];
+ char cache[PATH_MAX], cache_root[PATH_MAX];
+ char *local_cache = getenv("REF_CACHE");
+ mFILE *mf;
+ int local_path = 0;
+
+ if (fd->verbose)
+ fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
+
+ cache_root[0] = '\0';
+
+ if (!ref_path || *ref_path == '\0') {
+ /*
+ * If we have no ref path, we use the EBI server.
+ * However to avoid spamming it we require a local ref cache too.
+ */
+ ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
+ if (!local_cache || *local_cache == '\0') {
+ const char *extra;
+ const char *base = get_cache_basedir(&extra);
+ snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
+ snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
+ local_cache = cache;
+ if (fd->verbose)
+ fprintf(stderr, "Populating local cache: %s\n", local_cache);
+ }
+ }
+
+ if (!r->name)
+ return -1;
+
+ if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
+ return -1;
+
+ if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
+ goto no_M5;
+
+ if (fd->verbose)
+ fprintf(stderr, "Querying ref %s\n", tag->str+3);
+
+ /* Use cache if available */
+ if (local_cache && *local_cache) {
+ expand_cache_path(path, local_cache, tag->str+3);
+ local_path = 1;
+ }
+
+#ifndef HAVE_MMAP
+ char *path2;
+ /* Search local files in REF_PATH; we can open them and return as above */
+ if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
+ strncpy(path, path2, PATH_MAX);
+ free(path2);
+ if (is_file(path)) // incase it's too long
+ local_path = 1;
+ }
+#endif
+
+ /* Found via REF_CACHE or local REF_PATH file */
+ if (local_path) {
+ struct stat sb;
+ BGZF *fp;
+
+ if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
+ r->length = sb.st_size;
+ r->offset = r->line_length = r->bases_per_line = 0;
+
+ r->fn = string_dup(fd->refs->pool, path);
+
+ if (fd->refs->fp)
+ if (bgzf_close(fd->refs->fp) != 0)
+ return -1;
+ fd->refs->fp = fp;
+ fd->refs->fn = r->fn;
+ r->is_md5 = 1;
+
+ // Fall back to cram_get_ref() where it'll do the actual
+ // reading of the file.
+ return 0;
+ }
+ }
+
+
+ /* Otherwise search full REF_PATH; slower as loads entire file */
+ if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
+ size_t sz;
+ r->seq = mfsteal(mf, &sz);
+ if (r->seq) {
+ r->mf = NULL;
+ } else {
+ // keep mf around as we couldn't detach
+ r->seq = mf->data;
+ r->mf = mf;
+ }
+ r->length = sz;
+ r->is_md5 = 1;
+ } else {
+ refs_t *refs;
+ char *fn;
+
+ no_M5:
+ /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
+ if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
+ return -1;
+
+ fn = (strncmp(tag->str+3, "file:", 5) == 0)
+ ? tag->str+8
+ : tag->str+3;
+
+ if (fd->refs->fp) {
+ if (bgzf_close(fd->refs->fp) != 0)
+ return -1;
+ fd->refs->fp = NULL;
+ }
+ if (!(refs = refs_load_fai(fd->refs, fn, 0)))
+ return -1;
+ sanitise_SQ_lines(fd);
+
+ fd->refs = refs;
+ if (fd->refs->fp) {
+ if (bgzf_close(fd->refs->fp) != 0)
+ return -1;
+ fd->refs->fp = NULL;
+ }
+
+ if (!fd->refs->fn)
+ return -1;
+
+ if (-1 == refs2id(fd->refs, fd->header))
+ return -1;
+ if (!fd->refs->ref_id || !fd->refs->ref_id[id])
+ return -1;
+
+ // Local copy already, so fall back to cram_get_ref().
+ return 0;
+ }
+
+ /* Populate the local disk cache if required */
+ if (local_cache && *local_cache) {
+ int pid = (int) getpid();
+ unsigned thrid = get_int_threadid();
+ FILE *fp;
+
+ if (*cache_root && !is_directory(cache_root) && hts_verbose >= 1)
+ fprintf(stderr,
+"Creating reference cache directory %s\n"
+"This may become large; see the samtools(1) manual page REF_CACHE discussion\n",
+ cache_root);
+
+ expand_cache_path(path, local_cache, tag->str+3);
+ if (fd->verbose)
+ fprintf(stderr, "Writing cache file '%s'\n", path);
+ mkdir_prefix(path, 01777);
+
+ do {
+ // Attempt to further uniquify the temporary filename
+ unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
+ thrid++; // Ensure filename changes even if time/clock haven't
+
+ sprintf(path_tmp, "%s.tmp_%d_%u_%u", path, pid, thrid, t);
+ fp = fopen(path_tmp, "wx");
+ } while (fp == NULL && errno == EEXIST);
+ if (!fp) {
+ perror(path_tmp);
+
+ // Not fatal - we have the data already so keep going.
+ return 0;
+ }
+
+ // Check md5sum
+ hts_md5_context *md5;
+ char unsigned md5_buf1[16];
+ char md5_buf2[33];
+
+ if (!(md5 = hts_md5_init())) {
+ unlink(path_tmp);
+ fclose(fp);
+ return -1;
+ }
+ hts_md5_update(md5, r->seq, r->length);
+ hts_md5_final(md5_buf1, md5);
+ hts_md5_destroy(md5);
+ hts_md5_hex(md5_buf2, md5_buf1);
+
+ if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
+ fprintf(stderr, "[E::%s] mismatching md5sum for downloaded reference.\n", __func__);
+ unlink(path_tmp);
+ fclose(fp);
+ return -1;
+ }
+
+ if (r->length != fwrite(r->seq, 1, r->length, fp)) {
+ perror(path);
+ }
+ if (-1 == paranoid_fclose(fp)) {
+ unlink(path_tmp);
+ } else {
+ if (0 == chmod(path_tmp, 0444))
+ rename(path_tmp, path);
+ else
+ unlink(path_tmp);
+ }
+ }
+
+ return 0;
+}
+
+static void cram_ref_incr_locked(refs_t *r, int id) {
+ RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
+
+ if (id < 0 || !r->ref_id[id]->seq)
+ return;
+
+ if (r->last_id == id)
+ r->last_id = -1;
+
+ ++r->ref_id[id]->count;
+}
+
+void cram_ref_incr(refs_t *r, int id) {
+ pthread_mutex_lock(&r->lock);
+ cram_ref_incr_locked(r, id);
+ pthread_mutex_unlock(&r->lock);
+}
+
+static void cram_ref_decr_locked(refs_t *r, int id) {
+ RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
+
+ if (id < 0 || !r->ref_id[id]->seq) {
+ assert(r->ref_id[id]->count >= 0);
+ return;
+ }
+
+ if (--r->ref_id[id]->count <= 0) {
+ assert(r->ref_id[id]->count == 0);
+ if (r->last_id >= 0) {
+ if (r->ref_id[r->last_id]->count <= 0 &&
+ r->ref_id[r->last_id]->seq) {
+ RP("%d FREE REF %d (%p)\n", gettid(),
+ r->last_id, r->ref_id[r->last_id]->seq);
+ ref_entry_free_seq(r->ref_id[r->last_id]);
+ r->ref_id[r->last_id]->length = 0;
+ }
+ }
+ r->last_id = id;
+ }
+}
+
+void cram_ref_decr(refs_t *r, int id) {
+ pthread_mutex_lock(&r->lock);
+ cram_ref_decr_locked(r, id);
+ pthread_mutex_unlock(&r->lock);
+}
+
+/*
+ * Used by cram_ref_load and cram_ref_get. The file handle will have
+ * already been opened, so we can catch it. The ref_entry *e informs us
+ * of whether this is a multi-line fasta file or a raw MD5 style file.
+ * Either way we create a single contiguous sequence.
+ *
+ * Returns all or part of a reference sequence on success (malloced);
+ * NULL on failure.
+ */
+static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
+ off_t offset, len;
+ char *seq;
+
+ if (end < start)
+ end = start;
+
+ /*
+ * Compute locations in file. This is trivial for the MD5 files, but
+ * is still necessary for the fasta variants.
+ */
+ offset = e->line_length
+ ? e->offset + (start-1)/e->bases_per_line * e->line_length +
+ (start-1) % e->bases_per_line
+ : start-1;
+
+ len = (e->line_length
+ ? e->offset + (end-1)/e->bases_per_line * e->line_length +
+ (end-1) % e->bases_per_line
+ : end-1) - offset + 1;
+
+ if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
+ perror("bgzf_useek() on reference file");
+ return NULL;
+ }
+
+ if (len == 0 || !(seq = malloc(len))) {
+ return NULL;
+ }
+
+ if (len != bgzf_read(fp, seq, len)) {
+ perror("bgzf_read() on reference file");
+ free(seq);
+ return NULL;
+ }
+
+ /* Strip white-space if required. */
+ if (len != end-start+1) {
+ int i, j;
+ char *cp = seq;
+ char *cp_to;
+
+ for (i = j = 0; i < len; i++) {
+ if (cp[i] >= '!' && cp[i] <= '~')
+ cp[j++] = toupper(cp[i]);
+ }
+ cp_to = cp+j;
+
+ if (cp_to - seq != end-start+1) {
+ fprintf(stderr, "Malformed reference file?\n");
+ free(seq);
+ return NULL;
+ }
+ } else {
+ int i;
+ for (i = 0; i < len; i++) {
+ seq[i] = toupper(seq[i]);
+ }
+ }
+
+ return seq;
+}
+
+/*
+ * Load the entire reference 'id'.
+ * This also increments the reference count by 1.
+ *
+ * Returns ref_entry on success;
+ * NULL on failure
+ */
+ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
+ ref_entry *e = r->ref_id[id];
+ int start = 1, end = e->length;
+ char *seq;
+
+ if (e->seq) {
+ return e;
+ }
+
+ assert(e->count == 0);
+
+ if (r->last) {
+#ifdef REF_DEBUG
+ int idx = 0;
+ for (idx = 0; idx < r->nref; idx++)
+ if (r->last == r->ref_id[idx])
+ break;
+ RP("%d cram_ref_load DECR %d\n", gettid(), idx);
+#endif
+ assert(r->last->count > 0);
+ if (--r->last->count <= 0) {
+ RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
+ if (r->last->seq)
+ ref_entry_free_seq(r->last);
+ }
+ }
+
+ /* Open file if it's not already the current open reference */
+ if (strcmp(r->fn, e->fn) || r->fp == NULL) {
+ if (r->fp)
+ if (bgzf_close(r->fp) != 0)
+ return NULL;
+ r->fn = e->fn;
+ if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
+ return NULL;
+ }
+
+ RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
+
+ if (!(seq = load_ref_portion(r->fp, e, start, end))) {
+ return NULL;
+ }
+
+ RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
+
+ RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
+ e->seq = seq;
+ e->mf = NULL;
+ e->count++;
+
+ /*
+ * Also keep track of last used ref so incr/decr loops on the same
+ * sequence don't cause load/free loops.
+ */
+ RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
+ r->last = e;
+ e->count++;
+
+ return e;
+}
+
+/*
+ * Returns a portion of a reference sequence from start to end inclusive.
+ * The returned pointer is owned by either the cram_file fd or by the
+ * internal refs_t structure and should not be freed by the caller.
+ *
+ * The difference is whether or not this refs_t is in use by just the one
+ * cram_fd or by multiples, or whether we have multiple threads accessing
+ * references. In either case fd->shared will be true and we start using
+ * reference counting to track the number of users of a specific reference
+ * sequence.
+ *
+ * Otherwise the ref seq returned is allocated as part of cram_fd itself
+ * and will be freed up on the next call to cram_get_ref or cram_close.
+ *
+ * To return the entire reference sequence, specify start as 1 and end
+ * as 0.
+ *
+ * To cease using a reference, call cram_ref_decr().
+ *
+ * Returns reference on success,
+ * NULL on failure
+ */
+char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
+ ref_entry *r;
+ char *seq;
+ int ostart = start;
+
+ if (id == -1)
+ return NULL;
+
+ /* FIXME: axiomatic query of r->seq being true?
+ * Or shortcut for unsorted data where we load once and never free?
+ */
+
+ //fd->shared_ref = 1; // hard code for now to simplify things
+
+ pthread_mutex_lock(&fd->ref_lock);
+
+ RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
+
+ /*
+ * Unsorted data implies we want to fetch an entire reference at a time.
+ * We just deal with this at the moment by claiming we're sharing
+ * references instead, which has the same requirement.
+ */
+ if (fd->unsorted)
+ fd->shared_ref = 1;
+
+
+ /* Sanity checking: does this ID exist? */
+ if (id >= fd->refs->nref) {
+ fprintf(stderr, "No reference found for id %d\n", id);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+
+ if (!fd->refs || !fd->refs->ref_id[id]) {
+ fprintf(stderr, "No reference found for id %d\n", id);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+
+ if (!(r = fd->refs->ref_id[id])) {
+ fprintf(stderr, "No reference found for id %d\n", id);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+
+
+ /*
+ * It has an entry, but may not have been populated yet.
+ * Any manually loaded .fai files have their lengths known.
+ * A ref entry computed from @SQ lines (M5 or UR field) will have
+ * r->length == 0 unless it's been loaded once and verified that we have
+ * an on-disk filename for it.
+ *
+ * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
+ * open_path_mfile and libcurl, which isn't multi-thread safe unless I
+ * rewrite my code to have one curl handle per thread.
+ */
+ pthread_mutex_lock(&fd->refs->lock);
+ if (r->length == 0) {
+ if (cram_populate_ref(fd, id, r) == -1) {
+ fprintf(stderr, "Failed to populate reference for id %d\n", id);
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+ r = fd->refs->ref_id[id];
+ if (fd->unsorted)
+ cram_ref_incr_locked(fd->refs, id);
+ }
+
+
+ /*
+ * We now know that we the filename containing the reference, so check
+ * for limits. If it's over half the reference we'll load all of it in
+ * memory as this will speed up subsequent calls.
+ */
+ if (end < 1)
+ end = r->length;
+ if (end >= r->length)
+ end = r->length;
+ assert(start >= 1);
+
+ if (end - start >= 0.5*r->length || fd->shared_ref) {
+ start = 1;
+ end = r->length;
+ }
+
+ /*
+ * Maybe we have it cached already? If so use it.
+ *
+ * Alternatively if we don't have the sequence but we're sharing
+ * references and/or are asking for the entire length of it, then
+ * load the full reference into the refs structure and return
+ * a pointer to that one instead.
+ */
+ if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
+ char *cp;
+
+ if (id >= 0) {
+ if (r->seq) {
+ cram_ref_incr_locked(fd->refs, id);
+ } else {
+ ref_entry *e;
+ if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+
+ /* unsorted data implies cache ref indefinitely, to avoid
+ * continually loading and unloading.
+ */
+ if (fd->unsorted)
+ cram_ref_incr_locked(fd->refs, id);
+ }
+
+ fd->ref = NULL; /* We never access it directly */
+ fd->ref_start = 1;
+ fd->ref_end = r->length;
+ fd->ref_id = id;
+
+ cp = fd->refs->ref_id[id]->seq + ostart-1;
+ } else {
+ fd->ref = NULL;
+ cp = NULL;
+ }
+
+ RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
+
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return cp;
+ }
+
+ /*
+ * Otherwise we're not sharing, we don't have a copy of it already and
+ * we're only asking for a small portion of it.
+ *
+ * In this case load up just that segment ourselves, freeing any old
+ * small segments in the process.
+ */
+
+ /* Unmapped ref ID */
+ if (id < 0) {
+ if (fd->ref_free) {
+ free(fd->ref_free);
+ fd->ref_free = NULL;
+ }
+ fd->ref = NULL;
+ fd->ref_id = id;
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+
+ /* Open file if it's not already the current open reference */
+ if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
+ if (fd->refs->fp)
+ if (bgzf_close(fd->refs->fp) != 0)
+ return NULL;
+ fd->refs->fn = r->fn;
+ if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+ }
+
+ if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+ return NULL;
+ }
+
+ if (fd->ref_free)
+ free(fd->ref_free);
+
+ fd->ref_id = id;
+ fd->ref_start = start;
+ fd->ref_end = end;
+ fd->ref_free = fd->ref;
+ seq = fd->ref;
+
+ pthread_mutex_unlock(&fd->refs->lock);
+ pthread_mutex_unlock(&fd->ref_lock);
+
+ return seq + ostart - start;
+}
+
+/*
+ * If fd has been opened for reading, it may be permitted to specify 'fn'
+ * as NULL and let the code auto-detect the reference by parsing the
+ * SAM header @SQ lines.
+ */
+int cram_load_reference(cram_fd *fd, char *fn) {
+ int ret = 0;
+
+ if (fn) {
+ fd->refs = refs_load_fai(fd->refs, fn,
+ !(fd->embed_ref && fd->mode == 'r'));
+ fn = fd->refs ? fd->refs->fn : NULL;
+ if (!fn)
+ ret = -1;
+ sanitise_SQ_lines(fd);
+ }
+ fd->ref_fn = fn;
+
+ if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
+ if (fd->refs)
+ refs_free(fd->refs);
+ if (!(fd->refs = refs_create()))
+ return -1;
+ if (-1 == refs_from_header(fd->refs, fd, fd->header))
+ return -1;
+ }
+
+ if (fd->header)
+ if (-1 == refs2id(fd->refs, fd->header))
+ return -1;
+
+ return ret;
+}
+
+/* ----------------------------------------------------------------------
+ * Containers
+ */
+
+/*
+ * Creates a new container, specifying the maximum number of slices
+ * and records permitted.
+ *
+ * Returns cram_container ptr on success
+ * NULL on failure
+ */
+cram_container *cram_new_container(int nrec, int nslice) {
+ cram_container *c = calloc(1, sizeof(*c));
+ enum cram_DS_ID id;
+
+ if (!c)
+ return NULL;
+
+ c->curr_ref = -2;
+
+ c->max_c_rec = nrec * nslice;
+ c->curr_c_rec = 0;
+
+ c->max_rec = nrec;
+ c->record_counter = 0;
+ c->num_bases = 0;
+
+ c->max_slice = nslice;
+ c->curr_slice = 0;
+
+ c->pos_sorted = 1;
+ c->max_apos = 0;
+ c->multi_seq = 0;
+
+ c->bams = NULL;
+
+ if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
+ goto err;
+ c->slice = NULL;
+
+ if (!(c->comp_hdr = cram_new_compression_header()))
+ goto err;
+ c->comp_hdr_block = NULL;
+
+ for (id = DS_RN; id < DS_TN; id++)
+ if (!(c->stats[id] = cram_stats_create())) goto err;
+
+ //c->aux_B_stats = cram_stats_create();
+
+ if (!(c->tags_used = kh_init(s_i2i)))
+ goto err;
+ c->refs_used = 0;
+
+ return c;
+
+ err:
+ if (c) {
+ if (c->slices)
+ free(c->slices);
+ free(c);
+ }
+ return NULL;
+}
+
+void cram_free_container(cram_container *c) {
+ enum cram_DS_ID id;
+ int i;
+
+ if (!c)
+ return;
+
+ if (c->refs_used)
+ free(c->refs_used);
+
+ if (c->landmark)
+ free(c->landmark);
+
+ if (c->comp_hdr)
+ cram_free_compression_header(c->comp_hdr);
+
+ if (c->comp_hdr_block)
+ cram_free_block(c->comp_hdr_block);
+
+ if (c->slices) {
+ for (i = 0; i < c->max_slice; i++)
+ if (c->slices[i])
+ cram_free_slice(c->slices[i]);
+ free(c->slices);
+ }
+
+ for (id = DS_RN; id < DS_TN; id++)
+ if (c->stats[id]) cram_stats_free(c->stats[id]);
+
+ //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
+
+ if (c->tags_used) kh_destroy(s_i2i, c->tags_used);
+
+ free(c);
+}
+
+/*
+ * Reads a container header.
+ *
+ * Returns cram_container on success
+ * NULL on failure or no container left (fd->err == 0).
+ */
+cram_container *cram_read_container(cram_fd *fd) {
+ cram_container c2, *c;
+ int i, s;
+ size_t rd = 0;
+ uint32_t crc = 0;
+
+ fd->err = 0;
+ fd->eof = 0;
+
+ memset(&c2, 0, sizeof(c2));
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
+ fd->eof = fd->empty_container ? 1 : 2;
+ return NULL;
+ } else {
+ rd+=s;
+ }
+ } else {
+ uint32_t len;
+ if ((s = int32_decode(fd, &c2.length)) == -1) {
+ if (CRAM_MAJOR_VERS(fd->version) == 2 &&
+ CRAM_MINOR_VERS(fd->version) == 0)
+ fd->eof = 1; // EOF blocks arrived in v2.1
+ else
+ fd->eof = fd->empty_container ? 1 : 2;
+ return NULL;
+ } else {
+ rd+=s;
+ }
+ len = le_int4(c2.length);
+ crc = crc32(0L, (unsigned char *)&len, 4);
+ }
+ if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s;
+ if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s;
+ if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s;
+ if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s;
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ c2.record_counter = 0;
+ c2.num_bases = 0;
+ } else {
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
+ return NULL;
+ else
+ rd += s;
+ } else {
+ int32_t i32;
+ if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
+ return NULL;
+ else
+ rd += s;
+ c2.record_counter = i32;
+ }
+
+ if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
+ return NULL;
+ else
+ rd += s;
+ }
+ if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s;
+ if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
+
+ if (!(c = calloc(1, sizeof(*c))))
+ return NULL;
+
+ *c = c2;
+
+ if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
+ c->num_landmarks) {
+ fd->err = errno;
+ cram_free_container(c);
+ return NULL;
+ }
+ for (i = 0; i < c->num_landmarks; i++) {
+ if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
+ cram_free_container(c);
+ return NULL;
+ } else {
+ rd += s;
+ }
+ }
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
+ return NULL;
+ else
+ rd+=4;
+
+ if (crc != c->crc32) {
+ fprintf(stderr, "Container header CRC32 failure\n");
+ cram_free_container(c);
+ return NULL;
+ }
+ }
+
+ c->offset = rd;
+ c->slices = NULL;
+ c->curr_slice = 0;
+ c->max_slice = c->num_landmarks;
+ c->slice_rec = 0;
+ c->curr_rec = 0;
+ c->max_rec = 0;
+
+ if (c->ref_seq_id == -2) {
+ c->multi_seq = 1;
+ fd->multi_seq = 1;
+ }
+
+ fd->empty_container =
+ (c->num_records == 0 &&
+ c->ref_seq_id == -1 &&
+ c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
+
+ return c;
+}
+
+
+/* MAXIMUM storage size needed for the container. */
+int cram_container_size(cram_container *c) {
+ return 55 + 5*c->num_landmarks;
+}
+
+
+/*
+ * Stores the container structure in dat and returns *size as the
+ * number of bytes written to dat[]. The input size of dat is also
+ * held in *size and should be initialised to cram_container_size(c).
+ *
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
+{
+ char *cp = dat;
+ int i;
+
+ // Check the input buffer is large enough according to our stated
+ // requirements. (NOTE: it may actually take less.)
+ if (cram_container_size(c) > *size)
+ return -1;
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ cp += itf8_put(cp, c->length);
+ } else {
+ *(int32_t *)cp = le_int4(c->length);
+ cp += 4;
+ }
+ if (c->multi_seq) {
+ cp += itf8_put(cp, -2);
+ cp += itf8_put(cp, 0);
+ cp += itf8_put(cp, 0);
+ } else {
+ cp += itf8_put(cp, c->ref_seq_id);
+ cp += itf8_put(cp, c->ref_seq_start);
+ cp += itf8_put(cp, c->ref_seq_span);
+ }
+ cp += itf8_put(cp, c->num_records);
+ if (CRAM_MAJOR_VERS(fd->version) == 2) {
+ cp += itf8_put(cp, c->record_counter);
+ cp += ltf8_put(cp, c->num_bases);
+ } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ cp += ltf8_put(cp, c->record_counter);
+ cp += ltf8_put(cp, c->num_bases);
+ }
+
+ cp += itf8_put(cp, c->num_blocks);
+ cp += itf8_put(cp, c->num_landmarks);
+ for (i = 0; i < c->num_landmarks; i++)
+ cp += itf8_put(cp, c->landmark[i]);
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ c->crc32 = crc32(0L, (uc *)dat, cp-dat);
+ cp[0] = c->crc32 & 0xff;
+ cp[1] = (c->crc32 >> 8) & 0xff;
+ cp[2] = (c->crc32 >> 16) & 0xff;
+ cp[3] = (c->crc32 >> 24) & 0xff;
+ cp += 4;
+ }
+
+ *size = cp-dat; // actual used size
+
+ return 0;
+}
+
+
+/*
+ * Writes a container structure.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_write_container(cram_fd *fd, cram_container *c) {
+ char buf_a[1024], *buf = buf_a, *cp;
+ int i;
+
+ if (55 + c->num_landmarks * 5 >= 1024)
+ buf = malloc(55 + c->num_landmarks * 5);
+ cp = buf;
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ cp += itf8_put(cp, c->length);
+ } else {
+ *(int32_t *)cp = le_int4(c->length);
+ cp += 4;
+ }
+ if (c->multi_seq) {
+ cp += itf8_put(cp, -2);
+ cp += itf8_put(cp, 0);
+ cp += itf8_put(cp, 0);
+ } else {
+ cp += itf8_put(cp, c->ref_seq_id);
+ cp += itf8_put(cp, c->ref_seq_start);
+ cp += itf8_put(cp, c->ref_seq_span);
+ }
+ cp += itf8_put(cp, c->num_records);
+ if (CRAM_MAJOR_VERS(fd->version) == 2) {
+ cp += itf8_put(cp, c->record_counter);
+ cp += ltf8_put(cp, c->num_bases);
+ } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ cp += ltf8_put(cp, c->record_counter);
+ cp += ltf8_put(cp, c->num_bases);
+ }
+
+ cp += itf8_put(cp, c->num_blocks);
+ cp += itf8_put(cp, c->num_landmarks);
+ for (i = 0; i < c->num_landmarks; i++)
+ cp += itf8_put(cp, c->landmark[i]);
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3) {
+ c->crc32 = crc32(0L, (uc *)buf, cp-buf);
+ cp[0] = c->crc32 & 0xff;
+ cp[1] = (c->crc32 >> 8) & 0xff;
+ cp[2] = (c->crc32 >> 16) & 0xff;
+ cp[3] = (c->crc32 >> 24) & 0xff;
+ cp += 4;
+ }
+
+ if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
+ if (buf != buf_a)
+ free(buf);
+ return -1;
+ }
+
+ if (buf != buf_a)
+ free(buf);
+
+ return 0;
+}
+
+// common component shared by cram_flush_container{,_mt}
+static int cram_flush_container2(cram_fd *fd, cram_container *c) {
+ int i, j;
+
+ if (c->curr_slice > 0 && !c->slices)
+ return -1;
+
+ //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
+
+ /* Write the container struct itself */
+ if (0 != cram_write_container(fd, c))
+ return -1;
+
+ /* And the compression header */
+ if (0 != cram_write_block(fd, c->comp_hdr_block))
+ return -1;
+
+ /* Followed by the slice blocks */
+ for (i = 0; i < c->curr_slice; i++) {
+ cram_slice *s = c->slices[i];
+
+ if (0 != cram_write_block(fd, s->hdr_block))
+ return -1;
+
+ for (j = 0; j < s->hdr->num_blocks; j++) {
+ if (0 != cram_write_block(fd, s->block[j]))
+ return -1;
+ }
+ }
+
+ return hflush(fd->fp) == 0 ? 0 : -1;
+}
+
+/*
+ * Flushes a completely or partially full container to disk, writing
+ * container structure, header and blocks. This also calls the encoder
+ * functions.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_flush_container(cram_fd *fd, cram_container *c) {
+ /* Encode the container blocks and generate compression header */
+ if (0 != cram_encode_container(fd, c))
+ return -1;
+
+ return cram_flush_container2(fd, c);
+}
+
+typedef struct {
+ cram_fd *fd;
+ cram_container *c;
+} cram_job;
+
+void *cram_flush_thread(void *arg) {
+ cram_job *j = (cram_job *)arg;
+
+ /* Encode the container blocks and generate compression header */
+ if (0 != cram_encode_container(j->fd, j->c)) {
+ fprintf(stderr, "cram_encode_container failed\n");
+ return NULL;
+ }
+
+ return arg;
+}
+
+static int cram_flush_result(cram_fd *fd) {
+ int i, ret = 0;
+ t_pool_result *r;
+
+ while ((r = t_pool_next_result(fd->rqueue))) {
+ cram_job *j = (cram_job *)r->data;
+ cram_container *c;
+
+ if (!j) {
+ t_pool_delete_result(r, 0);
+ return -1;
+ }
+
+ fd = j->fd;
+ c = j->c;
+
+ if (0 != cram_flush_container2(fd, c))
+ return -1;
+
+ /* Free the container */
+ for (i = 0; i < c->max_slice; i++) {
+ cram_free_slice(c->slices[i]);
+ c->slices[i] = NULL;
+ }
+
+ c->slice = NULL;
+ c->curr_slice = 0;
+
+ cram_free_container(c);
+
+ ret |= hflush(fd->fp) == 0 ? 0 : -1;
+
+ t_pool_delete_result(r, 1);
+ }
+
+ return ret;
+}
+
+int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
+ cram_job *j;
+
+ if (!fd->pool)
+ return cram_flush_container(fd, c);
+
+ if (!(j = malloc(sizeof(*j))))
+ return -1;
+ j->fd = fd;
+ j->c = c;
+
+ t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j);
+
+ return cram_flush_result(fd);
+}
+
+/* ----------------------------------------------------------------------
+ * Compression headers; the first part of the container
+ */
+
+/*
+ * Creates a new blank container compression header
+ *
+ * Returns header ptr on success
+ * NULL on failure
+ */
+cram_block_compression_hdr *cram_new_compression_header(void) {
+ cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
+ if (!hdr)
+ return NULL;
+
+ if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
+ free(hdr);
+ return NULL;
+ }
+
+ if (!(hdr->TD_hash = kh_init(m_s2i))) {
+ cram_free_block(hdr->TD_blk);
+ free(hdr);
+ return NULL;
+ }
+
+ if (!(hdr->TD_keys = string_pool_create(8192))) {
+ kh_destroy(m_s2i, hdr->TD_hash);
+ cram_free_block(hdr->TD_blk);
+ free(hdr);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+void cram_free_compression_header(cram_block_compression_hdr *hdr) {
+ int i;
+
+ if (hdr->landmark)
+ free(hdr->landmark);
+
+ if (hdr->preservation_map)
+ kh_destroy(map, hdr->preservation_map);
+
+ for (i = 0; i < CRAM_MAP_HASH; i++) {
+ cram_map *m, *m2;
+ for (m = hdr->rec_encoding_map[i]; m; m = m2) {
+ m2 = m->next;
+ if (m->codec)
+ m->codec->free(m->codec);
+ free(m);
+ }
+ }
+
+ for (i = 0; i < CRAM_MAP_HASH; i++) {
+ cram_map *m, *m2;
+ for (m = hdr->tag_encoding_map[i]; m; m = m2) {
+ m2 = m->next;
+ if (m->codec)
+ m->codec->free(m->codec);
+ free(m);
+ }
+ }
+
+ for (i = 0; i < DS_END; i++) {
+ if (hdr->codecs[i])
+ hdr->codecs[i]->free(hdr->codecs[i]);
+ }
+
+ if (hdr->TL)
+ free(hdr->TL);
+ if (hdr->TD_blk)
+ cram_free_block(hdr->TD_blk);
+ if (hdr->TD_hash)
+ kh_destroy(m_s2i, hdr->TD_hash);
+ if (hdr->TD_keys)
+ string_pool_destroy(hdr->TD_keys);
+
+ free(hdr);
+}
+
+
+/* ----------------------------------------------------------------------
+ * Slices and slice headers
+ */
+
+void cram_free_slice_header(cram_block_slice_hdr *hdr) {
+ if (!hdr)
+ return;
+
+ if (hdr->block_content_ids)
+ free(hdr->block_content_ids);
+
+ free(hdr);
+
+ return;
+}
+
+void cram_free_slice(cram_slice *s) {
+ if (!s)
+ return;
+
+ if (s->hdr_block)
+ cram_free_block(s->hdr_block);
+
+ if (s->block) {
+ int i;
+
+ if (s->hdr) {
+ for (i = 0; i < s->hdr->num_blocks; i++) {
+ cram_free_block(s->block[i]);
+ }
+ }
+ free(s->block);
+ }
+
+ if (s->block_by_id)
+ free(s->block_by_id);
+
+ if (s->hdr)
+ cram_free_slice_header(s->hdr);
+
+ if (s->seqs_blk)
+ cram_free_block(s->seqs_blk);
+
+ if (s->qual_blk)
+ cram_free_block(s->qual_blk);
+
+ if (s->name_blk)
+ cram_free_block(s->name_blk);
+
+ if (s->aux_blk)
+ cram_free_block(s->aux_blk);
+
+ if (s->aux_OQ_blk)
+ cram_free_block(s->aux_OQ_blk);
+
+ if (s->aux_BQ_blk)
+ cram_free_block(s->aux_BQ_blk);
+
+ if (s->aux_FZ_blk)
+ cram_free_block(s->aux_FZ_blk);
+
+ if (s->aux_oq_blk)
+ cram_free_block(s->aux_oq_blk);
+
+ if (s->aux_os_blk)
+ cram_free_block(s->aux_os_blk);
+
+ if (s->aux_oz_blk)
+ cram_free_block(s->aux_oz_blk);
+
+ if (s->base_blk)
+ cram_free_block(s->base_blk);
+
+ if (s->soft_blk)
+ cram_free_block(s->soft_blk);
+
+ if (s->cigar)
+ free(s->cigar);
+
+ if (s->crecs)
+ free(s->crecs);
+
+ if (s->features)
+ free(s->features);
+
+ if (s->TN)
+ free(s->TN);
+
+ if (s->pair_keys)
+ string_pool_destroy(s->pair_keys);
+
+ if (s->pair[0])
+ kh_destroy(m_s2i, s->pair[0]);
+ if (s->pair[1])
+ kh_destroy(m_s2i, s->pair[1]);
+
+ free(s);
+}
+
+/*
+ * Creates a new empty slice in memory, for subsequent writing to
+ * disk.
+ *
+ * Returns cram_slice ptr on success
+ * NULL on failure
+ */
+cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
+ cram_slice *s = calloc(1, sizeof(*s));
+ if (!s)
+ return NULL;
+
+ if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
+ goto err;
+ s->hdr->content_type = type;
+
+ s->hdr_block = NULL;
+ s->block = NULL;
+ s->block_by_id = NULL;
+ s->last_apos = 0;
+ if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
+ s->cigar = NULL;
+ s->cigar_alloc = 0;
+ s->ncigar = 0;
+
+ if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
+ if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
+ if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
+ if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
+ if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
+ if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
+
+ s->features = NULL;
+ s->nfeatures = s->afeatures = 0;
+
+#ifndef TN_external
+ s->TN = NULL;
+ s->nTN = s->aTN = 0;
+#endif
+
+ // Volatile keys as we do realloc in dstring
+ if (!(s->pair_keys = string_pool_create(8192))) goto err;
+ if (!(s->pair[0] = kh_init(m_s2i))) goto err;
+ if (!(s->pair[1] = kh_init(m_s2i))) goto err;
+
+#ifdef BA_external
+ s->BA_len = 0;
+#endif
+
+ return s;
+
+ err:
+ if (s)
+ cram_free_slice(s);
+
+ return NULL;
+}
+
+/*
+ * Loads an entire slice.
+ * FIXME: In 1.0 the native unit of slices within CRAM is broken
+ * as slices contain references to objects in other slices.
+ * To work around this while keeping the slice oriented outer loop
+ * we read all slices and stitch them together into a fake large
+ * slice instead.
+ *
+ * Returns cram_slice ptr on success
+ * NULL on failure
+ */
+cram_slice *cram_read_slice(cram_fd *fd) {
+ cram_block *b = cram_read_block(fd);
+ cram_slice *s = calloc(1, sizeof(*s));
+ int i, n, max_id, min_id;
+
+ if (!b || !s)
+ goto err;
+
+ s->hdr_block = b;
+ switch (b->content_type) {
+ case MAPPED_SLICE:
+ case UNMAPPED_SLICE:
+ if (!(s->hdr = cram_decode_slice_header(fd, b)))
+ goto err;
+ break;
+
+ default:
+ fprintf(stderr, "Unexpected block of type %s\n",
+ cram_content_type2str(b->content_type));
+ goto err;
+ }
+
+ if (s->hdr->num_blocks < 1) {
+ fprintf(stderr, "Slice does not include any data blocks.\n");
+ goto err;
+ }
+
+ s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
+ if (!s->block)
+ goto err;
+
+ for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
+ if (!(s->block[i] = cram_read_block(fd)))
+ goto err;
+
+ if (s->block[i]->content_type == EXTERNAL) {
+ if (max_id < s->block[i]->content_id)
+ max_id = s->block[i]->content_id;
+ if (min_id > s->block[i]->content_id)
+ min_id = s->block[i]->content_id;
+ }
+ }
+ if (min_id >= 0 && max_id < 1024) {
+ if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
+ goto err;
+
+ for (i = 0; i < n; i++) {
+ if (s->block[i]->content_type != EXTERNAL)
+ continue;
+ s->block_by_id[s->block[i]->content_id] = s->block[i];
+ }
+ }
+
+ /* Initialise encoding/decoding tables */
+ s->cigar = NULL;
+ s->cigar_alloc = 0;
+ s->ncigar = 0;
+
+ if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
+ if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
+ if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
+ if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
+ if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
+ if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
+
+ s->crecs = NULL;
+
+ s->last_apos = s->hdr->ref_seq_start;
+
+ return s;
+
+ err:
+ if (b)
+ cram_free_block(b);
+ if (s) {
+ s->hdr_block = NULL;
+ cram_free_slice(s);
+ }
+ return NULL;
+}
+
+
+/* ----------------------------------------------------------------------
+ * CRAM file definition (header)
+ */
+
+/*
+ * Reads a CRAM file definition structure.
+ * Returns file_def ptr on success
+ * NULL on failure
+ */
+cram_file_def *cram_read_file_def(cram_fd *fd) {
+ cram_file_def *def = malloc(sizeof(*def));
+ if (!def)
+ return NULL;
+
+ if (26 != hread(fd->fp, &def->magic[0], 26)) {
+ free(def);
+ return NULL;
+ }
+
+ if (memcmp(def->magic, "CRAM", 4) != 0) {
+ free(def);
+ return NULL;
+ }
+
+ if (def->major_version > 3) {
+ fprintf(stderr, "CRAM version number mismatch\n"
+ "Expected 1.x, 2.x or 3.x, got %d.%d\n",
+ def->major_version, def->minor_version);
+ free(def);
+ return NULL;
+ }
+
+ fd->first_container += 26;
+ fd->last_slice = 0;
+
+ return def;
+}
+
+/*
+ * Writes a cram_file_def structure to cram_fd.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
+ return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
+}
+
+void cram_free_file_def(cram_file_def *def) {
+ if (def) free(def);
+}
+
+/* ----------------------------------------------------------------------
+ * SAM header I/O
+ */
+
+
+/*
+ * Reads the SAM header from the first CRAM data block.
+ * Also performs minimal parsing to extract read-group
+ * and sample information.
+
+ * Returns SAM hdr ptr on success
+ * NULL on failure
+ */
+SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
+ int32_t header_len;
+ char *header;
+ SAM_hdr *hdr;
+
+ /* 1.1 onwards stores the header in the first block of a container */
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ /* Length */
+ if (-1 == int32_decode(fd, &header_len))
+ return NULL;
+
+ /* Alloc and read */
+ if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
+ return NULL;
+
+ if (header_len != hread(fd->fp, header, header_len))
+ return NULL;
+ header[header_len] = '\0';
+
+ fd->first_container += 4 + header_len;
+ } else {
+ cram_container *c = cram_read_container(fd);
+ cram_block *b;
+ int i, len;
+
+ if (!c)
+ return NULL;
+
+ if (c->num_blocks < 1) {
+ cram_free_container(c);
+ return NULL;
+ }
+
+ if (!(b = cram_read_block(fd))) {
+ cram_free_container(c);
+ return NULL;
+ }
+ if (cram_uncompress_block(b) != 0) {
+ cram_free_container(c);
+ return NULL;
+ }
+
+ len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
+ itf8_size(b->content_id) +
+ itf8_size(b->uncomp_size) +
+ itf8_size(b->comp_size);
+
+ /* Extract header from 1st block */
+ if (-1 == int32_get_blk(b, &header_len) ||
+ header_len < 0 || /* Spec. says signed... why? */
+ b->uncomp_size - 4 < header_len) {
+ cram_free_container(c);
+ cram_free_block(b);
+ return NULL;
+ }
+ if (NULL == (header = malloc((size_t) header_len+1))) {
+ cram_free_container(c);
+ cram_free_block(b);
+ return NULL;
+ }
+ memcpy(header, BLOCK_END(b), header_len);
+ header[header_len]='\0';
+ cram_free_block(b);
+
+ /* Consume any remaining blocks */
+ for (i = 1; i < c->num_blocks; i++) {
+ if (!(b = cram_read_block(fd))) {
+ cram_free_container(c);
+ return NULL;
+ }
+ len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
+ itf8_size(b->content_id) +
+ itf8_size(b->uncomp_size) +
+ itf8_size(b->comp_size);
+ cram_free_block(b);
+ }
+
+ if (c->length > 0 && len > 0 && c->length > len) {
+ // Consume padding
+ char *pads = malloc(c->length - len);
+ if (!pads) {
+ cram_free_container(c);
+ return NULL;
+ }
+
+ if (c->length - len != hread(fd->fp, pads, c->length - len)) {
+ cram_free_container(c);
+ return NULL;
+ }
+ free(pads);
+ }
+
+ cram_free_container(c);
+ }
+
+ /* Parse */
+ hdr = sam_hdr_parse_(header, header_len);
+ free(header);
+
+ return hdr;
+}
+
+/*
+ * Converts 'in' to a full pathname to store in out.
+ * Out must be at least PATH_MAX bytes long.
+ */
+static void full_path(char *out, char *in) {
+ if (*in == '/') {
+ strncpy(out, in, PATH_MAX);
+ out[PATH_MAX-1] = 0;
+ } else {
+ int len;
+
+ // unable to get dir or out+in is too long
+ if (!getcwd(out, PATH_MAX) ||
+ (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
+ strncpy(out, in, PATH_MAX);
+ out[PATH_MAX-1] = 0;
+ return;
+ }
+
+ sprintf(out+len, "/%.*s", PATH_MAX - len, in);
+
+ // FIXME: cope with `pwd`/../../../foo.fa ?
+ }
+}
+
+/*
+ * Writes a CRAM SAM header.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
+ int header_len;
+ int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
+
+ /* Write CRAM MAGIC if not yet written. */
+ if (fd->file_def->major_version == 0) {
+ fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
+ fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
+ if (0 != cram_write_file_def(fd, fd->file_def))
+ return -1;
+ }
+
+ /* 1.0 requires an UNKNOWN read-group */
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
+ if (sam_hdr_add(hdr, "RG",
+ "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
+ return -1;
+ }
+
+ /* Fix M5 strings */
+ if (fd->refs && !fd->no_ref) {
+ int i;
+ for (i = 0; i < hdr->nref; i++) {
+ SAM_hdr_type *ty;
+ char *ref;
+
+ if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
+ return -1;
+
+ if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
+ char unsigned buf[16];
+ char buf2[33];
+ int rlen;
+ hts_md5_context *md5;
+
+ if (!fd->refs ||
+ !fd->refs->ref_id ||
+ !fd->refs->ref_id[i]) {
+ return -1;
+ }
+ rlen = fd->refs->ref_id[i]->length;
+ if (!(md5 = hts_md5_init()))
+ return -1;
+ ref = cram_get_ref(fd, i, 1, rlen);
+ if (NULL == ref) return -1;
+ rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
+ hts_md5_update(md5, ref, rlen);
+ hts_md5_final(buf, md5);
+ hts_md5_destroy(md5);
+ cram_ref_decr(fd->refs, i);
+
+ hts_md5_hex(buf2, buf);
+ if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
+ return -1;
+ }
+
+ if (fd->ref_fn) {
+ char ref_fn[PATH_MAX];
+ full_path(ref_fn, fd->ref_fn);
+ if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
+ return -1;
+ }
+ }
+ }
+
+ if (sam_hdr_rebuild(hdr))
+ return -1;
+
+ /* Length */
+ header_len = sam_hdr_length(hdr);
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ if (-1 == int32_encode(fd, header_len))
+ return -1;
+
+ /* Text data */
+ if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
+ return -1;
+ } else {
+ /* Create block(s) inside a container */
+ cram_block *b = cram_new_block(FILE_HEADER, 0);
+ cram_container *c = cram_new_container(0, 0);
+ int padded_length;
+ char *pads;
+ int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
+
+ if (!b || !c) {
+ if (b) cram_free_block(b);
+ if (c) cram_free_container(c);
+ return -1;
+ }
+
+ int32_put_blk(b, header_len);
+ BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
+ BLOCK_UPLEN(b);
+
+ // Compress header block if V3.0 and above
+ if (CRAM_MAJOR_VERS(fd->version) >= 3)
+ cram_compress_block(fd, b, NULL, -1, -1);
+
+ if (blank_block) {
+ c->length = b->comp_size + 2 + 4*is_cram_3 +
+ itf8_size(b->content_id) +
+ itf8_size(b->uncomp_size) +
+ itf8_size(b->comp_size);
+
+ c->num_blocks = 2;
+ c->num_landmarks = 2;
+ if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
+ c->landmark[0] = 0;
+ c->landmark[1] = c->length;
+
+ // Plus extra storage for uncompressed secondary blank block
+ padded_length = MIN(c->length*.5, 10000);
+ c->length += padded_length + 2 + 4*is_cram_3 +
+ itf8_size(b->content_id) +
+ itf8_size(padded_length)*2;
+ } else {
+ // Pad the block instead.
+ c->num_blocks = 1;
+ c->num_landmarks = 1;
+ if (!(c->landmark = malloc(sizeof(*c->landmark))))
+ return -1;
+ c->landmark[0] = 0;
+
+ padded_length = MAX(c->length*1.5, 10000) - c->length;
+
+ c->length = b->comp_size + padded_length +
+ 2 + 4*is_cram_3 +
+ itf8_size(b->content_id) +
+ itf8_size(b->uncomp_size) +
+ itf8_size(b->comp_size);
+
+ if (NULL == (pads = calloc(1, padded_length))) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
+ BLOCK_APPEND(b, pads, padded_length);
+ BLOCK_UPLEN(b);
+ free(pads);
+ }
+
+ if (-1 == cram_write_container(fd, c)) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
+
+ if (-1 == cram_write_block(fd, b)) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
+
+ if (blank_block) {
+ BLOCK_RESIZE(b, padded_length);
+ memset(BLOCK_DATA(b), 0, padded_length);
+ BLOCK_SIZE(b) = padded_length;
+ BLOCK_UPLEN(b);
+ b->method = RAW;
+ if (-1 == cram_write_block(fd, b)) {
+ cram_free_block(b);
+ cram_free_container(c);
+ return -1;
+ }
+ }
+
+ cram_free_block(b);
+ cram_free_container(c);
+ }
+
+ if (-1 == refs_from_header(fd->refs, fd, fd->header))
+ return -1;
+ if (-1 == refs2id(fd->refs, fd->header))
+ return -1;
+
+ if (0 != hflush(fd->fp))
+ return -1;
+
+ RP("=== Finishing saving header ===\n");
+
+ return 0;
+}
+
+/* ----------------------------------------------------------------------
+ * The top-level cram opening, closing and option handling
+ */
+
+/*
+ * Initialises the lookup tables. These could be global statics, but they're
+ * clumsy to setup in a multi-threaded environment unless we generate
+ * verbatim code and include that.
+ */
+static void cram_init_tables(cram_fd *fd) {
+ int i;
+
+ memset(fd->L1, 4, 256);
+ fd->L1['A'] = 0; fd->L1['a'] = 0;
+ fd->L1['C'] = 1; fd->L1['c'] = 1;
+ fd->L1['G'] = 2; fd->L1['g'] = 2;
+ fd->L1['T'] = 3; fd->L1['t'] = 3;
+
+ memset(fd->L2, 5, 256);
+ fd->L2['A'] = 0; fd->L2['a'] = 0;
+ fd->L2['C'] = 1; fd->L2['c'] = 1;
+ fd->L2['G'] = 2; fd->L2['g'] = 2;
+ fd->L2['T'] = 3; fd->L2['t'] = 3;
+ fd->L2['N'] = 4; fd->L2['n'] = 4;
+
+ if (CRAM_MAJOR_VERS(fd->version) == 1) {
+ for (i = 0; i < 0x200; i++) {
+ int f = 0;
+
+ if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
+ if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
+ if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
+ if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
+ if (i & CRAM_FREAD1) f |= BAM_FREAD1;
+ if (i & CRAM_FREAD2) f |= BAM_FREAD2;
+ if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
+ if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
+ if (i & CRAM_FDUP) f |= BAM_FDUP;
+
+ fd->bam_flag_swap[i] = f;
+ }
+
+ for (i = 0; i < 0x1000; i++) {
+ int g = 0;
+
+ if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
+ if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
+ if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
+ if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
+ if (i & BAM_FREAD1) g |= CRAM_FREAD1;
+ if (i & BAM_FREAD2) g |= CRAM_FREAD2;
+ if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
+ if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
+ if (i & BAM_FDUP) g |= CRAM_FDUP;
+
+ fd->cram_flag_swap[i] = g;
+ }
+ } else {
+ /* NOP */
+ for (i = 0; i < 0x1000; i++)
+ fd->bam_flag_swap[i] = i;
+ for (i = 0; i < 0x1000; i++)
+ fd->cram_flag_swap[i] = i;
+ }
+
+ memset(fd->cram_sub_matrix, 4, 32*32);
+ for (i = 0; i < 32; i++) {
+ fd->cram_sub_matrix[i]['A'&0x1f]=0;
+ fd->cram_sub_matrix[i]['C'&0x1f]=1;
+ fd->cram_sub_matrix[i]['G'&0x1f]=2;
+ fd->cram_sub_matrix[i]['T'&0x1f]=3;
+ fd->cram_sub_matrix[i]['N'&0x1f]=4;
+ }
+ for (i = 0; i < 20; i+=4) {
+ int j;
+ for (j = 0; j < 20; j++) {
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
+ }
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
+ fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
+ }
+}
+
+// Default version numbers for CRAM
+static int major_version = 3;
+static int minor_version = 0;
+
+/*
+ * Opens a CRAM file for read (mode "rb") or write ("wb").
+ * The filename may be "-" to indicate stdin or stdout.
+ *
+ * Returns file handle on success
+ * NULL on failure.
+ */
+cram_fd *cram_open(const char *filename, const char *mode) {
+ hFILE *fp;
+ cram_fd *fd;
+ char fmode[3]= { mode[0], '\0', '\0' };
+
+ if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
+ fmode[1] = 'b';
+ }
+
+ fp = hopen(filename, fmode);
+ if (!fp)
+ return NULL;
+
+ fd = cram_dopen(fp, filename, mode);
+ if (!fd)
+ hclose_abruptly(fp);
+
+ return fd;
+}
+
+/* Opens an existing stream for reading or writing.
+ *
+ * Returns file handle on success;
+ * NULL on failure.
+ */
+cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
+ int i;
+ char *cp;
+ cram_fd *fd = calloc(1, sizeof(*fd));
+ if (!fd)
+ return NULL;
+
+ fd->level = 5;
+ for (i = 0; mode[i]; i++) {
+ if (mode[i] >= '0' && mode[i] <= '9') {
+ fd->level = mode[i] - '0';
+ break;
+ }
+ }
+
+ fd->fp = fp;
+ fd->mode = *mode;
+ fd->first_container = 0;
+
+ if (fd->mode == 'r') {
+ /* Reader */
+
+ if (!(fd->file_def = cram_read_file_def(fd)))
+ goto err;
+
+ fd->version = fd->file_def->major_version * 256 +
+ fd->file_def->minor_version;
+
+ if (!(fd->header = cram_read_SAM_hdr(fd)))
+ goto err;
+
+ } else {
+ /* Writer */
+ cram_file_def *def = calloc(1, sizeof(*def));
+ if (!def)
+ return NULL;
+
+ fd->file_def = def;
+
+ def->magic[0] = 'C';
+ def->magic[1] = 'R';
+ def->magic[2] = 'A';
+ def->magic[3] = 'M';
+ def->major_version = 0; // Indicator to write file def later.
+ def->minor_version = 0;
+ memset(def->file_id, 0, 20);
+ strncpy(def->file_id, filename, 20);
+
+ fd->version = major_version * 256 + minor_version;
+
+ /* SAM header written later along with this file_def */
+ }
+
+ cram_init_tables(fd);
+
+ fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
+ if (!fd->prefix)
+ goto err;
+ fd->first_base = fd->last_base = -1;
+ fd->record_counter = 0;
+
+ fd->ctr = NULL;
+ fd->refs = refs_create();
+ if (!fd->refs)
+ goto err;
+ fd->ref_id = -2;
+ fd->ref = NULL;
+
+ fd->decode_md = 0;
+ fd->verbose = 0;
+ fd->seqs_per_slice = SEQS_PER_SLICE;
+ fd->slices_per_container = SLICE_PER_CNT;
+ fd->embed_ref = 0;
+ fd->no_ref = 0;
+ fd->ignore_md5 = 0;
+ fd->use_bz2 = 0;
+ fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
+ fd->use_lzma = 0;
+ fd->multi_seq = -1;
+ fd->unsorted = 0;
+ fd->shared_ref = 0;
+
+ fd->index = NULL;
+ fd->own_pool = 0;
+ fd->pool = NULL;
+ fd->rqueue = NULL;
+ fd->job_pending = NULL;
+ fd->ooc = 0;
+ fd->required_fields = INT_MAX;
+
+ for (i = 0; i < DS_END; i++)
+ fd->m[i] = cram_new_metrics();
+
+ fd->range.refid = -2; // no ref.
+ fd->eof = 1; // See samtools issue #150
+ fd->ref_fn = NULL;
+
+ fd->bl = NULL;
+
+ /* Initialise dummy refs from the @SQ headers */
+ if (-1 == refs_from_header(fd->refs, fd, fd->header))
+ goto err;
+
+ return fd;
+
+ err:
+ if (fd)
+ free(fd);
+
+ return NULL;
+}
+
+/*
+ * Seek within a CRAM file.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_seek(cram_fd *fd, off_t offset, int whence) {
+ char buf[65536];
+
+ fd->ooc = 0;
+
+ if (hseek(fd->fp, offset, whence) >= 0)
+ return 0;
+
+ if (!(whence == SEEK_CUR && offset >= 0))
+ return -1;
+
+ /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
+ while (offset > 0) {
+ int len = MIN(65536, offset);
+ if (len != hread(fd->fp, buf, len))
+ return -1;
+ offset -= len;
+ }
+
+ return 0;
+}
+
+/*
+ * Flushes a CRAM file.
+ * Useful for when writing to stdout without wishing to close the stream.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_flush(cram_fd *fd) {
+ if (!fd)
+ return -1;
+
+ if (fd->mode == 'w' && fd->ctr) {
+ if(fd->ctr->slice)
+ fd->ctr->curr_slice++;
+ if (-1 == cram_flush_container_mt(fd, fd->ctr))
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Closes a CRAM file.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_close(cram_fd *fd) {
+ spare_bams *bl, *next;
+ int i;
+
+ if (!fd)
+ return -1;
+
+ if (fd->mode == 'w' && fd->ctr) {
+ if(fd->ctr->slice)
+ fd->ctr->curr_slice++;
+ if (-1 == cram_flush_container_mt(fd, fd->ctr))
+ return -1;
+ }
+
+ if (fd->pool && fd->eof >= 0) {
+ t_pool_flush(fd->pool);
+
+ if (0 != cram_flush_result(fd))
+ return -1;
+
+ pthread_mutex_destroy(&fd->metrics_lock);
+ pthread_mutex_destroy(&fd->ref_lock);
+ pthread_mutex_destroy(&fd->bam_list_lock);
+
+ fd->ctr = NULL; // prevent double freeing
+
+ //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
+
+ t_results_queue_destroy(fd->rqueue);
+ }
+
+ if (fd->mode == 'w') {
+ /* Write EOF block */
+ if (CRAM_MAJOR_VERS(fd->version) == 3) {
+ if (38 != hwrite(fd->fp,
+ "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
+ "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
+ "\x00\x01\x00" // Cont HDR
+ "\x05\xbd\xd9\x4f" // CRC32
+ "\x00\x01\x00\x06\x06" // Comp.HDR blk
+ "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
+ "\xee\x63\x01\x4b", // CRC32
+ 38))
+ return -1;
+ } else {
+ if (30 != hwrite(fd->fp,
+ "\x0b\x00\x00\x00\xff\xff\xff\xff"
+ "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
+ "\x00\x01\x00\x00\x01\x00\x06\x06"
+ "\x01\x00\x01\x00\x01\x00", 30))
+ return -1;
+ }
+ }
+
+ for (bl = fd->bl; bl; bl = next) {
+ int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
+
+ next = bl->next;
+ for (i = 0; i < max_rec; i++) {
+ if (bl->bams[i])
+ bam_free(bl->bams[i]);
+ }
+ free(bl->bams);
+ free(bl);
+ }
+
+ if (hclose(fd->fp) != 0)
+ return -1;
+
+ if (fd->file_def)
+ cram_free_file_def(fd->file_def);
+
+ if (fd->header)
+ sam_hdr_free(fd->header);
+
+ free(fd->prefix);
+
+ if (fd->ctr)
+ cram_free_container(fd->ctr);
+
+ if (fd->refs)
+ refs_free(fd->refs);
+ if (fd->ref_free)
+ free(fd->ref_free);
+
+ for (i = 0; i < DS_END; i++)
+ if (fd->m[i])
+ free(fd->m[i]);
+
+ if (fd->index)
+ cram_index_free(fd);
+
+ if (fd->own_pool && fd->pool)
+ t_pool_destroy(fd->pool, 0);
+
+ free(fd);
+ return 0;
+}
+
+/*
+ * Returns 1 if we hit an EOF while reading.
+ */
+int cram_eof(cram_fd *fd) {
+ return fd->eof;
+}
+
+
+/*
+ * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
+ * Use this immediately after opening.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
+ int r;
+ va_list args;
+
+ va_start(args, opt);
+ r = cram_set_voption(fd, opt, args);
+ va_end(args);
+
+ return r;
+}
+
+/*
+ * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
+ * Use this immediately after opening.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
+ refs_t *refs;
+
+ if (!fd) {
+ errno = EBADF;
+ return -1;
+ }
+
+ switch (opt) {
+ case CRAM_OPT_DECODE_MD:
+ fd->decode_md = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_PREFIX:
+ if (fd->prefix)
+ free(fd->prefix);
+ if (!(fd->prefix = strdup(va_arg(args, char *))))
+ return -1;
+ break;
+
+ case CRAM_OPT_VERBOSITY:
+ fd->verbose = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_SEQS_PER_SLICE:
+ fd->seqs_per_slice = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_SLICES_PER_CONTAINER:
+ fd->slices_per_container = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_EMBED_REF:
+ fd->embed_ref = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_NO_REF:
+ fd->no_ref = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_IGNORE_MD5:
+ fd->ignore_md5 = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_USE_BZIP2:
+ fd->use_bz2 = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_USE_RANS:
+ fd->use_rans = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_USE_LZMA:
+ fd->use_lzma = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_SHARED_REF:
+ fd->shared_ref = 1;
+ refs = va_arg(args, refs_t *);
+ if (refs != fd->refs) {
+ if (fd->refs)
+ refs_free(fd->refs);
+ fd->refs = refs;
+ fd->refs->count++;
+ }
+ break;
+
+ case CRAM_OPT_RANGE:
+ fd->range = *va_arg(args, cram_range *);
+ return cram_seek_to_refpos(fd, &fd->range);
+
+ case CRAM_OPT_REFERENCE:
+ return cram_load_reference(fd, va_arg(args, char *));
+
+ case CRAM_OPT_VERSION: {
+ int major, minor;
+ char *s = va_arg(args, char *);
+ if (2 != sscanf(s, "%d.%d", &major, &minor)) {
+ fprintf(stderr, "Malformed version string %s\n", s);
+ return -1;
+ }
+ if (!((major == 1 && minor == 0) ||
+ (major == 2 && (minor == 0 || minor == 1)) ||
+ (major == 3 && minor == 0))) {
+ fprintf(stderr, "Unknown version string; "
+ "use 1.0, 2.0, 2.1 or 3.0\n");
+ errno = EINVAL;
+ return -1;
+ }
+ fd->version = major*256 + minor;
+
+ if (CRAM_MAJOR_VERS(fd->version) >= 3)
+ fd->use_rans = 1;
+ break;
+ }
+
+ case CRAM_OPT_MULTI_SEQ_PER_SLICE:
+ fd->multi_seq = va_arg(args, int);
+ break;
+
+ case CRAM_OPT_NTHREADS: {
+ int nthreads = va_arg(args, int);
+ if (nthreads > 1) {
+ if (!(fd->pool = t_pool_init(nthreads*2, nthreads)))
+ return -1;
+
+ fd->rqueue = t_results_queue_init();
+ pthread_mutex_init(&fd->metrics_lock, NULL);
+ pthread_mutex_init(&fd->ref_lock, NULL);
+ pthread_mutex_init(&fd->bam_list_lock, NULL);
+ fd->shared_ref = 1;
+ fd->own_pool = 1;
+ }
+ break;
+ }
+
+ case CRAM_OPT_THREAD_POOL:
+ fd->pool = va_arg(args, t_pool *);
+ if (fd->pool) {
+ fd->rqueue = t_results_queue_init();
+ pthread_mutex_init(&fd->metrics_lock, NULL);
+ pthread_mutex_init(&fd->ref_lock, NULL);
+ pthread_mutex_init(&fd->bam_list_lock, NULL);
+ }
+ fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
+ fd->own_pool = 0;
+
+ //fd->qsize = 1;
+ //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
+ //t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
+ break;
+
+ case CRAM_OPT_REQUIRED_FIELDS:
+ fd->required_fields = va_arg(args, int);
+ break;
+
+ case HTS_OPT_COMPRESSION_LEVEL:
+ fd->level = va_arg(args, int);
+ break;
+
+ default:
+ fprintf(stderr, "Unknown CRAM option code %d\n", opt);
+ errno = EINVAL;
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/htslib/cram/cram_io.h b/htslib/cram/cram_io.h
new file mode 100644
index 0000000..d3fe90e
--- /dev/null
+++ b/htslib/cram/cram_io.h
@@ -0,0 +1,669 @@
+/*
+Copyright (c) 2012-2014 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*! \file
+ * Include cram.h instead.
+ *
+ * This is an internal part of the CRAM system and is automatically included
+ * when you #include cram.h.
+ *
+ * Implements the low level CRAM I/O primitives.
+ * This includes basic data types such as byte, int, ITF-8,
+ * maps, bitwise I/O, etc.
+ */
+
+#ifndef _CRAM_IO_H_
+#define _CRAM_IO_H_
+
+#define ITF8_MACROS
+
+#include <stdint.h>
+#include <cram/misc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@{ ----------------------------------------------------------------------
+ * ITF8 encoding and decoding.
+ *
+ * Also see the itf8_get and itf8_put macros.
+ */
+
+/*! INTERNAL: Converts two characters into an integer for use in switch{} */
+#define CRAM_KEY(a,b) (((a)<<8)|((b)))
+
+/*! Reads an integer in ITF-8 encoding from 'fd' and stores it in
+ * *val.
+ *
+ * @return
+ * Returns the number of bytes read on success;
+ * -1 on failure
+ */
+int itf8_decode(cram_fd *fd, int32_t *val);
+
+#ifndef ITF8_MACROS
+/*! Reads an integer in ITF-8 encoding from 'cp' and stores it in
+ * *val.
+ *
+ * @return
+ * Returns the number of bytes read on success;
+ * -1 on failure
+ */
+int itf8_get(char *cp, int32_t *val_p);
+
+/*! Stores a value to memory in ITF-8 format.
+ *
+ * @return
+ * Returns the number of bytes required to store the number.
+ * This is a maximum of 5 bytes.
+ */
+int itf8_put(char *cp, int32_t val);
+
+#else
+
+/*
+ * Macro implementations of the above
+ */
+#define itf8_get(c,v) (((uc)(c)[0]<0x80)?(*(v)=(uc)(c)[0],1):(((uc)(c)[0]<0xc0)?(*(v)=(((uc)(c)[0]<<8)|(uc)(c)[1])&0x3fff,2):(((uc)(c)[0]<0xe0)?(*(v)=(((uc)(c)[0]<<16)|((uc)(c)[1]<<8)|(uc)(c)[2])&0x1fffff,3):(((uc)(c)[0]<0xf0)?(*(v)=(((uc)(c)[0]<<24)|((uc)(c)[1]<<16)|((uc)(c)[2]<<8)|(uc)(c)[3])&0x0fffffff,4):(*(v)=(((uc)(c)[0]&0x0f)<<28)|((uc)(c)[1]<<20)|((uc)(c)[2]<<12)|((uc)(c)[3]<<4)|((uc)(c)[4]&0x0f),5)))))
+
+#define itf8_put(c,v) ((!((v)&~0x7f))?((c)[0]=(v),1):(!((v)&~0x3fff))?((c)[0]=((v)>>8)|0x80,(c)[1]=(v)&0xff,2):(!((v)&~0x1fffff))?((c)[0]=((v)>>16)|0xc0,(c)[1]=((v)>>8)&0xff,(c)[2]=(v)&0xff,3):(!((v)&~0xfffffff))?((c)[0]=((v)>>24)|0xe0,(c)[1]=((v)>>16)&0xff,(c)[2]=((v)>>8)&0xff,(c)[3]=(v)&0xff,4):((c)[0]=0xf0|(((v)>>28)&0xff),(c)[1]=((v)>>20)&0xff,(c)[2]=((v)>>12)&0xff,(c)[3]=((v)>>4)&0xff,(c)[4]=(v)&0xf,5))
+
+#define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5)
+
+#endif
+
+int ltf8_get(char *cp, int64_t *val_p);
+int ltf8_put(char *cp, int64_t val);
+
+ /* Version of itf8_get that checks it hasn't run out of input */
+
+extern const int itf8_bytes[16];
+
+static inline int safe_itf8_get(const char *cp, const char *endp,
+ int32_t *val_p) {
+ const unsigned char *up = (unsigned char *)cp;
+
+ if (endp - cp < 5 &&
+ (cp >= endp || endp - cp < itf8_bytes[up[0]>>4])) {
+ *val_p = 0;
+ return 0;
+ }
+
+ if (up[0] < 0x80) {
+ *val_p = up[0];
+ return 1;
+ } else if (up[0] < 0xc0) {
+ *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
+ return 2;
+ } else if (up[0] < 0xe0) {
+ *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
+ return 3;
+ } else if (up[0] < 0xf0) {
+ *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
+ return 4;
+ } else {
+ *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
+ return 5;
+ }
+}
+
+/*! Pushes a value in ITF8 format onto the end of a block.
+ *
+ * This shouldn't be used for high-volume data as it is not the fastest
+ * method.
+ *
+ * @return
+ * Returns the number of bytes written
+ */
+int itf8_put_blk(cram_block *blk, int val);
+
+/*! Pulls a literal 32-bit value from a block.
+ *
+ * @returns the number of bytes decoded;
+ * -1 on failure.
+ */
+int int32_get_blk(cram_block *b, int32_t *val);
+
+/*! Pushes a literal 32-bit value onto the end of a block.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure.
+ */
+int int32_put_blk(cram_block *blk, int32_t val);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * CRAM blocks - the dynamically growable data block. We have code to
+ * create, update, (un)compress and read/write.
+ *
+ * These are derived from the deflate_interlaced.c blocks, but with the
+ * CRAM extension of content types and IDs.
+ */
+
+/*! Allocates a new cram_block structure with a specified content_type and
+ * id.
+ *
+ * @return
+ * Returns block pointer on success;
+ * NULL on failure
+ */
+cram_block *cram_new_block(enum cram_content_type content_type,
+ int content_id);
+
+/*! Reads a block from a cram file.
+ *
+ * @return
+ * Returns cram_block pointer on success;
+ * NULL on failure
+ */
+cram_block *cram_read_block(cram_fd *fd);
+
+/*! Writes a CRAM block.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_write_block(cram_fd *fd, cram_block *b);
+
+/*! Frees a CRAM block, deallocating internal data too.
+ */
+void cram_free_block(cram_block *b);
+
+/*! Uncompress a memory block using Zlib.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size);
+
+/*! Uncompresses a CRAM block, if compressed.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_uncompress_block(cram_block *b);
+
+/*! Compresses a block.
+ *
+ * Compresses a block using one of two different zlib strategies. If we only
+ * want one choice set strat2 to be -1.
+ *
+ * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
+ * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
+ * significantly faster.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
+ int method, int level);
+
+cram_metrics *cram_new_metrics(void);
+char *cram_block_method2str(enum cram_block_method m);
+char *cram_content_type2str(enum cram_content_type t);
+
+/*
+ * Find an external block by its content_id
+ */
+
+static inline cram_block *cram_get_block_by_id(cram_slice *slice, int id) {
+ if (slice->block_by_id && id >= 0 && id < 1024) {
+ return slice->block_by_id[id];
+ } else {
+ int i;
+ for (i = 0; i < slice->hdr->num_blocks; i++) {
+ cram_block *b = slice->block[i];
+ if (b && b->content_type == EXTERNAL && b->content_id == id)
+ return b;
+ }
+ }
+ return NULL;
+}
+
+/* --- Accessor macros for manipulating blocks on a byte by byte basis --- */
+
+/* Block size and data pointer. */
+#define BLOCK_SIZE(b) ((b)->byte)
+#define BLOCK_DATA(b) ((b)->data)
+
+/* Returns the address one past the end of the block */
+#define BLOCK_END(b) (&(b)->data[(b)->byte])
+
+/* Request block to be at least 'l' bytes long */
+#define BLOCK_RESIZE(b,l) \
+ do { \
+ while((b)->alloc <= (l)) { \
+ (b)->alloc = (b)->alloc ? (b)->alloc*1.5 : 1024; \
+ (b)->data = realloc((b)->data, (b)->alloc); \
+ } \
+ } while(0)
+
+/* Make block exactly 'l' bytes long */
+#define BLOCK_RESIZE_EXACT(b,l) \
+ do { \
+ (b)->alloc = (l); \
+ (b)->data = realloc((b)->data, (b)->alloc); \
+ } while(0)
+
+/* Ensure the block can hold at least another 'l' bytes */
+#define BLOCK_GROW(b,l) BLOCK_RESIZE((b), BLOCK_SIZE((b)) + (l))
+
+/* Append string 's' of length 'l' */
+#define BLOCK_APPEND(b,s,l) \
+ do { \
+ BLOCK_GROW((b),(l)); \
+ memcpy(BLOCK_END((b)), (s), (l)); \
+ BLOCK_SIZE((b)) += (l); \
+ } while (0)
+
+/* Append as single character 'c' */
+#define BLOCK_APPEND_CHAR(b,c) \
+ do { \
+ BLOCK_GROW((b),1); \
+ (b)->data[(b)->byte++] = (c); \
+ } while (0)
+
+/* Append a single unsigned integer */
+#define BLOCK_APPEND_UINT(b,i) \
+ do { \
+ unsigned char *cp; \
+ BLOCK_GROW((b),11); \
+ cp = &(b)->data[(b)->byte]; \
+ (b)->byte += append_uint32(cp, (i)) - cp; \
+ } while (0)
+
+static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
+ uint32_t j;
+
+ if (i == 0) {
+ *cp++ = '0';
+ return cp;
+ }
+
+ if (i < 100) goto b1;
+ if (i < 10000) goto b3;
+ if (i < 1000000) goto b5;
+ if (i < 100000000) goto b7;
+
+ if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
+ if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
+ b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
+ if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
+ b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
+ if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
+ b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
+ if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
+ b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
+ if (i) *cp++ = i + '0';
+ return cp;
+
+ x8: *cp++ = i / 100000000 + '0', i %= 100000000;
+ x7: *cp++ = i / 10000000 + '0', i %= 10000000;
+ x6: *cp++ = i / 1000000 + '0', i %= 1000000;
+ x5: *cp++ = i / 100000 + '0', i %= 100000;
+ x4: *cp++ = i / 10000 + '0', i %= 10000;
+ x3: *cp++ = i / 1000 + '0', i %= 1000;
+ x2: *cp++ = i / 100 + '0', i %= 100;
+ x1: *cp++ = i / 10 + '0', i %= 10;
+ x0: *cp++ = i + '0';
+
+ return cp;
+}
+
+static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
+ *cp++ = i / 100000000 + '0', i %= 100000000;
+ *cp++ = i / 10000000 + '0', i %= 10000000;
+ *cp++ = i / 1000000 + '0', i %= 1000000;
+ *cp++ = i / 100000 + '0', i %= 100000;
+ *cp++ = i / 10000 + '0', i %= 10000;
+ *cp++ = i / 1000 + '0', i %= 1000;
+ *cp++ = i / 100 + '0', i %= 100;
+ *cp++ = i / 10 + '0', i %= 10;
+ *cp++ = i + '0';
+
+ return cp;
+}
+
+static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
+ uint64_t j;
+
+ if (i <= 0xffffffff)
+ return append_uint32(cp, i);
+
+ if ((j = i/1000000000) > 1000000000) {
+ cp = append_uint32(cp, j/1000000000);
+ j %= 1000000000;
+ cp = append_sub32(cp, j);
+ } else {
+ cp = append_uint32(cp, i / 1000000000);
+ }
+ cp = append_sub32(cp, i % 1000000000);
+
+ return cp;
+}
+
+#define BLOCK_UPLEN(b) \
+ (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Reference sequence handling
+ */
+
+/*! Loads a reference set from fn and stores in the cram_fd.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_load_reference(cram_fd *fd, char *fn);
+
+/*! Generates a lookup table in refs based on the SQ headers in SAM_hdr.
+ *
+ * Indexes references by the order they appear in a BAM file. This may not
+ * necessarily be the same order they appear in the fasta reference file.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int refs2id(refs_t *r, SAM_hdr *bfd);
+
+void refs_free(refs_t *r);
+
+/*! Returns a portion of a reference sequence from start to end inclusive.
+ *
+ * The returned pointer is owned by the cram_file fd and should not be freed
+ * by the caller. It is valid only until the next cram_get_ref is called
+ * with the same fd parameter (so is thread-safe if given multiple files).
+ *
+ * To return the entire reference sequence, specify start as 1 and end
+ * as 0.
+ *
+ * @return
+ * Returns reference on success;
+ * NULL on failure
+ */
+char *cram_get_ref(cram_fd *fd, int id, int start, int end);
+void cram_ref_incr(refs_t *r, int id);
+void cram_ref_decr(refs_t *r, int id);
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Containers
+ */
+
+/*! Creates a new container, specifying the maximum number of slices
+ * and records permitted.
+ *
+ * @return
+ * Returns cram_container ptr on success;
+ * NULL on failure
+ */
+cram_container *cram_new_container(int nrec, int nslice);
+void cram_free_container(cram_container *c);
+
+/*! Reads a container header.
+ *
+ * @return
+ * Returns cram_container on success;
+ * NULL on failure or no container left (fd->err == 0).
+ */
+cram_container *cram_read_container(cram_fd *fd);
+
+/*! Writes a container structure.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_write_container(cram_fd *fd, cram_container *h);
+
+/*! Flushes a container to disk.
+ *
+ * Flushes a completely or partially full container to disk, writing
+ * container structure, header and blocks. This also calls the encoder
+ * functions.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_flush_container(cram_fd *fd, cram_container *c);
+int cram_flush_container_mt(cram_fd *fd, cram_container *c);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Compression headers; the first part of the container
+ */
+
+/*! Creates a new blank container compression header
+ *
+ * @return
+ * Returns header ptr on success;
+ * NULL on failure
+ */
+cram_block_compression_hdr *cram_new_compression_header(void);
+
+/*! Frees a cram_block_compression_hdr */
+void cram_free_compression_header(cram_block_compression_hdr *hdr);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Slices and slice headers
+ */
+
+/*! Frees a slice header */
+void cram_free_slice_header(cram_block_slice_hdr *hdr);
+
+/*! Frees a slice */
+void cram_free_slice(cram_slice *s);
+
+/*! Creates a new empty slice in memory, for subsequent writing to
+ * disk.
+ *
+ * @return
+ * Returns cram_slice ptr on success;
+ * NULL on failure
+ */
+cram_slice *cram_new_slice(enum cram_content_type type, int nrecs);
+
+/*! Loads an entire slice.
+ *
+ * FIXME: In 1.0 the native unit of slices within CRAM is broken
+ * as slices contain references to objects in other slices.
+ * To work around this while keeping the slice oriented outer loop
+ * we read all slices and stitch them together into a fake large
+ * slice instead.
+ *
+ * @return
+ * Returns cram_slice ptr on success;
+ * NULL on failure
+ */
+cram_slice *cram_read_slice(cram_fd *fd);
+
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * CRAM file definition (header)
+ */
+
+/*! Reads a CRAM file definition structure.
+ *
+ * @return
+ * Returns file_def ptr on success;
+ * NULL on failure
+ */
+cram_file_def *cram_read_file_def(cram_fd *fd);
+
+/*! Writes a cram_file_def structure to cram_fd.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_write_file_def(cram_fd *fd, cram_file_def *def);
+
+/*! Frees a cram_file_def structure. */
+void cram_free_file_def(cram_file_def *def);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * SAM header I/O
+ */
+
+/*! Reads the SAM header from the first CRAM data block.
+ *
+ * Also performs minimal parsing to extract read-group
+ * and sample information.
+ *
+ * @return
+ * Returns SAM hdr ptr on success;
+ * NULL on failure
+ */
+SAM_hdr *cram_read_SAM_hdr(cram_fd *fd);
+
+/*! Writes a CRAM SAM header.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * The top-level cram opening, closing and option handling
+ */
+
+/*! Opens a CRAM file for read (mode "rb") or write ("wb").
+ *
+ * The filename may be "-" to indicate stdin or stdout.
+ *
+ * @return
+ * Returns file handle on success;
+ * NULL on failure.
+ */
+cram_fd *cram_open(const char *filename, const char *mode);
+
+/*! Opens an existing stream for reading or writing.
+ *
+ * @return
+ * Returns file handle on success;
+ * NULL on failure.
+ */
+cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
+
+/*! Closes a CRAM file.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_close(cram_fd *fd);
+
+/*
+ * Seek within a CRAM file.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_seek(cram_fd *fd, off_t offset, int whence);
+
+/*
+ * Flushes a CRAM file.
+ * Useful for when writing to stdout without wishing to close the stream.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_flush(cram_fd *fd);
+
+/*! Checks for end of file on a cram_fd stream.
+ *
+ * @return
+ * Returns 0 if not at end of file
+ * 1 if we hit an expected EOF (end of range or EOF block)
+ * 2 for other EOF (end of stream without EOF block)
+ */
+int cram_eof(cram_fd *fd);
+
+/*! Sets options on the cram_fd.
+ *
+ * See CRAM_OPT_* definitions in cram_structs.h.
+ * Use this immediately after opening.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...);
+
+/*! Sets options on the cram_fd.
+ *
+ * See CRAM_OPT_* definitions in cram_structs.h.
+ * Use this immediately after opening.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args);
+
+/*!
+ * Attaches a header to a cram_fd.
+ *
+ * This should be used when creating a new cram_fd for writing where
+ * we have an SAM_hdr already constructed (eg from a file we've read
+ * in).
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_set_header(cram_fd *fd, SAM_hdr *hdr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CRAM_IO_H_ */
diff --git a/htslib/cram/cram_samtools.c b/htslib/cram/cram_samtools.c
new file mode 100644
index 0000000..9e95c7b
--- /dev/null
+++ b/htslib/cram/cram_samtools.c
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2010-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "cram/cram.h"
+#include "htslib/sam.h"
+
+/*---------------------------------------------------------------------------
+ * Samtools compatibility portion
+ */
+int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
+ const char *qname, size_t qname_len,
+ int flag,
+ int rname, // Ref ID
+ int pos,
+ int end, // aligned start/end coords
+ int mapq,
+ uint32_t ncigar, const uint32_t *cigar,
+ int mrnm, // Mate Ref ID
+ int mpos,
+ int isize,
+ int len,
+ const char *seq,
+ const char *qual) {
+ static const char L[256] = {
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15,
+ 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15,
+ 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+ };
+ bam1_t *b = (bam1_t *)*bp;
+ uint8_t *cp;
+ int i, bam_len;
+
+ //b->l_aux = extra_len; // we fill this out later
+
+ bam_len = qname_len + 1 + ncigar*4 + (len+1)/2 + len + extra_len;
+ if (b->m_data < bam_len) {
+ b->m_data = bam_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ if (!b->data)
+ return -1;
+ }
+ b->l_data = bam_len;
+
+ b->core.tid = rname;
+ b->core.pos = pos-1;
+ b->core.bin = bam_reg2bin(pos, end);
+ b->core.qual = mapq;
+ b->core.l_qname = qname_len+1;
+ b->core.flag = flag;
+ b->core.n_cigar = ncigar;
+ b->core.l_qseq = len;
+ b->core.mtid = mrnm;
+ b->core.mpos = mpos-1;
+ b->core.isize = isize;
+
+ cp = b->data;
+
+ strncpy((char *)cp, qname, qname_len);
+ cp[qname_len] = 0;
+ cp += qname_len+1;
+ memcpy(cp, cigar, ncigar*4);
+ cp += ncigar*4;
+
+ for (i = 0; i+1 < len; i+=2) {
+ *cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]];
+ }
+ if (i < len)
+ *cp++ = L[(uc)seq[i]]<<4;
+
+ if (qual)
+ memcpy(cp, qual, len);
+ else
+ memset(cp, '\xff', len);
+
+ return 0;
+}
+
+bam_hdr_t *cram_header_to_bam(SAM_hdr *h) {
+ int i;
+ bam_hdr_t *header = bam_hdr_init();
+
+ header->l_text = ks_len(&h->text);
+ header->text = malloc(header->l_text+1);
+ memcpy(header->text, ks_str(&h->text), header->l_text);
+ header->text[header->l_text] = 0;
+
+ header->n_targets = h->nref;
+ header->target_name = (char **)calloc(header->n_targets,
+ sizeof(char *));
+ header->target_len = (uint32_t *)calloc(header->n_targets, 4);
+
+ for (i = 0; i < h->nref; i++) {
+ header->target_name[i] = strdup(h->ref[i].name);
+ header->target_len[i] = h->ref[i].len;
+ }
+
+ return header;
+}
+
+SAM_hdr *bam_header_to_cram(bam_hdr_t *h) {
+ return sam_hdr_parse_(h->text, h->l_text);
+}
diff --git a/htslib/cram/cram_samtools.h b/htslib/cram/cram_samtools.h
new file mode 100644
index 0000000..635e2e0
--- /dev/null
+++ b/htslib/cram/cram_samtools.h
@@ -0,0 +1,105 @@
+/*
+Copyright (c) 2010-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _CRAM_SAMTOOLS_H_
+#define _CRAM_SAMTOOLS_H_
+
+/* Samtools compatible API */
+#define bam_blk_size(b) ((b)->l_data)
+#define bam_set_blk_size(b,v) ((b)->data_len = (v))
+
+#define bam_ref(b) (b)->core.tid
+#define bam_pos(b) (b)->core.pos
+#define bam_mate_pos(b) (b)->core.mpos
+#define bam_mate_ref(b) (b)->core.mtid
+#define bam_ins_size(b) (b)->core.isize
+#define bam_seq_len(b) (b)->core.l_qseq
+#define bam_cigar_len(b) (b)->core.n_cigar
+#define bam_flag(b) (b)->core.flag
+#define bam_bin(b) (b)->core.bin
+#define bam_map_qual(b) (b)->core.qual
+#define bam_name_len(b) (b)->core.l_qname
+#define bam_name(b) bam_get_qname((b))
+#define bam_qual(b) bam_get_qual((b))
+#define bam_seq(b) bam_get_seq((b))
+#define bam_cigar(b) bam_get_cigar((b))
+#define bam_aux(b) bam_get_aux((b))
+
+#define bam_dup(b) bam_copy1(bam_init1(), (b))
+
+#define bam_free(b) bam_destroy1((b))
+
+#define bam_reg2bin(beg,end) hts_reg2bin((beg),(end),14,5)
+
+#include "htslib/sam.h"
+
+enum cigar_op {
+ BAM_CMATCH_=BAM_CMATCH,
+ BAM_CINS_=BAM_CINS,
+ BAM_CDEL_=BAM_CDEL,
+ BAM_CREF_SKIP_=BAM_CREF_SKIP,
+ BAM_CSOFT_CLIP_=BAM_CSOFT_CLIP,
+ BAM_CHARD_CLIP_=BAM_CHARD_CLIP,
+ BAM_CPAD_=BAM_CPAD,
+ BAM_CBASE_MATCH=BAM_CEQUAL,
+ BAM_CBASE_MISMATCH=BAM_CDIFF
+};
+
+typedef bam1_t bam_seq_t;
+
+#include "cram/sam_header.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bam_hdr_t *cram_header_to_bam(SAM_hdr *h);
+SAM_hdr *bam_header_to_cram(bam_hdr_t *h);
+
+int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
+ const char *qname, size_t qname_len,
+ int flag,
+ int rname, // Ref ID
+ int pos,
+ int end, // aligned start/end coords
+ int mapq,
+ uint32_t ncigar, const uint32_t *cigar,
+ int mrnm, // Mate Ref ID
+ int mpos,
+ int isize,
+ int len,
+ const char *seq,
+ const char *qual);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CRAM_SAMTOOLS_H_ */
diff --git a/htslib/cram/cram_stats.c b/htslib/cram/cram_stats.c
new file mode 100644
index 0000000..c627d3e
--- /dev/null
+++ b/htslib/cram/cram_stats.c
@@ -0,0 +1,448 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "cram/cram.h"
+#include "cram/os.h"
+
+cram_stats *cram_stats_create(void) {
+ return calloc(1, sizeof(cram_stats));
+}
+
+void cram_stats_add(cram_stats *st, int32_t val) {
+ st->nsamp++;
+
+ //assert(val >= 0);
+
+ if (val < MAX_STAT_VAL && val >= 0) {
+ st->freqs[val]++;
+ } else {
+ khint_t k;
+ int r;
+
+ if (!st->h) {
+ st->h = kh_init(m_i2i);
+ }
+
+ k = kh_put(m_i2i, st->h, val, &r);
+ if (r == 0)
+ kh_val(st->h, k)++;
+ else if (r != -1)
+ kh_val(st->h, k) = 1;
+ else
+ ; // FIXME: handle error
+ }
+}
+
+void cram_stats_del(cram_stats *st, int32_t val) {
+ st->nsamp--;
+
+ //assert(val >= 0);
+
+ if (val < MAX_STAT_VAL && val >= 0) {
+ st->freqs[val]--;
+ assert(st->freqs[val] >= 0);
+ } else if (st->h) {
+ khint_t k = kh_get(m_i2i, st->h, val);
+
+ if (k != kh_end(st->h)) {
+ if (--kh_val(st->h, k) == 0)
+ kh_del(m_i2i, st->h, k);
+ } else {
+ fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
+ st->nsamp++;
+ }
+ } else {
+ fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
+ st->nsamp++;
+ }
+}
+
+void cram_stats_dump(cram_stats *st) {
+ int i;
+ fprintf(stderr, "cram_stats:\n");
+ for (i = 0; i < MAX_STAT_VAL; i++) {
+ if (!st->freqs[i])
+ continue;
+ fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
+ }
+ if (st->h) {
+ khint_t k;
+ for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
+ if (!kh_exist(st->h, k))
+ continue;
+
+ fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k));
+ }
+ }
+}
+
+#if 1
+/* Returns the number of bits set in val; it the highest bit used */
+static int nbits(int v) {
+ static const int MultiplyDeBruijnBitPosition[32] = {
+ 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
+ 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
+ };
+
+ v |= v >> 1; // first up to set all bits 1 after the first 1 */
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+
+ // DeBruijn magic to find top bit
+ return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
+}
+#endif
+
+/*
+ * Computes entropy from integer frequencies for various encoding methods and
+ * picks the best encoding.
+ *
+ * FIXME: we could reuse some of the code here for the actual encoding
+ * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
+ *
+ * Returns the best codec to use.
+ */
+enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
+ enum cram_encoding best_encoding = E_NULL;
+ int best_size = INT_MAX, bits;
+ int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
+ int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes;
+
+ //cram_stats_dump(st);
+
+ /* Count number of unique symbols */
+ for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
+ if (!st->freqs[i])
+ continue;
+ if (nvals >= vals_alloc) {
+ vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
+ vals = realloc(vals, vals_alloc * sizeof(int));
+ freqs = realloc(freqs, vals_alloc * sizeof(int));
+ if (!vals || !freqs) {
+ if (vals) free(vals);
+ if (freqs) free(freqs);
+ return E_HUFFMAN; // Cannot do much else atm
+ }
+ }
+ vals[nvals] = i;
+ freqs[nvals] = st->freqs[i];
+ ntot += freqs[nvals];
+ if (max_val < i) max_val = i;
+ if (min_val > i) min_val = i;
+ nvals++;
+ }
+ if (st->h) {
+ khint_t k;
+ int i;
+
+ for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
+ if (!kh_exist(st->h, k))
+ continue;
+
+ if (nvals >= vals_alloc) {
+ vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
+ vals = realloc(vals, vals_alloc * sizeof(int));
+ freqs = realloc(freqs, vals_alloc * sizeof(int));
+ if (!vals || !freqs)
+ return E_HUFFMAN; // Cannot do much else atm
+ }
+ i = kh_key(st->h, k);
+ vals[nvals]=i;
+ freqs[nvals] = kh_val(st->h, k);
+ ntot += freqs[nvals];
+ if (max_val < i) max_val = i;
+ if (min_val > i) min_val = i;
+ nvals++;
+ }
+ }
+
+ st->nvals = nvals;
+ assert(ntot == st->nsamp);
+
+ if (nvals <= 1) {
+ free(vals);
+ free(freqs);
+ return E_HUFFMAN;
+ }
+
+ if (fd->verbose > 1)
+ fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
+ min_val, max_val, nvals, ntot);
+
+ /* Theoretical entropy */
+// if (fd->verbose > 1) {
+// double dbits = 0;
+// for (i = 0; i < nvals; i++) {
+// dbits += freqs[i] * log((double)freqs[i]/ntot);
+// }
+// dbits /= -log(2);
+// if (fd->verbose > 1)
+// fprintf(stderr, "Entropy = %f\n", dbits);
+// }
+
+ if (nvals > 1 && ntot > 256) {
+#if 0
+ /*
+ * CRUDE huffman estimator. Round to closest and round up from 0
+ * to 1 bit.
+ *
+ * With and without ITF8 incase we have a few discrete values but with
+ * large magnitude.
+ *
+ * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be
+ * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes
+ * may detect the correlation of high bytes to low bytes in multi-
+ * byte values. So this predictor breaks down.
+ */
+ double dbits = 0; // entropy + ~huffman
+ double dbitsH = 0;
+ double dbitsE = 0; // external entropy + ~huffman
+ double dbitsEH = 0;
+ int F[256] = {0}, n = 0;
+ double e = 0; // accumulated error bits
+ for (i = 0; i < nvals; i++) {
+ double x; int X;
+ unsigned int v = vals[i];
+
+ //Better encoding would cope with sign.
+ //v = ABS(vals[i])*2+(vals[i]<0);
+
+ if (!(v & ~0x7f)) {
+ F[v] += freqs[i], n+=freqs[i];
+ } else if (!(v & ~0x3fff)) {
+ F[(v>>8) |0x80] += freqs[i];
+ F[ v &0xff] += freqs[i], n+=2*freqs[i];
+ } else if (!(v & ~0x1fffff)) {
+ F[(v>>16)|0xc0] += freqs[i];
+ F[(v>>8 )&0xff] += freqs[i];
+ F[ v &0xff] += freqs[i], n+=3*freqs[i];
+ } else if (!(v & ~0x0fffffff)) {
+ F[(v>>24)|0xe0] += freqs[i];
+ F[(v>>16)&0xff] += freqs[i];
+ F[(v>>8 )&0xff] += freqs[i];
+ F[ v &0xff] += freqs[i], n+=4*freqs[i];
+ } else {
+ F[(v>>28)|0xf0] += freqs[i];
+ F[(v>>20)&0xff] += freqs[i];
+ F[(v>>12)&0xff] += freqs[i];
+ F[(v>>4 )&0xff] += freqs[i];
+ F[ v &0x0f] += freqs[i], n+=5*freqs[i];
+ }
+
+ x = -log((double)freqs[i]/ntot)/.69314718055994530941;
+ X = x+0.5;
+ if ((int)(x+((double)e/freqs[i])+.5)>X) {
+ X++;
+ } else if ((int)(x+((double)e/freqs[i])+.5)<X) {
+ X--;
+ }
+ e-=freqs[i]*(X-x);
+ X += (X==0);
+
+ //fprintf(stderr, "Val %d = %d x %d (ent %f, %d) e %f\n", i, v, freqs[i], x, X, e);
+
+ dbits += freqs[i] * x;
+ dbitsH += freqs[i] * X;
+ }
+
+ for (i = 0; i < 256; i++) {
+ if (F[i]) {
+ double x = -log((double)F[i]/n)/.69314718055994530941;
+ int X = x+0.5;
+ X += (X==0);
+ dbitsE += F[i] * x;
+ dbitsEH += F[i] * X;
+
+ //fprintf(stderr, "Val %d = %d x %d (e %f, %d)\n", i, i, F[i], x, X);
+ }
+ }
+
+ //fprintf(stderr, "CORE Entropy = %f, %f\n", dbits/8, dbitsH/8);
+ //fprintf(stderr, "Ext. Entropy = %f, %f\n", dbitsE/8, dbitsEH/8);
+
+ if (dbitsE < 1000 || dbitsE / dbits > 1.1) {
+ //fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals);
+ free(vals); free(freqs);
+ return nvals < 200 ? E_HUFFMAN : E_BETA;
+ }
+#endif
+ free(vals); free(freqs);
+ return E_EXTERNAL;
+ }
+
+ /*
+ * Avoid complex stats for now, just do heuristic of HUFFMAN for small
+ * alphabets and BETA for anything large.
+ */
+ free(vals); free(freqs);
+ return nvals < 200 ? E_HUFFMAN : E_BETA;
+ //return E_HUFFMAN;
+ //return E_EXTERNAL;
+
+
+ /* We only support huffman now anyway... */
+ //free(vals); free(freqs); return E_HUFFMAN;
+
+ /* Beta */
+ bits = nbits(max_val - min_val) * ntot;
+ if (fd->verbose > 1)
+ fprintf(stderr, "BETA = %d\n", bits);
+ if (best_size > bits)
+ best_size = bits, best_encoding = E_BETA;
+
+#if 0
+ /* Unary */
+ if (min_val >= 0) {
+ for (bits = i = 0; i < nvals; i++)
+ bits += freqs[i]*(vals[i]+1);
+ if (fd->verbose > 1)
+ fprintf(stderr, "UNARY = %d\n", bits);
+ if (best_size > bits)
+ best_size = bits, best_encoding = E_NULL; //E_UNARY;
+ }
+
+ /* Gamma */
+ for (bits = i = 0; i < nvals; i++)
+ bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i];
+ if (fd->verbose > 1)
+ fprintf(stderr, "GAMMA = %d\n", bits);
+ if (best_size > bits)
+ best_size = bits, best_encoding = E_GAMMA;
+
+ /* Subexponential */
+ for (k = 0; k < 10; k++) {
+ for (bits = i = 0; i < nvals; i++) {
+ if (vals[i]-min_val < (1<<k))
+ bits += (1 + k)*freqs[i];
+ else
+ bits += (nbits(vals[i]-min_val)*2-k)*freqs[i];
+ }
+
+ if (fd->verbose > 1)
+ fprintf(stderr, "SUBEXP%d = %d\n", k, bits);
+ if (best_size > bits)
+ best_size = bits, best_encoding = E_SUBEXP;
+ }
+#endif
+
+ /* byte array len */
+
+ /* byte array stop */
+
+ /* External? Guesswork! */
+
+ /* Huffman */
+// qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs);
+// for (i = 0; i < nvals; i++) {
+// fprintf(stderr, "%d = %d\n", i, freqs[i]);
+// vals[i] = 0;
+// }
+
+ /* Grow freqs to 2*freqs, to store sums */
+ /* Vals holds link data */
+ freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
+ codes = calloc(2*nvals, sizeof(*codes));
+ if (!freqs || !codes)
+ return E_HUFFMAN; // Cannot do much else atm
+
+ /* Inefficient, use pointers to form chain so we can insert and maintain
+ * a sorted list? This is currently O(nvals^2) complexity.
+ */
+ for (;;) {
+ int low1 = INT_MAX, low2 = INT_MAX;
+ int ind1 = 0, ind2 = 0;
+ for (i = 0; i < nvals; i++) {
+ if (freqs[i] < 0)
+ continue;
+ if (low1 > freqs[i])
+ low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
+ else if (low2 > freqs[i])
+ low2 = freqs[i], ind2 = i;
+ }
+ if (low2 == INT_MAX)
+ break;
+
+ //fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n",
+ // ind1, vals[ind1], ind2, vals[ind2], low1, low2,
+ // nvals, low1+low2);
+
+ freqs[nvals] = low1 + low2;
+ codes[ind1] = nvals;
+ codes[ind2] = nvals;
+ freqs[ind1] *= -1;
+ freqs[ind2] *= -1;
+ nvals++;
+ }
+ nvals = nvals/2+1;
+
+ for (i = 0; i < nvals; i++) {
+ int code_len = 0;
+ for (k = codes[i]; k; k = codes[k])
+ code_len++;
+ codes[i] = code_len;
+ freqs[i] *= -1;
+ //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]);
+ }
+
+ for (bits = i = 0; i < nvals; i++) {
+ bits += freqs[i] * codes[i];
+ }
+ if (fd->verbose > 1)
+ fprintf(stderr, "HUFFMAN = %d\n", bits);
+ if (best_size >= bits)
+ best_size = bits, best_encoding = E_HUFFMAN;
+ free(codes);
+
+ free(vals);
+ free(freqs);
+
+ return best_encoding;
+}
+
+void cram_stats_free(cram_stats *st) {
+ if (st->h)
+ kh_destroy(m_i2i, st->h);
+ free(st);
+}
diff --git a/htslib/cram/cram_stats.h b/htslib/cram/cram_stats.h
new file mode 100644
index 0000000..b471e68
--- /dev/null
+++ b/htslib/cram/cram_stats.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _CRAM_STATS_H_
+#define _CRAM_STATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+cram_stats *cram_stats_create(void);
+void cram_stats_add(cram_stats *st, int32_t val);
+void cram_stats_del(cram_stats *st, int32_t val);
+void cram_stats_dump(cram_stats *st);
+void cram_stats_free(cram_stats *st);
+
+/*
+ * Computes entropy from integer frequencies for various encoding methods and
+ * picks the best encoding.
+ *
+ * FIXME: we could reuse some of the code here for the actual encoding
+ * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
+ *
+ * Returns the best codec to use.
+ */
+enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/cram/cram_structs.h b/htslib/cram/cram_structs.h
new file mode 100644
index 0000000..d427a62
--- /dev/null
+++ b/htslib/cram/cram_structs.h
@@ -0,0 +1,821 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _CRAM_STRUCTS_H_
+#define _CRAM_STRUCTS_H_
+
+/*
+ * Defines in-memory structs for the basic file-format objects in the
+ * CRAM format.
+ *
+ * The basic file format is:
+ * File-def SAM-hdr Container Container ...
+ *
+ * Container:
+ * Service-block data-block data-block ...
+ *
+ * Multiple blocks in a container are grouped together as slices,
+ * also sometimes referred to as landmarks in the spec.
+ */
+
+
+#include <stdint.h>
+
+#include "cram/thread_pool.h"
+#include "cram/string_alloc.h"
+#include "cram/mFILE.h"
+#include "htslib/khash.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Generic hash-map integer -> integer
+KHASH_MAP_INIT_INT(m_i2i, int)
+
+// Generic hash-set integer -> (existance)
+KHASH_SET_INIT_INT(s_i2i)
+
+// For brevity
+typedef unsigned char uc;
+
+/*
+ * A union for the preservation map. Required for khash.
+ */
+typedef union {
+ int i;
+ char *p;
+} pmap_t;
+
+// Generates static functions here which isn't ideal, but we have no way
+// currently to declare the kh_map_t structure here without also declaring a
+// duplicate in the .c files due to the nature of the KHASH macros.
+KHASH_MAP_INIT_STR(map, pmap_t)
+
+struct hFILE;
+
+#define SEQS_PER_SLICE 10000
+#define SLICE_PER_CNT 1
+
+#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
+
+#define MAX_STAT_VAL 1024
+//#define MAX_STAT_VAL 16
+typedef struct cram_stats {
+ int freqs[MAX_STAT_VAL];
+ khash_t(m_i2i) *h;
+ int nsamp; // total number of values added
+ int nvals; // total number of unique values added
+} cram_stats;
+
+/* NB: matches java impl, not the spec */
+enum cram_encoding {
+ E_NULL = 0,
+ E_EXTERNAL = 1,
+ E_GOLOMB = 2,
+ E_HUFFMAN = 3,
+ E_BYTE_ARRAY_LEN = 4,
+ E_BYTE_ARRAY_STOP = 5,
+ E_BETA = 6,
+ E_SUBEXP = 7,
+ E_GOLOMB_RICE = 8,
+ E_GAMMA = 9,
+ E_NUM_CODECS = 10, /* Number of codecs, not a real one. */
+};
+
+enum cram_external_type {
+ E_INT = 1,
+ E_LONG = 2,
+ E_BYTE = 3,
+ E_BYTE_ARRAY = 4,
+ E_BYTE_ARRAY_BLOCK = 5,
+};
+
+/* External IDs used by this implementation (only assumed during writing) */
+enum cram_DS_ID {
+ DS_CORE = 0,
+ DS_aux = 1, // aux_blk
+ DS_aux_OQ = 2,
+ DS_aux_BQ = 3,
+ DS_aux_BD = 4,
+ DS_aux_BI = 5,
+ DS_aux_FZ = 6, // also ZM:B
+ DS_aux_oq = 7, // other qualities
+ DS_aux_os = 8, // other sequences
+ DS_aux_oz = 9, // other strings
+ DS_ref,
+ DS_RN, // name_blk
+ DS_QS, // qual_blk
+ DS_IN, // base_blk
+ DS_SC, // soft_blk
+
+ DS_BF, // start loop
+ DS_CF,
+ DS_AP,
+ DS_RG,
+ DS_MQ,
+ DS_NS,
+ DS_MF,
+ DS_TS,
+ DS_NP,
+ DS_NF,
+ DS_RL,
+ DS_FN,
+ DS_FC,
+ DS_FP,
+ DS_DL,
+ DS_BA,
+ DS_BS,
+ DS_TL,
+ DS_RI,
+ DS_RS,
+ DS_PD,
+ DS_HC,
+ DS_BB,
+ DS_QQ,
+
+ DS_TN, // end loop
+
+ DS_RN_len,
+ DS_SC_len,
+ DS_BB_len,
+ DS_QQ_len,
+
+ DS_TC, // CRAM v1.0 tags
+ DS_TM, // test
+ DS_TV, // test
+
+ DS_END,
+};
+
+/* "File Definition Structure" */
+typedef struct cram_file_def {
+ char magic[4];
+ uint8_t major_version;
+ uint8_t minor_version;
+ char file_id[20]; // Filename or SHA1 checksum
+} cram_file_def;
+
+#define CRAM_MAJOR_VERS(v) ((v) >> 8)
+#define CRAM_MINOR_VERS(v) ((v) & 0xff)
+
+struct cram_slice;
+
+enum cram_block_method {
+ ERROR = -1,
+ RAW = 0,
+ GZIP = 1,
+ BZIP2 = 2,
+ LZMA = 3,
+ RANS = 4, // Generic; either order
+ RANS0 = 4,
+ RANS1 = 10, // Not externalised; stored as RANS (generic)
+ GZIP_RLE = 11, // NB: not externalised in CRAM
+};
+
+enum cram_content_type {
+ CT_ERROR = -1,
+ FILE_HEADER = 0,
+ COMPRESSION_HEADER = 1,
+ MAPPED_SLICE = 2,
+ UNMAPPED_SLICE = 3, // CRAM V1.0 only
+ EXTERNAL = 4,
+ CORE = 5,
+};
+
+/* Compression metrics */
+typedef struct {
+ // number of trials and time to next trial
+ int trial;
+ int next_trial;
+
+ // aggregate sizes during trials
+ int sz_gz_rle;
+ int sz_gz_def;
+ int sz_rans0;
+ int sz_rans1;
+ int sz_bzip2;
+ int sz_lzma;
+
+ // resultant method from trials
+ int method;
+ int strat;
+
+ // Revisions of method, to allow culling of continually failing ones.
+ int gz_rle_cnt;
+ int gz_def_cnt;
+ int rans0_cnt;
+ int rans1_cnt;
+ int bzip2_cnt;
+ int lzma_cnt;
+ int revised_method;
+
+ double gz_rle_extra;
+ double gz_def_extra;
+ double rans0_extra;
+ double rans1_extra;
+ double bzip2_extra;
+ double lzma_extra;
+} cram_metrics;
+
+/* Block */
+typedef struct cram_block {
+ enum cram_block_method method, orig_method;
+ enum cram_content_type content_type;
+ int32_t content_id;
+ int32_t comp_size;
+ int32_t uncomp_size;
+ uint32_t crc32;
+ int32_t idx; /* offset into data */
+ unsigned char *data;
+
+ // For bit I/O
+ size_t alloc;
+ size_t byte;
+ int bit;
+} cram_block;
+
+struct cram_codec; /* defined in cram_codecs.h */
+struct cram_map;
+
+#define CRAM_MAP_HASH 32
+#define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
+
+/* Compression header block */
+typedef struct cram_block_compression_hdr {
+ int32_t ref_seq_id;
+ int32_t ref_seq_start;
+ int32_t ref_seq_span;
+ int32_t num_records;
+ int32_t num_landmarks;
+ int32_t *landmark;
+
+ /* Flags from preservation map */
+ int mapped_qs_included;
+ int unmapped_qs_included;
+ int unmapped_placed;
+ int qs_included;
+ int read_names_included;
+ int AP_delta;
+ // indexed by ref-base and subst. code
+ char substitution_matrix[5][4];
+
+ // TD Dictionary as a concatenated block
+ cram_block *TD_blk; // Tag Dictionary
+ int nTL; // number of TL entries in TD
+ unsigned char **TL; // array of size nTL, pointer into TD_blk.
+ khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices
+ string_alloc_t *TD_keys; // Pooled keys for TD hash.
+
+ khash_t(map) *preservation_map;
+ struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
+ struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
+
+ struct cram_codec *codecs[DS_END];
+
+ char *uncomp; // A single block of uncompressed data
+ size_t uncomp_size, uncomp_alloc;
+
+ unsigned int data_series; // See cram_fields enum below
+} cram_block_compression_hdr;
+
+typedef struct cram_map {
+ int key; /* 0xe0 + 3 bytes */
+ enum cram_encoding encoding;
+ int offset; /* Offset into a single block of memory */
+ int size; /* Size */
+ struct cram_codec *codec;
+ struct cram_map *next; // for noddy internal hash
+} cram_map;
+
+/* Mapped or unmapped slice header block */
+typedef struct cram_block_slice_hdr {
+ enum cram_content_type content_type;
+ int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */
+ int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */
+ int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */
+ int32_t num_records;
+ int64_t record_counter;
+ int32_t num_blocks;
+ int32_t num_content_ids;
+ int32_t *block_content_ids;
+ int32_t ref_base_id; /* if content_type == MAPPED_SLICE */
+ unsigned char md5[16];
+} cram_block_slice_hdr;
+
+struct ref_entry;
+
+/*
+ * Container.
+ *
+ * Conceptually a container is split into slices, and slices into blocks.
+ * However on disk it's just a list of blocks and we need to query the
+ * block types to identify the start/end points of the slices.
+ *
+ * OR... are landmarks the start/end points of slices?
+ */
+typedef struct cram_container {
+ int32_t length;
+ int32_t ref_seq_id;
+ int32_t ref_seq_start;
+ int32_t ref_seq_span;
+ int64_t record_counter;
+ int64_t num_bases;
+ int32_t num_records;
+ int32_t num_blocks;
+ int32_t num_landmarks;
+ int32_t *landmark;
+
+ /* Size of container header above */
+ size_t offset;
+
+ /* Compression header is always the first block? */
+ cram_block_compression_hdr *comp_hdr;
+ cram_block *comp_hdr_block;
+
+ /* For construction purposes */
+ int max_slice, curr_slice; // maximum number of slices
+ int max_rec, curr_rec; // current and max recs per slice
+ int max_c_rec, curr_c_rec; // current and max recs per container
+ int slice_rec; // rec no. for start of this slice
+ int curr_ref; // current ref ID. -2 for no previous
+ int last_pos; // last record position
+ struct cram_slice **slices, *slice;
+ int pos_sorted; // boolean, 1=>position sorted data
+ int max_apos; // maximum position, used if pos_sorted==0
+ int last_slice; // number of reads in last slice (0 for 1st)
+ int multi_seq; // true if packing multi seqs per cont/slice
+ int unsorted; // true is AP_delta is 0.
+
+ /* Copied from fd before encoding, to allow multi-threading */
+ int ref_start, first_base, last_base, ref_id, ref_end;
+ char *ref;
+ //struct ref_entry *ref;
+
+ /* For multi-threading */
+ bam_seq_t **bams;
+
+ /* Statistics for encoding */
+ cram_stats *stats[DS_END];
+
+ khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map
+ int *refs_used; // array of frequency of ref seq IDs
+
+ uint32_t crc32; // CRC32
+} cram_container;
+
+/*
+ * A single cram record
+ */
+typedef struct cram_record {
+ struct cram_slice *s; // Filled out by cram_decode only
+
+ int32_t ref_id; // fixed for all recs in slice?
+ int32_t flags; // BF
+ int32_t cram_flags; // CF
+ int32_t len; // RL
+ int32_t apos; // AP
+ int32_t rg; // RG
+ int32_t name; // RN; idx to s->names_blk
+ int32_t name_len;
+ int32_t mate_line; // index to another cram_record
+ int32_t mate_ref_id;
+ int32_t mate_pos; // NP
+ int32_t tlen; // TS
+
+ // Auxiliary data
+ int32_t ntags; // TC
+ int32_t aux; // idx to s->aux_blk
+ int32_t aux_size; // total size of packed ntags in aux_blk
+#ifndef TN_external
+ int32_t TN_idx; // TN; idx to s->TN;
+#else
+ int32_t tn; // idx to s->tn_blk
+#endif
+ int TL;
+
+ int32_t seq; // idx to s->seqs_blk
+ int32_t qual; // idx to s->qual_blk
+ int32_t cigar; // idx to s->cigar
+ int32_t ncigar;
+ int32_t aend; // alignment end
+ int32_t mqual; // MQ
+
+ int32_t feature; // idx to s->feature
+ int32_t nfeature; // number of features
+ int32_t mate_flags; // MF
+} cram_record;
+
+// Accessor macros as an analogue of the bam ones
+#define cram_qname(c) (&(c)->s->name_blk->data[(c)->name])
+#define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq])
+#define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual])
+#define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux])
+#define cram_seqi(c,i) (cram_seq((c))[(i)])
+#define cram_name_len(c) ((c)->name_len)
+#define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0)
+#define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0)
+#define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar])
+
+/*
+ * A feature is a base difference, used for the sequence reference encoding.
+ * (We generate these internally when writing CRAM.)
+ */
+typedef struct cram_feature {
+ union {
+ struct {
+ int pos;
+ int code;
+ int base; // substitution code
+ } X;
+ struct {
+ int pos;
+ int code;
+ int base; // actual base & qual
+ int qual;
+ } B;
+ struct {
+ int pos;
+ int code;
+ int seq_idx; // index to s->seqs_blk
+ int len;
+ } b;
+ struct {
+ int pos;
+ int code;
+ int qual;
+ } Q;
+ struct {
+ int pos;
+ int code;
+ int len;
+ int seq_idx; // soft-clip multiple bases
+ } S;
+ struct {
+ int pos;
+ int code;
+ int len;
+ int seq_idx; // insertion multiple bases
+ } I;
+ struct {
+ int pos;
+ int code;
+ int base; // insertion single base
+ } i;
+ struct {
+ int pos;
+ int code;
+ int len;
+ } D;
+ struct {
+ int pos;
+ int code;
+ int len;
+ } N;
+ struct {
+ int pos;
+ int code;
+ int len;
+ } P;
+ struct {
+ int pos;
+ int code;
+ int len;
+ } H;
+ };
+} cram_feature;
+
+/*
+ * A slice is really just a set of blocks, but it
+ * is the logical unit for decoding a number of
+ * sequences.
+ */
+typedef struct cram_slice {
+ cram_block_slice_hdr *hdr;
+ cram_block *hdr_block;
+ cram_block **block;
+ cram_block **block_by_id;
+
+ /* State used during encoding/decoding */
+ int last_apos, max_apos;
+
+ /* Array of decoded cram records */
+ cram_record *crecs;
+
+ /* An dynamically growing buffers for data pointed
+ * to by crecs[] array.
+ */
+ uint32_t *cigar;
+ uint32_t cigar_alloc;
+ uint32_t ncigar;
+
+ cram_feature *features;
+ int nfeatures;
+ int afeatures; // allocated size of features
+
+#ifndef TN_external
+ // TN field (Tag Name)
+ uint32_t *TN;
+ int nTN, aTN; // used and allocated size for TN[]
+#else
+ cram_block *tn_blk;
+ int tn_id;
+#endif
+
+ // For variable sized elements which are always external blocks.
+ cram_block *name_blk;
+ cram_block *seqs_blk;
+ cram_block *qual_blk;
+ cram_block *base_blk;
+ cram_block *soft_blk;
+ cram_block *aux_blk;
+ cram_block *aux_OQ_blk;
+ cram_block *aux_BQ_blk;
+ cram_block *aux_BD_blk;
+ cram_block *aux_BI_blk;
+ cram_block *aux_FZ_blk;
+ cram_block *aux_oq_blk;
+ cram_block *aux_os_blk;
+ cram_block *aux_oz_blk;
+
+ string_alloc_t *pair_keys; // Pooled keys for pair hash.
+ khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice.
+
+ char *ref; // slice of current reference
+ int ref_start; // start position of current reference;
+ int ref_end; // end position of current reference;
+ int ref_id;
+} cram_slice;
+
+/*-----------------------------------------------------------------------------
+ * Consider moving reference handling to cram_refs.[ch]
+ */
+// from fa.fai / samtools faidx files
+typedef struct ref_entry {
+ char *name;
+ char *fn;
+ int64_t length;
+ int64_t offset;
+ int bases_per_line;
+ int line_length;
+ int64_t count; // for shared references so we know to dealloc seq
+ char *seq;
+ mFILE *mf;
+ int is_md5; // Reference comes from a raw seq found by MD5
+} ref_entry;
+
+KHASH_MAP_INIT_STR(refs, ref_entry*)
+
+// References structure.
+typedef struct {
+ string_alloc_t *pool; // String pool for holding filenames and SN vals
+
+ khash_t(refs) *h_meta; // ref_entry*, index by name
+ ref_entry **ref_id; // ref_entry*, index by ID
+ int nref; // number of ref_entry
+
+ char *fn; // current file opened
+ BGZF *fp; // and the hFILE* to go with it.
+
+ int count; // how many cram_fd sharing this refs struct
+
+ pthread_mutex_t lock; // Mutex for multi-threaded updating
+ ref_entry *last; // Last queried sequence
+ int last_id; // Used in cram_ref_decr_locked to delay free
+} refs_t;
+
+/*-----------------------------------------------------------------------------
+ * CRAM index
+ *
+ * Detect format by number of entries per line.
+ * 5 => 1.0 (refid, start, nseq, C offset, slice)
+ * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
+ *
+ * Indices are stored in a nested containment list, which is trivial to set
+ * up as the indices are on sorted data so we're appending to the nclist
+ * in sorted order. Basically if a slice entirely fits within a previous
+ * slice then we append to that slices list. This is done recursively.
+ *
+ * Lists are sorted on two dimensions: ref id + slice coords.
+ */
+typedef struct cram_index {
+ int nslice, nalloc; // total number of slices
+ struct cram_index *e; // array of size nslice
+
+ int refid; // 1.0 1.1
+ int start; // 1.0 1.1
+ int end; // 1.1
+ int nseq; // 1.0 - undocumented
+ int slice; // 1.0 landmark index, 1.1 landmark value
+ int len; // 1.1 - size of slice in bytes
+ int64_t offset; // 1.0 1.1
+} cram_index;
+
+typedef struct {
+ int refid;
+ int start;
+ int end;
+} cram_range;
+
+/*-----------------------------------------------------------------------------
+ */
+/* CRAM File handle */
+
+typedef struct spare_bams {
+ bam_seq_t **bams;
+ struct spare_bams *next;
+} spare_bams;
+
+typedef struct cram_fd {
+ struct hFILE *fp;
+ int mode; // 'r' or 'w'
+ int version;
+ cram_file_def *file_def;
+ SAM_hdr *header;
+
+ char *prefix;
+ int64_t record_counter;
+ int err;
+
+ // Most recent compression header decoded
+ //cram_block_compression_hdr *comp_hdr;
+ //cram_block_slice_hdr *slice_hdr;
+
+ // Current container being processed.
+ cram_container *ctr;
+
+ // positions for encoding or decoding
+ int first_base, last_base;
+
+ // cached reference portion
+ refs_t *refs; // ref meta-data structure
+ char *ref, *ref_free; // current portion held in memory
+ int ref_id;
+ int ref_start;
+ int ref_end;
+ char *ref_fn; // reference fasta filename
+
+ // compression level and metrics
+ int level;
+ cram_metrics *m[DS_END];
+
+ // options
+ int decode_md; // Whether to export MD and NM tags
+ int verbose;
+ int seqs_per_slice;
+ int slices_per_container;
+ int embed_ref;
+ int no_ref;
+ int ignore_md5;
+ int use_bz2;
+ int use_rans;
+ int use_lzma;
+ int shared_ref;
+ unsigned int required_fields;
+ cram_range range;
+
+ // lookup tables, stored here so we can be trivially multi-threaded
+ unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
+ unsigned int cram_flag_swap[0x1000];// bam -> cram flags
+ unsigned char L1[256]; // ACGT{*} ->0123{4}
+ unsigned char L2[256]; // ACGTN{*}->01234{5}
+ char cram_sub_matrix[32][32]; // base substituion codes
+
+ int index_sz;
+ cram_index *index; // array, sizeof index_sz
+ off_t first_container;
+ int eof;
+ int last_slice; // number of recs encoded in last slice
+ int multi_seq;
+ int unsorted;
+ int empty_container; // Marker for EOF block
+
+ // thread pool
+ int own_pool;
+ t_pool *pool;
+ t_results_queue *rqueue;
+ pthread_mutex_t metrics_lock;
+ pthread_mutex_t ref_lock;
+ spare_bams *bl;
+ pthread_mutex_t bam_list_lock;
+ void *job_pending;
+ int ooc; // out of containers.
+} cram_fd;
+
+// Translation of required fields to cram data series
+enum cram_fields {
+ CRAM_BF = 0x00000001,
+ CRAM_AP = 0x00000002,
+ CRAM_FP = 0x00000004,
+ CRAM_RL = 0x00000008,
+ CRAM_DL = 0x00000010,
+ CRAM_NF = 0x00000020,
+ CRAM_BA = 0x00000040,
+ CRAM_QS = 0x00000080,
+ CRAM_FC = 0x00000100,
+ CRAM_FN = 0x00000200,
+ CRAM_BS = 0x00000400,
+ CRAM_IN = 0x00000800,
+ CRAM_RG = 0x00001000,
+ CRAM_MQ = 0x00002000,
+ CRAM_TL = 0x00004000,
+ CRAM_RN = 0x00008000,
+ CRAM_NS = 0x00010000,
+ CRAM_NP = 0x00020000,
+ CRAM_TS = 0x00040000,
+ CRAM_MF = 0x00080000,
+ CRAM_CF = 0x00100000,
+ CRAM_RI = 0x00200000,
+ CRAM_RS = 0x00400000,
+ CRAM_PD = 0x00800000,
+ CRAM_HC = 0x01000000,
+ CRAM_SC = 0x02000000,
+ CRAM_BB = 0x04000000,
+ CRAM_BB_len = 0x08000000,
+ CRAM_QQ = 0x10000000,
+ CRAM_QQ_len = 0x20000000,
+ CRAM_aux= 0x40000000,
+ CRAM_ALL= 0x7fffffff,
+};
+
+// A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
+// encode a base difference, but we don't need to know what it is for CIGAR.
+// If we have a soft-clip or insertion, we do need SC/IN though to know how
+// long that array is.
+#define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
+ CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
+
+#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \
+ CRAM_RL | CRAM_AP | CRAM_BB)
+
+#define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ)
+
+/* BF bitfields */
+/* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
+#define CRAM_FPAIRED 256
+#define CRAM_FPROPER_PAIR 128
+#define CRAM_FUNMAP 64
+#define CRAM_FREVERSE 32
+#define CRAM_FREAD1 16
+#define CRAM_FREAD2 8
+#define CRAM_FSECONDARY 4
+#define CRAM_FQCFAIL 2
+#define CRAM_FDUP 1
+
+#define DS_aux_S "\001"
+#define DS_aux_OQ_S "\002"
+#define DS_aux_BQ_S "\003"
+#define DS_aux_BD_S "\004"
+#define DS_aux_BI_S "\005"
+#define DS_aux_FZ_S "\006"
+#define DS_aux_oq_S "\007"
+#define DS_aux_os_S "\010"
+#define DS_aux_oz_S "\011"
+
+#define CRAM_M_REVERSE 1
+#define CRAM_M_UNMAP 2
+
+
+/* CF bitfields */
+#define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
+#define CRAM_FLAG_DETACHED (1<<1)
+#define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
+#define CRAM_FLAG_NO_SEQ (1<<3)
+#define CRAM_FLAG_MASK ((1<<4)-1)
+
+/* Internal only */
+#define CRAM_FLAG_STATS_ADDED (1<<30)
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CRAM_STRUCTS_H_ */
diff --git a/htslib/cram/files.c b/htslib/cram/files.c
new file mode 100644
index 0000000..bed5406
--- /dev/null
+++ b/htslib/cram/files.c
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 1994, 1996-1997, 2000, 2003 MEDICAL RESEARCH COUNCIL
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1 Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2 Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
+MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
+promote products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <config.h>
+
+#include "cram/misc.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Alliant's Concentrix <sys/stat.h> is hugely deficient */
+/* Define things we require in this program */
+/* Methinks S_IFMT and S_IFDIR aren't defined in POSIX */
+#ifndef S_ISDIR
+#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
+#endif /*!S_ISDIR*/
+#ifndef S_ISREG
+#define S_ISREG(m) (((m)&S_IFMT) == S_IFREG)
+#endif /*!S_ISREG*/
+
+int is_directory(char * fn)
+{
+ struct stat buf;
+ if ( stat(fn,&buf) ) return 0;
+ return S_ISDIR(buf.st_mode);
+}
+
+int is_file(char * fn)
+{
+ struct stat buf;
+ if ( stat(fn,&buf) ) return 0;
+ return S_ISREG(buf.st_mode);
+}
+
+int file_exists(char * fn)
+{
+ struct stat buf;
+ return ( stat(fn,&buf) == 0);
+}
+
+int file_size(char * fn)
+{
+ struct stat buf;
+ if ( stat(fn,&buf) != 0) return 0;
+ return buf.st_size;
+}
+
diff --git a/htslib/cram/mFILE.c b/htslib/cram/mFILE.c
new file mode 100644
index 0000000..0d4bd72
--- /dev/null
+++ b/htslib/cram/mFILE.c
@@ -0,0 +1,694 @@
+/*
+Copyright (c) 2005-2006, 2008-2009, 2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "cram/os.h"
+#include "cram/mFILE.h"
+#include "cram/vlen.h"
+
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+
+/*
+ * This file contains memory-based versions of the most commonly used
+ * (by io_lib) stdio functions.
+ *
+ * Actual file IO takes place either on opening or closing an mFILE.
+ *
+ * Coupled to this are a bunch of rather scary macros which can be obtained
+ * by including stdio_hack.h. It is recommended though that you use mFILE.h
+ * instead and replace fopen with mfopen (etc). This is more or less
+ * mandatory if you wish to use both FILE and mFILE structs in a single file.
+ */
+
+static mFILE *m_channel[3]; /* stdin, stdout and stderr fakes */
+
+/*
+ * Reads the entirety of fp into memory. If 'fn' exists it is the filename
+ * associated with fp. This will be used for more optimal reading (via a
+ * stat to identify the size and a single read). Otherwise we use successive
+ * reads until EOF.
+ *
+ * Returns a malloced buffer on success of length *size
+ * NULL on failure
+ */
+static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) {
+ struct stat sb;
+ char *data = NULL;
+ size_t allocated = 0, used = 0;
+ int bufsize = 8192;
+
+#ifdef _WIN32
+ if (binary)
+ _setmode(_fileno(fp), _O_BINARY);
+ else
+ _setmode(_fileno(fp), _O_TEXT);
+#endif
+
+ if (fn && -1 != stat(fn, &sb)) {
+ data = malloc(allocated = sb.st_size);
+ bufsize = sb.st_size;
+ } else {
+ fn = NULL;
+ }
+
+ do {
+ size_t len;
+ if (used + bufsize > allocated) {
+ allocated += bufsize;
+ data = realloc(data, allocated);
+ }
+ len = fread(data + used, 1, allocated - used, fp);
+ if (len > 0)
+ used += len;
+ } while (!feof(fp) && (fn == NULL || used < sb.st_size));
+
+ *size = used;
+
+ return data;
+}
+
+
+#ifdef HAVE_MMAP
+/*
+ * mmaps in the file, but only for reading currently.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int mfmmap(mFILE *mf, FILE *fp, const char *fn) {
+ struct stat sb;
+
+ if (stat(fn, &sb) != 0)
+ return -1;
+
+ mf->size = sb.st_size;
+ mf->data = mmap(NULL, mf->size, PROT_READ, MAP_SHARED,
+ fileno(fp), 0);
+
+ if (!mf->data)
+ return -1;
+
+ mf->alloced = 0;
+ return 0;
+}
+#endif
+
+
+/*
+ * Creates and returns m_channel[0].
+ * We initialise this on the first attempted read, which then slurps in
+ * all of stdin until EOF is met.
+ */
+mFILE *mstdin(void) {
+ if (m_channel[0])
+ return m_channel[0];
+
+ m_channel[0] = mfcreate(NULL, 0);
+ if (NULL == m_channel[0]) return NULL;
+ m_channel[0]->fp = stdin;
+ return m_channel[0];
+}
+
+static void init_mstdin(void) {
+ static int done_stdin = 0;
+ if (done_stdin)
+ return;
+
+ m_channel[0]->data = mfload(stdin, NULL, &m_channel[0]->size, 1);
+ m_channel[0]->mode = MF_READ;
+ done_stdin = 1;
+}
+
+/*
+ * Creates and returns m_channel[1]. This is the fake for stdout. It starts as
+ * an empty buffer which is physically written out only when mfflush or
+ * mfclose are called.
+ */
+mFILE *mstdout(void) {
+ if (m_channel[1])
+ return m_channel[1];
+
+ m_channel[1] = mfcreate(NULL, 0);
+ if (NULL == m_channel[1]) return NULL;
+ m_channel[1]->fp = stdout;
+ m_channel[1]->mode = MF_WRITE;
+ return m_channel[1];
+}
+
+/*
+ * Stderr as an mFILE.
+ * The code handles stderr by returning m_channel[2], but also checking
+ * for stderr in fprintf (the common usage of it) to auto-flush.
+ */
+mFILE *mstderr(void) {
+ if (m_channel[2])
+ return m_channel[2];
+
+ m_channel[2] = mfcreate(NULL, 0);
+ if (NULL == m_channel[2]) return NULL;
+ m_channel[2]->fp = stderr;
+ m_channel[2]->mode = MF_WRITE;
+ return m_channel[2];
+}
+
+
+/*
+ * For creating existing mFILE pointers directly from memory buffers.
+ */
+mFILE *mfcreate(char *data, int size) {
+ mFILE *mf = (mFILE *)malloc(sizeof(*mf));
+ if (NULL == mf) return NULL;
+ mf->fp = NULL;
+ mf->data = data;
+ mf->alloced = size;
+ mf->size = size;
+ mf->eof = 0;
+ mf->offset = 0;
+ mf->flush_pos = 0;
+ mf->mode = MF_READ | MF_WRITE;
+ return mf;
+}
+
+/*
+ * Recreate an existing mFILE to house new data/size.
+ * It also rewinds the file.
+ */
+void mfrecreate(mFILE *mf, char *data, int size) {
+ if (mf->data)
+ free(mf->data);
+ mf->data = data;
+ mf->size = size;
+ mf->alloced = size;
+ mf->eof = 0;
+ mf->offset = 0;
+ mf->flush_pos = 0;
+}
+
+
+/*
+ * Creates a new mFILE to contain the contents of the FILE pointer.
+ * This mFILE is purely for in-memory operations and has no links to the
+ * original FILE* it came from. It also doesn't close the FILE pointer.
+ * Consider using mfreopen() is you need different behaviour.
+ *
+ * Returns mFILE * on success
+ * NULL on failure.
+ */
+mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp) {
+ mFILE *mf;
+
+ /* Open using mfreopen() */
+ if (NULL == (mf = mfreopen(path, mode_str, fp)))
+ return NULL;
+
+ /* Disassociate from the input stream */
+ mf->fp = NULL;
+
+ return mf;
+}
+
+/*
+ * Converts a FILE * to an mFILE *.
+ * Use this for wrapper functions to turn external prototypes requring
+ * FILE * as an argument into internal code using mFILE *.
+ */
+mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) {
+ mFILE *mf;
+ int r = 0, w = 0, a = 0, b = 0, x = 0, mode = 0;
+
+ /* Parse mode:
+ * r = read file contents (if truncated => don't read)
+ * w = write on close
+ * a = position at end of buffer
+ * x = position at same location as the original fp, don't seek on flush
+ * + = for update (read and write)
+ * m = mmap (read only)
+ */
+ if (strchr(mode_str, 'r'))
+ r = 1, mode |= MF_READ;
+ if (strchr(mode_str, 'w'))
+ w = 1, mode |= MF_WRITE | MF_TRUNC;
+ if (strchr(mode_str, 'a'))
+ w = a = 1, mode |= MF_WRITE | MF_APPEND;
+ if (strchr(mode_str, 'b'))
+ b = 1, mode |= MF_BINARY;
+ if (strchr(mode_str, 'x'))
+ x = 1;
+ if (strchr(mode_str, '+')) {
+ w = 1, mode |= MF_READ | MF_WRITE;
+ if (a)
+ r = 1;
+ }
+#ifdef HAVE_MMAP
+ if (strchr(mode_str, 'm'))
+ if (!w) mode |= MF_MMAP;
+#endif
+
+ if (r) {
+ mf = mfcreate(NULL, 0);
+ if (NULL == mf) return NULL;
+ if (!(mode & MF_TRUNC)) {
+#ifdef HAVE_MMAP
+ if (mode & MF_MMAP) {
+ if (mfmmap(mf, fp, path) == -1) {
+ mf->data = NULL;
+ mode &= ~MF_MMAP;
+ }
+ }
+#endif
+ if (!mf->data) {
+ mf->data = mfload(fp, path, &mf->size, b);
+ mf->alloced = mf->size;
+ if (!a)
+ fseek(fp, 0, SEEK_SET);
+ }
+ }
+ } else if (w) {
+ /* Write - initialise the data structures */
+ mf = mfcreate(NULL, 0);
+ if (NULL == mf) return NULL;
+ } else {
+ fprintf(stderr, "Must specify either r, w or a for mode\n");
+ return NULL;
+ }
+ mf->fp = fp;
+ mf->mode = mode;
+
+ if (x) {
+ mf->mode |= MF_MODEX;
+ }
+
+ if (a) {
+ mf->flush_pos = mf->size;
+ fseek(fp, 0, SEEK_END);
+ }
+
+ return mf;
+}
+
+/*
+ * Opens a file. If we have read access (r or a+) then it loads the entire
+ * file into memory. If We have write access then the pathname is stored.
+ * We do not actually write until an mfclose, which then checks this pathname.
+ */
+mFILE *mfopen(const char *path, const char *mode) {
+ FILE *fp;
+
+ if (NULL == (fp = fopen(path, mode)))
+ return NULL;
+ return mfreopen(path, mode, fp);
+}
+
+/*
+ * Closes an mFILE. If the filename is known (implying write access) then this
+ * also writes the data to disk.
+ *
+ * Stdout is handled by calling mfflush which writes to stdout if appropriate.
+ */
+int mfclose(mFILE *mf) {
+ if (!mf)
+ return -1;
+
+ mfflush(mf);
+
+#ifdef HAVE_MMAP
+ if ((mf->mode & MF_MMAP) && mf->data) {
+ /* Mmaped */
+ munmap(mf->data, mf->size);
+ mf->data = NULL;
+ }
+#endif
+
+ if (mf->fp)
+ fclose(mf->fp);
+
+ mfdestroy(mf);
+
+ return 0;
+}
+
+/*
+ * Closes the file pointer contained within the mFILE without destroying
+ * the in-memory data.
+ *
+ * Attempting to do this on an mmaped buffer is an error.
+ */
+int mfdetach(mFILE *mf) {
+ if (!mf)
+ return -1;
+
+ mfflush(mf);
+ if (mf->mode & MF_MMAP)
+ return -1;
+
+ if (mf->fp) {
+ fclose(mf->fp);
+ mf->fp = NULL;
+ }
+
+ return 0;
+}
+
+/*
+ * Destroys an mFILE structure but does not flush or close it
+ */
+int mfdestroy(mFILE *mf) {
+ if (!mf)
+ return -1;
+
+ if (mf->data)
+ free(mf->data);
+ free(mf);
+
+ return 0;
+}
+
+/*
+ * Steals that data out of an mFILE. The mFILE itself will be closed.
+ * It is up to the caller to free the stolen buffer. If size_out is
+ * not NULL, mf->size will be stored in it.
+ * This is more-or-less the opposite of mfcreate().
+ *
+ * Note, we cannot steal the allocated buffer from an mmaped mFILE.
+ */
+
+void *mfsteal(mFILE *mf, size_t *size_out) {
+ void *data;
+
+ if (!mf) return NULL;
+
+ data = mf->data;
+
+ if (NULL != size_out) *size_out = mf->size;
+
+ if (mfdetach(mf) != 0)
+ return NULL;
+
+ mf->data = NULL;
+ mfdestroy(mf);
+
+ return data;
+}
+
+/*
+ * Seek/tell functions. Nothing more than updating and reporting an
+ * in-memory index. NB we can seek on stdin or stdout even provided we
+ * haven't been flushing.
+ */
+int mfseek(mFILE *mf, long offset, int whence) {
+ switch (whence) {
+ case SEEK_SET:
+ mf->offset = offset;
+ break;
+ case SEEK_CUR:
+ mf->offset += offset;
+ break;
+ case SEEK_END:
+ mf->offset = mf->size + offset;
+ break;
+ default:
+ errno = EINVAL;
+ return -1;
+ }
+
+ mf->eof = 0;
+ return 0;
+}
+
+long mftell(mFILE *mf) {
+ return mf->offset;
+}
+
+void mrewind(mFILE *mf) {
+ mf->offset = 0;
+ mf->eof = 0;
+}
+
+/*
+ * mftruncate is not directly a translation of ftruncate as the latter
+ * takes a file descriptor instead of a FILE *. It performs the analogous
+ * role though.
+ *
+ * If offset is -1 then the file is truncated to be the current file
+ * offset.
+ */
+void mftruncate(mFILE *mf, long offset) {
+ mf->size = offset != -1 ? offset : mf->offset;
+ if (mf->offset > mf->size)
+ mf->offset = mf->size;
+}
+
+int mfeof(mFILE *mf) {
+ return mf->eof;
+}
+
+/*
+ * mFILE read/write functions. Basically these turn fread/fwrite syntax
+ * into memcpy statements, with appropriate memory handling for writing.
+ */
+size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
+ size_t len;
+ char *cptr = (char *)ptr;
+
+ if (mf == m_channel[0]) init_mstdin();
+
+ if (mf->size <= mf->offset)
+ return 0;
+
+ len = size * nmemb <= mf->size - mf->offset
+ ? size * nmemb
+ : mf->size - mf->offset;
+ if (!size)
+ return 0;
+
+ memcpy(cptr, &mf->data[mf->offset], len);
+ mf->offset += len;
+
+ if (len != size * nmemb) {
+ mf->eof = 1;
+ }
+
+ return len / size;
+}
+
+size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
+ if (!(mf->mode & MF_WRITE))
+ return 0;
+
+ /* Append mode => forced all writes to end of file */
+ if (mf->mode & MF_APPEND)
+ mf->offset = mf->size;
+
+ /* Make sure we have enough room */
+ while (size * nmemb + mf->offset > mf->alloced) {
+ size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
+ void * new_data = realloc(mf->data, new_alloced);
+ if (NULL == new_data) return 0;
+ mf->alloced = new_alloced;
+ mf->data = new_data;
+ }
+
+ /* Record where we need to reflush from */
+ if (mf->offset < mf->flush_pos)
+ mf->flush_pos = mf->offset;
+
+ /* Copy the data over */
+ memcpy(&mf->data[mf->offset], ptr, size * nmemb);
+ mf->offset += size * nmemb;
+ if (mf->size < mf->offset)
+ mf->size = mf->offset;
+
+ return nmemb;
+}
+
+int mfgetc(mFILE *mf) {
+ if (mf == m_channel[0]) init_mstdin();
+ if (mf->offset < mf->size) {
+ return (unsigned char)mf->data[mf->offset++];
+ }
+
+ mf->eof = 1;
+ return -1;
+}
+
+int mungetc(int c, mFILE *mf) {
+ if (mf->offset > 0) {
+ mf->data[--mf->offset] = c;
+ return c;
+ }
+
+ mf->eof = 1;
+ return -1;
+}
+
+char *mfgets(char *s, int size, mFILE *mf) {
+ int i;
+
+ if (mf == m_channel[0]) init_mstdin();
+ *s = 0;
+ for (i = 0; i < size-1;) {
+ if (mf->offset < mf->size) {
+ s[i] = mf->data[mf->offset++];
+ if (s[i++] == '\n')
+ break;
+ } else {
+ mf->eof = 1;
+ break;
+ }
+ }
+
+ s[i] = 0;
+ return i ? s : NULL;
+}
+
+/*
+ * Flushes an mFILE. If this is a real open of a file in write mode then
+ * mFILE->fp will be set. We then write out any new data in mFILE since the
+ * last flush. We cannot tell what may have been modified as we don't keep
+ * track of that, so we typically rewrite out the entire file contents between
+ * the last flush_pos and the end of file.
+ *
+ * For stderr/stdout we also reset the offsets so we cannot modify things
+ * we've already output.
+ */
+int mfflush(mFILE *mf) {
+ if (!mf->fp)
+ return 0;
+
+ /* FIXME: only do this when opened in write mode */
+ if (mf == m_channel[1] || mf == m_channel[2]) {
+ if (mf->flush_pos < mf->size) {
+ size_t bytes = mf->size - mf->flush_pos;
+ if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
+ return -1;
+ if (0 != fflush(mf->fp))
+ return -1;
+ }
+
+ /* Stdout & stderr are non-seekable streams so throw away the data */
+ mf->offset = mf->size = mf->flush_pos = 0;
+ }
+
+ /* only flush when opened in write mode */
+ if (mf->mode & MF_WRITE) {
+ if (mf->flush_pos < mf->size) {
+ size_t bytes = mf->size - mf->flush_pos;
+ if (!(mf->mode & MF_MODEX)) {
+ fseek(mf->fp, mf->flush_pos, SEEK_SET);
+ }
+ if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
+ return -1;
+ if (0 != fflush(mf->fp))
+ return -1;
+ }
+ if (ftell(mf->fp) != -1 &&
+ ftruncate(fileno(mf->fp), ftell(mf->fp)) == -1)
+ return -1;
+ mf->flush_pos = mf->size;
+ }
+
+ return 0;
+}
+
+/*
+ * A wrapper around vsprintf() to write to an mFILE. This also uses vflen() to
+ * estimate how many additional bytes of storage will be required for the
+ * vsprintf to work.
+ */
+int mfprintf(mFILE *mf, char *fmt, ...) {
+ int ret;
+ size_t est_length;
+ va_list args;
+
+ va_start(args, fmt);
+ est_length = vflen(fmt, args);
+ va_end(args);
+ while (est_length + mf->offset > mf->alloced) {
+ size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
+ void * new_data = realloc(mf->data, new_alloced);
+ if (NULL == new_data) return -1;
+ mf->alloced = new_alloced;
+ mf->data = new_data;
+ }
+
+ va_start(args, fmt);
+ ret = vsprintf(&mf->data[mf->offset], fmt, args);
+ va_end(args);
+
+ if (ret > 0) {
+ mf->offset += ret;
+ if (mf->size < mf->offset)
+ mf->size = mf->offset;
+ }
+
+ if (mf->fp == stderr) {
+ /* Auto-flush for stderr */
+ if (0 != mfflush(mf)) return -1;
+ }
+
+ return ret;
+}
+
+/*
+ * Converts an mFILE from binary to ascii mode by replacing all
+ * cr-nl with nl.
+ *
+ * Primarily used on windows when we've uncompressed a binary file which
+ * happens to be a text file (eg Experiment File). Previously we would have
+ * seeked back to the start and used _setmode(fileno(fp), _O_TEXT).
+ *
+ * Side effect: resets offset and flush_pos back to the start.
+ */
+void mfascii(mFILE *mf) {
+ size_t p1, p2;
+
+ for (p1 = p2 = 1; p1 < mf->size; p1++, p2++) {
+ if (mf->data[p1] == '\n' && mf->data[p1-1] == '\r') {
+ p2--; /* delete the \r */
+ }
+ mf->data[p2] = mf->data[p1];
+ }
+ mf->size = p2;
+
+ mf->offset = mf->flush_pos = 0;
+}
diff --git a/htslib/cram/mFILE.h b/htslib/cram/mFILE.h
new file mode 100644
index 0000000..05a3a88
--- /dev/null
+++ b/htslib/cram/mFILE.h
@@ -0,0 +1,89 @@
+/*
+Copyright (c) 2005-2006, 2008-2009 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _MFILE_H_
+#define _MFILE_H_
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ FILE *fp;
+ char *data;
+ size_t alloced;
+ int eof;
+ int mode; /* open mode in MF_?? define bit pattern */
+ size_t size;
+ size_t offset;
+ size_t flush_pos;
+} mFILE;
+
+#define MF_READ 1
+#define MF_WRITE 2
+#define MF_APPEND 4
+#define MF_BINARY 8
+#define MF_TRUNC 16
+#define MF_MODEX 32
+#define MF_MMAP 64
+
+mFILE *mfreopen(const char *path, const char *mode, FILE *fp);
+mFILE *mfopen(const char *path, const char *mode);
+int mfdetach(mFILE *mf);
+int mfclose(mFILE *mf);
+int mfdestroy(mFILE *mf);
+int mfseek(mFILE *mf, long offset, int whence);
+long mftell(mFILE *mf);
+void mrewind(mFILE *mf);
+void mftruncate(mFILE *mf, long offset);
+int mfeof(mFILE *mf);
+size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf);
+size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf);
+int mfgetc(mFILE *mf);
+int mungetc(int c, mFILE *mf);
+mFILE *mfcreate(char *data, int size);
+mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp);
+void mfrecreate(mFILE *mf, char *data, int size);
+void *mfsteal(mFILE *mf, size_t *size_out);
+char *mfgets(char *s, int size, mFILE *mf);
+int mfflush(mFILE *mf);
+int mfprintf(mFILE *mf, char *fmt, ...);
+mFILE *mstdin(void);
+mFILE *mstdout(void);
+mFILE *mstderr(void);
+void mfascii(mFILE *mf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MFILE_H_ */
diff --git a/htslib/cram/misc.h b/htslib/cram/misc.h
new file mode 100644
index 0000000..681b28c
--- /dev/null
+++ b/htslib/cram/misc.h
@@ -0,0 +1,110 @@
+/*
+Copyright (c) 1994-1997, 2001-2002 MEDICAL RESEARCH COUNCIL
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1 Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2 Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
+MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
+promote products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+Copyright (c) 2003-2013 Genome Research Ltd.
+
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _misc_h
+#define _misc_h
+
+#include "cram/os.h"
+
+#include <stdio.h>
+#include <stdarg.h> /* varargs needed for v*printf() prototypes */
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This informs gcc that crash() doesn't return, so it doesn't need to
+ * concern itself that code paths going via crash could mean some variables
+ * being undefined and then issuing uninitialised variable warnings.
+ * This particularly affected convert.
+ */
+#ifdef __GNUC__
+# define __NORETURN__ __attribute__ ((__noreturn__))
+#else
+# define __NORETURN__
+#endif
+
+/*
+ * Used for printf style argument checking. We can request a function such
+ * as vTcl_SetResult does argument checking, avoiding bugs with using
+ * %d and passing in a 64-bit record.
+ */
+#ifdef __GNUC__
+# define __PRINTF_FORMAT__(a,b) __attribute__ ((format (printf, a, b)))
+#else
+# define __PRINTF_FORMAT__(a,b)
+#endif
+
+extern int is_directory(char * fn);
+extern int is_file(char * fn);
+extern int file_size(char * fn);
+
+#define MIN(A,B) ( ( (A) < (B) ) ? (A) : (B) )
+#define MAX(A,B) ( ( (A) > (B) ) ? (A) : (B) )
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_misc_h*/
diff --git a/htslib/cram/open_trace_file.c b/htslib/cram/open_trace_file.c
new file mode 100644
index 0000000..62f4087
--- /dev/null
+++ b/htslib/cram/open_trace_file.c
@@ -0,0 +1,414 @@
+/*
+Author: James Bonfield
+
+Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
+MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
+promote products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+Copyright (c) 2008, 2009, 2013, 2014 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "cram/os.h"
+#ifndef PATH_MAX
+# define PATH_MAX 1024
+#endif
+
+#include "cram/open_trace_file.h"
+#include "cram/misc.h"
+#include "htslib/hfile.h"
+
+/*
+ * Tokenises the search path splitting on colons (unix) or semicolons
+ * (windows).
+ * We also explicitly add a "./" to the end of the search path
+ *
+ * Returns: A new search path with items separated by nul chars. Two nul
+ * chars in a row represent the end of the tokenised path.
+ * Returns NULL for a failure.
+ *
+ * The returned data has been malloced. It is up to the caller to free this
+ * memory.
+ */
+char *tokenise_search_path(char *searchpath) {
+ char *newsearch;
+ unsigned int i, j;
+ size_t len;
+#ifdef _WIN32
+ char path_sep = ';';
+#else
+ char path_sep = ':';
+#endif
+
+ if (!searchpath)
+ searchpath="";
+
+ newsearch = (char *)malloc((len = strlen(searchpath))+5);
+ if (!newsearch)
+ return NULL;
+
+ for (i = 0, j = 0; i < len; i++) {
+ /* "::" => ":". Used for escaping colons in http://foo */
+ if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') {
+ newsearch[j++] = ':';
+ i++;
+ continue;
+ }
+
+ /* Handle http:// and ftp:// too without :: */
+ if (path_sep == ':') {
+ if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) &&
+ (!strncmp(&searchpath[i], "http:", 5) ||
+ !strncmp(&searchpath[i], "ftp:", 4) ||
+ !strncmp(&searchpath[i], "|http:", 6) ||
+ !strncmp(&searchpath[i], "|ftp:", 5) ||
+ !strncmp(&searchpath[i], "URL=http:", 9) ||
+ !strncmp(&searchpath[i], "URL=ftp:", 8))) {
+ do {
+ newsearch[j++] = searchpath[i];
+ } while (i<len && searchpath[i++] != ':');
+ if (searchpath[i] == ':')
+ i++;
+ if (searchpath[i]=='/')
+ newsearch[j++] = searchpath[i++];
+ if (searchpath[i]=='/')
+ newsearch[j++] = searchpath[i++];
+ // Look for host:port
+ do {
+ newsearch[j++] = searchpath[i++];
+ } while (i<len && searchpath[i] != ':' && searchpath[i] != '/');
+ newsearch[j++] = searchpath[i++];
+ if (searchpath[i] == ':')
+ i++;
+ }
+ }
+
+ if (searchpath[i] == path_sep) {
+ /* Skip blank path components */
+ if (j && newsearch[j-1] != 0)
+ newsearch[j++] = 0;
+ } else {
+ newsearch[j++] = searchpath[i];
+ }
+ }
+
+ if (j)
+ newsearch[j++] = 0;
+ newsearch[j++] = '.';
+ newsearch[j++] = '/';
+ newsearch[j++] = 0;
+ newsearch[j++] = 0;
+
+ return newsearch;
+}
+
+mFILE *find_file_url(char *file, char *url) {
+ char buf[8192], *cp;
+ mFILE *mf = NULL;
+ int maxlen = 8190 - strlen(file), len;
+ hFILE *hf;
+
+ /* Expand %s for the trace name */
+ for (cp = buf; *url && cp - buf < maxlen; url++) {
+ if (*url == '%' && *(url+1) == 's') {
+ url++;
+ cp += strlen(strcpy(cp, file));
+ } else {
+ *cp++ = *url;
+ }
+ }
+ *cp++ = 0;
+
+ if (!(hf = hopen(buf, "r")))
+ return NULL;
+
+ if (NULL == (mf = mfcreate(NULL, 0)))
+ return NULL;
+ while ((len = hread(hf, buf, 8192)) > 0) {
+ if (mfwrite(buf, len, 1, mf) <= 0) {
+ hclose_abruptly(hf);
+ mfdestroy(mf);
+ return NULL;
+ }
+ }
+ if (hclose(hf) < 0 || len < 0) {
+ mfdestroy(mf);
+ return NULL;
+ }
+
+ mrewind(mf);
+ return mf;
+}
+
+/*
+ * Takes a dirname possibly including % rules and appends the filename
+ * to it.
+ *
+ * Returns expanded pathname or NULL for malloc failure.
+ */
+static char *expand_path(char *file, char *dirname) {
+ size_t len = strlen(dirname);
+ size_t lenf = strlen(file);
+ char *cp, *path;
+
+ path = malloc(len+lenf+2); // worst expansion DIR/FILE
+ if (!path)
+ return NULL;
+
+ if (dirname[len-1] == '/')
+ len--;
+
+ /* Special case for "./" or absolute filenames */
+ if (*file == '/' || (len==1 && *dirname == '.')) {
+ sprintf(path, "%s", file);
+ } else {
+ /* Handle %[0-9]*s expansions, if required */
+ char *path_end = path;
+ *path = 0;
+ while ((cp = strchr(dirname, '%'))) {
+ char *endp;
+ long l = strtol(cp+1, &endp, 10);
+ if (*endp != 's') {
+ strncpy(path_end, dirname, (endp+1)-dirname);
+ path_end += (endp+1)-dirname;
+ dirname = endp+1;
+ continue;
+ }
+
+ strncpy(path_end, dirname, cp-dirname);
+ path_end += cp-dirname;
+ if (l) {
+ strncpy(path_end, file, l);
+ path_end += MIN(strlen(file), l);
+ file += MIN(strlen(file), l);
+ } else {
+ strcpy(path_end, file);
+ path_end += strlen(file);
+ file += strlen(file);
+ }
+ len -= (endp+1) - dirname;
+ dirname = endp+1;
+ }
+ strncpy(path_end, dirname, len);
+ path_end += MIN(strlen(dirname), len);
+ *path_end = 0;
+ if (*file) {
+ *path_end++ = '/';
+ strcpy(path_end, file);
+ }
+ }
+
+ //fprintf(stderr, "*PATH=\"%s\"\n", path);
+ return path;
+}
+
+/*
+ * Searches for file in the directory 'dirname'. If it finds it, it opens
+ * it. This also searches for compressed versions of the file in dirname
+ * too.
+ *
+ * Returns mFILE pointer if found
+ * NULL if not
+ */
+static mFILE *find_file_dir(char *file, char *dirname) {
+ char *path;
+ mFILE *mf = NULL;
+
+ path = expand_path(file, dirname);
+
+ if (is_file(path))
+ mf = mfopen(path, "rbm");
+
+ free(path);
+ return mf;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * Public functions below.
+ */
+
+/*
+ * Opens a trace file named 'file'. This is initially looked for as a
+ * pathname relative to a file named "relative_to". This may (for
+ * example) be the name of an experiment file referencing the trace
+ * file. In this case by passing relative_to as the experiment file
+ * filename the trace file will be picked up in the same directory as
+ * the experiment file. Relative_to may be supplied as NULL.
+ *
+ * 'file' is looked for at relative_to, then the current directory, and then
+ * all of the locations listed in 'path' (which is a colon separated list).
+ * If 'path' is NULL it uses the RAWDATA environment variable instead.
+ *
+ * Returns a mFILE pointer when found.
+ * NULL otherwise.
+ */
+mFILE *open_path_mfile(char *file, char *path, char *relative_to) {
+ char *newsearch;
+ char *ele;
+ mFILE *fp;
+
+ /* Use path first */
+ if (!path)
+ path = getenv("RAWDATA");
+ if (NULL == (newsearch = tokenise_search_path(path)))
+ return NULL;
+
+ /*
+ * Step through the search path testing out each component.
+ * We now look through each path element treating some prefixes as
+ * special, otherwise we treat the element as a directory.
+ */
+ for (ele = newsearch; *ele; ele += strlen(ele)+1) {
+ char *ele2;
+
+ /*
+ * '|' prefixing a path component indicates that we do not
+ * wish to perform the compression extension searching in that
+ * location.
+ *
+ * NB: this has been removed from the htslib implementation.
+ */
+ if (*ele == '|') {
+ ele2 = ele+1;
+ } else {
+ ele2 = ele;
+ }
+
+ if (0 == strncmp(ele2, "URL=", 4)) {
+ if ((fp = find_file_url(file, ele2+4))) {
+ free(newsearch);
+ return fp;
+ }
+ } else if (!strncmp(ele2, "http:", 5) ||
+ !strncmp(ele2, "ftp:", 4)) {
+ if ((fp = find_file_url(file, ele2))) {
+ free(newsearch);
+ return fp;
+ }
+ } else if ((fp = find_file_dir(file, ele2))) {
+ free(newsearch);
+ return fp;
+ }
+ }
+
+ free(newsearch);
+
+ /* Look in the same location as the incoming 'relative_to' filename */
+ if (relative_to) {
+ char *cp;
+ char relative_path[PATH_MAX+1];
+ strcpy(relative_path, relative_to);
+ if ((cp = strrchr(relative_path, '/')))
+ *cp = 0;
+ if ((fp = find_file_dir(file, relative_path)))
+ return fp;
+ }
+
+ return NULL;
+}
+
+
+/*
+ * As per open_path_mfile, but searching only for local filenames.
+ * This is useful as we may avoid doing a full mfopen and loading
+ * the entire file into memory.
+ *
+ * Returns the expanded pathname if found.
+ * NULL if not
+ */
+char *find_path(char *file, char *path) {
+ char *newsearch;
+ char *ele;
+ char *outpath = NULL;
+
+ /* Use path first */
+ if (!path)
+ path = getenv("RAWDATA");
+ if (NULL == (newsearch = tokenise_search_path(path)))
+ return NULL;
+
+ for (ele = newsearch; *ele; ele += strlen(ele)+1) {
+ char *ele2 = (*ele == '|') ? ele+1 : ele;
+
+ if (!strncmp(ele2, "URL=", 4) ||
+ !strncmp(ele2, "http:", 5) ||
+ !strncmp(ele2, "ftp:", 4)) {
+ continue;
+ } else {
+ outpath = expand_path(file, ele2);
+ if (is_file(outpath)) {
+ free(newsearch);
+ return outpath;
+ } else {
+ free(outpath);
+ }
+ }
+ }
+
+ free(newsearch);
+
+ return NULL;
+}
diff --git a/htslib/cram/open_trace_file.h b/htslib/cram/open_trace_file.h
new file mode 100644
index 0000000..a067dc6
--- /dev/null
+++ b/htslib/cram/open_trace_file.h
@@ -0,0 +1,125 @@
+/*
+Author: James Bonfield
+
+Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ . Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ . Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ . Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
+MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
+promote products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+Copyright (c) 2008, 2009, 2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _OPEN_TRACE_FILE_H_
+#define _OPEN_TRACE_FILE_H_
+
+#include "cram/mFILE.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Tokenises the search path splitting on colons (unix) or semicolons
+ * (windows).
+ * We also explicitly add a "./" to the end of the search path
+ *
+ * Returns: A new search path with items separated by nul chars. Two nul
+ * chars in a row represent the end of the tokenised path.
+ * Returns NULL for a failure.
+ *
+ * The returned data has been malloced. It is up to the caller to free this
+ * memory.
+ */
+char *tokenise_search_path(char *searchpath);
+
+/*
+ * Opens a trace file named 'file'. This is initially looked for as a
+ * pathname relative to a file named "relative_to". This may (for
+ * example) be the name of an experiment file referencing the trace
+ * file. In this case by passing relative_to as the experiment file
+ * filename the trace file will be picked up in the same directory as
+ * the experiment file. Relative_to may be supplied as NULL.
+ *
+ * 'file' is looked for at relative_to, then the current directory, and then
+ * all of the locations listed in 'path' (which is a colon separated list).
+ * If 'path' is NULL it uses the RAWDATA environment variable instead.
+ *
+ * Returns a mFILE pointer when found.
+ * NULL otherwise.
+ */
+mFILE *open_path_mfile(char *file, char *path, char *relative_to);
+
+/*
+ * Returns a mFILE containing the entire contents of the url;
+ * NULL on failure.
+ */
+mFILE *find_file_url(char *file, char *url);
+
+
+/*
+ * As per open_path_mfile, but searching only for local filenames.
+ * This is useful as we may avoid doing a full mfopen and loading
+ * the entire file into memory.
+ *
+ * Returns the expanded pathname if found.
+ * NULL if not
+ */
+char *find_path(char *file, char *path);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OPEN_TRACE_FILE_H_ */
diff --git a/htslib/cram/os.h b/htslib/cram/os.h
new file mode 100644
index 0000000..22d8096
--- /dev/null
+++ b/htslib/cram/os.h
@@ -0,0 +1,308 @@
+/*
+Copyright (c) 1993, 1995-2002 MEDICAL RESEARCH COUNCIL
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1 Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2 Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
+MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
+promote products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+Copyright (c) 2004, 2006, 2009-2011, 2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * File: os.h
+ *
+ * Author:
+ * MRC Laboratory of Molecular Biology
+ * Hills Road
+ * Cambridge CB2 2QH
+ * United Kingdom
+ *
+ * Description: operating system specific type definitions
+ *
+ */
+
+#ifndef _OS_H_
+#define _OS_H_
+
+#include <limits.h>
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*-----------------------------------------------------------------------------
+ * Detection of endianness. The main part of this is done in autoconf, but
+ * for the case of MacOS FAT binaries we fall back on auto-sensing based on
+ * processor type too.
+ */
+
+/* Set by autoconf */
+#define SP_LITTLE_ENDIAN
+
+/* Mac FAT binaries or unknown. Auto detect based on CPU type */
+#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN)
+
+/*
+ * x86 equivalents
+ */
+#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686)
+# if defined(SP_BIG_ENDIAN)
+# undef SP_BIG_ENDIAN
+# endif
+# define SP_LITTLE_ENDIAN
+#endif
+
+/*
+ * DEC Alpha
+ */
+#if defined(__alpha__) || defined(__alpha)
+# if defined(SP_LITTLE_ENDIAN)
+# undef SP_LITTLE_ENDIAN
+# endif
+# define SP_BIG_ENDIAN
+#endif
+
+/*
+ * SUN Sparc
+ */
+#if defined(__sparc__) || defined(__sparc)
+# if defined(SP_LITTLE_ENDIAN)
+# undef SP_LITTLE_ENDIAN
+# endif
+# define SP_BIG_ENDIAN
+#endif
+
+/*
+ * PowerPC
+ */
+#if defined(__ppc__) || defined(__ppc)
+# if defined(SP_LITTLE_ENDIAN)
+# undef SP_LITTLE_ENDIAN
+# endif
+# define SP_BIG_ENDIAN
+#endif
+
+/* Some catch-alls */
+#if defined(__LITTLE_ENDIAN__) || defined(__LITTLEENDIAN__)
+# define SP_LITTLE_ENDIAN
+#endif
+
+#if defined(__BIG_ENDIAN__) || defined(__BIGENDIAN__)
+# define SP_BIG_ENDIAN
+#endif
+
+#if defined(SP_BIG_ENDIAN) && defined(SP_LITTLE_ENDIAN)
+# error Both BIG and LITTLE endian defined. Fix os.h and/or Makefile
+#endif
+
+#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN)
+# error Neither BIG nor LITTLE endian defined. Fix os.h and/or Makefile
+#endif
+
+#endif
+
+/*-----------------------------------------------------------------------------
+ * Allow for unaligned memory access. This is used in BAM code as the packed
+ * structure has 4-byte cigar ints after the variable length name.
+ *
+ * Consider using AX_CHECK_ALIGNED_ACCESS_REQUIRED in autoconf.
+ */
+#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686)
+# define ALLOW_UAC
+#endif
+
+/*-----------------------------------------------------------------------------
+ * Byte swapping macros
+ */
+
+/*
+ * Our new swap runs at the same speed on Ultrix, but substantially faster
+ * (300% for swap_int4, ~50% for swap_int2) on an Alpha (due to the lack of
+ * decent 'char' support).
+ *
+ * They also have the ability to swap in situ (src == dst). Newer code now
+ * relies on this so don't change back!
+ */
+#define iswap_int8(x) \
+ (((x & 0x00000000000000ffLL) << 56) + \
+ ((x & 0x000000000000ff00LL) << 40) + \
+ ((x & 0x0000000000ff0000LL) << 24) + \
+ ((x & 0x00000000ff000000LL) << 8) + \
+ ((x & 0x000000ff00000000LL) >> 8) + \
+ ((x & 0x0000ff0000000000LL) >> 24) + \
+ ((x & 0x00ff000000000000LL) >> 40) + \
+ ((x & 0xff00000000000000LL) >> 56))
+
+#define iswap_int4(x) \
+ (((x & 0x000000ff) << 24) + \
+ ((x & 0x0000ff00) << 8) + \
+ ((x & 0x00ff0000) >> 8) + \
+ ((x & 0xff000000) >> 24))
+
+#define iswap_int2(x) \
+ (((x & 0x00ff) << 8) + \
+ ((x & 0xff00) >> 8))
+
+/*
+ * Linux systems may use byteswap.h to get assembly versions of byte-swap
+ * on intel systems. This can be as trivial as the bswap opcode, which works
+ * out at over 2-times faster than iswap_int4 above.
+ */
+#if 0
+#if defined(__linux__)
+# include <byteswap.h>
+# undef iswap_int8
+# undef iswap_int4
+# undef iswap_int2
+# define iswap_int8 bswap_64
+# define iswap_int4 bswap_32
+# define iswap_int2 bswap_16
+#endif
+#endif
+
+
+/*
+ * Macros to specify that data read in is of a particular endianness.
+ * The macros here swap to the appropriate order for the particular machine
+ * running the macro and return the new answer. These may also be used when
+ * writing to a file to specify that we wish to write in (eg) big endian
+ * format.
+ *
+ * This leads to efficient code as most of the time these macros are
+ * trivial.
+ */
+#ifdef SP_BIG_ENDIAN
+#define le_int4(x) iswap_int4((x))
+#define le_int2(x) iswap_int2((x))
+#endif
+
+#ifdef SP_LITTLE_ENDIAN
+#define le_int4(x) (x)
+#define le_int2(x) (x)
+#endif
+
+/*-----------------------------------------------------------------------------
+ * <inttypes.h> definitions, incase they're not present
+ */
+
+#ifndef PRId64
+#define __PRI64__ "l"
+#define PRId64 __PRI64__ "d"
+#define PRId32 "d"
+#define PRId16 "d"
+#define PRId8 "d"
+#define PRIu64 __PRI64__ "u"
+#define PRIu32 "u"
+#define PRIu16 "u"
+#define PRIu8 "u"
+#endif
+
+/*-----------------------------------------------------------------------------
+ * Operating system specifics.
+ * These ought to be done by autoconf, but are legacy code.
+ */
+/*
+ * SunOS 4.x
+ * Even though we use the ANSI gcc, we make use the the standard SunOS 4.x
+ * libraries and include files, which are non-ansi
+ */
+#if defined(__sun__) && !defined(__svr4__)
+#define SEEK_SET 0
+#define SEEK_CUR 1
+#define SEEK_END 2
+#endif
+
+/*
+ * Microsoft Visual C++
+ * Windows
+ */
+#if defined(_MSC_VER)
+#define popen _popen
+#define pclose _pclose
+#define ftruncate(fd,len) _chsize(fd,len)
+#endif
+
+
+/*
+ * Microsoft Windows running MinGW
+ */
+#if defined(__MINGW32__)
+/* #define mkdir(filename,mode) mkdir((filename)) */
+#define sysconf(x) 512
+#define ftruncate(fd,len) _chsize(fd,len)
+#endif
+
+/* Generic WIN32 API issues */
+#ifdef _WIN32
+# ifndef HAVE_FSEEKO
+# if __MSVCRT_VERSION__ >= 0x800
+ /* if you have MSVCR80 installed then you can use these definitions: */
+# define off_t __int64
+# define fseeko _fseeki64
+# define ftello _ftelli64
+# else
+ /* otherwise we're stuck with 32-bit file support */
+# define off_t long
+# define fseeko fseek
+# define ftello ftell
+# endif
+# endif /* !HAVE_FSEEKO */
+#endif /* _WIN32 */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_OS_H_*/
diff --git a/htslib/cram/pooled_alloc.c b/htslib/cram/pooled_alloc.c
new file mode 100644
index 0000000..b15f88e
--- /dev/null
+++ b/htslib/cram/pooled_alloc.c
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2009 Genome Research Ltd.
+Author: Rob Davies <rmd at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include "cram/pooled_alloc.h"
+#include "cram/misc.h"
+
+//#define TEST_MAIN
+
+#define PSIZE 1024*1024
+
+// credit to http://graphics.stanford.edu/~seander/bithacks.html
+static int next_power_2(unsigned int v) {
+ v--;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+
+ return v;
+}
+
+/*
+ * Creates a pool.
+ * Pool allocations are approx minimum of 1024*dsize or PSIZE.
+ * (Assumes we're not trying to use pools for >= 2Gb or more)
+ */
+pool_alloc_t *pool_create(size_t dsize) {
+ pool_alloc_t *p;
+
+ if (NULL == (p = (pool_alloc_t *)malloc(sizeof(*p))))
+ return NULL;
+
+ /* Minimum size is a pointer, for free list */
+ dsize = (dsize + sizeof(void *) - 1) & ~(sizeof(void *)-1);
+ if (dsize < sizeof(void *))
+ dsize = sizeof(void *);
+ p->dsize = dsize;
+ p->psize = MIN(PSIZE, next_power_2(p->dsize*1024));
+
+ p->npools = 0;
+ p->pools = NULL;
+ p->free = NULL;
+
+ return p;
+}
+
+static pool_t *new_pool(pool_alloc_t *p) {
+ size_t n = p->psize / p->dsize;
+ pool_t *pool;
+
+ pool = realloc(p->pools, (p->npools + 1) * sizeof(*p->pools));
+ if (NULL == pool) return NULL;
+ p->pools = pool;
+ pool = &p->pools[p->npools];
+
+ pool->pool = malloc(n * p->dsize);
+ if (NULL == pool->pool) return NULL;
+
+ pool->used = 0;
+
+ p->npools++;
+
+ return pool;
+}
+
+void pool_destroy(pool_alloc_t *p) {
+ size_t i;
+
+ for (i = 0; i < p->npools; i++) {
+ free(p->pools[i].pool);
+ }
+ free(p->pools);
+ free(p);
+}
+
+void *pool_alloc(pool_alloc_t *p) {
+ pool_t *pool;
+ void *ret;
+
+ /* Look on free list */
+ if (NULL != p->free) {
+ ret = p->free;
+ p->free = *((void **)p->free);
+ return ret;
+ }
+
+ /* Look for space in the last pool */
+ if (p->npools) {
+ pool = &p->pools[p->npools - 1];
+ if (pool->used + p->dsize < p->psize) {
+ ret = ((char *) pool->pool) + pool->used;
+ pool->used += p->dsize;
+ return ret;
+ }
+ }
+
+ /* Need a new pool */
+ pool = new_pool(p);
+ if (NULL == pool) return NULL;
+
+ pool->used = p->dsize;
+ return pool->pool;
+}
+
+void pool_free(pool_alloc_t *p, void *ptr) {
+ *(void **)ptr = p->free;
+ p->free = ptr;
+}
+
+#ifdef TEST_MAIN
+typedef struct {
+ int x, y, z;
+} xyz;
+
+#define NP 10000
+int main(void) {
+ int i;
+ xyz *item;
+ xyz **items;
+ pool_alloc_t *p = pool_create(sizeof(xyz));
+
+ items = (xyz **)malloc(NP * sizeof(*items));
+
+ for (i = 0; i < NP; i++) {
+ item = pool_alloc(p);
+ item->x = i;
+ item->y = i+1;
+ item->z = i+2;
+ items[i] = item;
+ }
+
+ for (i = 0; i < NP; i++) {
+ item = items[i];
+ if (i % 3)
+ pool_free(p, item);
+ }
+
+ for (i = 0; i < NP; i++) {
+ item = pool_alloc(p);
+ item->x = 1000000+i;
+ item->y = 1000000+i+1;
+ item->z = 1000000+i+2;
+ }
+
+ for (i = 0; i < NP; i++) {
+ item = items[i];
+ printf("%d\t%d\t%d\t%d\n", i, item->x, item->y, item->z);
+ pool_free(p, item);
+ }
+
+ return 0;
+}
+#endif
diff --git a/htslib/cram/pooled_alloc.h b/htslib/cram/pooled_alloc.h
new file mode 100644
index 0000000..e19e320
--- /dev/null
+++ b/htslib/cram/pooled_alloc.h
@@ -0,0 +1,64 @@
+/*
+Copyright (c) 2009 Genome Research Ltd.
+Author: Rob Davies <rmd at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _POOLED_ALLOC_H_
+#define _POOLED_ALLOC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Implements a pooled block allocator where all items are the same size,
+ * but we need many of them.
+ */
+typedef struct {
+ void *pool;
+ size_t used;
+} pool_t;
+
+typedef struct {
+ size_t dsize;
+ size_t psize;
+ size_t npools;
+ pool_t *pools;
+ void *free;
+} pool_alloc_t;
+
+pool_alloc_t *pool_create(size_t dsize);
+void pool_destroy(pool_alloc_t *p);
+void *pool_alloc(pool_alloc_t *p);
+void pool_free(pool_alloc_t *p, void *ptr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_POOLED_ALLOC_H_*/
diff --git a/htslib/cram/rANS_byte.h b/htslib/cram/rANS_byte.h
new file mode 100644
index 0000000..c61ed9d
--- /dev/null
+++ b/htslib/cram/rANS_byte.h
@@ -0,0 +1,336 @@
+/* rans_byte.h originally from https://github.com/rygorous/ryg_rans
+ *
+ * This is a public-domain implementation of several rANS variants. rANS is an
+ * entropy coder from the ANS family, as described in Jarek Duda's paper
+ * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540).
+ */
+
+/*-------------------------------------------------------------------------- */
+
+// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014
+//
+// Not intended to be "industrial strength"; just meant to illustrate the general
+// idea.
+
+#ifndef RANS_BYTE_HEADER
+#define RANS_BYTE_HEADER
+
+#include <stdint.h>
+
+#ifdef assert
+#define RansAssert assert
+#else
+#define RansAssert(x)
+#endif
+
+// READ ME FIRST:
+//
+// This is designed like a typical arithmetic coder API, but there's three
+// twists you absolutely should be aware of before you start hacking:
+//
+// 1. You need to encode data in *reverse* - last symbol first. rANS works
+// like a stack: last in, first out.
+// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give
+// it a pointer to the *end* of your buffer (exclusive), and it will
+// slowly move towards the beginning as more bytes are emitted.
+// 3. Unlike basically any other entropy coder implementation you might
+// have used, you can interleave data from multiple independent rANS
+// encoders into the same bytestream without any extra signaling;
+// you can also just write some bytes by yourself in the middle if
+// you want to. This is in addition to the usual arithmetic encoder
+// property of being able to switch models on the fly. Writing raw
+// bytes can be useful when you have some data that you know is
+// incompressible, and is cheaper than going through the rANS encode
+// function. Using multiple rANS coders on the same byte stream wastes
+// a few bytes compared to using just one, but execution of two
+// independent encoders can happen in parallel on superscalar and
+// Out-of-Order CPUs, so this can be *much* faster in tight decoding
+// loops.
+//
+// This is why all the rANS functions take the write pointer as an
+// argument instead of just storing it in some context struct.
+
+// --------------------------------------------------------------------------
+
+// L ('l' in the paper) is the lower bound of our normalization interval.
+// Between this and our byte-aligned emission, we use 31 (not 32!) bits.
+// This is done intentionally because exact reciprocals for 31-bit uints
+// fit in 32-bit uints: this permits some optimizations during encoding.
+#define RANS_BYTE_L (1u << 23) // lower bound of our normalization interval
+
+// State for a rANS encoder. Yep, that's all there is to it.
+typedef uint32_t RansState;
+
+// Initialize a rANS encoder.
+static inline void RansEncInit(RansState* r)
+{
+ *r = RANS_BYTE_L;
+}
+
+// Renormalize the encoder. Internal function.
+static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits)
+{
+ uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; // this turns into a shift.
+ if (x >= x_max) {
+ uint8_t* ptr = *pptr;
+ do {
+ *--ptr = (uint8_t) (x & 0xff);
+ x >>= 8;
+ } while (x >= x_max);
+ *pptr = ptr;
+ }
+ return x;
+}
+
+// Encodes a single symbol with range start "start" and frequency "freq".
+// All frequencies are assumed to sum to "1 << scale_bits", and the
+// resulting bytes get written to ptr (which is updated).
+//
+// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from
+// beginning to end! Likewise, the output bytestream is written *backwards*:
+// ptr starts pointing at the end of the output buffer and keeps decrementing.
+static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ // renormalize
+ RansState x = RansEncRenorm(*r, pptr, freq, scale_bits);
+
+ // x = C(s,x)
+ *r = ((x / freq) << scale_bits) + (x % freq) + start;
+}
+
+// Flushes the rANS encoder.
+static inline void RansEncFlush(RansState* r, uint8_t** pptr)
+{
+ uint32_t x = *r;
+ uint8_t* ptr = *pptr;
+
+ ptr -= 4;
+ ptr[0] = (uint8_t) (x >> 0);
+ ptr[1] = (uint8_t) (x >> 8);
+ ptr[2] = (uint8_t) (x >> 16);
+ ptr[3] = (uint8_t) (x >> 24);
+
+ *pptr = ptr;
+}
+
+// Initializes a rANS decoder.
+// Unlike the encoder, the decoder works forwards as you'd expect.
+static inline void RansDecInit(RansState* r, uint8_t** pptr)
+{
+ uint32_t x;
+ uint8_t* ptr = *pptr;
+
+ x = ptr[0] << 0;
+ x |= ptr[1] << 8;
+ x |= ptr[2] << 16;
+ x |= ptr[3] << 24;
+ ptr += 4;
+
+ *pptr = ptr;
+ *r = x;
+}
+
+// Returns the current cumulative frequency (map it to a symbol yourself!)
+static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits)
+{
+ return *r & ((1u << scale_bits) - 1);
+}
+
+// Advances in the bit stream by "popping" a single symbol with range start
+// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits",
+// and the resulting bytes get written to ptr (which is updated).
+static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ uint32_t mask = (1u << scale_bits) - 1;
+
+ // s, x = D(x)
+ uint32_t x = *r;
+ x = freq * (x >> scale_bits) + (x & mask) - start;
+
+ // renormalize
+ if (x < RANS_BYTE_L) {
+ uint8_t* ptr = *pptr;
+ do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L);
+ *pptr = ptr;
+ }
+
+ *r = x;
+}
+
+// --------------------------------------------------------------------------
+
+// That's all you need for a full encoder; below here are some utility
+// functions with extra convenience or optimizations.
+
+// Encoder symbol description
+// This (admittedly odd) selection of parameters was chosen to make
+// RansEncPutSymbol as cheap as possible.
+typedef struct {
+ uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval
+ uint32_t rcp_freq; // Fixed-point reciprocal frequency
+ uint32_t bias; // Bias
+ uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq
+ uint16_t rcp_shift; // Reciprocal shift
+} RansEncSymbol;
+
+// Decoder symbols are straightforward.
+typedef struct {
+ uint16_t start; // Start of range.
+ uint16_t freq; // Symbol frequency.
+} RansDecSymbol;
+
+// Initializes an encoder symbol to start "start" and frequency "freq"
+static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ RansAssert(scale_bits <= 16);
+ RansAssert(start <= (1u << scale_bits));
+ RansAssert(freq <= (1u << scale_bits) - start);
+
+ // Say M := 1 << scale_bits.
+ //
+ // The original encoder does:
+ // x_new = (x/freq)*M + start + (x%freq)
+ //
+ // The fast encoder does (schematically):
+ // q = mul_hi(x, rcp_freq) >> rcp_shift (division)
+ // r = x - q*freq (remainder)
+ // x_new = q*M + bias + r (new x)
+ // plugging in r into x_new yields:
+ // x_new = bias + x + q*(M - freq)
+ // =: bias + x + q*cmpl_freq (*)
+ //
+ // and we can just precompute cmpl_freq. Now we just need to
+ // set up our parameters such that the original encoder and
+ // the fast encoder agree.
+
+ s->x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq;
+ s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq);
+ if (freq < 2) {
+ // freq=0 symbols are never valid to encode, so it doesn't matter what
+ // we set our values to.
+ //
+ // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately,
+ // our fixed-point reciprocal approximation can only multiply by values
+ // smaller than 1.
+ //
+ // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0.
+ // This gives:
+ // q = mul_hi(x, rcp_freq) >> rcp_shift
+ // = mul_hi(x, (1<<32) - 1)) >> 0
+ // = floor(x - x/(2^32))
+ // = x - 1 if 1 <= x < 2^32
+ // and we know that x>0 (x=0 is never in a valid normalization interval).
+ //
+ // So we now need to choose the other parameters such that
+ // x_new = x*M + start
+ // plug it in:
+ // x*M + start (desired result)
+ // = bias + x + q*cmpl_freq (*)
+ // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq)
+ // = bias + 1 + (x - 1)*M
+ // = x*M + (bias + 1 - M)
+ //
+ // so we have start = bias + 1 - M, or equivalently
+ // bias = start + M - 1.
+ s->rcp_freq = ~0u;
+ s->rcp_shift = 0;
+ s->bias = start + (1 << scale_bits) - 1;
+ } else {
+ // Alverson, "Integer Division using reciprocals"
+ // shift=ceil(log2(freq))
+ uint32_t shift = 0;
+ while (freq > (1u << shift))
+ shift++;
+
+ s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq);
+ s->rcp_shift = shift - 1;
+
+ // With these values, 'q' is the correct quotient, so we
+ // have bias=start.
+ s->bias = start;
+ }
+
+ s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol
+}
+
+// Initialize a decoder symbol to start "start" and frequency "freq"
+static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq)
+{
+ RansAssert(start <= (1 << 16));
+ RansAssert(freq <= (1 << 16) - start);
+ s->start = (uint16_t) start;
+ s->freq = (uint16_t) freq;
+}
+
+// Encodes a given symbol. This is faster than straight RansEnc since we can do
+// multiplications instead of a divide.
+//
+// See RansEncSymbolInit for a description of how this works.
+static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym)
+{
+ RansAssert(sym->x_max != 0); // can't encode symbol with freq=0
+
+ // renormalize
+ uint32_t x = *r;
+ uint32_t x_max = sym->x_max;
+
+ if (x >= x_max) {
+ uint8_t* ptr = *pptr;
+ do {
+ *--ptr = (uint8_t) (x & 0xff);
+ x >>= 8;
+ } while (x >= x_max);
+ *pptr = ptr;
+ }
+
+ // x = C(s,x)
+ // NOTE: written this way so we get a 32-bit "multiply high" when
+ // available. If you're on a 64-bit platform with cheap multiplies
+ // (e.g. x64), just bake the +32 into rcp_shift.
+ //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift;
+
+ // The extra >>32 has already been added to RansEncSymbolInit
+ uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift);
+ *r = x + sym->bias + q * sym->cmpl_freq;
+}
+
+// Equivalent to RansDecAdvance that takes a symbol.
+static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits)
+{
+ RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits);
+}
+
+// Advances in the bit stream by "popping" a single symbol with range start
+// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits".
+// No renormalization or output happens.
+static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits)
+{
+ uint32_t mask = (1u << scale_bits) - 1;
+
+ // s, x = D(x)
+ uint32_t x = *r;
+ *r = freq * (x >> scale_bits) + (x & mask) - start;
+}
+
+// Equivalent to RansDecAdvanceStep that takes a symbol.
+static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits)
+{
+ RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits);
+}
+
+// Renormalize.
+static inline void RansDecRenorm(RansState* r, uint8_t** pptr)
+{
+ // renormalize
+ uint32_t x = *r;
+
+ if (x < RANS_BYTE_L) {
+ uint8_t* ptr = *pptr;
+ do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L);
+ *pptr = ptr;
+ }
+
+ *r = x;
+}
+
+#endif // RANS_BYTE_HEADER
diff --git a/htslib/cram/rANS_static.c b/htslib/cram/rANS_static.c
new file mode 100644
index 0000000..00eda5a
--- /dev/null
+++ b/htslib/cram/rANS_static.c
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2014 Genome Research Ltd.
+ * Author(s): James Bonfield
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ * Institute nor the names of its contributors may be used to endorse
+ * or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Author: James Bonfield, Wellcome Trust Sanger Institute. 2014
+ */
+
+#include <config.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "cram/rANS_static.h"
+#include "cram/rANS_byte.h"
+
+#define TF_SHIFT 12
+#define TOTFREQ (1<<TF_SHIFT)
+
+#define ABS(a) ((a)>0?(a):-(a))
+#ifndef BLK_SIZE
+# define BLK_SIZE 1024*1024
+#endif
+
+// Room to allow for expanded BLK_SIZE on worst case compression.
+#define BLK_SIZE2 ((int)(1.05*BLK_SIZE))
+
+/*-----------------------------------------------------------------------------
+ * Memory to memory compression functions.
+ *
+ * These are original versions without any manual loop unrolling. They
+ * are easier to understand, but can be up to 2x slower.
+ */
+
+unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9);
+ unsigned char *cp, *out_end;
+ RansEncSymbol syms[256];
+ RansState rans0, rans1, rans2, rans3;
+ uint8_t* ptr;
+ int F[256] = {0}, i, j, tab_size, rle, x, fsum = 0;
+ int m = 0, M = 0;
+ uint64_t tr;
+
+ if (!out_buf)
+ return NULL;
+
+ ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
+
+ // Compute statistics
+ for (i = 0; i < in_size; i++) {
+ F[in[i]]++;
+ }
+ tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size;
+
+ // Normalise so T[i] == TOTFREQ
+ for (m = M = j = 0; j < 256; j++) {
+ if (!F[j])
+ continue;
+
+ if (m < F[j])
+ m = F[j], M = j;
+
+ if ((F[j] = (F[j]*tr)>>31) == 0)
+ F[j] = 1;
+ fsum += F[j];
+ }
+
+ fsum++;
+ if (fsum < TOTFREQ)
+ F[M] += TOTFREQ-fsum;
+ else
+ F[M] -= fsum-TOTFREQ;
+
+ //printf("F[%d]=%d\n", M, F[M]);
+ assert(F[M]>0);
+
+ // Encode statistics.
+ cp = out_buf+9;
+
+ for (x = rle = j = 0; j < 256; j++) {
+ if (F[j]) {
+ // j
+ if (rle) {
+ rle--;
+ } else {
+ *cp++ = j;
+ if (!rle && j && F[j-1]) {
+ for(rle=j+1; rle<256 && F[rle]; rle++)
+ ;
+ rle -= j+1;
+ *cp++ = rle;
+ }
+ //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
+ }
+
+ // F[j]
+ if (F[j]<128) {
+ *cp++ = F[j];
+ } else {
+ *cp++ = 128 | (F[j]>>8);
+ *cp++ = F[j]&0xff;
+ }
+ RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
+ x += F[j];
+ }
+ }
+ *cp++ = 0;
+
+ //write(1, out_buf+4, cp-(out_buf+4));
+ tab_size = cp-out_buf;
+
+ RansEncInit(&rans0);
+ RansEncInit(&rans1);
+ RansEncInit(&rans2);
+ RansEncInit(&rans3);
+
+ switch (i=(in_size&3)) {
+ case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
+ case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
+ case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
+ case 0:
+ break;
+ }
+ for (i=(in_size &~3); i>0; i-=4) {
+ RansEncSymbol *s3 = &syms[in[i-1]];
+ RansEncSymbol *s2 = &syms[in[i-2]];
+ RansEncSymbol *s1 = &syms[in[i-3]];
+ RansEncSymbol *s0 = &syms[in[i-4]];
+
+ RansEncPutSymbol(&rans3, &ptr, s3);
+ RansEncPutSymbol(&rans2, &ptr, s2);
+ RansEncPutSymbol(&rans1, &ptr, s1);
+ RansEncPutSymbol(&rans0, &ptr, s0);
+ }
+
+ RansEncFlush(&rans3, &ptr);
+ RansEncFlush(&rans2, &ptr);
+ RansEncFlush(&rans1, &ptr);
+ RansEncFlush(&rans0, &ptr);
+
+ // Finalise block size and return it
+ *out_size = (out_end - ptr) + tab_size;
+
+ cp = out_buf;
+
+ *cp++ = 0; // order
+ *cp++ = ((*out_size-9)>> 0) & 0xff;
+ *cp++ = ((*out_size-9)>> 8) & 0xff;
+ *cp++ = ((*out_size-9)>>16) & 0xff;
+ *cp++ = ((*out_size-9)>>24) & 0xff;
+
+ *cp++ = (in_size>> 0) & 0xff;
+ *cp++ = (in_size>> 8) & 0xff;
+ *cp++ = (in_size>>16) & 0xff;
+ *cp++ = (in_size>>24) & 0xff;
+
+ memmove(out_buf + tab_size, ptr, out_end-ptr);
+
+ return out_buf;
+}
+
+typedef struct {
+ struct {
+ int F;
+ int C;
+ } fc[256];
+ unsigned char *R;
+} ari_decoder;
+
+unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ /* Load in the static tables */
+ unsigned char *cp = in + 9;
+ int i, j, x, out_sz, in_sz, rle;
+ char *out_buf;
+ ari_decoder D;
+ RansDecSymbol syms[256];
+
+ memset(&D, 0, sizeof(D));
+
+ if (*in++ != 0) // Order-0 check
+ return NULL;
+
+ in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24);
+ out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24);
+ if (in_sz != in_size-9)
+ return NULL;
+
+ out_buf = malloc(out_sz);
+ if (!out_buf)
+ return NULL;
+
+ //fprintf(stderr, "out_sz=%d\n", out_sz);
+
+ // Precompute reverse lookup of frequency.
+ rle = x = 0;
+ j = *cp++;
+ do {
+ if ((D.fc[j].F = *cp++) >= 128) {
+ D.fc[j].F &= ~128;
+ D.fc[j].F = ((D.fc[j].F & 127) << 8) | *cp++;
+ }
+ D.fc[j].C = x;
+
+ RansDecSymbolInit(&syms[j], D.fc[j].C, D.fc[j].F);
+
+ /* Build reverse lookup table */
+ if (!D.R) D.R = (unsigned char *)malloc(TOTFREQ);
+ memset(&D.R[x], j, D.fc[j].F);
+
+ x += D.fc[j].F;
+
+ if (!rle && j+1 == *cp) {
+ j = *cp++;
+ rle = *cp++;
+ } else if (rle) {
+ rle--;
+ j++;
+ } else {
+ j = *cp++;
+ }
+ } while(j);
+
+ assert(x < TOTFREQ);
+
+ RansState rans0, rans1, rans2, rans3;
+ uint8_t *ptr = cp;
+ RansDecInit(&rans0, &ptr);
+ RansDecInit(&rans1, &ptr);
+ RansDecInit(&rans2, &ptr);
+ RansDecInit(&rans3, &ptr);
+
+ int out_end = (out_sz&~3);
+
+ RansState R[4];
+ R[0] = rans0;
+ R[1] = rans1;
+ R[2] = rans2;
+ R[3] = rans3;
+ uint32_t mask = (1u << TF_SHIFT)-1;
+
+ for (i=0; i < out_end; i+=4) {
+ uint32_t m[4] = {R[0] & mask,
+ R[1] & mask,
+ R[2] & mask,
+ R[3] & mask};
+ uint8_t c[4] = {D.R[m[0]],
+ D.R[m[1]],
+ D.R[m[2]],
+ D.R[m[3]]};
+ out_buf[i+0] = c[0];
+ out_buf[i+1] = c[1];
+ out_buf[i+2] = c[2];
+ out_buf[i+3] = c[3];
+
+ // RansDecAdvanceSymbolStep(&R[0], &syms[c[0]], TF_SHIFT);
+ // RansDecAdvanceSymbolStep(&R[1], &syms[c[1]], TF_SHIFT);
+ // RansDecAdvanceSymbolStep(&R[2], &syms[c[2]], TF_SHIFT);
+ // RansDecAdvanceSymbolStep(&R[3], &syms[c[3]], TF_SHIFT);
+ R[0] = syms[c[0]].freq * (R[0]>>TF_SHIFT);
+ R[1] = syms[c[1]].freq * (R[1]>>TF_SHIFT);
+ R[2] = syms[c[2]].freq * (R[2]>>TF_SHIFT);
+ R[3] = syms[c[3]].freq * (R[3]>>TF_SHIFT);
+
+ R[0] += m[0] - syms[c[0]].start;
+ R[1] += m[1] - syms[c[1]].start;
+ R[2] += m[2] - syms[c[2]].start;
+ R[3] += m[3] - syms[c[3]].start;
+
+ RansDecRenorm(&R[0], &ptr);
+ RansDecRenorm(&R[1], &ptr);
+ RansDecRenorm(&R[2], &ptr);
+ RansDecRenorm(&R[3], &ptr);
+ }
+
+ rans0 = R[0];
+ rans1 = R[1];
+ rans2 = R[2];
+ rans3 = R[3];
+
+ switch(out_sz&3) {
+ unsigned char c;
+ case 0:
+ break;
+ case 1:
+ c = D.R[RansDecGet(&rans0, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end] = c;
+ break;
+
+ case 2:
+ c = D.R[RansDecGet(&rans0, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end] = c;
+
+ c = D.R[RansDecGet(&rans1, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end+1] = c;
+ break;
+
+ case 3:
+ c = D.R[RansDecGet(&rans0, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end] = c;
+
+ c = D.R[RansDecGet(&rans1, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end+1] = c;
+
+ c = D.R[RansDecGet(&rans2, TF_SHIFT)];
+ RansDecAdvanceSymbol(&rans2, &ptr, &syms[c], TF_SHIFT);
+ out_buf[out_end+2] = c;
+ break;
+ }
+
+ *out_size = out_sz;
+
+ if (D.R) free(D.R);
+
+ return (unsigned char *)out_buf;
+}
+
+unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ unsigned char *out_buf = NULL, *out_end, *cp;
+ unsigned int last_i, tab_size, rle_i, rle_j;
+ RansEncSymbol (*syms)[256] = NULL; /* syms[256][256] */
+ int (*F)[256] = NULL; /* F[256][256] */
+ int *T = NULL; /* T[256] */
+ int i, j;
+ unsigned char c;
+
+ if (in_size < 4)
+ return rans_compress_O0(in, in_size, out_size);
+
+ syms = malloc(256 * sizeof(*syms));
+ if (!syms) goto cleanup;
+ F = calloc(256, sizeof(*F));
+ if (!F) goto cleanup;
+ T = calloc(256, sizeof(*T));
+ if (!T) goto cleanup;
+ out_buf = malloc(1.05*in_size + 257*257*3 + 9);
+ if (!out_buf) goto cleanup;
+
+ out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
+ cp = out_buf+9;
+
+ //for (last = 0, i=in_size-1; i>=0; i--) {
+ // F[last][c = in[i]]++;
+ // T[last]++;
+ // last = c;
+ //}
+
+ for (last_i=i=0; i<in_size; i++) {
+ F[last_i][c = in[i]]++;
+ T[last_i]++;
+ last_i = c;
+ }
+ F[0][in[1*(in_size>>2)]]++;
+ F[0][in[2*(in_size>>2)]]++;
+ F[0][in[3*(in_size>>2)]]++;
+ T[0]+=3;
+
+ // Normalise so T[i] == TOTFREQ
+ for (rle_i = i = 0; i < 256; i++) {
+ int t2, m, M;
+ unsigned int x;
+
+ if (T[i] == 0)
+ continue;
+
+ //uint64_t p = (TOTFREQ * TOTFREQ) / t;
+ double p = ((double)TOTFREQ)/T[i];
+ for (t2 = m = M = j = 0; j < 256; j++) {
+ if (!F[i][j])
+ continue;
+
+ if (m < F[i][j])
+ m = F[i][j], M = j;
+
+ //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0)
+ if ((F[i][j] *= p) == 0)
+ F[i][j] = 1;
+ t2 += F[i][j];
+ }
+
+ t2++;
+ if (t2 < TOTFREQ)
+ F[i][M] += TOTFREQ-t2;
+ else
+ F[i][M] -= t2-TOTFREQ;
+
+ // Store frequency table
+ // i
+ if (rle_i) {
+ rle_i--;
+ } else {
+ *cp++ = i;
+ // FIXME: could use order-0 statistics to observe which alphabet
+ // symbols are present and base RLE on that ordering instead.
+ if (i && T[i-1]) {
+ for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++)
+ ;
+ rle_i -= i+1;
+ *cp++ = rle_i;
+ }
+ }
+
+ int *F_i_ = F[i];
+ x = 0;
+ rle_j = 0;
+ for (j = 0; j < 256; j++) {
+ if (F_i_[j]) {
+ //fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x);
+
+ // j
+ if (rle_j) {
+ rle_j--;
+ } else {
+ *cp++ = j;
+ if (!rle_j && j && F_i_[j-1]) {
+ for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++)
+ ;
+ rle_j -= j+1;
+ *cp++ = rle_j;
+ }
+ }
+
+ // F_i_[j]
+ if (F_i_[j]<128) {
+ *cp++ = F_i_[j];
+ } else {
+ *cp++ = 128 | (F_i_[j]>>8);
+ *cp++ = F_i_[j]&0xff;
+ }
+
+ RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT);
+ x += F_i_[j];
+ }
+ }
+ *cp++ = 0;
+ }
+ *cp++ = 0;
+
+ //write(1, out_buf+4, cp-(out_buf+4));
+ tab_size = cp - out_buf;
+ assert(tab_size < 257*257*3);
+
+ RansState rans0, rans1, rans2, rans3;
+ RansEncInit(&rans0);
+ RansEncInit(&rans1);
+ RansEncInit(&rans2);
+ RansEncInit(&rans3);
+
+ uint8_t* ptr = out_end;
+
+ int isz4 = in_size>>2;
+ int i0 = 1*isz4-2;
+ int i1 = 2*isz4-2;
+ int i2 = 3*isz4-2;
+ int i3 = 4*isz4-2;
+
+ unsigned char l0 = in[i0+1];
+ unsigned char l1 = in[i1+1];
+ unsigned char l2 = in[i2+1];
+ unsigned char l3 = in[i3+1];
+
+ // Deal with the remainder
+ l3 = in[in_size-1];
+ for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
+ unsigned char c3 = in[i3];
+ RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
+ l3 = c3;
+ }
+
+ for (; i0 >= 0; i0--, i1--, i2--, i3--) {
+ unsigned char c0, c1, c2, c3;
+ RansEncSymbol *s3 = &syms[c3 = in[i3]][l3];
+ RansEncSymbol *s2 = &syms[c2 = in[i2]][l2];
+ RansEncSymbol *s1 = &syms[c1 = in[i1]][l1];
+ RansEncSymbol *s0 = &syms[c0 = in[i0]][l0];
+
+ RansEncPutSymbol(&rans3, &ptr, s3);
+ RansEncPutSymbol(&rans2, &ptr, s2);
+ RansEncPutSymbol(&rans1, &ptr, s1);
+ RansEncPutSymbol(&rans0, &ptr, s0);
+
+ l0 = c0;
+ l1 = c1;
+ l2 = c2;
+ l3 = c3;
+ }
+
+ RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
+ RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
+ RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
+ RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
+
+ RansEncFlush(&rans3, &ptr);
+ RansEncFlush(&rans2, &ptr);
+ RansEncFlush(&rans1, &ptr);
+ RansEncFlush(&rans0, &ptr);
+
+ *out_size = (out_end - ptr) + tab_size;
+
+ cp = out_buf;
+ *cp++ = 1; // order
+
+ *cp++ = ((*out_size-9)>> 0) & 0xff;
+ *cp++ = ((*out_size-9)>> 8) & 0xff;
+ *cp++ = ((*out_size-9)>>16) & 0xff;
+ *cp++ = ((*out_size-9)>>24) & 0xff;
+
+ *cp++ = (in_size>> 0) & 0xff;
+ *cp++ = (in_size>> 8) & 0xff;
+ *cp++ = (in_size>>16) & 0xff;
+ *cp++ = (in_size>>24) & 0xff;
+
+ memmove(out_buf + tab_size, ptr, out_end-ptr);
+
+ cleanup:
+ free(syms);
+ free(F);
+ free(T);
+
+ return out_buf;
+}
+
+unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ /* Load in the static tables */
+ unsigned char *cp = in + 9;
+ int i, j = -999, x, out_sz, in_sz, rle_i, rle_j;
+ char *out_buf = NULL;
+ ari_decoder *D = NULL; /* D[256] */
+ RansDecSymbol (*syms)[256] = NULL; /* syms[256][256] */
+
+ if (*in++ != 1) // Order-1 check
+ return NULL;
+
+ in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24);
+ out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24);
+ if (in_sz != in_size-9)
+ return NULL;
+
+ D = calloc(256, sizeof(*D));
+ if (!D) goto cleanup;
+ syms = malloc(256 * sizeof(*syms));
+ if (!syms) goto cleanup;
+
+ //fprintf(stderr, "out_sz=%d\n", out_sz);
+
+ //i = *cp++;
+ rle_i = 0;
+ i = *cp++;
+ do {
+ rle_j = x = 0;
+ j = *cp++;
+ do {
+ if ((D[i].fc[j].F = *cp++) >= 128) {
+ D[i].fc[j].F &= ~128;
+ D[i].fc[j].F = ((D[i].fc[j].F & 127) << 8) | *cp++;
+ }
+ D[i].fc[j].C = x;
+
+ //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, D[i].fc[j].F, D[i].fc[j].C);
+
+ if (!D[i].fc[j].F)
+ D[i].fc[j].F = TOTFREQ;
+
+ RansDecSymbolInit(&syms[i][j], D[i].fc[j].C, D[i].fc[j].F);
+
+ /* Build reverse lookup table */
+ if (!D[i].R) {
+ D[i].R = (unsigned char *)malloc(TOTFREQ);
+ if (!D[i].R)
+ goto cleanup;
+ }
+ memset(&D[i].R[x], j, D[i].fc[j].F);
+
+ x += D[i].fc[j].F;
+ assert(x <= TOTFREQ);
+
+ if (!rle_j && j+1 == *cp) {
+ j = *cp++;
+ rle_j = *cp++;
+ } else if (rle_j) {
+ rle_j--;
+ j++;
+ } else {
+ j = *cp++;
+ }
+ } while(j);
+
+ if (!rle_i && i+1 == *cp) {
+ i = *cp++;
+ rle_i = *cp++;
+ } else if (rle_i) {
+ rle_i--;
+ i++;
+ } else {
+ i = *cp++;
+ }
+ } while (i);
+
+ // Precompute reverse lookup of frequency.
+
+ RansState rans0, rans1, rans2, rans3;
+ uint8_t *ptr = cp;
+ RansDecInit(&rans0, &ptr);
+ RansDecInit(&rans1, &ptr);
+ RansDecInit(&rans2, &ptr);
+ RansDecInit(&rans3, &ptr);
+
+ int isz4 = out_sz>>2;
+ int l0 = 0;
+ int l1 = 0;
+ int l2 = 0;
+ int l3 = 0;
+ int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
+
+ RansState R[4];
+ R[0] = rans0;
+ R[1] = rans1;
+ R[2] = rans2;
+ R[3] = rans3;
+
+ /* Allocate output buffer */
+ out_buf = malloc(out_sz);
+ if (!out_buf) goto cleanup;
+
+ for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
+ uint32_t m[4] = {R[0] & ((1u << TF_SHIFT)-1),
+ R[1] & ((1u << TF_SHIFT)-1),
+ R[2] & ((1u << TF_SHIFT)-1),
+ R[3] & ((1u << TF_SHIFT)-1)};
+
+ uint8_t c[4] = {D[l0].R[m[0]],
+ D[l1].R[m[1]],
+ D[l2].R[m[2]],
+ D[l3].R[m[3]]};
+
+ out_buf[i4[0]] = c[0];
+ out_buf[i4[1]] = c[1];
+ out_buf[i4[2]] = c[2];
+ out_buf[i4[3]] = c[3];
+
+ //RansDecAdvanceSymbolStep(&R[0], &syms[l0][c[0]], TF_SHIFT);
+ //RansDecAdvanceSymbolStep(&R[1], &syms[l1][c[1]], TF_SHIFT);
+ //RansDecAdvanceSymbolStep(&R[2], &syms[l2][c[2]], TF_SHIFT);
+ //RansDecAdvanceSymbolStep(&R[3], &syms[l3][c[3]], TF_SHIFT);
+
+ R[0] = syms[l0][c[0]].freq * (R[0]>>TF_SHIFT);
+ R[1] = syms[l1][c[1]].freq * (R[1]>>TF_SHIFT);
+ R[2] = syms[l2][c[2]].freq * (R[2]>>TF_SHIFT);
+ R[3] = syms[l3][c[3]].freq * (R[3]>>TF_SHIFT);
+
+ R[0] += m[0] - syms[l0][c[0]].start;
+ R[1] += m[1] - syms[l1][c[1]].start;
+ R[2] += m[2] - syms[l2][c[2]].start;
+ R[3] += m[3] - syms[l3][c[3]].start;
+
+ RansDecRenorm(&R[0], &ptr);
+ RansDecRenorm(&R[1], &ptr);
+ RansDecRenorm(&R[2], &ptr);
+ RansDecRenorm(&R[3], &ptr);
+
+ l0 = c[0];
+ l1 = c[1];
+ l2 = c[2];
+ l3 = c[3];
+ }
+
+ rans0 = R[0];
+ rans1 = R[1];
+ rans2 = R[2];
+ rans3 = R[3];
+
+ // Remainder
+ for (; i4[3] < out_sz; i4[3]++) {
+ unsigned char c3 = D[l3].R[RansDecGet(&rans3, TF_SHIFT)];
+ out_buf[i4[3]] = c3;
+ RansDecAdvanceSymbol(&rans3, &ptr, &syms[l3][c3], TF_SHIFT);
+ l3 = c3;
+ }
+
+ *out_size = out_sz;
+
+ cleanup:
+ if (D) {
+ for (i = 0; i < 256; i++)
+ if (D[i].R) free(D[i].R);
+ free(D);
+ }
+ free(syms);
+
+ return (unsigned char *)out_buf;
+}
+
+/*-----------------------------------------------------------------------------
+ * Simple interface to the order-0 vs order-1 encoders and decoders.
+ */
+unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size, int order) {
+ return order
+ ? rans_compress_O1(in, in_size, out_size)
+ : rans_compress_O0(in, in_size, out_size);
+}
+
+unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size) {
+ /* Both rans_uncompress functions need to be able to read at least 9
+ bytes. */
+ if (in_size < 9)
+ return NULL;
+ return in[0]
+ ? rans_uncompress_O1(in, in_size, out_size)
+ : rans_uncompress_O0(in, in_size, out_size);
+}
+
+
+#ifdef TEST_MAIN
+/*-----------------------------------------------------------------------------
+ * Main.
+ *
+ * This is a simple command line tool for testing order-0 and order-1
+ * compression using the rANS codec. Simply compile with
+ *
+ * gcc -DTEST_MAIN -O3 -I. cram/rANS_static.c -o cram/rANS_static
+ *
+ * Usage: cram/rANS_static -o0 < file > file.o0
+ * cram/rANS_static -d < file.o0 > file2
+ *
+ * cram/rANS_static -o1 < file > file.o1
+ * cram/rANS_static -d < file.o1 > file2
+ */
+int main(int argc, char **argv) {
+ int opt, order = 0;
+ unsigned char in_buf[BLK_SIZE2+257*257*3];
+ int decode = 0;
+ FILE *infp = stdin, *outfp = stdout;
+ struct timeval tv1, tv2;
+ size_t bytes = 0;
+
+ extern char *optarg;
+ extern int optind;
+
+ while ((opt = getopt(argc, argv, "o:d")) != -1) {
+ switch (opt) {
+ case 'o':
+ order = atoi(optarg);
+ break;
+
+ case 'd':
+ decode = 1;
+ break;
+ }
+ }
+
+ order = order ? 1 : 0; // Only support O(0) and O(1)
+
+ if (optind < argc) {
+ if (!(infp = fopen(argv[optind], "rb"))) {
+ perror(argv[optind]);
+ return 1;
+ }
+ optind++;
+ }
+
+ if (optind < argc) {
+ if (!(outfp = fopen(argv[optind], "wb"))) {
+ perror(argv[optind]);
+ return 1;
+ }
+ optind++;
+ }
+
+ gettimeofday(&tv1, NULL);
+
+ if (decode) {
+ // Only used in some test implementations of RC_GetFreq()
+ //RC_init();
+ //RC_init2();
+
+ for (;;) {
+ uint32_t in_size, out_size;
+ unsigned char *out;
+
+ if (9 != fread(in_buf, 1, 9, infp))
+ break;
+ in_size = *(int *)&in_buf[1];
+ if (in_size != fread(in_buf+9, 1, in_size, infp)) {
+ fprintf(stderr, "Truncated input\n");
+ exit(1);
+ }
+ out = rans_uncompress(in_buf, in_size+9, &out_size);
+ if (!out)
+ abort();
+
+ fwrite(out, 1, out_size, outfp);
+ free(out);
+
+ bytes += out_size;
+ }
+ } else {
+ for (;;) {
+ uint32_t in_size, out_size;
+ unsigned char *out;
+
+ in_size = fread(in_buf, 1, BLK_SIZE, infp);
+ if (in_size <= 0)
+ break;
+
+ out = rans_compress(in_buf, in_size, &out_size, order);
+
+ fwrite(out, 1, out_size, outfp);
+ free(out);
+
+ bytes += in_size;
+ }
+ }
+
+ gettimeofday(&tv2, NULL);
+
+ fprintf(stderr, "Took %ld microseconds, %5.1f MB/s\n",
+ (long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
+ tv2.tv_usec - tv1.tv_usec,
+ (double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
+ tv2.tv_usec - tv1.tv_usec));
+ return 0;
+}
+#endif
diff --git a/htslib/cram/rANS_static.h b/htslib/cram/rANS_static.h
new file mode 100644
index 0000000..11f20b8
--- /dev/null
+++ b/htslib/cram/rANS_static.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2014 Genome Research Ltd.
+ * Author(s): James Bonfield
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ * Institute nor the names of its contributors may be used to endorse
+ * or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef RANS_STATIC_H
+#define RANS_STATIC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size, int order);
+unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
+ unsigned int *out_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RANS_STATIC_H */
diff --git a/htslib/cram/sam_header.c b/htslib/cram/sam_header.c
new file mode 100644
index 0000000..cc13d46
--- /dev/null
+++ b/htslib/cram/sam_header.c
@@ -0,0 +1,1268 @@
+/*
+Copyright (c) 2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <string.h>
+#include <assert.h>
+
+#include "cram/sam_header.h"
+#include "cram/string_alloc.h"
+
+static void sam_hdr_error(char *msg, char *line, int len, int lno) {
+ int j;
+
+ for (j = 0; j < len && line[j] != '\n'; j++)
+ ;
+ fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
+}
+
+void sam_hdr_dump(SAM_hdr *hdr) {
+ khint_t k;
+ int i;
+
+ printf("===DUMP===\n");
+ for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
+ SAM_hdr_type *t1, *t2;
+ char c[2];
+
+ if (!kh_exist(hdr->h, k))
+ continue;
+
+ t1 = t2 = kh_val(hdr->h, k);
+ c[0] = kh_key(hdr->h, k)>>8;
+ c[1] = kh_key(hdr->h, k)&0xff;
+ printf("Type %.2s, count %d\n", c, t1->prev->order+1);
+
+ do {
+ SAM_hdr_tag *tag;
+ printf(">>>%d ", t1->order);
+ for (tag = t1->tag; tag; tag=tag->next) {
+ printf("\"%.2s\":\"%.*s\"\t",
+ tag->str, tag->len-3, tag->str+3);
+ }
+ putchar('\n');
+ t1 = t1->next;
+ } while (t1 != t2);
+ }
+
+ /* Dump out PG chains */
+ printf("\n at PG chains:\n");
+ for (i = 0; i < hdr->npg_end; i++) {
+ int j;
+ printf(" %d:", i);
+ for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
+ printf("%s%d(%.*s)",
+ j == hdr->pg_end[i] ? " " : "->",
+ j, hdr->pg[j].name_len, hdr->pg[j].name);
+ }
+ printf("\n");
+ }
+
+ puts("===END DUMP===");
+}
+
+/* Updates the hash tables in the SAM_hdr structure.
+ *
+ * Returns 0 on success;
+ * -1 on failure
+ */
+static int sam_hdr_update_hashes(SAM_hdr *sh,
+ int type,
+ SAM_hdr_type *h_type) {
+ /* Add to reference hash? */
+ if ((type>>8) == 'S' && (type&0xff) == 'Q') {
+ SAM_hdr_tag *tag;
+ SAM_SQ *new_ref;
+ int nref = sh->nref;
+
+ new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
+ if (!new_ref)
+ return -1;
+ sh->ref = new_ref;
+
+ tag = h_type->tag;
+ sh->ref[nref].name = NULL;
+ sh->ref[nref].len = 0;
+ sh->ref[nref].ty = h_type;
+ sh->ref[nref].tag = tag;
+
+ while (tag) {
+ if (tag->str[0] == 'S' && tag->str[1] == 'N') {
+ if (!(sh->ref[nref].name = malloc(tag->len)))
+ return -1;
+ strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
+ sh->ref[nref].name[tag->len-3] = 0;
+ } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
+ sh->ref[nref].len = atoi(tag->str+3);
+ }
+ tag = tag->next;
+ }
+
+ if (sh->ref[nref].name) {
+ khint_t k;
+ int r;
+ k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
+ if (-1 == r) return -1;
+ kh_val(sh->ref_hash, k) = nref;
+ } else {
+ return -1; // SN should be present, according to spec.
+ }
+
+ sh->nref++;
+ }
+
+ /* Add to read-group hash? */
+ if ((type>>8) == 'R' && (type&0xff) == 'G') {
+ SAM_hdr_tag *tag;
+ SAM_RG *new_rg;
+ int nrg = sh->nrg;
+
+ new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
+ if (!new_rg)
+ return -1;
+ sh->rg = new_rg;
+
+ tag = h_type->tag;
+ sh->rg[nrg].name = NULL;
+ sh->rg[nrg].name_len = 0;
+ sh->rg[nrg].ty = h_type;
+ sh->rg[nrg].tag = tag;
+ sh->rg[nrg].id = nrg;
+
+ while (tag) {
+ if (tag->str[0] == 'I' && tag->str[1] == 'D') {
+ if (!(sh->rg[nrg].name = malloc(tag->len)))
+ return -1;
+ strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
+ sh->rg[nrg].name[tag->len-3] = 0;
+ sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
+ }
+ tag = tag->next;
+ }
+
+ if (sh->rg[nrg].name) {
+ khint_t k;
+ int r;
+ k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
+ if (-1 == r) return -1;
+ kh_val(sh->rg_hash, k) = nrg;
+ } else {
+ return -1; // ID should be present, according to spec.
+ }
+
+ sh->nrg++;
+ }
+
+ /* Add to program hash? */
+ if ((type>>8) == 'P' && (type&0xff) == 'G') {
+ SAM_hdr_tag *tag;
+ SAM_PG *new_pg;
+ int npg = sh->npg;
+
+ new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
+ if (!new_pg)
+ return -1;
+ sh->pg = new_pg;
+
+ tag = h_type->tag;
+ sh->pg[npg].name = NULL;
+ sh->pg[npg].name_len = 0;
+ sh->pg[npg].ty = h_type;
+ sh->pg[npg].tag = tag;
+ sh->pg[npg].id = npg;
+ sh->pg[npg].prev_id = -1;
+
+ while (tag) {
+ if (tag->str[0] == 'I' && tag->str[1] == 'D') {
+ if (!(sh->pg[npg].name = malloc(tag->len)))
+ return -1;
+ strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
+ sh->pg[npg].name[tag->len-3] = 0;
+ sh->pg[npg].name_len = strlen(sh->pg[npg].name);
+ } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
+ // Resolve later if needed
+ khint_t k;
+ char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
+ k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
+ tag->str[tag->len] = tmp;
+
+ if (k != kh_end(sh->pg_hash)) {
+ int p_id = kh_val(sh->pg_hash, k);
+ sh->pg[npg].prev_id = sh->pg[p_id].id;
+
+ /* Unmark previous entry as a PG termination */
+ if (sh->npg_end > 0 &&
+ sh->pg_end[sh->npg_end-1] == p_id) {
+ sh->npg_end--;
+ } else {
+ int i;
+ for (i = 0; i < sh->npg_end; i++) {
+ if (sh->pg_end[i] == p_id) {
+ memmove(&sh->pg_end[i], &sh->pg_end[i+1],
+ (sh->npg_end-i-1)*sizeof(*sh->pg_end));
+ sh->npg_end--;
+ }
+ }
+ }
+ } else {
+ sh->pg[npg].prev_id = -1;
+ }
+ }
+ tag = tag->next;
+ }
+
+ if (sh->pg[npg].name) {
+ khint_t k;
+ int r;
+ k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
+ if (-1 == r) return -1;
+ kh_val(sh->pg_hash, k) = npg;
+ } else {
+ return -1; // ID should be present, according to spec.
+ }
+
+ /* Add to npg_end[] array. Remove later if we find a PP line */
+ if (sh->npg_end >= sh->npg_end_alloc) {
+ int *new_pg_end;
+ int new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4;
+
+ new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int));
+ if (!new_pg_end)
+ return -1;
+ sh->npg_end_alloc = new_alloc;
+ sh->pg_end = new_pg_end;
+ }
+ sh->pg_end[sh->npg_end++] = npg;
+
+ sh->npg++;
+ }
+
+ return 0;
+}
+
+/*
+ * Appends a formatted line to an existing SAM header.
+ * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
+ * optional new-line. If it contains more than 1 line then multiple lines
+ * will be added in order.
+ *
+ * Input text is of maximum length len or as terminated earlier by a NUL.
+ * Len may be 0 if unknown, in which case lines must be NUL-terminated.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
+ int i, lno, text_offset;
+ char *hdr;
+
+ if (!len)
+ len = strlen(lines);
+
+ text_offset = ks_len(&sh->text);
+ if (EOF == kputsn(lines, len, &sh->text))
+ return -1;
+ hdr = ks_str(&sh->text) + text_offset;
+
+ for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) {
+ khint32_t type;
+ khint_t k;
+
+ int l_start = i, new;
+ SAM_hdr_type *h_type;
+ SAM_hdr_tag *h_tag, *last;
+
+ if (hdr[i] != '@') {
+ int j;
+ for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
+ ;
+ sam_hdr_error("Header line does not start with '@'",
+ &hdr[l_start], len - l_start, lno);
+ return -1;
+ }
+
+ type = (hdr[i+1]<<8) | hdr[i+2];
+ if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
+ hdr[i+2] < 'A' || hdr[i+2] > 'z') {
+ sam_hdr_error("Header line does not have a two character key",
+ &hdr[l_start], len - l_start, lno);
+ return -1;
+ }
+
+ i += 3;
+ if (hdr[i] == '\n')
+ continue;
+
+ // Add the header line type
+ if (!(h_type = pool_alloc(sh->type_pool)))
+ return -1;
+ if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
+ return -1;
+
+ // Form the ring, either with self or other lines of this type
+ if (!new) {
+ SAM_hdr_type *t = kh_val(sh->h, k), *p;
+ p = t->prev;
+
+ assert(p->next == t);
+ p->next = h_type;
+ h_type->prev = p;
+
+ t->prev = h_type;
+ h_type->next = t;
+ h_type->order = p->order+1;
+ } else {
+ kh_val(sh->h, k) = h_type;
+ h_type->prev = h_type->next = h_type;
+ h_type->order = 0;
+ }
+
+ // Parse the tags on this line
+ last = NULL;
+ if ((type>>8) == 'C' && (type&0xff) == 'O') {
+ int j;
+ if (hdr[i] != '\t') {
+ sam_hdr_error("Missing tab",
+ &hdr[l_start], len - l_start, lno);
+ return -1;
+ }
+
+ for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
+ ;
+
+ if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
+ return -1;
+ h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
+ h_tag->len = j-i;
+ h_tag->next = NULL;
+ if (!h_tag->str)
+ return -1;
+
+ i = j;
+
+ } else {
+ do {
+ int j;
+ if (hdr[i] != '\t') {
+ sam_hdr_error("Missing tab",
+ &hdr[l_start], len - l_start, lno);
+ return -1;
+ }
+
+ for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
+ ;
+
+ if (!(h_tag = pool_alloc(sh->tag_pool)))
+ return -1;
+ h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
+ h_tag->len = j-i;
+ h_tag->next = NULL;
+ if (!h_tag->str)
+ return -1;
+
+ if (h_tag->len < 3 || h_tag->str[2] != ':') {
+ sam_hdr_error("Malformed key:value pair",
+ &hdr[l_start], len - l_start, lno);
+ return -1;
+ }
+
+ if (last)
+ last->next = h_tag;
+ else
+ h_type->tag = h_tag;
+
+ last = h_tag;
+ i = j;
+ } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
+ }
+
+ /* Update RG/SQ hashes */
+ if (-1 == sam_hdr_update_hashes(sh, type, h_type))
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Adds a single line to a SAM header.
+ * Specify type and one or more key,value pairs, ending with the NULL key.
+ * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
+ *
+ * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
+ * -1 on failure
+ */
+int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
+ va_list args;
+ va_start(args, type);
+ return sam_hdr_vadd(sh, type, args, NULL);
+}
+
+/*
+ * sam_hdr_add with a va_list interface.
+ *
+ * Note: this function invokes va_arg at least once, making the value
+ * of ap indeterminate after the return. The caller should call
+ * va_start/va_end before/after calling this function or use va_copy.
+ */
+int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
+ va_list args;
+ SAM_hdr_type *h_type;
+ SAM_hdr_tag *h_tag, *last;
+ int new;
+ khint32_t type_i = (type[0]<<8) | type[1], k;
+
+ if (EOF == kputc_('@', &sh->text))
+ return -1;
+ if (EOF == kputsn(type, 2, &sh->text))
+ return -1;
+
+ if (!(h_type = pool_alloc(sh->type_pool)))
+ return -1;
+ if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
+ return -1;
+
+ // Form the ring, either with self or other lines of this type
+ if (!new) {
+ SAM_hdr_type *t = kh_val(sh->h, k), *p;
+ p = t->prev;
+
+ assert(p->next == t);
+ p->next = h_type;
+ h_type->prev = p;
+
+ t->prev = h_type;
+ h_type->next = t;
+ h_type->order = p->order + 1;
+ } else {
+ kh_val(sh->h, k) = h_type;
+ h_type->prev = h_type->next = h_type;
+ h_type->order = 0;
+ }
+
+ last = NULL;
+
+ // Any ... varargs
+ va_start(args, ap);
+ for (;;) {
+ char *k, *v;
+ int idx;
+
+ if (!(k = (char *)va_arg(args, char *)))
+ break;
+ v = va_arg(args, char *);
+
+ if (EOF == kputc_('\t', &sh->text))
+ return -1;
+
+ if (!(h_tag = pool_alloc(sh->tag_pool)))
+ return -1;
+ idx = ks_len(&sh->text);
+
+ if (EOF == kputs(k, &sh->text))
+ return -1;
+ if (EOF == kputc_(':', &sh->text))
+ return -1;
+ if (EOF == kputs(v, &sh->text))
+ return -1;
+
+ h_tag->len = ks_len(&sh->text) - idx;
+ h_tag->str = string_ndup(sh->str_pool,
+ ks_str(&sh->text) + idx,
+ h_tag->len);
+ h_tag->next = NULL;
+ if (!h_tag->str)
+ return -1;
+
+ if (last)
+ last->next = h_tag;
+ else
+ h_type->tag = h_tag;
+
+ last = h_tag;
+ }
+ va_end(args);
+
+ // Plus the specified va_list params
+ for (;;) {
+ char *k, *v;
+ int idx;
+
+ if (!(k = (char *)va_arg(ap, char *)))
+ break;
+ v = va_arg(ap, char *);
+
+ if (EOF == kputc_('\t', &sh->text))
+ return -1;
+
+ if (!(h_tag = pool_alloc(sh->tag_pool)))
+ return -1;
+ idx = ks_len(&sh->text);
+
+ if (EOF == kputs(k, &sh->text))
+ return -1;
+ if (EOF == kputc_(':', &sh->text))
+ return -1;
+ if (EOF == kputs(v, &sh->text))
+ return -1;
+
+ h_tag->len = ks_len(&sh->text) - idx;
+ h_tag->str = string_ndup(sh->str_pool,
+ ks_str(&sh->text) + idx,
+ h_tag->len);
+ h_tag->next = NULL;
+ if (!h_tag->str)
+ return -1;
+
+ if (last)
+ last->next = h_tag;
+ else
+ h_type->tag = h_tag;
+
+ last = h_tag;
+ }
+ va_end(ap);
+
+ if (EOF == kputc('\n', &sh->text))
+ return -1;
+
+ int itype = (type[0]<<8) | type[1];
+ if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
+ return -1;
+
+ return h_type->order;
+}
+
+/*
+ * Returns the first header item matching 'type'. If ID is non-NULL it checks
+ * for the tag ID: and compares against the specified ID.
+ *
+ * Returns NULL if no type/ID is found
+ */
+SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
+ char *ID_key, char *ID_value) {
+ SAM_hdr_type *t1, *t2;
+ int itype = (type[0]<<8)|(type[1]);
+ khint_t k;
+
+ /* Special case for types we have prebuilt hashes on */
+ if (ID_key) {
+ if (type[0] == 'S' && type[1] == 'Q' &&
+ ID_key[0] == 'S' && ID_key[1] == 'N') {
+ k = kh_get(m_s2i, hdr->ref_hash, ID_value);
+ return k != kh_end(hdr->ref_hash)
+ ? hdr->ref[kh_val(hdr->ref_hash, k)].ty
+ : NULL;
+ }
+
+ if (type[0] == 'R' && type[1] == 'G' &&
+ ID_key[0] == 'I' && ID_key[1] == 'D') {
+ k = kh_get(m_s2i, hdr->rg_hash, ID_value);
+ return k != kh_end(hdr->rg_hash)
+ ? hdr->rg[kh_val(hdr->rg_hash, k)].ty
+ : NULL;
+ }
+
+ if (type[0] == 'P' && type[1] == 'G' &&
+ ID_key[0] == 'I' && ID_key[1] == 'D') {
+ k = kh_get(m_s2i, hdr->pg_hash, ID_value);
+ return k != kh_end(hdr->pg_hash)
+ ? hdr->pg[kh_val(hdr->pg_hash, k)].ty
+ : NULL;
+ }
+ }
+
+ k = kh_get(sam_hdr, hdr->h, itype);
+ if (k == kh_end(hdr->h))
+ return NULL;
+
+ if (!ID_key)
+ return kh_val(hdr->h, k);
+
+ t1 = t2 = kh_val(hdr->h, k);
+ do {
+ SAM_hdr_tag *tag;
+ for (tag = t1->tag; tag; tag = tag->next) {
+ if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
+ char *cp1 = tag->str+3;
+ char *cp2 = ID_value;
+ while (*cp1 && *cp1 == *cp2)
+ cp1++, cp2++;
+ if (*cp2 || *cp1)
+ continue;
+ return t1;
+ }
+ }
+ t1 = t1->next;
+ } while (t1 != t2);
+
+ return NULL;
+}
+
+/*
+ * As per SAM_hdr_type, but returns a complete line of formatted text
+ * for a specific head type/ID combination. If ID is NULL then it returns
+ * the first line of the specified type.
+ *
+ * The returned string is malloced and should be freed by the calling
+ * function with free().
+ *
+ * Returns NULL if no type/ID is found.
+ */
+char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
+ char *ID_key, char *ID_value) {
+ SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
+ kstring_t ks = KS_INITIALIZER;
+ SAM_hdr_tag *tag;
+ int r = 0;
+
+ if (!ty)
+ return NULL;
+
+ // Paste together the line from the hashed copy
+ r |= (kputc_('@', &ks) == EOF);
+ r |= (kputs(type, &ks) == EOF);
+ for (tag = ty->tag; tag; tag = tag->next) {
+ r |= (kputc_('\t', &ks) == EOF);
+ r |= (kputsn(tag->str, tag->len, &ks) == EOF);
+ }
+
+ if (r) {
+ KS_FREE(&ks);
+ return NULL;
+ }
+
+ return ks_str(&ks);
+}
+
+
+/*
+ * Looks for a specific key in a single sam header line.
+ * If prev is non-NULL it also fills this out with the previous tag, to
+ * permit use in key removal. *prev is set to NULL when the tag is the first
+ * key in the list. When a tag isn't found, prev (if non NULL) will be the last
+ * tag in the existing list.
+ *
+ * Returns the tag pointer on success
+ * NULL on failure
+ */
+SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
+ SAM_hdr_type *type,
+ char *key,
+ SAM_hdr_tag **prev) {
+ SAM_hdr_tag *tag, *p = NULL;
+
+ for (tag = type->tag; tag; p = tag, tag = tag->next) {
+ if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
+ if (prev)
+ *prev = p;
+ return tag;
+ }
+ }
+
+ if (prev)
+ *prev = p;
+
+ return NULL;
+}
+
+
+/*
+ * Adds or updates tag key,value pairs in a header line.
+ * Eg for adding M5 tags to @SQ lines or updating sort order for the
+ * @HD line (although use the sam_hdr_sort_order() function for
+ * HD manipulation, which is a wrapper around this funuction).
+ *
+ * Specify multiple key,value pairs ending in NULL.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
+ va_list ap;
+
+ va_start(ap, type);
+
+ for (;;) {
+ char *k, *v;
+ int idx;
+ SAM_hdr_tag *tag, *prev;
+
+ if (!(k = (char *)va_arg(ap, char *)))
+ break;
+ v = va_arg(ap, char *);
+
+ tag = sam_hdr_find_key(hdr, type, k, &prev);
+ if (!tag) {
+ if (!(tag = pool_alloc(hdr->tag_pool)))
+ return -1;
+ if (prev)
+ prev->next = tag;
+ else
+ type->tag = tag;
+
+ tag->next = NULL;
+ }
+
+ idx = ks_len(&hdr->text);
+ if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
+ return -1;
+ tag->len = ks_len(&hdr->text) - idx;
+ tag->str = string_ndup(hdr->str_pool,
+ ks_str(&hdr->text) + idx,
+ tag->len);
+ if (!tag->str)
+ return -1;
+ }
+
+ va_end(ap);
+
+ return 0;
+}
+
+#define K(a) (((a)[0]<<8)|((a)[1]))
+
+/*
+ * Returns the sort order:
+ */
+enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) {
+ return hdr->sort_order;
+}
+
+static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) {
+ khint_t k;
+ enum sam_sort_order so;
+
+ so = ORDER_UNKNOWN;
+ k = kh_get(sam_hdr, hdr->h, K("HD"));
+ if (k != kh_end(hdr->h)) {
+ SAM_hdr_type *ty = kh_val(hdr->h, k);
+ SAM_hdr_tag *tag;
+ for (tag = ty->tag; tag; tag = tag->next) {
+ if (tag->str[0] == 'S' && tag->str[1] == 'O') {
+ if (strcmp(tag->str+3, "unsorted") == 0)
+ so = ORDER_UNSORTED;
+ else if (strcmp(tag->str+3, "queryname") == 0)
+ so = ORDER_NAME;
+ else if (strcmp(tag->str+3, "coordinate") == 0)
+ so = ORDER_COORD;
+ else if (strcmp(tag->str+3, "unknown") != 0)
+ fprintf(stderr, "Unknown sort order field: %s\n",
+ tag->str+3);
+ }
+ }
+ }
+
+ return so;
+}
+
+
+/*
+ * Reconstructs the kstring from the header hash table.
+ * Returns 0 on success
+ * -1 on failure
+ */
+int sam_hdr_rebuild(SAM_hdr *hdr) {
+ /* Order: HD then others */
+ kstring_t ks = KS_INITIALIZER;
+ khint_t k;
+
+
+ k = kh_get(sam_hdr, hdr->h, K("HD"));
+ if (k != kh_end(hdr->h)) {
+ SAM_hdr_type *ty = kh_val(hdr->h, k);
+ SAM_hdr_tag *tag;
+ if (EOF == kputs("@HD", &ks))
+ return -1;
+ for (tag = ty->tag; tag; tag = tag->next) {
+ if (EOF == kputc_('\t', &ks))
+ return -1;
+ if (EOF == kputsn_(tag->str, tag->len, &ks))
+ return -1;
+ }
+ if (EOF == kputc('\n', &ks))
+ return -1;
+ }
+
+ for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
+ SAM_hdr_type *t1, *t2;
+
+ if (!kh_exist(hdr->h, k))
+ continue;
+
+ if (kh_key(hdr->h, k) == K("HD"))
+ continue;
+
+ t1 = t2 = kh_val(hdr->h, k);
+ do {
+ SAM_hdr_tag *tag;
+ char c[2];
+
+ if (EOF == kputc_('@', &ks))
+ return -1;
+ c[0] = kh_key(hdr->h, k)>>8;
+ c[1] = kh_key(hdr->h, k)&0xff;
+ if (EOF == kputsn_(c, 2, &ks))
+ return -1;
+ for (tag = t1->tag; tag; tag=tag->next) {
+ if (EOF == kputc_('\t', &ks))
+ return -1;
+ if (EOF == kputsn_(tag->str, tag->len, &ks))
+ return -1;
+ }
+ if (EOF == kputc('\n', &ks))
+ return -1;
+ t1 = t1->next;
+ } while (t1 != t2);
+ }
+
+ if (ks_str(&hdr->text))
+ KS_FREE(&hdr->text);
+
+ hdr->text = ks;
+
+ return 0;
+}
+
+
+/*
+ * Creates an empty SAM header, ready to be populated.
+ *
+ * Returns a SAM_hdr struct on success (free with sam_hdr_free())
+ * NULL on failure
+ */
+SAM_hdr *sam_hdr_new() {
+ SAM_hdr *sh = calloc(1, sizeof(*sh));
+
+ if (!sh)
+ return NULL;
+
+ sh->h = kh_init(sam_hdr);
+ if (!sh->h)
+ goto err;
+
+ sh->ID_cnt = 1;
+ sh->ref_count = 1;
+
+ sh->nref = 0;
+ sh->ref = NULL;
+ if (!(sh->ref_hash = kh_init(m_s2i)))
+ goto err;
+
+ sh->nrg = 0;
+ sh->rg = NULL;
+ if (!(sh->rg_hash = kh_init(m_s2i)))
+ goto err;
+
+ sh->npg = 0;
+ sh->pg = NULL;
+ sh->npg_end = sh->npg_end_alloc = 0;
+ sh->pg_end = NULL;
+ if (!(sh->pg_hash = kh_init(m_s2i)))
+ goto err;
+
+ KS_INIT(&sh->text);
+
+ if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
+ goto err;
+
+ if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
+ goto err;
+
+ if (!(sh->str_pool = string_pool_create(8192)))
+ goto err;
+
+ return sh;
+
+ err:
+ if (sh->h)
+ kh_destroy(sam_hdr, sh->h);
+
+ if (sh->tag_pool)
+ pool_destroy(sh->tag_pool);
+
+ if (sh->type_pool)
+ pool_destroy(sh->type_pool);
+
+ if (sh->str_pool)
+ string_pool_destroy(sh->str_pool);
+
+ free(sh);
+
+ return NULL;
+}
+
+
+/*
+ * Tokenises a SAM header into a hash table.
+ * Also extracts a few bits on specific data types, such as @RG lines.
+ *
+ * Returns a SAM_hdr struct on success (free with sam_hdr_free())
+ * NULL on failure
+ */
+SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
+ /* Make an empty SAM_hdr */
+ SAM_hdr *sh;
+
+ sh = sam_hdr_new();
+ if (NULL == sh) return NULL;
+
+ if (NULL == hdr) return sh; // empty header is permitted
+
+ /* Parse the header, line by line */
+ if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
+ sam_hdr_free(sh);
+ return NULL;
+ }
+
+ /* Obtain sort order */
+ sh->sort_order = sam_hdr_parse_sort_order(sh);
+
+ //sam_hdr_dump(sh);
+ //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
+ //sam_hdr_rebuild(sh);
+ //printf(">>%s<<", ks_str(sh->text));
+
+ //parse_references(sh);
+ //parse_read_groups(sh);
+
+ sam_hdr_link_pg(sh);
+ //sam_hdr_dump(sh);
+
+ return sh;
+}
+
+/*
+ * Produces a duplicate copy of hdr and returns it.
+ * Returns NULL on failure
+ */
+SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
+ if (-1 == sam_hdr_rebuild(hdr))
+ return NULL;
+
+ return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
+}
+
+/*! Increments a reference count on hdr.
+ *
+ * This permits multiple files to share the same header, all calling
+ * sam_hdr_free when done, without causing errors for other open files.
+ */
+void sam_hdr_incr_ref(SAM_hdr *hdr) {
+ hdr->ref_count++;
+}
+
+/*! Increments a reference count on hdr.
+ *
+ * This permits multiple files to share the same header, all calling
+ * sam_hdr_free when done, without causing errors for other open files.
+ *
+ * If the reference count hits zero then the header is automatically
+ * freed. This makes it a synonym for sam_hdr_free().
+ */
+void sam_hdr_decr_ref(SAM_hdr *hdr) {
+ sam_hdr_free(hdr);
+}
+
+/*! Deallocates all storage used by a SAM_hdr struct.
+ *
+ * This also decrements the header reference count. If after decrementing
+ * it is still non-zero then the header is assumed to be in use by another
+ * caller and the free is not done.
+ *
+ * This is a synonym for sam_hdr_dec_ref().
+ */
+void sam_hdr_free(SAM_hdr *hdr) {
+ if (!hdr)
+ return;
+
+ if (--hdr->ref_count > 0)
+ return;
+
+ if (ks_str(&hdr->text))
+ KS_FREE(&hdr->text);
+
+ if (hdr->h)
+ kh_destroy(sam_hdr, hdr->h);
+
+ if (hdr->ref_hash)
+ kh_destroy(m_s2i, hdr->ref_hash);
+
+ if (hdr->ref) {
+ int i;
+ for (i = 0; i < hdr->nref; i++)
+ if (hdr->ref[i].name)
+ free(hdr->ref[i].name);
+ free(hdr->ref);
+ }
+
+ if (hdr->rg_hash)
+ kh_destroy(m_s2i, hdr->rg_hash);
+
+ if (hdr->rg) {
+ int i;
+ for (i = 0; i < hdr->nrg; i++)
+ if (hdr->rg[i].name)
+ free(hdr->rg[i].name);
+ free(hdr->rg);
+ }
+
+ if (hdr->pg_hash)
+ kh_destroy(m_s2i, hdr->pg_hash);
+
+ if (hdr->pg) {
+ int i;
+ for (i = 0; i < hdr->npg; i++)
+ if (hdr->pg[i].name)
+ free(hdr->pg[i].name);
+ free(hdr->pg);
+ }
+
+ if (hdr->pg_end)
+ free(hdr->pg_end);
+
+ if (hdr->type_pool)
+ pool_destroy(hdr->type_pool);
+
+ if (hdr->tag_pool)
+ pool_destroy(hdr->tag_pool);
+
+ if (hdr->str_pool)
+ string_pool_destroy(hdr->str_pool);
+
+ free(hdr);
+}
+
+int sam_hdr_length(SAM_hdr *hdr) {
+ return ks_len(&hdr->text);
+}
+
+char *sam_hdr_str(SAM_hdr *hdr) {
+ return ks_str(&hdr->text);
+}
+
+/*
+ * Looks up a reference sequence by name and returns the numerical ID.
+ * Returns -1 if unknown reference.
+ */
+int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
+ khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
+ return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
+}
+
+/*
+ * Looks up a read-group by name and returns a pointer to the start of the
+ * associated tag list.
+ *
+ * Returns NULL on failure
+ */
+SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
+ khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
+ return k == kh_end(hdr->rg_hash)
+ ? NULL
+ : &hdr->rg[kh_val(hdr->rg_hash, k)];
+}
+
+
+/*
+ * Fixes any PP links in @PG headers.
+ * If the entries are in order then this doesn't need doing, but incase
+ * our header is out of order this goes through the sh->pg[] array
+ * setting the prev_id field.
+ *
+ * Note we can have multiple complete chains. This code should identify the
+ * tails of these chains as these are the entries we have to link to in
+ * subsequent PP records.
+ *
+ * Returns 0 on sucess
+ * -1 on failure (indicating broken PG/PP records)
+ */
+int sam_hdr_link_pg(SAM_hdr *hdr) {
+ int i, j, ret = 0;
+
+ hdr->npg_end_alloc = hdr->npg;
+ hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
+ if (!hdr->pg_end)
+ return -1;
+
+ for (i = 0; i < hdr->npg; i++)
+ hdr->pg_end[i] = i;
+
+ for (i = 0; i < hdr->npg; i++) {
+ khint_t k;
+ SAM_hdr_tag *tag;
+ char tmp;
+
+ for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
+ if (tag->str[0] == 'P' && tag->str[1] == 'P')
+ break;
+ }
+ if (!tag) {
+ /* Chain start points */
+ continue;
+ }
+
+ tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
+ k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
+ tag->str[tag->len] = tmp;
+
+ if (k == kh_end(hdr->pg_hash)) {
+ ret = -1;
+ continue;
+ }
+
+ hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
+ hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
+ }
+
+ for (i = j = 0; i < hdr->npg; i++) {
+ if (hdr->pg_end[i] != -1)
+ hdr->pg_end[j++] = hdr->pg_end[i];
+ }
+ hdr->npg_end = j;
+
+ return ret;
+}
+
+/*
+ * Returns a unique ID from a base name.
+ *
+ * The value returned is valid until the next call to
+ * this function.
+ */
+const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
+ khint_t k = kh_get(m_s2i, sh->pg_hash, name);
+ if (k == kh_end(sh->pg_hash))
+ return name;
+
+ do {
+ sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
+ k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
+ } while (k != kh_end(sh->pg_hash));
+
+ return sh->ID_buf;
+}
+
+/*
+ * Add an @PG line.
+ *
+ * If we wish complete control over this use sam_hdr_add() directly. This
+ * function uses that, but attempts to do a lot of tedious house work for
+ * you too.
+ *
+ * - It will generate a suitable ID if the supplied one clashes.
+ * - It will generate multiple @PG records if we have multiple PG chains.
+ *
+ * Call it as per sam_hdr_add() with a series of key,value pairs ending
+ * in NULL.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
+ va_list args;
+
+ if (sh->npg_end) {
+ /* Copy ends array to avoid us looping while modifying it */
+ int *end = malloc(sh->npg_end * sizeof(int));
+ int i, nends = sh->npg_end;
+
+ if (!end)
+ return -1;
+
+ memcpy(end, sh->pg_end, nends * sizeof(*end));
+
+ for (i = 0; i < nends; i++) {
+ va_start(args, name);
+ if (-1 == sam_hdr_vadd(sh, "PG", args,
+ "ID", sam_hdr_PG_ID(sh, name),
+ "PN", name,
+ "PP", sh->pg[end[i]].name,
+ NULL)) {
+ free(end);
+ return -1;
+ }
+ va_end(args);
+ }
+
+ free(end);
+ } else {
+ va_start(args, name);
+ if (-1 == sam_hdr_vadd(sh, "PG", args,
+ "ID", sam_hdr_PG_ID(sh, name),
+ "PN", name,
+ NULL))
+ return -1;
+ va_end(args);
+ }
+
+ //sam_hdr_dump(sh);
+
+ return 0;
+}
+
+/*
+ * A function to help with construction of CL tags in @PG records.
+ * Takes an argc, argv pair and returns a single space-separated string.
+ * This string should be deallocated by the calling function.
+ *
+ * Returns malloced char * on success
+ * NULL on failure
+ */
+char *stringify_argv(int argc, char *argv[]) {
+ char *str, *cp;
+ size_t nbytes = 1;
+ int i, j;
+
+ /* Allocate */
+ for (i = 0; i < argc; i++) {
+ nbytes += strlen(argv[i]) + 1;
+ }
+ if (!(str = malloc(nbytes)))
+ return NULL;
+
+ /* Copy */
+ cp = str;
+ for (i = 0; i < argc; i++) {
+ j = 0;
+ while (argv[i][j]) {
+ if (argv[i][j] == '\t')
+ *cp++ = ' ';
+ else
+ *cp++ = argv[i][j];
+ j++;
+ }
+ *cp++ = ' ';
+ }
+ *cp++ = 0;
+
+ return str;
+}
diff --git a/htslib/cram/sam_header.h b/htslib/cram/sam_header.h
new file mode 100644
index 0000000..e312df4
--- /dev/null
+++ b/htslib/cram/sam_header.h
@@ -0,0 +1,459 @@
+/*
+Copyright (c) 2013-2014 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*! \file
+ * SAM header parsing.
+ *
+ * These functions can be shared between SAM, BAM and CRAM file
+ * formats as all three internally use the same string encoding for
+ * header fields.
+ */
+
+/*
+ * TODO.
+ *
+ * - Sort order (parse to struct, enum type, updating funcs)
+ * - Removal of lines.
+ * - Updating of lines
+ */
+
+#ifndef _SAM_HDR_H_
+#define _SAM_HDR_H_
+
+#include <stdarg.h>
+
+#include "cram/string_alloc.h"
+#include "cram/pooled_alloc.h"
+
+#include "htslib/khash.h"
+#include "htslib/kstring.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// For structure assignment. Eg kstring_t s = KS_INITIALIZER;
+#define KS_INITIALIZER {0,0,0}
+
+// For initialisation elsewhere. Eg KS_INIT(x->str);
+#define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL)
+
+// Frees the string subfield only. Assumes 's' itself is static.
+#define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0)
+
+/*
+ * Proposed new SAM header parsing
+
+1 @SQ ID:foo LN:100
+2 @SQ ID:bar LN:200
+3 @SQ ID:ram LN:300 UR:xyz
+4 @RG ID:r ...
+5 @RG ID:s ...
+
+Hash table for 2-char @keys without dup entries.
+If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
+
+HASH("SQ")--\
+ |
+ (3) <-> 1 <-> 2 <-> 3 <-> (1)
+
+HASH("RG")--\
+ |
+ (5) <-> 4 <-> 5 <-> (4)
+
+Items stored in the hash values also form their own linked lists:
+Ie SQ->ID(foo)->LN(100)
+ SQ->ID(bar)->LN(200)
+ SQ->ID(ram)->LN(300)->UR(xyz)
+ RG->ID(r)
+ */
+
+/*! A single key:value pair on a header line
+ *
+ * These form a linked list and hold strings. The strings are
+ * allocated from a string_alloc_t pool referenced in the master
+ * SAM_hdr structure. Do not attempt to free, malloc or manipulate
+ * these strings directly.
+ */
+typedef struct SAM_hdr_tag_s {
+ struct SAM_hdr_tag_s *next;
+ char *str;
+ int len;
+} SAM_hdr_tag;
+
+/*! The parsed version of the SAM header string.
+ *
+ * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type
+ * struct via the main hash table h in the SAM_hdr struct.
+ *
+ * These in turn consist of circular bi-directional linked lists (ie
+ * rings) to hold the multiple instances of the same header type
+ * code. For example if we have 5 \@SQ lines the primary hash table
+ * will key on \@SQ pointing to the first SAM_hdr_type and that in turn
+ * will be part of a ring of 5 elements.
+ *
+ * For each SAM_hdr_type structure we also point to a SAM_hdr_tag
+ * structure which holds the tokenised attributes; the tab separated
+ * key:value pairs per line.
+ */
+typedef struct SAM_hdr_item_s {
+ struct SAM_hdr_item_s *next; // cirular
+ struct SAM_hdr_item_s *prev;
+ SAM_hdr_tag *tag; // first tag
+ int order; // 0 upwards
+} SAM_hdr_type;
+
+/*! Parsed \@SQ lines */
+typedef struct {
+ char *name;
+ uint32_t len;
+ SAM_hdr_type *ty;
+ SAM_hdr_tag *tag;
+} SAM_SQ;
+
+/*! Parsed \@RG lines */
+typedef struct {
+ char *name;
+ SAM_hdr_type *ty;
+ SAM_hdr_tag *tag;
+ int name_len;
+ int id; // numerical ID
+} SAM_RG;
+
+/*! Parsed \@PG lines */
+typedef struct {
+ char *name;
+ SAM_hdr_type *ty;
+ SAM_hdr_tag *tag;
+ int name_len;
+ int id; // numerical ID
+ int prev_id; // -1 if none
+} SAM_PG;
+
+/*! Sort order parsed from @HD line */
+enum sam_sort_order {
+ ORDER_UNKNOWN =-1,
+ ORDER_UNSORTED = 0,
+ ORDER_NAME = 1,
+ ORDER_COORD = 2,
+ //ORDER_COLLATE = 3 // maybe one day!
+};
+
+KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*)
+KHASH_MAP_INIT_STR(m_s2i, int)
+
+/*! Primary structure for header manipulation
+ *
+ * The initial header text is held in the text kstring_t, but is also
+ * parsed out into SQ, RG and PG arrays. These have a hash table
+ * associated with each to allow lookup by ID or SN fields instead of
+ * their numeric array indices. Additionally PG has an array to hold
+ * the linked list start points (the last in a PP chain).
+ *
+ * Use the appropriate sam_hdr_* functions to edit the header, and
+ * call sam_hdr_rebuild() any time the textual form needs to be
+ * updated again.
+ */
+typedef struct {
+ kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag
+ khash_t(sam_hdr) *h;
+ string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings
+ pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs
+ pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs
+
+ // @SQ lines / references
+ int nref; //!< Number of \@SQ lines
+ SAM_SQ *ref; //!< Array of parsed \@SQ lines
+ khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index
+
+ // @RG lines / read-groups
+ int nrg; //!< Number of \@RG lines
+ SAM_RG *rg; //!< Array of parsed \@RG lines
+ khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index
+
+ // @PG lines / programs
+ int npg; //!< Number of \@PG lines
+ int npg_end; //!< Number of terminating \@PG lines
+ int npg_end_alloc; //!< Size of pg_end field
+ SAM_PG *pg; //!< Array of parsed \@PG lines
+ khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index
+ int *pg_end; //!< \@PG chain termination IDs
+
+ // @HD data
+ enum sam_sort_order sort_order; //!< @HD SO: field
+
+ // @cond internal
+ char ID_buf[1024]; // temporary buffer
+ int ID_cnt;
+ int ref_count; // number of uses of this SAM_hdr
+ // @endcond
+} SAM_hdr;
+
+/*! Creates an empty SAM header, ready to be populated.
+ *
+ * @return
+ * Returns a SAM_hdr struct on success (free with sam_hdr_free())
+ * NULL on failure
+ */
+SAM_hdr *sam_hdr_new(void);
+
+/*! Tokenises a SAM header into a hash table.
+ *
+ * Also extracts a few bits on specific data types, such as @RG lines.
+ *
+ * @return
+ * Returns a SAM_hdr struct on success (free with sam_hdr_free());
+ * NULL on failure
+ */
+SAM_hdr *sam_hdr_parse_(const char *hdr, int len);
+
+
+/*! Produces a duplicate copy of hdr and returns it.
+ * @return
+ * Returns NULL on failure
+ */
+SAM_hdr *sam_hdr_dup(SAM_hdr *hdr);
+
+
+/*! Increments a reference count on hdr.
+ *
+ * This permits multiple files to share the same header, all calling
+ * sam_hdr_free when done, without causing errors for other open files.
+ */
+void sam_hdr_incr_ref(SAM_hdr *hdr);
+
+
+/*! Increments a reference count on hdr.
+ *
+ * This permits multiple files to share the same header, all calling
+ * sam_hdr_free when done, without causing errors for other open files.
+ *
+ * If the reference count hits zero then the header is automatically
+ * freed. This makes it a synonym for sam_hdr_free().
+ */
+void sam_hdr_decr_ref(SAM_hdr *hdr);
+
+
+/*! Deallocates all storage used by a SAM_hdr struct.
+ *
+ * This also decrements the header reference count. If after decrementing
+ * it is still non-zero then the header is assumed to be in use by another
+ * caller and the free is not done.
+ *
+ * This is a synonym for sam_hdr_dec_ref().
+ */
+void sam_hdr_free(SAM_hdr *hdr);
+
+/*! Returns the current length of the SAM_hdr in text form.
+ *
+ * Call sam_hdr_rebuild() first if editing has taken place.
+ */
+int sam_hdr_length(SAM_hdr *hdr);
+
+/*! Returns the string form of the SAM_hdr.
+ *
+ * Call sam_hdr_rebuild() first if editing has taken place.
+ */
+char *sam_hdr_str(SAM_hdr *hdr);
+
+/*! Appends a formatted line to an existing SAM header.
+ *
+ * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
+ * optional new-line. If it contains more than 1 line then multiple lines
+ * will be added in order.
+ *
+ * Input text is of maximum length len or as terminated earlier by a NUL.
+ * Len may be 0 if unknown, in which case lines must be NUL-terminated.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len);
+
+/*! Adds a single line to a SAM header.
+ *
+ * Specify type and one or more key,value pairs, ending with the NULL key.
+ * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int sam_hdr_add(SAM_hdr *sh, const char *type, ...);
+
+/*! Adds a single line to a SAM header.
+ *
+ * This is much like sam_hdr_add() but with the additional va_list
+ * argument. This is followed by specifying type and one or more
+ * key,value pairs, ending with the NULL key.
+ *
+ * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
+ *
+ * The purpose of the additional va_list parameter is to permit other
+ * varargs functions to call this while including their own additional
+ * parameters; an example is in sam_hdr_add_PG().
+ *
+ * Note: this function invokes va_arg at least once, making the value
+ * of ap indeterminate after the return. The caller should call
+ * va_start/va_end before/after calling this function or use va_copy.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...);
+
+/*!
+ * @return
+ * Returns the first header item matching 'type'. If ID is non-NULL it checks
+ * for the tag ID: and compares against the specified ID.
+ *
+ * Returns NULL if no type/ID is found
+ */
+SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
+ char *ID_key, char *ID_value);
+
+/*!
+ *
+ * As per SAM_hdr_type, but returns a complete line of formatted text
+ * for a specific head type/ID combination. If ID is NULL then it returns
+ * the first line of the specified type.
+ *
+ * The returned string is malloced and should be freed by the calling
+ * function with free().
+ *
+ * @return
+ * Returns NULL if no type/ID is found.
+ */
+char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
+ char *ID_key, char *ID_value);
+
+/*! Looks for a specific key in a single sam header line.
+ *
+ * If prev is non-NULL it also fills this out with the previous tag, to
+ * permit use in key removal. *prev is set to NULL when the tag is the first
+ * key in the list. When a tag isn't found, prev (if non NULL) will be the last
+ * tag in the existing list.
+ *
+ * @return
+ * Returns the tag pointer on success;
+ * NULL on failure
+ */
+SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
+ SAM_hdr_type *type,
+ char *key,
+ SAM_hdr_tag **prev);
+
+/*! Adds or updates tag key,value pairs in a header line.
+ *
+ * Eg for adding M5 tags to @SQ lines or updating sort order for the
+ * @HD line (although use the sam_hdr_sort_order() function for
+ * HD manipulation, which is a wrapper around this funuction).
+ *
+ * Specify multiple key,value pairs ending in NULL.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...);
+
+/*! Returns the sort order from the @HD SO: field */
+enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr);
+
+/*! Reconstructs the kstring from the header hash table.
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int sam_hdr_rebuild(SAM_hdr *hdr);
+
+/*! Looks up a reference sequence by name and returns the numerical ID.
+ * @return
+ * Returns -1 if unknown reference.
+ */
+int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref);
+
+/*! Looks up a read-group by name and returns a pointer to the start of the
+ * associated tag list.
+ *
+ * @return
+ * Returns NULL on failure
+ */
+SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg);
+
+/*! Fixes any PP links in @PG headers.
+ *
+ * If the entries are in order then this doesn't need doing, but incase
+ * our header is out of order this goes through the sh->pg[] array
+ * setting the prev_id field.
+ *
+ * @return
+ * Returns 0 on sucess;
+ * -1 on failure (indicating broken PG/PP records)
+ */
+int sam_hdr_link_pg(SAM_hdr *hdr);
+
+
+/*! Add an @PG line.
+ *
+ * If we wish complete control over this use sam_hdr_add() directly. This
+ * function uses that, but attempts to do a lot of tedious house work for
+ * you too.
+ *
+ * - It will generate a suitable ID if the supplied one clashes.
+ * - It will generate multiple @PG records if we have multiple PG chains.
+ *
+ * Call it as per sam_hdr_add() with a series of key,value pairs ending
+ * in NULL.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...);
+
+/*!
+ * A function to help with construction of CL tags in @PG records.
+ * Takes an argc, argv pair and returns a single space-separated string.
+ * This string should be deallocated by the calling function.
+ *
+ * @return
+ * Returns malloced char * on success;
+ * NULL on failure
+ */
+char *stringify_argv(int argc, char *argv[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SAM_HDR_H_ */
diff --git a/htslib/cram/string_alloc.c b/htslib/cram/string_alloc.c
new file mode 100644
index 0000000..3e0e4e2
--- /dev/null
+++ b/htslib/cram/string_alloc.c
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2010 Genome Research Ltd.
+Author: Andrew Whitwham <aw7 at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+/*
+ A pooled string allocator intended to cut down on the
+ memory overhead of many small string allocations.
+
+ Andrew Whitwham, September 2010.
+*/
+
+#include <config.h>
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "string_alloc.h"
+
+#define MIN_STR_SIZE 1024
+
+
+/* creates the string pool. max_length is the initial size
+ a single string can be. Tha max_length can grow as
+ needed */
+
+string_alloc_t *string_pool_create(size_t max_length) {
+ string_alloc_t *a_str;
+
+ if (NULL == (a_str = (string_alloc_t *)malloc(sizeof(*a_str)))) {
+ return NULL;
+ }
+
+ if (max_length < MIN_STR_SIZE) max_length = MIN_STR_SIZE;
+
+ a_str->nstrings = 0;
+ a_str->max_length = max_length;
+ a_str->strings = NULL;
+
+ return a_str;
+}
+
+
+/* internal function to do the actual memory allocation */
+
+static string_t *new_string_pool(string_alloc_t *a_str) {
+ string_t *str;
+
+ str = realloc(a_str->strings, (a_str->nstrings + 1) * sizeof(*a_str->strings));
+
+ if (NULL == str) return NULL;
+
+ a_str->strings = str;
+ str = &a_str->strings[a_str->nstrings];
+
+ str->str = malloc(a_str->max_length);;
+
+ if (NULL == str->str) return NULL;
+
+ str->used = 0;
+ a_str->nstrings++;
+
+ return str;
+}
+
+
+/* free allocated memory */
+
+void string_pool_destroy(string_alloc_t *a_str) {
+ size_t i;
+
+ for (i = 0; i < a_str->nstrings; i++) {
+ free(a_str->strings[i].str);
+ }
+
+ free(a_str->strings);
+ free(a_str);
+}
+
+
+/* allocate space for a string */
+
+char *string_alloc(string_alloc_t *a_str, size_t length) {
+ string_t *str;
+ char *ret;
+
+ if (length <= 0) return NULL;
+
+ // add to last string pool if we have space
+ if (a_str->nstrings) {
+ str = &a_str->strings[a_str->nstrings - 1];
+
+ if (str->used + length < a_str->max_length) {
+ ret = str->str + str->used;
+ str->used += length;
+ return ret;
+ }
+ }
+
+ // increase the max length if needs be
+ if (length > a_str->max_length) a_str->max_length = length;
+
+ // need a new string pool
+ str = new_string_pool(a_str);
+
+ if (NULL == str) return NULL;
+
+ str->used = length;
+ return str->str;
+}
+
+
+/* equivalent to strdup */
+
+char *string_dup(string_alloc_t *a_str, char *instr) {
+ return string_ndup(a_str, instr, strlen(instr));
+}
+
+char *string_ndup(string_alloc_t *a_str, char *instr, size_t len) {
+ char *str = string_alloc(a_str, len + 1);
+
+ if (NULL == str) return NULL;
+
+ strncpy(str, instr, len);
+ str[len] = 0;
+
+ return str;
+}
diff --git a/htslib/cram/string_alloc.h b/htslib/cram/string_alloc.h
new file mode 100644
index 0000000..e044673
--- /dev/null
+++ b/htslib/cram/string_alloc.h
@@ -0,0 +1,68 @@
+/*
+Copyright (c) 2010 Genome Research Ltd.
+Author: Andrew Whitwham <aw7 at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _STRING_ALLOC_H_
+#define _STRING_ALLOC_H_
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A pooled string allocator intended to cut down on the
+ * memory overhead of many small string allocations.
+ *
+ * Andrew Whitwham, September 2010.
+ */
+
+typedef struct {
+ char *str;
+ size_t used;
+} string_t;
+
+typedef struct {
+ size_t max_length;
+ size_t nstrings;
+ string_t *strings;
+} string_alloc_t;
+
+string_alloc_t *string_pool_create(size_t max_length);
+void string_pool_destroy(string_alloc_t *a_str);
+char *string_alloc(string_alloc_t *a_str, size_t length);
+char *string_dup(string_alloc_t *a_str, char *instr);
+char *string_ndup(string_alloc_t *a_str, char *instr, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/cram/thread_pool.c b/htslib/cram/thread_pool.c
new file mode 100644
index 0000000..cf80edb
--- /dev/null
+++ b/htslib/cram/thread_pool.c
@@ -0,0 +1,757 @@
+/*
+Copyright (c) 2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+
+#include <signal.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <assert.h>
+
+#include "cram/thread_pool.h"
+
+//#define DEBUG
+//#define DEBUG_TIME
+
+#define IN_ORDER
+
+#ifdef DEBUG
+static int worker_id(t_pool *p) {
+ int i;
+ pthread_t s = pthread_self();
+ for (i = 0; i < p->tsize; i++) {
+ if (pthread_equal(s, p->t[i].tid))
+ return i;
+ }
+ return -1;
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * A queue to hold results from the thread pool.
+ *
+ * Each thread pool may have jobs of multiple types being queued up and
+ * interleaved, so we allow several results queue per pool.
+ *
+ * The jobs themselves are expected to push their results onto their
+ * appropriate results queue.
+ */
+
+/*
+ * Adds a result to the end of the result queue.
+ *
+ * Returns 0 on success;
+ * -1 on failure
+ */
+static int t_pool_add_result(t_pool_job *j, void *data) {
+ t_results_queue *q = j->q;
+ t_pool_result *r;
+
+#ifdef DEBUG
+ fprintf(stderr, "%d: Adding resulting to queue %p, serial %d\n",
+ worker_id(j->p), q, j->serial);
+#endif
+
+ /* No results queue is fine if we don't want any results back */
+ if (!q)
+ return 0;
+
+ if (!(r = malloc(sizeof(*r))))
+ return -1;
+
+ r->next = NULL;
+ r->data = data;
+ r->serial = j->serial;
+
+ pthread_mutex_lock(&q->result_m);
+ if (q->result_tail) {
+ q->result_tail->next = r;
+ q->result_tail = r;
+ } else {
+ q->result_head = q->result_tail = r;
+ }
+ q->queue_len++;
+ q->pending--;
+
+#ifdef DEBUG
+ fprintf(stderr, "%d: Broadcasting result_avail (id %d)\n",
+ worker_id(j->p), r->serial);
+#endif
+ pthread_cond_signal(&q->result_avail_c);
+#ifdef DEBUG
+ fprintf(stderr, "%d: Broadcast complete\n", worker_id(j->p));
+#endif
+
+ pthread_mutex_unlock(&q->result_m);
+
+ return 0;
+}
+
+/* Core of t_pool_next_result() */
+static t_pool_result *t_pool_next_result_locked(t_results_queue *q) {
+ t_pool_result *r, *last;
+
+ for (last = NULL, r = q->result_head; r; last = r, r = r->next) {
+ if (r->serial == q->next_serial)
+ break;
+ }
+
+ if (r) {
+ if (q->result_head == r)
+ q->result_head = r->next;
+ else
+ last->next = r->next;
+
+ if (q->result_tail == r)
+ q->result_tail = last;
+
+ if (!q->result_head)
+ q->result_tail = NULL;
+
+ q->next_serial++;
+ q->queue_len--;
+ }
+
+ return r;
+}
+
+/*
+ * Pulls a result off the head of the result queue. Caller should
+ * free it (and any internals as appropriate) after use. This doesn't
+ * wait for a result to be present.
+ *
+ * Results will be returned in strict order.
+ *
+ * Returns t_pool_result pointer if a result is ready.
+ * NULL if not.
+ */
+t_pool_result *t_pool_next_result(t_results_queue *q) {
+ t_pool_result *r;
+
+#ifdef DEBUG
+ fprintf(stderr, "Requesting next result on queue %p\n", q);
+#endif
+
+ pthread_mutex_lock(&q->result_m);
+ r = t_pool_next_result_locked(q);
+ pthread_mutex_unlock(&q->result_m);
+
+#ifdef DEBUG
+ fprintf(stderr, "(q=%p) Found %p\n", q, r);
+#endif
+
+ return r;
+}
+
+t_pool_result *t_pool_next_result_wait(t_results_queue *q) {
+ t_pool_result *r;
+
+#ifdef DEBUG
+ fprintf(stderr, "Waiting for result %d...\n", q->next_serial);
+#endif
+
+ pthread_mutex_lock(&q->result_m);
+ while (!(r = t_pool_next_result_locked(q))) {
+ /* Possible race here now avoided via _locked() call, but incase... */
+ struct timeval now;
+ struct timespec timeout;
+
+ gettimeofday(&now, NULL);
+ timeout.tv_sec = now.tv_sec + 10;
+ timeout.tv_nsec = now.tv_usec * 1000;
+
+ pthread_cond_timedwait(&q->result_avail_c, &q->result_m, &timeout);
+ }
+ pthread_mutex_unlock(&q->result_m);
+
+ return r;
+}
+
+/*
+ * Returns true if there are no items on the finished results queue and
+ * also none still pending.
+ */
+int t_pool_results_queue_empty(t_results_queue *q) {
+ int empty;
+
+ pthread_mutex_lock(&q->result_m);
+ empty = q->queue_len == 0 && q->pending == 0;
+ pthread_mutex_unlock(&q->result_m);
+
+ return empty;
+}
+
+
+/*
+ * Returns the number of completed jobs on the results queue.
+ */
+int t_pool_results_queue_len(t_results_queue *q) {
+ int len;
+
+ pthread_mutex_lock(&q->result_m);
+ len = q->queue_len;
+ pthread_mutex_unlock(&q->result_m);
+
+ return len;
+}
+
+int t_pool_results_queue_sz(t_results_queue *q) {
+ int len;
+
+ pthread_mutex_lock(&q->result_m);
+ len = q->queue_len + q->pending;
+ pthread_mutex_unlock(&q->result_m);
+
+ return len;
+}
+
+/*
+ * Frees a result 'r' and if free_data is true also frees
+ * the internal r->data result too.
+ */
+void t_pool_delete_result(t_pool_result *r, int free_data) {
+ if (!r)
+ return;
+
+ if (free_data && r->data)
+ free(r->data);
+
+ free(r);
+}
+
+/*
+ * Initialises a results queue.
+ *
+ * Results queue pointer on success;
+ * NULL on failure
+ */
+t_results_queue *t_results_queue_init(void) {
+ t_results_queue *q = malloc(sizeof(*q));
+
+ pthread_mutex_init(&q->result_m, NULL);
+ pthread_cond_init(&q->result_avail_c, NULL);
+
+ q->result_head = NULL;
+ q->result_tail = NULL;
+ q->next_serial = 0;
+ q->curr_serial = 0;
+ q->queue_len = 0;
+ q->pending = 0;
+
+ return q;
+}
+
+/* Deallocates memory for a results queue */
+void t_results_queue_destroy(t_results_queue *q) {
+#ifdef DEBUG
+ fprintf(stderr, "Destroying results queue %p\n", q);
+#endif
+
+ if (!q)
+ return;
+
+ pthread_mutex_destroy(&q->result_m);
+ pthread_cond_destroy(&q->result_avail_c);
+
+ memset(q, 0xbb, sizeof(*q));
+ free(q);
+
+#ifdef DEBUG
+ fprintf(stderr, "Destroyed results queue %p\n", q);
+#endif
+}
+
+/* ----------------------------------------------------------------------------
+ * The thread pool.
+ */
+
+#define TDIFF(t2,t1) ((t2.tv_sec-t1.tv_sec)*1000000 + t2.tv_usec-t1.tv_usec)
+
+/*
+ * A worker thread.
+ *
+ * Each thread waits for the pool to be non-empty.
+ * As soon as this applies, one of them succeeds in getting the lock
+ * and then executes the job.
+ */
+static void *t_pool_worker(void *arg) {
+ t_pool_worker_t *w = (t_pool_worker_t *)arg;
+ t_pool *p = w->p;
+ t_pool_job *j;
+#ifdef DEBUG_TIME
+ struct timeval t1, t2, t3;
+#endif
+
+ for (;;) {
+ // Pop an item off the pool queue
+#ifdef DEBUG_TIME
+ gettimeofday(&t1, NULL);
+#endif
+
+ pthread_mutex_lock(&p->pool_m);
+
+#ifdef DEBUG_TIME
+ gettimeofday(&t2, NULL);
+ p->wait_time += TDIFF(t2,t1);
+ w->wait_time += TDIFF(t2,t1);
+#endif
+
+ // If there is something on the job list and a higher priority
+ // thread waiting, let it handle this instead.
+// while (p->head && p->t_stack_top != -1 && p->t_stack_top < w->idx) {
+// pthread_mutex_unlock(&p->pool_m);
+// pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
+// pthread_mutex_lock(&p->pool_m);
+// }
+
+ while (!p->head && !p->shutdown) {
+ p->nwaiting++;
+
+ if (p->njobs == 0)
+ pthread_cond_signal(&p->empty_c);
+#ifdef DEBUG_TIME
+ gettimeofday(&t2, NULL);
+#endif
+
+#ifdef IN_ORDER
+ // Push this thread to the top of the waiting stack
+ if (p->t_stack_top == -1 || p->t_stack_top > w->idx)
+ p->t_stack_top = w->idx;
+
+ p->t_stack[w->idx] = 1;
+ pthread_cond_wait(&w->pending_c, &p->pool_m);
+ p->t_stack[w->idx] = 0;
+
+ /* Find new t_stack_top */
+ {
+ int i;
+ p->t_stack_top = -1;
+ for (i = 0; i < p->tsize; i++) {
+ if (p->t_stack[i]) {
+ p->t_stack_top = i;
+ break;
+ }
+ }
+ }
+#else
+ pthread_cond_wait(&p->pending_c, &p->pool_m);
+#endif
+
+#ifdef DEBUG_TIME
+ gettimeofday(&t3, NULL);
+ p->wait_time += TDIFF(t3,t2);
+ w->wait_time += TDIFF(t3,t2);
+#endif
+ p->nwaiting--;
+ }
+
+ if (p->shutdown) {
+#ifdef DEBUG_TIME
+ p->total_time += TDIFF(t3,t1);
+#endif
+#ifdef DEBUG
+ fprintf(stderr, "%d: Shutting down\n", worker_id(p));
+#endif
+ pthread_mutex_unlock(&p->pool_m);
+ pthread_exit(NULL);
+ }
+
+ j = p->head;
+ if (!(p->head = j->next))
+ p->tail = NULL;
+
+ if (p->njobs-- >= p->qsize)
+ pthread_cond_signal(&p->full_c);
+
+ if (p->njobs == 0)
+ pthread_cond_signal(&p->empty_c);
+
+ pthread_mutex_unlock(&p->pool_m);
+
+ // We have job 'j' - now execute it.
+ t_pool_add_result(j, j->func(j->arg));
+#ifdef DEBUG_TIME
+ pthread_mutex_lock(&p->pool_m);
+ gettimeofday(&t3, NULL);
+ p->total_time += TDIFF(t3,t1);
+ pthread_mutex_unlock(&p->pool_m);
+#endif
+ memset(j, 0xbb, sizeof(*j));
+ free(j);
+ }
+
+ return NULL;
+}
+
+/*
+ * Creates a worker pool of length qsize with tsize worker threads.
+ *
+ * Returns pool pointer on success;
+ * NULL on failure
+ */
+t_pool *t_pool_init(int qsize, int tsize) {
+ int i;
+ t_pool *p = malloc(sizeof(*p));
+ p->qsize = qsize;
+ p->tsize = tsize;
+ p->njobs = 0;
+ p->nwaiting = 0;
+ p->shutdown = 0;
+ p->head = p->tail = NULL;
+ p->t_stack = NULL;
+#ifdef DEBUG_TIME
+ p->total_time = p->wait_time = 0;
+#endif
+
+ p->t = malloc(tsize * sizeof(p->t[0]));
+
+ pthread_mutex_init(&p->pool_m, NULL);
+ pthread_cond_init(&p->empty_c, NULL);
+ pthread_cond_init(&p->full_c, NULL);
+
+ pthread_mutex_lock(&p->pool_m);
+
+#ifdef IN_ORDER
+ if (!(p->t_stack = malloc(tsize * sizeof(*p->t_stack))))
+ return NULL;
+ p->t_stack_top = -1;
+
+ for (i = 0; i < tsize; i++) {
+ t_pool_worker_t *w = &p->t[i];
+ p->t_stack[i] = 0;
+ w->p = p;
+ w->idx = i;
+ w->wait_time = 0;
+ pthread_cond_init(&w->pending_c, NULL);
+ if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w))
+ return NULL;
+ }
+#else
+ pthread_cond_init(&p->pending_c, NULL);
+
+ for (i = 0; i < tsize; i++) {
+ t_pool_worker_t *w = &p->t[i];
+ w->p = p;
+ w->idx = i;
+ pthread_cond_init(&w->pending_c, NULL);
+ if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w))
+ return NULL;
+ }
+#endif
+
+ pthread_mutex_unlock(&p->pool_m);
+
+ return p;
+}
+
+/*
+ * Adds an item to the work pool.
+ *
+ * FIXME: Maybe return 1,0,-1 and distinguish between job dispathed vs
+ * result returned. Ie rather than blocking on full queue we're permitted
+ * to return early on "result available" event too.
+ * Caller would then have a while loop around t_pool_dispatch.
+ * Or, return -1 and set errno to EAGAIN to indicate job not yet submitted.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int t_pool_dispatch(t_pool *p, t_results_queue *q,
+ void *(*func)(void *arg), void *arg) {
+ return t_pool_dispatch2(p, q, func, arg, 0);
+}
+
+/*
+ * As above but optional non-block flag.
+ *
+ * nonblock 0 => block if input queue is full
+ * nonblock +1 => don't block if input queue is full, but do not add task
+ * nonblock -1 => add task regardless of whether queue is full (over-size)
+ */
+int t_pool_dispatch2(t_pool *p, t_results_queue *q,
+ void *(*func)(void *arg), void *arg, int nonblock) {
+ t_pool_job *j;
+
+#ifdef DEBUG
+ fprintf(stderr, "Dispatching job for queue %p, serial %d\n", q, q->curr_serial);
+#endif
+
+ pthread_mutex_lock(&p->pool_m);
+
+ if (p->njobs >= p->qsize && nonblock == 1) {
+ pthread_mutex_unlock(&p->pool_m);
+ errno = EAGAIN;
+ return -1;
+ }
+
+ if (!(j = malloc(sizeof(*j))))
+ return -1;
+ j->func = func;
+ j->arg = arg;
+ j->next = NULL;
+ j->p = p;
+ j->q = q;
+ if (q) {
+ pthread_mutex_lock(&q->result_m);
+ j->serial = q->curr_serial++;
+ q->pending++;
+ pthread_mutex_unlock(&q->result_m);
+ } else {
+ j->serial = 0;
+ }
+
+ // Check if queue is full
+ if (nonblock == 0)
+ while (p->njobs >= p->qsize)
+ pthread_cond_wait(&p->full_c, &p->pool_m);
+
+ p->njobs++;
+
+// if (q->curr_serial % 100 == 0)
+// fprintf(stderr, "p->njobs = %d p->qsize = %d\n", p->njobs, p->qsize);
+
+ if (p->tail) {
+ p->tail->next = j;
+ p->tail = j;
+ } else {
+ p->head = p->tail = j;
+ }
+
+#ifdef DEBUG
+ fprintf(stderr, "Dispatched (serial %d)\n", j->serial);
+#endif
+
+ // Let a worker know we have data.
+#ifdef IN_ORDER
+ // Keep incoming queue at 1 per running thread, so there is always
+ // something waiting when they end their current task. If we go above
+ // this signal to start more threads (if available). This has the effect
+ // of concentrating jobs to fewer cores when we are I/O bound, which in
+ // turn benefits systems with auto CPU frequency scaling.
+ if (p->t_stack_top >= 0 && p->njobs > p->tsize - p->nwaiting)
+ pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
+#else
+ pthread_cond_signal(&p->pending_c);
+#endif
+
+ pthread_mutex_unlock(&p->pool_m);
+
+ return 0;
+}
+
+/*
+ * Flushes the pool, but doesn't exit. This simply drains the queue and
+ * ensures all worker threads have finished their current task.
+ *
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int t_pool_flush(t_pool *p) {
+ int i;
+
+#ifdef DEBUG
+ fprintf(stderr, "Flushing pool %p\n", p);
+#endif
+
+ // Drains the queue
+ pthread_mutex_lock(&p->pool_m);
+
+ // Wake up everything for the final sprint!
+ for (i = 0; i < p->tsize; i++)
+ if (p->t_stack[i])
+ pthread_cond_signal(&p->t[i].pending_c);
+
+ while (p->njobs || p->nwaiting != p->tsize)
+ pthread_cond_wait(&p->empty_c, &p->pool_m);
+
+ pthread_mutex_unlock(&p->pool_m);
+
+#ifdef DEBUG
+ fprintf(stderr, "Flushed complete for pool %p, njobs=%d, nwaiting=%d\n",
+ p, p->njobs, p->nwaiting);
+#endif
+
+ return 0;
+}
+
+/*
+ * Destroys a thread pool. If 'kill' is true the threads are terminated now,
+ * otherwise they are joined into the main thread so they will finish their
+ * current work load.
+ *
+ * Use t_pool_destroy(p,0) after a t_pool_flush(p) on a normal shutdown or
+ * t_pool_destroy(p,1) to quickly exit after a fatal error.
+ */
+void t_pool_destroy(t_pool *p, int kill) {
+ int i;
+
+#ifdef DEBUG
+ fprintf(stderr, "Destroying pool %p, kill=%d\n", p, kill);
+#endif
+
+ /* Send shutdown message to worker threads */
+ if (!kill) {
+ pthread_mutex_lock(&p->pool_m);
+ p->shutdown = 1;
+
+#ifdef DEBUG
+ fprintf(stderr, "Sending shutdown request\n");
+#endif
+
+#ifdef IN_ORDER
+ for (i = 0; i < p->tsize; i++)
+ pthread_cond_signal(&p->t[i].pending_c);
+#else
+ pthread_cond_broadcast(&p->pending_c);
+#endif
+ pthread_mutex_unlock(&p->pool_m);
+
+#ifdef DEBUG
+ fprintf(stderr, "Shutdown complete\n");
+#endif
+ for (i = 0; i < p->tsize; i++)
+ pthread_join(p->t[i].tid, NULL);
+ } else {
+ for (i = 0; i < p->tsize; i++)
+ pthread_kill(p->t[i].tid, SIGINT);
+ }
+
+ pthread_mutex_destroy(&p->pool_m);
+ pthread_cond_destroy(&p->empty_c);
+ pthread_cond_destroy(&p->full_c);
+#ifdef IN_ORDER
+ for (i = 0; i < p->tsize; i++)
+ pthread_cond_destroy(&p->t[i].pending_c);
+#else
+ pthread_cond_destroy(&p->pending_c);
+#endif
+
+#ifdef DEBUG_TIME
+ fprintf(stderr, "Total time=%f\n", p->total_time / 1000000.0);
+ fprintf(stderr, "Wait time=%f\n", p->wait_time / 1000000.0);
+ fprintf(stderr, "%d%% utilisation\n",
+ (int)(100 - ((100.0 * p->wait_time) / p->total_time + 0.5)));
+ for (i = 0; i < p->tsize; i++)
+ fprintf(stderr, "%d: Wait time=%f\n", i,
+ p->t[i].wait_time / 1000000.0);
+#endif
+
+ if (p->t_stack)
+ free(p->t_stack);
+
+ free(p->t);
+ free(p);
+
+#ifdef DEBUG
+ fprintf(stderr, "Destroyed pool %p\n", p);
+#endif
+}
+
+
+/*-----------------------------------------------------------------------------
+ * Test app.
+ */
+
+#ifdef TEST_MAIN
+
+#include <stdio.h>
+#include <math.h>
+
+void *doit(void *arg) {
+ int i, k, x = 0;
+ int job = *(int *)arg;
+ int *res;
+
+ printf("Worker: execute job %d\n", job);
+
+ usleep(random() % 1000000); // to coerce job completion out of order
+ if (0) {
+ for (k = 0; k < 100; k++) {
+ for (i = 0; i < 100000; i++) {
+ x++;
+ x += x * sin(i);
+ x += x * cos(x);
+ }
+ }
+ x *= 100;
+ x += job;
+ } else {
+ x = job*job;
+ }
+
+ printf("Worker: job %d terminating, x=%d\n", job, x);
+
+ free(arg);
+
+ res = malloc(sizeof(*res));
+ *res = x;
+
+ return res;
+}
+
+#define NTHREADS 8
+
+int main(int argc, char **argv) {
+ t_pool *p = t_pool_init(NTHREADS*2, NTHREADS);
+ t_results_queue *q = t_results_queue_init();
+ int i;
+ t_pool_result *r;
+
+ // Dispatch jobs
+ for (i = 0; i < 20; i++) {
+ int *ip = malloc(sizeof(*ip));
+ *ip = i;
+ printf("Submitting %d\n", i);
+ t_pool_dispatch(p, q, doit, ip);
+
+ // Check for results
+ if ((r = t_pool_next_result(q))) {
+ printf("RESULT: %d\n", *(int *)r->data);
+ t_pool_delete_result(r, 1);
+ }
+ }
+
+ t_pool_flush(p);
+
+ while ((r = t_pool_next_result(q))) {
+ printf("RESULT: %d\n", *(int *)r->data);
+ t_pool_delete_result(r, 1);
+ }
+
+ t_pool_destroy(p, 0);
+ t_results_queue_destroy(q);
+
+ return 0;
+}
+#endif
diff --git a/htslib/cram/thread_pool.h b/htslib/cram/thread_pool.h
new file mode 100644
index 0000000..8158c85
--- /dev/null
+++ b/htslib/cram/thread_pool.h
@@ -0,0 +1,218 @@
+/*
+Copyright (c) 2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * This file implements a thread pool for multi-threading applications.
+ * It consists of two distinct interfaces: thread pools an results queues.
+ *
+ * The pool of threads is given a function pointer and void* data to pass in.
+ * This means the pool can run jobs of multiple types, albeit first come
+ * first served with no job scheduling.
+ *
+ * Upon completion, the return value from the function pointer is added to
+ * a results queue. We may have multiple queues in use for the one pool.
+ *
+ * An example: reading from BAM and writing to CRAM with 10 threads. We'll
+ * have a pool of 10 threads and two results queues holding decoded BAM blocks
+ * and encoded CRAM blocks respectively.
+ */
+
+#ifndef _THREAD_POOL_H_
+#define _THREAD_POOL_H_
+
+#include <pthread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct t_pool;
+struct t_results_queue;
+
+typedef struct t_pool_job {
+ void *(*func)(void *arg);
+ void *arg;
+ struct t_pool_job *next;
+
+ struct t_pool *p;
+ struct t_results_queue *q;
+ int serial;
+} t_pool_job;
+
+typedef struct t_res {
+ struct t_res *next;
+ int serial; // sequential number for ordering
+ void *data; // result itself
+} t_pool_result;
+
+struct t_pool;
+
+typedef struct {
+ struct t_pool *p;
+ int idx;
+ pthread_t tid;
+ pthread_cond_t pending_c;
+ long long wait_time;
+} t_pool_worker_t;
+
+typedef struct t_pool {
+ int qsize; // size of queue
+ int njobs; // pending job count
+ int nwaiting; // how many workers waiting for new jobs
+ int shutdown; // true if pool is being destroyed
+
+ // queue of pending jobs
+ t_pool_job *head, *tail;
+
+ // threads
+ int tsize; // maximum number of jobs
+ t_pool_worker_t *t;
+
+ // Mutexes
+ pthread_mutex_t pool_m; // used when updating head/tail
+
+ pthread_cond_t empty_c;
+ pthread_cond_t pending_c; // not empty
+ pthread_cond_t full_c;
+
+ // array of worker IDs free
+ int *t_stack, t_stack_top;
+
+ // Debugging to check wait time
+ long long total_time, wait_time;
+} t_pool;
+
+typedef struct t_results_queue {
+ t_pool_result *result_head;
+ t_pool_result *result_tail;
+ int next_serial;
+ int curr_serial;
+ int queue_len; // number of items in queue
+ int pending; // number of pending items (in progress or in pool list)
+ pthread_mutex_t result_m;
+ pthread_cond_t result_avail_c;
+} t_results_queue;
+
+
+/*
+ * Creates a worker pool of length qsize with tsize worker threads.
+ *
+ * Returns pool pointer on success;
+ * NULL on failure
+ */
+t_pool *t_pool_init(int qsize, int tsize);
+
+/*
+ * Adds an item to the work pool.
+ *
+ * FIXME: Maybe return 1,0,-1 and distinguish between job dispathed vs
+ * result returned. Ie rather than blocking on full queue we're permitted
+ * to return early on "result available" event too.
+ * Caller would then have a while loop around t_pool_dispatch.
+ * Or, return -1 and set errno to E_AGAIN to indicate job not yet submitted.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int t_pool_dispatch(t_pool *p, t_results_queue *q,
+ void *(*func)(void *arg), void *arg);
+int t_pool_dispatch2(t_pool *p, t_results_queue *q,
+ void *(*func)(void *arg), void *arg, int nonblock);
+
+/*
+ * Flushes the pool, but doesn't exit. This simply drains the queue and
+ * ensures all worker threads have finished their current task.
+ *
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int t_pool_flush(t_pool *p);
+
+/*
+ * Destroys a thread pool. If 'kill' is true the threads are terminated now,
+ * otherwise they are joined into the main thread so they will finish their
+ * current work load.
+ *
+ * Use t_pool_destroy(p,0) after a t_pool_flush(p) on a normal shutdown or
+ * t_pool_destroy(p,1) to quickly exit after a fatal error.
+ */
+void t_pool_destroy(t_pool *p, int kill);
+
+/*
+ * Pulls a result off the head of the result queue. Caller should
+ * free it (and any internals as appropriate) after use. This doesn't
+ * wait for a result to be present.
+ *
+ * Results will be returned in strict order.
+ *
+ * Returns t_pool_result pointer if a result is ready.
+ * NULL if not.
+ */
+t_pool_result *t_pool_next_result(t_results_queue *q);
+t_pool_result *t_pool_next_result_wait(t_results_queue *q);
+
+/*
+ * Frees a result 'r' and if free_data is true also frees
+ * the internal r->data result too.
+ */
+void t_pool_delete_result(t_pool_result *r, int free_data);
+
+/*
+ * Initialises a results queue.
+ *
+ * Results queue pointer on success;
+ * NULL on failure
+ */
+t_results_queue *t_results_queue_init(void);
+
+/* Deallocates memory for a results queue */
+void t_results_queue_destroy(t_results_queue *q);
+
+/*
+ * Returns true if there are no items on the finished results queue and
+ * also none still pending.
+ */
+int t_pool_results_queue_empty(t_results_queue *q);
+
+/*
+ * Returns the number of completed jobs on the results queue.
+ */
+int t_pool_results_queue_len(t_results_queue *q);
+
+/*
+ * Returns the number of completed jobs plus the number queued up to run.
+ */
+int t_pool_results_queue_sz(t_results_queue *q);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _THREAD_POOL_H_ */
diff --git a/htslib/cram/vlen.c b/htslib/cram/vlen.c
new file mode 100644
index 0000000..0831741
--- /dev/null
+++ b/htslib/cram/vlen.c
@@ -0,0 +1,430 @@
+/*
+Author: James Bonfield (jkb at sanger.ac.uk)
+
+Copyright (c) 1995-1996 MEDICAL RESEARCH COUNCIL
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1 Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2 Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
+MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
+promote products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+Copyright (c) 2004, 2009, 2011-2012 Genome Research Ltd.
+
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <string.h>
+
+#include "cram/vlen.h"
+#include "cram/os.h"
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef ABS
+#define ABS(a) ((a)>0?(a):-(a))
+#endif
+
+/* #define DEBUG_printf(a,n) printf(a,n) */
+#define DEBUG_printf(a,n)
+
+/*
+ * vlen: 27/10/95 written by James Bonfield, jkb at mrc-lmb.cam.ac.uk
+ *
+ * Given sprintf style of arguments this routine returns the maximum
+ * size of buffer needed to allocate to use with sprintf. It errs on
+ * the side of caution by being simplistic in its approach: we assume
+ * all numbers are of maximum length.
+ *
+ * Handles the usual type conversions (%[%diuaxXcfeEgGpns]), but not
+ * the 'wide' character conversions (%C and %S).
+ * Precision is handled in the correct formats, including %*.*
+ * notations.
+ * Additionally, some of the more dubious (but probably illegal) cases
+ * are supported (eg "%10%" will expand to " %" on many
+ * systems).
+ *
+ * We also assume that the largest integer and larger pointer are 64
+ * bits, which at least covers the machines we'll need it for.
+ */
+int flen(char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ return vflen(fmt, args);
+}
+
+int vflen(char *fmt, va_list ap)
+{
+ int len = 0;
+ char *cp, c;
+ long long l;
+ int i;
+ double d;
+
+ /*
+ * This code modifies 'ap', but we do not know if va_list is a structure
+ * or a pointer to an array so we do not know if it is a local variable
+ * or not.
+ * C99 gets around this by defining va_copy() to make copies of ap, but
+ * this does not exist on all systems.
+ * For now, I just assume that when va_list is a pointer the system also
+ * provides a va_copy macro to work around this problem. The only system
+ * I have seen needing this so far was Linux on AMD64.
+ */
+#if defined(HAVE_VA_COPY)
+ va_list ap_local;
+ va_copy(ap_local, ap);
+# define ap ap_local
+#endif
+
+ for(cp = fmt; *cp; cp++) {
+ switch(*cp) {
+
+ /* A format specifier */
+ case '%': {
+ char *endp;
+ long conv_len1=0, conv_len2=0, conv_len=0;
+ signed int arg_size;
+
+ /* Firstly, strip the modifier flags (+-#0 and [space]) */
+ for(; (c=*++cp);) {
+ if ('#' == c)
+ len+=2; /* Worst case of "0x" */
+ else if ('-' == c || '+' == c || ' ' == c)
+ len++;
+ else
+ break;
+ }
+
+ /* Width specifier */
+ l = strtol(cp, &endp, 10);
+ if (endp != cp) {
+ cp = endp;
+ conv_len = conv_len1 = l;
+ } else if (*cp == '*') {
+ conv_len = conv_len1 = (int)va_arg(ap, int);
+ cp++;
+ }
+
+ /* Precision specifier */
+ if ('.' == *cp) {
+ cp++;
+ conv_len2 = strtol(cp, &endp, 10);
+ if (endp != cp) {
+ cp = endp;
+ } else if (*cp == '*') {
+ conv_len2 = (int)va_arg(ap, int);
+ cp++;
+ }
+ conv_len = MAX(conv_len1, conv_len2);
+ }
+
+ /* Short/long identifier */
+ if ('h' == *cp) {
+ arg_size = -1; /* short */
+ cp++;
+ } else if ('l' == *cp) {
+ arg_size = 1; /* long */
+ cp++;
+ if ('l' == *cp) {
+ arg_size = 2; /* long long */
+ cp++;
+ }
+ } else {
+ arg_size = 0; /* int */
+ }
+
+ /* The actual type */
+ switch (*cp) {
+ case '%':
+ /*
+ * Not real ANSI I suspect, but we'll allow for the
+ * completely daft "%10%" example.
+ */
+ len += MAX(conv_len1, 1);
+ break;
+
+ case 'd':
+ case 'i':
+ case 'u':
+ case 'a':
+ case 'x':
+ case 'X':
+ /* Remember: char and short are sent as int on the stack */
+ if (arg_size == -1)
+ l = (long)va_arg(ap, int);
+ else if (arg_size == 1)
+ l = va_arg(ap, long);
+ else if (arg_size == 2)
+ l = va_arg(ap, long long);
+ else
+ l = (long)va_arg(ap, int);
+
+ DEBUG_printf("%d", l);
+
+ /*
+ * No number can be more than 24 characters so we'll take
+ * the max of conv_len and 24 (23 is len(2^64) in octal).
+ * All that work above and we then go and estimate ;-),
+ * but it's needed incase someone does %500d.
+ */
+ len += MAX(conv_len, 23);
+ break;
+
+ case 'c':
+ i = va_arg(ap, int);
+ DEBUG_printf("%c", i);
+ /*
+ * Note that %10c and %.10c act differently.
+ * Besides, I think precision is not really allowed for %c.
+ */
+ len += MAX(conv_len1, i>=0x80 ?MB_CUR_MAX :1);
+ break;
+
+ case 'f':
+ d = va_arg(ap, double);
+ DEBUG_printf("%f", d);
+ /*
+ * Maybe "Inf" or "NaN", but we'll not worry about that.
+ * Again, err on side of caution and take max of conv_len
+ * and max length of a double. The worst case I can
+ * think of is 317 characters (-1[308 zeros].000000)
+ * without using precision codes. That's horrid. I
+ * cheat and either use 317 or 15 depending on how
+ * large the number is as I reckon 99% of floats
+ * aren't that long.
+ */
+ l = (ABS(d) > 1000000) ? 317 : 15;
+ l = MAX(l, conv_len1 + 2);
+ if (conv_len2) l += conv_len2 - 6;
+ len += l;
+ break;
+
+ case 'e':
+ case 'E':
+ case 'g':
+ case 'G':
+ d = va_arg(ap, double);
+ DEBUG_printf("%g", d);
+ /*
+ * Maybe "Inf" or "NaN", but we'll not worry about that
+ * Again, err on side of caution and take max of conv_len
+ * and max length of a double (which defaults to only
+ * '-' + 6 + '.' + 'E[+-]xxx' == 13.
+ */
+ len += MAX(conv_len, 13);
+ break;
+
+ case 'p':
+ l = (long)va_arg(ap, void *);
+ /*
+ * Max pointer is 64bits == 16 chars (on alpha),
+ * == 20 with + "0x".
+ */
+ DEBUG_printf("%p", (void *)l);
+ len += MAX(conv_len, 20);
+ break;
+
+ case 'n':
+ /* produces no output */
+ break;
+
+ case 's': {
+ char *s = (char *)va_arg(ap, char *);
+ DEBUG_printf("%s", s);
+
+ if (!conv_len2) {
+ len += MAX(conv_len, (int)strlen(s));
+ } else {
+ len += conv_len;
+ }
+ break;
+ }
+
+ default:
+ /* wchar_t types of 'C' and 'S' aren't supported */
+ DEBUG_printf("Arg is %c\n", *cp);
+ }
+
+ }
+
+ case '\0':
+ break;
+
+ default:
+ DEBUG_printf("%c", *cp);
+ len++;
+ }
+ }
+
+ va_end(ap);
+
+ return len+1; /* one for the null character */
+}
+
+#if 0
+int main() {
+ int l;
+ char buf[10000];
+
+ sprintf(buf, "d: %d\n", 500);
+ l = flen("d: %d\n", 500);
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "");
+ l = flen("");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%s\n","test");
+ l = flen("%s\n", "test");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%c\n", 'a');
+ l = flen("%c\n", 'a');
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%31.30f\n", -9999.99);
+ l = flen("%31.30f\n", -9999.99);
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%f\n", -1e308);
+ l = flen("%f\n", -1e308);
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%.9f\n", -1e308);
+ l = flen("%.9f\n", -1e308);
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%10.20f\n", -1.999222333);
+ l = flen("%10.20f\n", -1.999222333);
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%#g\n", -3.14159265358e-222);
+ l = flen("%#g\n", -3.1415927e-222);
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%e\n", -123456789123456789.1);
+ l = flen("%e\n", -123456789123456789.1);
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%c %f %d %s %c %g %ld %s\n", 'a', 3.1, 9, "one", 'b', 4.2, 9, "two");
+ l = flen("%c %f %d %s %c %g %ld %s\n", 'a', 3.1, 9, "one", 'b', 4.2, 9, "two");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%*.*e %*c\n", 10, 5, 9.0, 20, 'x');
+ l = flen("%*.*e %*c\n", 10, 5, 9.0, 20, 'x');
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%10c\n", 'z');
+ l = flen("%10c\n", 'z');
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%.10c\n", 'z');
+ l = flen("%.10c\n", 'z');
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%10d\n", 'z');
+ l = flen("%10d\n", 'z');
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%.10d\n", 'z');
+ l = flen("%.10d\n", 'z');
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%10%\n");
+ l = flen("%10%\n");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%.10%\n");
+ l = flen("%.10%\n");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%s\n", "0123456789");
+ l = flen("%s\n", "0123456789");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%5s\n", "0123456789");
+ l = flen("%5s\n", "0123456789");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%50s\n", "0123456789");
+ l = flen("%50s\n", "0123456789");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%.5s\n", "0123456789");
+ l = flen("%.5s\n", "0123456789");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%.50s\n", "0123456789");
+ l = flen("%.50s\n", "0123456789");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%5.50s\n", "0123456789");
+ l = flen("%5.50s\n", "0123456789");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ sprintf(buf, "%50.5s\n", "0123456789");
+ l = flen("%50.5s\n", "0123456789");
+ printf("%d %d\n\n", strlen(buf), l);
+
+ return 0;
+}
+#endif
diff --git a/htslib/cram/vlen.h b/htslib/cram/vlen.h
new file mode 100644
index 0000000..6b9b07c
--- /dev/null
+++ b/htslib/cram/vlen.h
@@ -0,0 +1,48 @@
+/*
+Author: James Bonfield (jkb at sanger.ac.uk)
+
+Copyright (c) 1995-1996 MEDICAL RESEARCH COUNCIL
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1 Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2 Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
+MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
+promote products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VLEN_H_
+#define _VLEN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int vflen(char *fmt, va_list ap);
+extern int flen(char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VLEN_H_ */
diff --git a/htslib/cram/zfio.c b/htslib/cram/zfio.c
new file mode 100644
index 0000000..46727c5
--- /dev/null
+++ b/htslib/cram/zfio.c
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2009-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "cram/os.h"
+#include "cram/zfio.h"
+
+/* ------------------------------------------------------------------------ */
+/* Some wrappers around FILE * vs gzFile *, allowing for either */
+
+/*
+ * gzopen() works on both compressed and uncompressed data, but it has
+ * a significant performance hit even for uncompressed data (tested as
+ * 25s using FILE* to 46s via gzOpen and 66s via gzOpen when gzipped).
+ *
+ * Hence we use our own wrapper 'zfp' which is a FILE* when uncompressed
+ * and gzFile* when compressed. This also means we could hide bzopen in
+ * there too if desired.
+ */
+
+off_t zftello(zfp *zf) {
+ return zf->fp ? ftello(zf->fp) : -1;
+}
+
+int zfseeko(zfp *zf, off_t offset, int whence) {
+ return zf->fp ? fseeko(zf->fp, offset, whence) : -1;
+}
+
+
+/*
+ * A wrapper for either fgets or gzgets depending on what has been
+ * opened.
+ */
+char *zfgets(char *line, int size, zfp *zf) {
+ if (zf->fp)
+ return fgets(line, size, zf->fp);
+ else
+ return gzgets(zf->gz, line, size);
+}
+
+/*
+ * A wrapper for either fputs or gzputs depending on what has been
+ * opened.
+ */
+int zfputs(char *line, zfp *zf) {
+ if (zf->fp)
+ return fputs(line, zf->fp);
+ else
+ return gzputs(zf->gz, line) ? 0 : EOF;
+}
+
+/*
+ * Peeks at and returns the next character without consuming it from the
+ * input. (Ie a combination of getc and ungetc).
+ */
+int zfpeek(zfp *zf) {
+ int c;
+
+ if (zf->fp) {
+ c = getc(zf->fp);
+ if (c != EOF)
+ ungetc(c, zf->fp);
+ } else {
+ c = gzgetc(zf->gz);
+ if (c != EOF)
+ gzungetc(c, zf->gz);
+ }
+
+ return c;
+}
+
+/* A replacement for either feof of gzeof */
+int zfeof(zfp *zf) {
+ return zf->fp ? feof(zf->fp) : gzeof(zf->gz);
+}
+
+/* A replacement for either fopen or gzopen */
+zfp *zfopen(const char *path, const char *mode) {
+ char path2[1024];
+ zfp *zf;
+
+ if (!(zf = (zfp *)malloc(sizeof(*zf))))
+ return NULL;
+ zf->fp = NULL;
+ zf->gz = NULL;
+
+ /* Try normal fopen */
+ if (mode[0] != 'z' && mode[1] != 'z' &&
+ NULL != (zf->fp = fopen(path, mode))) {
+ unsigned char magic[2];
+ if (2 != fread(magic, 1, 2, zf->fp)) {
+ free(zf);
+ return NULL;
+ }
+ if (!(magic[0] == 0x1f &&
+ magic[1] == 0x8b)) {
+ fseeko(zf->fp, 0, SEEK_SET);
+ return zf;
+ }
+
+ fclose(zf->fp);
+ zf->fp = NULL;
+ }
+
+#ifdef HAVE_POPEN
+ /*
+ * I've no idea why, by gzgets is VERY slow, maybe because it handles
+ * arbitrary seeks.
+ * popen to gzip -cd is 3 times faster though.
+ */
+ if (*mode == 'w') {
+ } else {
+ if (access(path, R_OK) == 0) {
+ sprintf(path2, "gzip -cd < %.*s", 1000, path);
+ if (NULL != (zf->fp = popen(path2, "r")))
+ return zf;
+ }
+
+ sprintf(path2, "gzip -cd < %.*s.gz", 1000, path);
+ if (NULL != (zf->fp = popen(path2, "r")))
+ return zf;
+
+ printf("Failed on %s\n", path);
+ } else {
+ sprintf(path2, "gzip > %.*s", 1000, path);
+ if (NULL != (zf->fp = popen(path2, "w")))
+ return zf;
+ }
+
+ printf("Failed on %s\n", path);
+ }
+#else
+ /* Gzopen instead */
+ if ((zf->gz = gzopen(path, mode)))
+ return zf;
+
+ sprintf(path2, "%.*s.gz", 1020, path);
+ if ((zf->gz = gzopen(path2, mode)))
+ return zf;
+#endif
+
+ perror(path);
+
+ free(zf);
+ return NULL;
+}
+
+int zfclose(zfp *zf) {
+ int r = (zf->fp) ? fclose(zf->fp) : gzclose(zf->gz);
+ free(zf);
+ return r;
+}
diff --git a/htslib/cram/zfio.h b/htslib/cram/zfio.h
new file mode 100644
index 0000000..ab9c9c9
--- /dev/null
+++ b/htslib/cram/zfio.h
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2009-2013 Genome Research Ltd.
+Author: James Bonfield <jkb at sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _ZFIO_H_
+#define _ZFIO_H_
+
+#include <stdio.h>
+#include <zlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Either a gzFile or a FILE.
+ */
+typedef struct {
+ FILE *fp;
+ gzFile gz;
+} zfp;
+
+off_t zftello(zfp *zf);
+int zfseeko(zfp *zf, off_t offset, int whence);
+char *zfgets(char *line, int size, zfp *zf);
+int zfputs(char *line, zfp *zf);
+zfp *zfopen(const char *path, const char *mode);
+int zfclose(zfp *zf);
+int zfpeek(zfp *zf);
+int zfeof(zfp *zf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFIO_H_ */
diff --git a/htslib/faidx.c b/htslib/faidx.c
new file mode 100644
index 0000000..7e3b2c6
--- /dev/null
+++ b/htslib/faidx.c
@@ -0,0 +1,525 @@
+/* faidx.c -- FASTA random access.
+
+ Copyright (C) 2008, 2009, 2013-2016 Genome Research Ltd.
+ Portions copyright (C) 2011 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <errno.h>
+
+#include "htslib/bgzf.h"
+#include "htslib/faidx.h"
+#include "htslib/hfile.h"
+#include "htslib/khash.h"
+#include "htslib/kstring.h"
+
+typedef struct {
+ int32_t line_len, line_blen;
+ int64_t len;
+ uint64_t offset;
+} faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+
+struct __faidx_t {
+ BGZF *bgzf;
+ int n, m;
+ char **name;
+ khash_t(s) *hash;
+};
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline int fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
+{
+ if (!name) {
+ fprintf(stderr, "[fai_build_core] malformed line\n");
+ return -1;
+ }
+
+ char *name_key = strdup(name);
+ int absent;
+ khint_t k = kh_put(s, idx->hash, name_key, &absent);
+ faidx1_t *v = &kh_value(idx->hash, k);
+
+ if (! absent) {
+ fprintf(stderr, "[fai_build_core] ignoring duplicate sequence \"%s\" at byte offset %"PRIu64"\n", name, offset);
+ free(name_key);
+ return 0;
+ }
+
+ if (idx->n == idx->m) {
+ char **tmp;
+ idx->m = idx->m? idx->m<<1 : 16;
+ if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
+ fprintf(stderr, "[fai_build_core] out of memory\n");
+ return -1;
+ }
+ idx->name = tmp;
+ }
+ idx->name[idx->n++] = name_key;
+ v->len = len;
+ v->line_len = line_len;
+ v->line_blen = line_blen;
+ v->offset = offset;
+
+ return 0;
+}
+
+faidx_t *fai_build_core(BGZF *bgzf)
+{
+ kstring_t name = { 0, 0, NULL };
+ int c;
+ int line_len, line_blen, state;
+ int l1, l2;
+ faidx_t *idx;
+ uint64_t offset;
+ int64_t len;
+
+ idx = (faidx_t*)calloc(1, sizeof(faidx_t));
+ idx->hash = kh_init(s);
+ len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
+ while ( (c=bgzf_getc(bgzf))>=0 ) {
+ if (c == '\n') { // an empty line
+ if (state == 1) {
+ offset = bgzf_utell(bgzf);
+ continue;
+ } else if ((state == 0 && len < 0) || state == 2) continue;
+ else if (state == 0) { state = 2; continue; }
+ }
+ if (c == '>') { // fasta header
+ if (len >= 0) {
+ if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
+ goto fail;
+ }
+
+ name.l = 0;
+ while ((c = bgzf_getc(bgzf)) >= 0)
+ if (! isspace(c)) kputc_(c, &name);
+ else if (name.l > 0 || c == '\n') break;
+ kputsn("", 0, &name);
+
+ if ( c<0 ) {
+ fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
+ goto fail;
+ }
+ if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
+ state = 1; len = 0;
+ offset = bgzf_utell(bgzf);
+ } else {
+ if (state == 3) {
+ fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name.s);
+ goto fail;
+ }
+ if (state == 2) state = 3;
+ l1 = l2 = 0;
+ do {
+ ++l1;
+ if (isgraph(c)) ++l2;
+ } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
+ if (state == 3 && l2) {
+ fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name.s);
+ goto fail;
+ }
+ ++l1; len += l2;
+ if (state == 1) line_len = l1, line_blen = l2, state = 0;
+ else if (state == 0) {
+ if (l1 != line_len || l2 != line_blen) state = 2;
+ }
+ }
+ }
+
+ if (len >= 0) {
+ if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
+ goto fail;
+ } else {
+ goto fail;
+ }
+
+ free(name.s);
+ return idx;
+
+fail:
+ free(name.s);
+ fai_destroy(idx);
+ return NULL;
+}
+
+void fai_save(const faidx_t *fai, FILE *fp)
+{
+ khint_t k;
+ int i;
+ for (i = 0; i < fai->n; ++i) {
+ faidx1_t x;
+ k = kh_get(s, fai->hash, fai->name[i]);
+ x = kh_value(fai->hash, k);
+#ifdef _WIN32
+ fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
+#else
+ fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+#endif
+ }
+}
+
+static faidx_t *fai_read(FILE *fp, const char *fname)
+{
+ faidx_t *fai;
+ char *buf, *p;
+ int len, line_len, line_blen;
+#ifdef _WIN32
+ long offset;
+#else
+ long long offset;
+#endif
+ fai = (faidx_t*)calloc(1, sizeof(faidx_t));
+ fai->hash = kh_init(s);
+ buf = (char*)calloc(0x10000, 1);
+ while (fgets(buf, 0x10000, fp)) {
+ for (p = buf; *p && isgraph(*p); ++p);
+ *p = 0; ++p;
+#ifdef _WIN32
+ sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
+#else
+ sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+#endif
+ if (fai_insert_index(fai, buf, len, line_len, line_blen, offset) != 0) {
+ free(buf);
+ return NULL;
+ }
+ }
+ free(buf);
+ if (ferror(fp)) {
+ fprintf(stderr, "[fai_load] error while reading \"%s\": %s\n", fname, strerror(errno));
+ fai_destroy(fai);
+ return NULL;
+ }
+ return fai;
+}
+
+void fai_destroy(faidx_t *fai)
+{
+ int i;
+ for (i = 0; i < fai->n; ++i) free(fai->name[i]);
+ free(fai->name);
+ kh_destroy(s, fai->hash);
+ if (fai->bgzf) bgzf_close(fai->bgzf);
+ free(fai);
+}
+
+int fai_build(const char *fn)
+{
+ char *str;
+ BGZF *bgzf;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+ bgzf = bgzf_open(fn, "r");
+ if ( !bgzf ) {
+ fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
+ free(str);
+ return -1;
+ }
+ if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf);
+ fai = fai_build_core(bgzf);
+ if ( !fai )
+ {
+ if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n");
+ bgzf_close(bgzf);
+ free(str);
+ return -1;
+ }
+ if ( bgzf->is_compressed ) {
+ if (bgzf_index_dump(bgzf, fn, ".gzi") < 0) {
+ fprintf(stderr, "[fai_build] fail to make bgzf index %s.gzi\n", fn);
+ fai_destroy(fai); free(str);
+ return -1;
+ }
+ }
+ if (bgzf_close(bgzf) < 0) {
+ fprintf(stderr, "[fai_build] Error on closing %s\n", fn);
+ fai_destroy(fai); free(str);
+ return -1;
+ }
+ fp = fopen(str, "wb");
+ if ( !fp ) {
+ fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
+ fai_destroy(fai); free(str);
+ return -1;
+ }
+ fai_save(fai, fp);
+ fclose(fp);
+ free(str);
+ fai_destroy(fai);
+ return 0;
+}
+
+static FILE *download_and_open(const char *fn)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ uint8_t *buf;
+ FILE *fp;
+ hFILE *fp_remote;
+ const char *url = fn;
+ const char *p;
+ int l = strlen(fn);
+ for (p = fn + l - 1; p >= fn; --p)
+ if (*p == '/') break;
+ fn = p + 1;
+
+ // First try to open a local copy
+ fp = fopen(fn, "r");
+ if (fp)
+ return fp;
+
+ // If failed, download from remote and open
+ fp_remote = hopen(url, "rb");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
+ return NULL;
+ }
+ if ((fp = fopen(fn, "wb")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
+ hclose_abruptly(fp_remote);
+ return NULL;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = hread(fp_remote, buf, buf_size)) > 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ if (hclose(fp_remote) != 0)
+ fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url);
+
+ return fopen(fn, "r");
+}
+
+faidx_t *fai_load(const char *fn)
+{
+ char *str;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+
+ if (hisremote(str))
+ {
+ fp = download_and_open(str);
+ if ( !fp )
+ {
+ fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
+ free(str);
+ return 0;
+ }
+ }
+ else
+ fp = fopen(str, "rb");
+
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] build FASTA index.\n");
+ if (fai_build(fn) < 0) {
+ free(str);
+ return 0;
+ }
+ fp = fopen(str, "rb");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] failed to open FASTA index: %s\n", strerror(errno));
+ free(str);
+ return 0;
+ }
+ }
+
+ fai = fai_read(fp, str);
+ fclose(fp);
+ free(str);
+ if (fai == NULL) {
+ return NULL;
+ }
+
+ fai->bgzf = bgzf_open(fn, "rb");
+ if (fai->bgzf == 0) {
+ fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
+ return 0;
+ }
+ if ( fai->bgzf->is_compressed==1 )
+ {
+ if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 )
+ {
+ fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn);
+ fai_destroy(fai);
+ return NULL;
+ }
+ }
+ return fai;
+}
+
+char *fai_fetch(const faidx_t *fai, const char *str, int *len)
+{
+ char *s;
+ int c, i, l, k, name_end;
+ khiter_t iter;
+ faidx1_t val;
+ khash_t(s) *h;
+ int beg, end;
+
+ beg = end = -1;
+ h = fai->hash;
+ name_end = l = strlen(str);
+ s = (char*)malloc(l+1);
+ // remove space
+ for (i = k = 0; i < l; ++i)
+ if (!isspace(str[i])) s[k++] = str[i];
+ s[k] = 0; l = k;
+ // determine the sequence name
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+ if (i >= 0) name_end = i;
+ if (name_end < l) { // check if this is really the end
+ int n_hyphen = 0;
+ for (i = name_end + 1; i < l; ++i) {
+ if (s[i] == '-') ++n_hyphen;
+ else if (!isdigit(s[i]) && s[i] != ',') break;
+ }
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+ s[name_end] = 0;
+ iter = kh_get(s, h, s);
+ if (iter == kh_end(h)) { // cannot find the sequence name
+ iter = kh_get(s, h, str); // try str as the name
+ if (iter == kh_end(h)) {
+ *len = 0;
+ free(s); return 0;
+ } else s[name_end] = ':', name_end = l;
+ }
+ } else iter = kh_get(s, h, str);
+ if(iter == kh_end(h)) {
+ fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
+ free(s);
+ *len = -2;
+ return 0;
+ };
+ val = kh_value(h, iter);
+ // parse the interval
+ if (name_end < l) {
+ for (i = k = name_end + 1; i < l; ++i)
+ if (s[i] != ',') s[k++] = s[i];
+ s[k] = 0;
+ beg = atoi(s + name_end + 1);
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+ end = i < k? atoi(s + i + 1) : val.len;
+ if (beg > 0) --beg;
+ } else beg = 0, end = val.len;
+ if (beg >= val.len) beg = val.len;
+ if (end >= val.len) end = val.len;
+ if (beg > end) beg = end;
+ free(s);
+
+ // now retrieve the sequence
+ int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
+ if ( ret<0 )
+ {
+ *len = -1;
+ fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
+ return NULL;
+ }
+ l = 0;
+ s = (char*)malloc(end - beg + 2);
+ while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg )
+ if (isgraph(c)) s[l++] = c;
+ s[l] = '\0';
+ *len = l;
+ return s;
+}
+
+int faidx_fetch_nseq(const faidx_t *fai)
+{
+ return fai->n;
+}
+
+int faidx_nseq(const faidx_t *fai)
+{
+ return fai->n;
+}
+
+const char *faidx_iseq(const faidx_t *fai, int i)
+{
+ return fai->name[i];
+}
+
+int faidx_seq_len(const faidx_t *fai, const char *seq)
+{
+ khint_t k = kh_get(s, fai->hash, seq);
+ if ( k == kh_end(fai->hash) ) return -1;
+ return kh_val(fai->hash, k).len;
+}
+
+char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
+{
+ int l, c;
+ khiter_t iter;
+ faidx1_t val;
+ char *seq=NULL;
+
+ // Adjust position
+ iter = kh_get(s, fai->hash, c_name);
+ if (iter == kh_end(fai->hash))
+ {
+ *len = -2;
+ fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
+ return NULL;
+ }
+ val = kh_value(fai->hash, iter);
+ if(p_end_i < p_beg_i) p_beg_i = p_end_i;
+ if(p_beg_i < 0) p_beg_i = 0;
+ else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
+ if(p_end_i < 0) p_end_i = 0;
+ else if(val.len <= p_end_i) p_end_i = val.len - 1;
+
+ // Now retrieve the sequence
+ int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
+ if ( ret<0 )
+ {
+ *len = -1;
+ fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
+ return NULL;
+ }
+ l = 0;
+ seq = (char*)malloc(p_end_i - p_beg_i + 2);
+ while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
+ if (isgraph(c)) seq[l++] = c;
+ seq[l] = '\0';
+ *len = l;
+ return seq;
+}
+
+int faidx_has_seq(const faidx_t *fai, const char *seq)
+{
+ khiter_t iter = kh_get(s, fai->hash, seq);
+ if (iter == kh_end(fai->hash)) return 0;
+ return 1;
+}
+
diff --git a/htslib/hfile.c b/htslib/hfile.c
new file mode 100644
index 0000000..e8b05fc
--- /dev/null
+++ b/htslib/hfile.c
@@ -0,0 +1,751 @@
+/* hfile.c -- buffered low-level input/output streams.
+
+ Copyright (C) 2013-2016 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <pthread.h>
+
+#include "htslib/hfile.h"
+#include "hfile_internal.h"
+
+#ifndef ENOTSUP
+#define ENOTSUP EINVAL
+#endif
+#ifndef EOVERFLOW
+#define EOVERFLOW ERANGE
+#endif
+#ifndef EPROTONOSUPPORT
+#define EPROTONOSUPPORT ENOSYS
+#endif
+
+/* hFILE fields are used as follows:
+
+ char *buffer; // Pointer to the start of the I/O buffer
+ char *begin; // First not-yet-read character / unused position
+ char *end; // First unfilled/unfillable position
+ char *limit; // Pointer to the first position past the buffer
+
+ const hFILE_backend *backend; // Methods to refill/flush I/O buffer
+
+ off_t offset; // Offset within the stream of buffer position 0
+ unsigned at_eof:1;// For reading, whether EOF has been seen
+ int has_errno; // Error number from the last failure on this stream
+
+For reading, begin is the first unread character in the buffer and end is the
+first unfilled position:
+
+ -----------ABCDEFGHIJKLMNO---------------
+ ^buffer ^begin ^end ^limit
+
+For writing, begin is the first unused position and end is unused so remains
+equal to buffer:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ---------------
+ ^buffer ^begin ^limit
+ ^end
+
+Thus if begin > end then there is a non-empty write buffer, if begin < end
+then there is a non-empty read buffer, and if begin == end then both buffers
+are empty. In all cases, the stream's file position indicator corresponds
+to the position pointed to by begin. */
+
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
+{
+ hFILE *fp = (hFILE *) malloc(struct_size);
+ if (fp == NULL) goto error;
+
+ if (capacity == 0) capacity = 32768;
+ // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory
+ if (strchr(mode, 'r') && capacity > 32768) capacity = 32768;
+
+ fp->buffer = (char *) malloc(capacity);
+ if (fp->buffer == NULL) goto error;
+
+ fp->begin = fp->end = fp->buffer;
+ fp->limit = &fp->buffer[capacity];
+
+ fp->offset = 0;
+ fp->at_eof = 0;
+ fp->has_errno = 0;
+ return fp;
+
+error:
+ hfile_destroy(fp);
+ return NULL;
+}
+
+void hfile_destroy(hFILE *fp)
+{
+ int save = errno;
+ if (fp) free(fp->buffer);
+ free(fp);
+ errno = save;
+}
+
+static inline int writebuffer_is_nonempty(hFILE *fp)
+{
+ return fp->begin > fp->end;
+}
+
+/* Refills the read buffer from the backend (once, so may only partially
+ fill the buffer), returning the number of additional characters read
+ (which might be 0), or negative when an error occurred. */
+static ssize_t refill_buffer(hFILE *fp)
+{
+ ssize_t n;
+
+ // Move any unread characters to the start of the buffer
+ if (fp->begin > fp->buffer) {
+ fp->offset += fp->begin - fp->buffer;
+ memmove(fp->buffer, fp->begin, fp->end - fp->begin);
+ fp->end = &fp->buffer[fp->end - fp->begin];
+ fp->begin = fp->buffer;
+ }
+
+ // Read into the available buffer space at fp->[end,limit)
+ if (fp->at_eof || fp->end == fp->limit) n = 0;
+ else {
+ n = fp->backend->read(fp, fp->end, fp->limit - fp->end);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ else if (n == 0) fp->at_eof = 1;
+ }
+
+ fp->end += n;
+ return n;
+}
+
+/* Called only from hgetc(), when our buffer is empty. */
+int hgetc2(hFILE *fp)
+{
+ return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF;
+}
+
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
+{
+ size_t n = fp->end - fp->begin;
+ while (n < nbytes) {
+ ssize_t ret = refill_buffer(fp);
+ if (ret < 0) return ret;
+ else if (ret == 0) break;
+ else n += ret;
+ }
+
+ if (n > nbytes) n = nbytes;
+ memcpy(buffer, fp->begin, n);
+ return n;
+}
+
+/* Called only from hread(); when called, our buffer is empty and nread bytes
+ have already been placed in the destination buffer. */
+ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread)
+{
+ const size_t capacity = fp->limit - fp->buffer;
+ char *dest = (char *) destv;
+ dest += nread, nbytes -= nread;
+
+ // Read large requests directly into the destination buffer
+ while (nbytes * 2 >= capacity && !fp->at_eof) {
+ ssize_t n = fp->backend->read(fp, dest, nbytes);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ else if (n == 0) fp->at_eof = 1;
+ fp->offset += n;
+ dest += n, nbytes -= n;
+ nread += n;
+ }
+
+ while (nbytes > 0 && !fp->at_eof) {
+ size_t n;
+ ssize_t ret = refill_buffer(fp);
+ if (ret < 0) return ret;
+
+ n = fp->end - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(dest, fp->begin, n);
+ fp->begin += n;
+ dest += n, nbytes -= n;
+ nread += n;
+ }
+
+ return nread;
+}
+
+/* Flushes the write buffer, fp->[buffer,begin), out through the backend
+ returning 0 on success or negative if an error occurred. */
+static ssize_t flush_buffer(hFILE *fp)
+{
+ const char *buffer = fp->buffer;
+ while (buffer < fp->begin) {
+ ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ buffer += n;
+ fp->offset += n;
+ }
+
+ fp->begin = fp->buffer; // Leave the buffer empty
+ return 0;
+}
+
+int hflush(hFILE *fp)
+{
+ if (flush_buffer(fp) < 0) return EOF;
+ if (fp->backend->flush) {
+ if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; }
+ }
+ return 0;
+}
+
+/* Called only from hputc(), when our buffer is already full. */
+int hputc2(int c, hFILE *fp)
+{
+ if (flush_buffer(fp) < 0) return EOF;
+ *(fp->begin++) = c;
+ return c;
+}
+
+/* Called only from hwrite() and hputs2(); when called, our buffer is full and
+ ncopied bytes from the source have already been copied to our buffer. */
+ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied)
+{
+ const char *src = (const char *) srcv;
+ ssize_t ret;
+ const size_t capacity = fp->limit - fp->buffer;
+ size_t remaining = totalbytes - ncopied;
+ src += ncopied;
+
+ ret = flush_buffer(fp);
+ if (ret < 0) return ret;
+
+ // Write large blocks out directly from the source buffer
+ while (remaining * 2 >= capacity) {
+ ssize_t n = fp->backend->write(fp, src, remaining);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ fp->offset += n;
+ src += n, remaining -= n;
+ }
+
+ // Just buffer any remaining characters
+ memcpy(fp->begin, src, remaining);
+ fp->begin += remaining;
+
+ return totalbytes;
+}
+
+/* Called only from hputs(), when our buffer is already full. */
+int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp)
+{
+ return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF;
+}
+
+off_t hseek(hFILE *fp, off_t offset, int whence)
+{
+ off_t curpos, pos;
+
+ if (writebuffer_is_nonempty(fp)) {
+ int ret = flush_buffer(fp);
+ if (ret < 0) return ret;
+ }
+
+ curpos = htell(fp);
+
+ // Relative offsets are given relative to the hFILE's stream position,
+ // which may differ from the backend's physical position due to buffering
+ // read-ahead. Correct for this by converting to an absolute position.
+ if (whence == SEEK_CUR) {
+ if (curpos + offset < 0) {
+ // Either a negative offset resulted in a position before the
+ // start of the file, or we overflowed when given a positive offset
+ fp->has_errno = errno = (offset < 0)? EINVAL : EOVERFLOW;
+ return -1;
+ }
+
+ whence = SEEK_SET;
+ offset = curpos + offset;
+ }
+
+ // TODO Avoid seeking if the desired position is within our read buffer
+
+ pos = fp->backend->seek(fp, offset, whence);
+ if (pos < 0) { fp->has_errno = errno; return pos; }
+
+ // Seeking succeeded, so discard any non-empty read buffer
+ fp->begin = fp->end = fp->buffer;
+ fp->at_eof = 0;
+
+ fp->offset = pos;
+ return pos;
+}
+
+int hclose(hFILE *fp)
+{
+ int err = fp->has_errno;
+
+ if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
+ if (fp->backend->close(fp) < 0) err = errno;
+ hfile_destroy(fp);
+
+ if (err) {
+ errno = err;
+ return EOF;
+ }
+ else return 0;
+}
+
+void hclose_abruptly(hFILE *fp)
+{
+ int save = errno;
+ if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
+ hfile_destroy(fp);
+ errno = save;
+}
+
+
+/***************************
+ * File descriptor backend *
+ ***************************/
+
+#ifndef _WIN32
+#include <sys/socket.h>
+#include <sys/stat.h>
+#define HAVE_STRUCT_STAT_ST_BLKSIZE
+#else
+#include <winsock2.h>
+#define HAVE_CLOSESOCKET
+#define HAVE_SETMODE
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+
+/* For Unix, it doesn't matter whether a file descriptor is a socket.
+ However Windows insists on send()/recv() and its own closesocket()
+ being used when fd happens to be a socket. */
+
+typedef struct {
+ hFILE base;
+ int fd;
+ unsigned is_socket:1;
+} hFILE_fd;
+
+static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ ssize_t n;
+ do {
+ n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0)
+ : read(fp->fd, buffer, nbytes);
+ } while (n < 0 && errno == EINTR);
+ return n;
+}
+
+static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ ssize_t n;
+ do {
+ n = fp->is_socket? send(fp->fd, buffer, nbytes, 0)
+ : write(fp->fd, buffer, nbytes);
+ } while (n < 0 && errno == EINTR);
+ return n;
+}
+
+static off_t fd_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ return lseek(fp->fd, offset, whence);
+}
+
+static int fd_flush(hFILE *fpv)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ int ret;
+ do {
+#ifdef HAVE_FDATASYNC
+ ret = fdatasync(fp->fd);
+#else
+ ret = fsync(fp->fd);
+#endif
+ // Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe,
+ // and operation-not-supported errors (Mac OS X)
+ if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0;
+ } while (ret < 0 && errno == EINTR);
+ return ret;
+}
+
+static int fd_close(hFILE *fpv)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ int ret;
+ do {
+#ifdef HAVE_CLOSESOCKET
+ ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd);
+#else
+ ret = close(fp->fd);
+#endif
+ } while (ret < 0 && errno == EINTR);
+ return ret;
+}
+
+static const struct hFILE_backend fd_backend =
+{
+ fd_read, fd_write, fd_seek, fd_flush, fd_close
+};
+
+static size_t blksize(int fd)
+{
+#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
+ struct stat sbuf;
+ if (fstat(fd, &sbuf) != 0) return 0;
+ return sbuf.st_blksize;
+#else
+ return 0;
+#endif
+}
+
+static hFILE *hopen_fd(const char *filename, const char *mode)
+{
+ hFILE_fd *fp = NULL;
+ int fd = open(filename, hfile_oflags(mode), 0666);
+ if (fd < 0) goto error;
+
+ fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+ if (fp == NULL) goto error;
+
+ fp->fd = fd;
+ fp->is_socket = 0;
+ fp->base.backend = &fd_backend;
+ return &fp->base;
+
+error:
+ if (fd >= 0) { int save = errno; (void) close(fd); errno = save; }
+ hfile_destroy((hFILE *) fp);
+ return NULL;
+}
+
+hFILE *hdopen(int fd, const char *mode)
+{
+ hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+ if (fp == NULL) return NULL;
+
+ fp->fd = fd;
+ fp->is_socket = (strchr(mode, 's') != NULL);
+ fp->base.backend = &fd_backend;
+ return &fp->base;
+}
+
+static hFILE *hopen_fd_fileuri(const char *url, const char *mode)
+{
+ if (strncmp(url, "file://localhost/", 17) == 0) url += 16;
+ else if (strncmp(url, "file:///", 8) == 0) url += 7;
+ else { errno = EPROTONOSUPPORT; return NULL; }
+
+ return hopen_fd(url, mode);
+}
+
+static hFILE *hopen_fd_stdinout(const char *mode)
+{
+ int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO;
+#if defined HAVE_SETMODE && defined O_BINARY
+ if (setmode(fd, O_BINARY) < 0) return NULL;
+#endif
+ return hdopen(fd, mode);
+}
+
+int hfile_oflags(const char *mode)
+{
+ int rdwr = 0, flags = 0;
+ const char *s;
+ for (s = mode; *s; s++)
+ switch (*s) {
+ case 'r': rdwr = O_RDONLY; break;
+ case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC; break;
+ case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND; break;
+ case '+': rdwr = O_RDWR; break;
+#ifdef O_CLOEXEC
+ case 'e': flags |= O_CLOEXEC; break;
+#endif
+#ifdef O_EXCL
+ case 'x': flags |= O_EXCL; break;
+#endif
+ default: break;
+ }
+
+#ifdef O_BINARY
+ flags |= O_BINARY;
+#endif
+
+ return rdwr | flags;
+}
+
+
+/*********************
+ * In-memory backend *
+ *********************/
+
+typedef struct {
+ hFILE base;
+ const char *buffer;
+ size_t length, pos;
+} hFILE_mem;
+
+static ssize_t mem_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+ hFILE_mem *fp = (hFILE_mem *) fpv;
+ size_t avail = fp->length - fp->pos;
+ if (nbytes > avail) nbytes = avail;
+ memcpy(buffer, fp->buffer + fp->pos, nbytes);
+ fp->pos += nbytes;
+ return nbytes;
+}
+
+static off_t mem_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_mem *fp = (hFILE_mem *) fpv;
+ size_t absoffset = (offset >= 0)? offset : -offset;
+ size_t origin;
+
+ switch (whence) {
+ case SEEK_SET: origin = 0; break;
+ case SEEK_CUR: origin = fp->pos; break;
+ case SEEK_END: origin = fp->length; break;
+ default: errno = EINVAL; return -1;
+ }
+
+ if ((offset < 0 && absoffset > origin) ||
+ (offset >= 0 && absoffset > fp->length - origin)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ fp->pos = origin + offset;
+ return fp->pos;
+}
+
+static int mem_close(hFILE *fpv)
+{
+ return 0;
+}
+
+static const struct hFILE_backend mem_backend =
+{
+ mem_read, NULL, mem_seek, NULL, mem_close
+};
+
+static hFILE *hopen_mem(const char *data, const char *mode)
+{
+ if (strncmp(data, "data:", 5) == 0) data += 5;
+
+ // TODO Implement write modes, which will require memory allocation
+ if (strchr(mode, 'r') == NULL) { errno = EINVAL; return NULL; }
+
+ hFILE_mem *fp = (hFILE_mem *) hfile_init(sizeof (hFILE_mem), mode, 0);
+ if (fp == NULL) return NULL;
+
+ fp->buffer = data;
+ fp->length = strlen(data);
+ fp->pos = 0;
+ fp->base.backend = &mem_backend;
+ return &fp->base;
+}
+
+
+/*****************************************
+ * Plugin and hopen() backend dispatcher *
+ *****************************************/
+
+#include <ctype.h>
+
+#include "hts_internal.h"
+#include "htslib/khash.h"
+
+KHASH_MAP_INIT_STR(scheme_string, const struct hFILE_scheme_handler *);
+static khash_t(scheme_string) *schemes = NULL;
+
+struct hFILE_plugin_list {
+ struct hFILE_plugin plugin;
+ struct hFILE_plugin_list *next;
+};
+
+static struct hFILE_plugin_list *plugins = NULL;
+static pthread_mutex_t plugins_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static void hfile_exit()
+{
+ pthread_mutex_lock(&plugins_lock);
+
+ kh_destroy(scheme_string, schemes);
+
+ while (plugins != NULL) {
+ struct hFILE_plugin_list *p = plugins;
+ if (p->plugin.destroy) p->plugin.destroy();
+#ifdef ENABLE_PLUGINS
+ if (p->plugin.obj) close_plugin(p->plugin.obj);
+#endif
+ plugins = p->next;
+ free(p);
+ }
+
+ pthread_mutex_unlock(&plugins_lock);
+ pthread_mutex_destroy(&plugins_lock);
+}
+
+void hfile_add_scheme_handler(const char *scheme,
+ const struct hFILE_scheme_handler *handler)
+{
+ int absent;
+ khint_t k = kh_put(scheme_string, schemes, scheme, &absent);
+ if (absent || handler->priority > kh_value(schemes, k)->priority) {
+ kh_value(schemes, k) = handler;
+ }
+}
+
+static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *),
+ const char *pluginname)
+{
+ struct hFILE_plugin_list *p = malloc (sizeof (struct hFILE_plugin_list));
+ if (p == NULL) abort();
+
+ p->plugin.api_version = 1;
+ p->plugin.obj = obj;
+ p->plugin.name = NULL;
+ p->plugin.destroy = NULL;
+
+ int ret = (*init)(&p->plugin);
+
+ if (ret != 0) {
+ if (hts_verbose >= 4)
+ fprintf(stderr, "[W::load_hfile_plugins] "
+ "initialisation failed for plugin \"%s\": %d\n",
+ pluginname, ret);
+ free(p);
+ return ret;
+ }
+
+ if (hts_verbose >= 5)
+ fprintf(stderr, "[M::load_hfile_plugins] loaded \"%s\"\n", pluginname);
+
+ p->next = plugins, plugins = p;
+ return 0;
+}
+
+static void load_hfile_plugins()
+{
+ static const struct hFILE_scheme_handler
+ data = { hopen_mem, hfile_always_local, "built-in", 80 },
+ file = { hopen_fd_fileuri, hfile_always_local, "built-in", 80 };
+
+ schemes = kh_init(scheme_string);
+ if (schemes == NULL) abort();
+
+ hfile_add_scheme_handler("data", &data);
+ hfile_add_scheme_handler("file", &file);
+ init_add_plugin(NULL, hfile_plugin_init_net, "knetfile");
+
+#ifdef ENABLE_PLUGINS
+ struct hts_path_itr path;
+ const char *pluginname;
+ hts_path_itr_setup(&path, NULL, NULL, "hfile_", 6, NULL, 0);
+ while ((pluginname = hts_path_itr_next(&path)) != NULL) {
+ void *obj;
+ int (*init)(struct hFILE_plugin *) = (int (*)(struct hFILE_plugin *))
+ load_plugin(&obj, pluginname, "hfile_plugin_init");
+
+ if (init) {
+ if (init_add_plugin(obj, init, pluginname) != 0)
+ close_plugin(obj);
+ }
+ }
+#else
+
+#ifdef HAVE_IRODS
+ init_add_plugin(NULL, hfile_plugin_init_irods, "iRODS");
+#endif
+#ifdef HAVE_LIBCURL
+ init_add_plugin(NULL, hfile_plugin_init_libcurl, "libcurl");
+#endif
+
+#endif
+
+ // In the unlikely event atexit() fails, it's better to succeed here and
+ // carry on; then eventually when the program exits, we'll merely close
+ // down the plugins uncleanly, as if we had aborted.
+ (void) atexit(hfile_exit);
+}
+
+/* A filename like "foo:bar" in which we don't recognise the scheme is
+ either an ordinary file or an indication of a missing or broken plugin.
+ Try to open it as an ordinary file; but if there's no such file, set
+ errno distinctively to make the plugin issue apparent. */
+static hFILE *hopen_unknown_scheme(const char *fname, const char *mode)
+{
+ hFILE *fp = hopen_fd(fname, mode);
+ if (fp == NULL && errno == ENOENT) errno = EPROTONOSUPPORT;
+ return fp;
+}
+
+/* Returns the appropriate handler, or NULL if the string isn't an URL. */
+static const struct hFILE_scheme_handler *find_scheme_handler(const char *s)
+{
+ static const struct hFILE_scheme_handler unknown_scheme =
+ { hopen_unknown_scheme, hfile_always_local, "built-in", 0 };
+
+ char scheme[12];
+ int i;
+
+ for (i = 0; i < sizeof scheme; i++)
+ if (isalnum(s[i]) || s[i] == '+' || s[i] == '-' || s[i] == '.')
+ scheme[i] = tolower(s[i]);
+ else if (s[i] == ':') break;
+ else return NULL;
+
+ if (i == 0 || i >= sizeof scheme) return NULL;
+ scheme[i] = '\0';
+
+ pthread_mutex_lock(&plugins_lock);
+ if (! schemes) load_hfile_plugins();
+ pthread_mutex_unlock(&plugins_lock);
+
+ khint_t k = kh_get(scheme_string, schemes, scheme);
+ return (k != kh_end(schemes))? kh_value(schemes, k) : &unknown_scheme;
+}
+
+hFILE *hopen(const char *fname, const char *mode)
+{
+ const struct hFILE_scheme_handler *handler = find_scheme_handler(fname);
+ if (handler) return handler->open(fname, mode);
+ else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
+ else return hopen_fd(fname, mode);
+}
+
+int hfile_always_local (const char *fname) { return 0; }
+int hfile_always_remote(const char *fname) { return 1; }
+
+int hisremote(const char *fname)
+{
+ const struct hFILE_scheme_handler *handler = find_scheme_handler(fname);
+ return handler? handler->isremote(fname) : 0;
+}
diff --git a/htslib/hfile_internal.h b/htslib/hfile_internal.h
new file mode 100644
index 0000000..47b340a
--- /dev/null
+++ b/htslib/hfile_internal.h
@@ -0,0 +1,139 @@
+/* hfile_internal.h -- internal parts of low-level input/output streams.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HFILE_INTERNAL_H
+#define HFILE_INTERNAL_H
+
+#include "htslib/hfile.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct hFILE_backend {
+ /* As per read(2), returning the number of bytes read (possibly 0) or
+ negative (and setting errno) on errors. Front-end code will call this
+ repeatedly if necessary to attempt to get the desired byte count. */
+ ssize_t (*read)(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+ /* As per write(2), returning the number of bytes written or negative (and
+ setting errno) on errors. Front-end code will call this repeatedly if
+ necessary until the desired block is written or an error occurs. */
+ ssize_t (*write)(hFILE *fp, const void *buffer, size_t nbytes)
+ HTS_RESULT_USED;
+
+ /* As per lseek(2), returning the resulting offset within the stream or
+ negative (and setting errno) on errors. */
+ off_t (*seek)(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+ /* Performs low-level flushing, if any, e.g., fsync(2); for writing streams
+ only. Returns 0 for success or negative (and sets errno) on errors. */
+ int (*flush)(hFILE *fp) HTS_RESULT_USED;
+
+ /* Closes the underlying stream (for output streams, the buffer will
+ already have been flushed), returning 0 for success or negative (and
+ setting errno) on errors, as per close(2). */
+ int (*close)(hFILE *fp) HTS_RESULT_USED;
+};
+
+/* May be called by hopen_*() functions to decode a fopen()-style mode into
+ open(2)-style flags. */
+int hfile_oflags(const char *mode);
+
+/* Must be called by hopen_*() functions to allocate the hFILE struct and set
+ up its base. Capacity is a suggested buffer size (e.g., via fstat(2))
+ or 0 for a default-sized buffer. */
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity);
+
+/* May be called by hopen_*() functions to undo the effects of hfile_init()
+ in the event opening the stream subsequently fails. (This is safe to use
+ even if fp is NULL. This takes care to preserve errno.) */
+void hfile_destroy(hFILE *fp);
+
+
+struct hFILE_scheme_handler {
+ /* Opens a stream when dispatched by hopen(); should call hfile_init()
+ to malloc a struct "derived" from hFILE and initialise it appropriately,
+ including setting base.backend to its own backend vector. */
+ hFILE *(*open)(const char *filename, const char *mode) HTS_RESULT_USED;
+
+ /* Returns whether the URL denotes remote storage when dispatched by
+ hisremote(). For simple cases, use one of hfile_always_*() below. */
+ int (*isremote)(const char *filename) HTS_RESULT_USED;
+
+ /* The name of the plugin or other code providing this handler. */
+ const char *provider;
+
+ /* If multiple handlers are registered for the same scheme, the one with
+ the highest priority is used; range is 0 (lowest) to 100 (highest). */
+ int priority;
+};
+
+/* May be used as an isremote() function in simple cases. */
+extern int hfile_always_local (const char *fname);
+extern int hfile_always_remote(const char *fname);
+
+/* Should be called by plugins for each URL scheme they wish to handle. */
+void hfile_add_scheme_handler(const char *scheme,
+ const struct hFILE_scheme_handler *handler);
+
+struct hFILE_plugin {
+ /* On entry, HTSlib's plugin API version (currently 1). */
+ int api_version;
+
+ /* On entry, the plugin's handle as returned by dlopen() etc. */
+ void *obj;
+
+ /* The plugin should fill this in with its (human-readable) name. */
+ const char *name;
+
+ /* The plugin may wish to fill in a function to be called on closing. */
+ void (*destroy)(void);
+};
+
+#ifdef ENABLE_PLUGINS
+#define PLUGIN_GLOBAL(identifier,suffix) identifier
+
+/* Plugins must define an entry point with this signature. */
+extern int hfile_plugin_init(struct hFILE_plugin *self);
+
+#else
+#define PLUGIN_GLOBAL(identifier,suffix) identifier##suffix
+
+/* Only plugins distributed within the HTSlib source that might be built
+ even with --disable-plugins need to use PLUGIN_GLOBAL and be listed here;
+ others can simply define hfile_plugin_init(). */
+
+extern int hfile_plugin_init_irods(struct hFILE_plugin *self);
+extern int hfile_plugin_init_libcurl(struct hFILE_plugin *self);
+#endif
+
+/* This one is never built as a separate plugin. */
+extern int hfile_plugin_init_net(struct hFILE_plugin *self);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/hfile_irods.c b/htslib/hfile_irods.c
new file mode 100644
index 0000000..84f622d
--- /dev/null
+++ b/htslib/hfile_irods.c
@@ -0,0 +1,259 @@
+/* hfile_irods.c -- iRODS backend for low-level file streams.
+
+ Copyright (C) 2013, 2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "hfile_internal.h"
+#include "htslib/hts.h" // for hts_version() and hts_verbose
+#include "htslib/kstring.h"
+
+#include <rcConnect.h>
+#include <rcMisc.h>
+#include <dataObjOpen.h>
+#include <dataObjRead.h>
+#include <dataObjWrite.h>
+#include <dataObjLseek.h>
+#include <dataObjClose.h>
+
+typedef struct {
+ hFILE base;
+ int descriptor;
+} hFILE_irods;
+
+static int status_errno(int status)
+{
+ switch (status) {
+ case SYS_NO_API_PRIV: return EACCES;
+ case SYS_MALLOC_ERR: return ENOMEM;
+ case SYS_OUT_OF_FILE_DESC: return ENFILE;
+ case SYS_BAD_FILE_DESCRIPTOR: return EBADF;
+ case CAT_NO_ACCESS_PERMISSION: return EACCES;
+ case CAT_INVALID_AUTHENTICATION: return EACCES;
+ case CAT_NO_ROWS_FOUND: return ENOENT;
+ case CATALOG_ALREADY_HAS_ITEM_BY_THAT_NAME: return EEXIST;
+ default: return EIO;
+ }
+}
+
+static void set_errno(int status)
+{
+ int err = abs(status) % 1000;
+ errno = err? err : status_errno(status);
+}
+
+static struct {
+ rcComm_t *conn;
+ rodsEnv env;
+} irods = { NULL };
+
+static void irods_exit()
+{
+ if (irods.conn) { (void) rcDisconnect(irods.conn); }
+ irods.conn = NULL;
+}
+
+static int irods_init()
+{
+ kstring_t useragent = { 0, 0, NULL };
+ struct sigaction pipehandler;
+ rErrMsg_t err;
+ int ret, pipehandler_ret;
+
+ if (hts_verbose >= 5) rodsLogLevel(hts_verbose);
+
+ ret = getRodsEnv(&irods.env);
+ if (ret < 0) goto error;
+
+ // Set iRODS User-Agent, if our caller hasn't already done so.
+ kputs("htslib/", &useragent);
+ kputs(hts_version(), &useragent);
+ (void) setenv(SP_OPTION, useragent.s, 0);
+ free(useragent.s);
+
+ // Prior to iRODS 4.1, rcConnect() (even if it fails) installs its own
+ // SIGPIPE handler, which just prints a message and otherwise ignores the
+ // signal. Most actual SIGPIPEs encountered will pertain to e.g. stdout
+ // rather than iRODS's connection, so we save and restore the existing
+ // state (by default, termination; or as already set by our caller).
+ pipehandler_ret = sigaction(SIGPIPE, NULL, &pipehandler);
+
+ irods.conn = rcConnect(irods.env.rodsHost, irods.env.rodsPort,
+ irods.env.rodsUserName, irods.env.rodsZone,
+ NO_RECONN, &err);
+ if (pipehandler_ret == 0) sigaction(SIGPIPE, &pipehandler, NULL);
+ if (irods.conn == NULL) { ret = err.status; goto error; }
+
+ if (strcmp(irods.env.rodsUserName, PUBLIC_USER_NAME) != 0) {
+#if defined IRODS_VERSION_INTEGER && IRODS_VERSION_INTEGER >= 4000000
+ ret = clientLogin(irods.conn, NULL, NULL);
+#else
+ ret = clientLogin(irods.conn);
+#endif
+ if (ret != 0) goto error;
+ }
+
+ return 0;
+
+error:
+ if (irods.conn) { (void) rcDisconnect(irods.conn); }
+ irods.conn = NULL;
+ set_errno(ret);
+ return -1;
+}
+
+static ssize_t irods_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ bytesBuf_t buf;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+ args.len = nbytes;
+
+ buf.buf = buffer;
+ buf.len = nbytes;
+
+ ret = rcDataObjRead(irods.conn, &args, &buf);
+ if (ret < 0) set_errno(ret);
+ return ret;
+}
+
+static ssize_t irods_write(hFILE *fpv, const void *buffer, size_t nbytes)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ bytesBuf_t buf;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+ args.len = nbytes;
+
+ buf.buf = (void *) buffer; // ...the iRODS API is not const-correct here
+ buf.len = nbytes;
+
+ ret = rcDataObjWrite(irods.conn, &args, &buf);
+ if (ret < 0) set_errno(ret);
+ return ret;
+}
+
+static off_t irods_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ fileLseekOut_t *out = NULL;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+ args.offset = offset;
+ args.whence = whence;
+
+ ret = rcDataObjLseek(irods.conn, &args, &out);
+
+ if (out) { offset = out->offset; free(out); }
+ else offset = -1;
+ if (ret < 0) { set_errno(ret); return -1; }
+ return offset;
+}
+
+static int irods_close(hFILE *fpv)
+{
+ hFILE_irods *fp = (hFILE_irods *) fpv;
+ openedDataObjInp_t args;
+ int ret;
+
+ memset(&args, 0, sizeof args);
+ args.l1descInx = fp->descriptor;
+
+ ret = rcDataObjClose(irods.conn, &args);
+ if (ret < 0) set_errno(ret);
+ return ret;
+}
+
+static const struct hFILE_backend irods_backend =
+{
+ irods_read, irods_write, irods_seek, NULL, irods_close
+};
+
+hFILE *hopen_irods(const char *filename, const char *mode)
+{
+ hFILE_irods *fp;
+ rodsPath_t path;
+ dataObjInp_t args;
+ int ret;
+
+ // Initialise the iRODS connection if this is the first use.
+ if (irods.conn == NULL) { if (irods_init() < 0) return NULL; }
+
+ if (strncmp(filename, "irods:", 6) == 0) filename += 6;
+ else { errno = EINVAL; return NULL; }
+
+ fp = (hFILE_irods *) hfile_init(sizeof (hFILE_irods), mode, 0);
+ if (fp == NULL) return NULL;
+
+ strncpy(path.inPath, filename, MAX_NAME_LEN-1);
+ path.inPath[MAX_NAME_LEN-1] = '\0';
+
+ ret = parseRodsPath(&path, &irods.env);
+ if (ret < 0) goto error;
+
+ memset(&args, 0, sizeof args);
+ strcpy(args.objPath, path.outPath);
+ args.openFlags = hfile_oflags(mode);
+ if (args.openFlags & O_CREAT) {
+ args.createMode = 0666;
+ addKeyVal(&args.condInput, DEST_RESC_NAME_KW,irods.env.rodsDefResource);
+ }
+
+ ret = rcDataObjOpen(irods.conn, &args);
+ if (ret < 0) goto error;
+ fp->descriptor = ret;
+
+ fp->base.backend = &irods_backend;
+ return &fp->base;
+
+error:
+ hfile_destroy((hFILE *) fp);
+ set_errno(ret);
+ return NULL;
+}
+
+int PLUGIN_GLOBAL(hfile_plugin_init,_irods)(struct hFILE_plugin *self)
+{
+ static const struct hFILE_scheme_handler handler =
+ { hopen_irods, hfile_always_remote, "iRODS", 50 };
+
+ self->name = "iRODS";
+ hfile_add_scheme_handler("irods", &handler);
+ self->destroy = irods_exit;
+ return 0;
+}
diff --git a/htslib/hfile_libcurl.c b/htslib/hfile_libcurl.c
new file mode 100644
index 0000000..457c8ce
--- /dev/null
+++ b/htslib/hfile_libcurl.c
@@ -0,0 +1,917 @@
+/* hfile_libcurl.c -- libcurl backend for low-level file streams.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <sys/select.h>
+
+#include "hfile_internal.h"
+#include "htslib/hts.h" // for hts_version() and hts_verbose
+#include "htslib/kstring.h"
+
+#include <curl/curl.h>
+
+typedef struct {
+ hFILE base;
+ CURL *easy;
+ struct curl_slist *headers;
+ off_t file_size;
+ struct {
+ union { char *rd; const char *wr; } ptr;
+ size_t len;
+ } buffer;
+ CURLcode final_result; // easy result code for finished transfers
+ // Flags for communicating with libcurl callbacks:
+ unsigned paused : 1; // callback tells us that it has paused transfer
+ unsigned closing : 1; // informs callback that hclose() has been invoked
+ unsigned finished : 1; // wait_perform() tells us transfer is complete
+} hFILE_libcurl;
+
+static int http_status_errno(int status)
+{
+ if (status >= 500)
+ switch (status) {
+ case 501: return ENOSYS;
+ case 503: return EBUSY;
+ case 504: return ETIMEDOUT;
+ default: return EIO;
+ }
+ else if (status >= 400)
+ switch (status) {
+ case 401: return EPERM;
+ case 403: return EACCES;
+ case 404: return ENOENT;
+ case 405: return EROFS;
+ case 407: return EPERM;
+ case 408: return ETIMEDOUT;
+ case 410: return ENOENT;
+ default: return EINVAL;
+ }
+ else return 0;
+}
+
+static int easy_errno(CURL *easy, CURLcode err)
+{
+ long lval;
+
+ switch (err) {
+ case CURLE_OK:
+ return 0;
+
+ case CURLE_UNSUPPORTED_PROTOCOL:
+ case CURLE_URL_MALFORMAT:
+ return EINVAL;
+
+#if LIBCURL_VERSION_NUM >= 0x071505
+ case CURLE_NOT_BUILT_IN:
+ return ENOSYS;
+#endif
+
+ case CURLE_COULDNT_RESOLVE_PROXY:
+ case CURLE_COULDNT_RESOLVE_HOST:
+ case CURLE_FTP_CANT_GET_HOST:
+ return EDESTADDRREQ; // Lookup failure
+
+ case CURLE_COULDNT_CONNECT:
+ case CURLE_SEND_ERROR:
+ case CURLE_RECV_ERROR:
+ if (curl_easy_getinfo(easy, CURLINFO_OS_ERRNO, &lval) == CURLE_OK)
+ return lval;
+ else
+ return ECONNABORTED;
+
+ case CURLE_REMOTE_ACCESS_DENIED:
+ case CURLE_LOGIN_DENIED:
+ case CURLE_TFTP_PERM:
+ return EACCES;
+
+ case CURLE_PARTIAL_FILE:
+ return EPIPE;
+
+ case CURLE_HTTP_RETURNED_ERROR:
+ if (curl_easy_getinfo(easy, CURLINFO_RESPONSE_CODE, &lval) == CURLE_OK)
+ return http_status_errno(lval);
+ else
+ return EIO;
+
+ case CURLE_OUT_OF_MEMORY:
+ return ENOMEM;
+
+ case CURLE_OPERATION_TIMEDOUT:
+ return ETIMEDOUT;
+
+ case CURLE_RANGE_ERROR:
+ return ESPIPE;
+
+ case CURLE_SSL_CONNECT_ERROR:
+ // TODO return SSL error buffer messages
+ return ECONNABORTED;
+
+ case CURLE_FILE_COULDNT_READ_FILE:
+ case CURLE_TFTP_NOTFOUND:
+ return ENOENT;
+
+ case CURLE_TOO_MANY_REDIRECTS:
+ return ELOOP;
+
+ case CURLE_FILESIZE_EXCEEDED:
+ return EFBIG;
+
+ case CURLE_REMOTE_DISK_FULL:
+ return ENOSPC;
+
+ case CURLE_REMOTE_FILE_EXISTS:
+ return EEXIST;
+
+ default:
+ return EIO;
+ }
+}
+
+static int multi_errno(CURLMcode errm)
+{
+ switch (errm) {
+ case CURLM_CALL_MULTI_PERFORM:
+ case CURLM_OK:
+ return 0;
+
+ case CURLM_BAD_HANDLE:
+ case CURLM_BAD_EASY_HANDLE:
+ case CURLM_BAD_SOCKET:
+ return EBADF;
+
+ case CURLM_OUT_OF_MEMORY:
+ return ENOMEM;
+
+ default:
+ return EIO;
+ }
+}
+
+
+static struct {
+ CURLM *multi;
+ kstring_t useragent;
+ int nrunning;
+ unsigned perform_again : 1;
+} curl = { NULL, { 0, 0, NULL }, 0, 0 };
+
+static void libcurl_exit()
+{
+ (void) curl_multi_cleanup(curl.multi);
+ curl.multi = NULL;
+
+ free(curl.useragent.s);
+ curl.useragent.l = curl.useragent.m = 0; curl.useragent.s = NULL;
+
+ curl_global_cleanup();
+}
+
+
+static void process_messages()
+{
+ CURLMsg *msg;
+ int remaining;
+
+ while ((msg = curl_multi_info_read(curl.multi, &remaining)) != NULL) {
+ hFILE_libcurl *fp = NULL;
+ curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char **) &fp);
+ switch (msg->msg) {
+ case CURLMSG_DONE:
+ fp->finished = 1;
+ fp->final_result = msg->data.result;
+ break;
+
+ default:
+ break;
+ }
+ }
+}
+
+static int wait_perform()
+{
+ fd_set rd, wr, ex;
+ int maxfd, nrunning;
+ long timeout;
+ CURLMcode errm;
+
+ FD_ZERO(&rd);
+ FD_ZERO(&wr);
+ FD_ZERO(&ex);
+ if (curl_multi_fdset(curl.multi, &rd, &wr, &ex, &maxfd) != CURLM_OK)
+ maxfd = -1, timeout = 1000;
+ else if (maxfd < 0)
+ timeout = 100; // as recommended by curl_multi_fdset(3)
+ else {
+ if (curl_multi_timeout(curl.multi, &timeout) != CURLM_OK)
+ timeout = 1000;
+ else if (timeout < 0)
+ timeout = 10000; // as recommended by curl_multi_timeout(3)
+ }
+
+ if (timeout > 0 && ! curl.perform_again) {
+ struct timeval tval;
+ tval.tv_sec = (timeout / 1000);
+ tval.tv_usec = (timeout % 1000) * 1000;
+
+ if (select(maxfd + 1, &rd, &wr, &ex, &tval) < 0) return -1;
+ }
+
+ errm = curl_multi_perform(curl.multi, &nrunning);
+ curl.perform_again = 0;
+ if (errm == CURLM_CALL_MULTI_PERFORM) curl.perform_again = 1;
+ else if (errm != CURLM_OK) { errno = multi_errno(errm); return -1; }
+
+ if (nrunning < curl.nrunning) process_messages();
+ return 0;
+}
+
+
+static size_t recv_callback(char *ptr, size_t size, size_t nmemb, void *fpv)
+{
+ hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
+ size_t n = size * nmemb;
+
+ if (n > fp->buffer.len) { fp->paused = 1; return CURL_WRITEFUNC_PAUSE; }
+ else if (n == 0) return 0;
+
+ memcpy(fp->buffer.ptr.rd, ptr, n);
+ fp->buffer.ptr.rd += n;
+ fp->buffer.len -= n;
+ return n;
+}
+
+static ssize_t libcurl_read(hFILE *fpv, void *bufferv, size_t nbytes)
+{
+ hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
+ char *buffer = (char *) bufferv;
+ CURLcode err;
+
+ fp->buffer.ptr.rd = buffer;
+ fp->buffer.len = nbytes;
+ fp->paused = 0;
+ err = curl_easy_pause(fp->easy, CURLPAUSE_CONT);
+ if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; }
+
+ while (! fp->paused && ! fp->finished)
+ if (wait_perform() < 0) return -1;
+
+ nbytes = fp->buffer.ptr.rd - buffer;
+ fp->buffer.ptr.rd = NULL;
+ fp->buffer.len = 0;
+
+ if (fp->finished && fp->final_result != CURLE_OK) {
+ errno = easy_errno(fp->easy, fp->final_result);
+ return -1;
+ }
+
+ return nbytes;
+}
+
+static size_t send_callback(char *ptr, size_t size, size_t nmemb, void *fpv)
+{
+ hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
+ size_t n = size * nmemb;
+
+ if (fp->buffer.len == 0) {
+ // Send buffer is empty; normally pause, or signal EOF if we're closing
+ if (fp->closing) return 0;
+ else { fp->paused = 1; return CURL_READFUNC_PAUSE; }
+ }
+
+ if (n > fp->buffer.len) n = fp->buffer.len;
+ memcpy(ptr, fp->buffer.ptr.wr, n);
+ fp->buffer.ptr.wr += n;
+ fp->buffer.len -= n;
+ return n;
+}
+
+static ssize_t libcurl_write(hFILE *fpv, const void *bufferv, size_t nbytes)
+{
+ hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
+ const char *buffer = (const char *) bufferv;
+ CURLcode err;
+
+ fp->buffer.ptr.wr = buffer;
+ fp->buffer.len = nbytes;
+ fp->paused = 0;
+ err = curl_easy_pause(fp->easy, CURLPAUSE_CONT);
+ if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; }
+
+ while (! fp->paused && ! fp->finished)
+ if (wait_perform() < 0) return -1;
+
+ nbytes = fp->buffer.ptr.wr - buffer;
+ fp->buffer.ptr.wr = NULL;
+ fp->buffer.len = 0;
+
+ if (fp->finished && fp->final_result != CURLE_OK) {
+ errno = easy_errno(fp->easy, fp->final_result);
+ return -1;
+ }
+
+ return nbytes;
+}
+
+static off_t libcurl_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
+
+ CURLcode err;
+ CURLMcode errm;
+ off_t origin, pos;
+
+ switch (whence) {
+ case SEEK_SET:
+ origin = 0;
+ break;
+ case SEEK_CUR:
+ errno = ENOSYS;
+ return -1;
+ case SEEK_END:
+ if (fp->file_size < 0) { errno = ESPIPE; return -1; }
+ origin = fp->file_size;
+ break;
+ default:
+ errno = EINVAL;
+ return -1;
+ }
+
+ // Check 0 <= origin+offset < fp->file_size carefully, avoiding overflow
+ if ((offset < 0)? origin + offset < 0
+ : (fp->file_size >= 0 && offset > fp->file_size - origin)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ pos = origin + offset;
+
+ errm = curl_multi_remove_handle(curl.multi, fp->easy);
+ if (errm != CURLM_OK) { errno = multi_errno(errm); return -1; }
+ curl.nrunning--;
+
+ // TODO If we seem to be doing random access, use CURLOPT_RANGE to do
+ // limited reads (e.g. about a BAM block!) so seeking can reuse the
+ // existing connection more often.
+
+ if (pos <= 2147483647) err = curl_easy_setopt(fp->easy, CURLOPT_RESUME_FROM, (long) pos);
+ else err = curl_easy_setopt(fp->easy, CURLOPT_RESUME_FROM_LARGE, (curl_off_t) pos);
+ if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; }
+
+ fp->buffer.len = 0;
+ fp->paused = fp->finished = 0;
+
+ errm = curl_multi_add_handle(curl.multi, fp->easy);
+ if (errm != CURLM_OK) { errno = multi_errno(errm); return -1; }
+ curl.nrunning++;
+
+ err = curl_easy_pause(fp->easy, CURLPAUSE_CONT);
+ if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; }
+
+ while (! fp->paused && ! fp->finished)
+ if (wait_perform() < 0) return -1;
+
+ if (fp->finished && fp->final_result != CURLE_OK) {
+ errno = easy_errno(fp->easy, fp->final_result);
+ return -1;
+ }
+
+ return pos;
+}
+
+static int libcurl_close(hFILE *fpv)
+{
+ hFILE_libcurl *fp = (hFILE_libcurl *) fpv;
+ CURLcode err;
+ CURLMcode errm;
+ int save_errno = 0;
+
+ // Before closing the file, unpause it and perform on it so that uploads
+ // have the opportunity to signal EOF to the server -- see send_callback().
+
+ fp->buffer.len = 0;
+ fp->closing = 1;
+ fp->paused = 0;
+ err = curl_easy_pause(fp->easy, CURLPAUSE_CONT);
+ if (err != CURLE_OK) save_errno = easy_errno(fp->easy, err);
+
+ while (save_errno == 0 && ! fp->paused && ! fp->finished)
+ if (wait_perform() < 0) save_errno = errno;
+
+ if (fp->finished && fp->final_result != CURLE_OK)
+ save_errno = easy_errno(fp->easy, fp->final_result);
+
+ errm = curl_multi_remove_handle(curl.multi, fp->easy);
+ if (errm != CURLM_OK && save_errno == 0) save_errno = multi_errno(errm);
+ curl.nrunning--;
+
+ curl_easy_cleanup(fp->easy);
+
+ if (save_errno) { errno = save_errno; return -1; }
+ else return 0;
+}
+
+static const struct hFILE_backend libcurl_backend =
+{
+ libcurl_read, libcurl_write, libcurl_seek, NULL, libcurl_close
+};
+
+static int add_header(hFILE_libcurl *fp, const char *header)
+{
+ struct curl_slist *list = curl_slist_append(fp->headers, header);
+ if (list == NULL) { errno = ENOMEM; return -1; }
+ fp->headers = list;
+ return 0;
+}
+
+static int
+add_s3_settings(hFILE_libcurl *fp, const char *url, kstring_t *message);
+
+hFILE *hopen_libcurl(const char *url, const char *modes)
+{
+ hFILE_libcurl *fp;
+ char mode;
+ const char *s;
+ CURLcode err;
+ CURLMcode errm;
+ int save;
+
+ if ((s = strpbrk(modes, "rwa+")) != NULL) {
+ mode = *s;
+ if (strpbrk(&s[1], "rwa+")) mode = 'e';
+ }
+ else mode = '\0';
+
+ if (mode != 'r' && mode != 'w') { errno = EINVAL; return NULL; }
+
+ fp = (hFILE_libcurl *) hfile_init(sizeof (hFILE_libcurl), modes, 0);
+ if (fp == NULL) return NULL;
+
+ fp->easy = curl_easy_init();
+ if (fp->easy == NULL) { errno = ENOMEM; goto error; }
+
+ fp->headers = NULL;
+ fp->file_size = -1;
+ fp->buffer.ptr.rd = NULL;
+ fp->buffer.len = 0;
+ fp->final_result = (CURLcode) -1;
+ fp->paused = fp->closing = fp->finished = 0;
+
+ // Make a route to the hFILE_libcurl* given just a CURL* easy handle
+ err = curl_easy_setopt(fp->easy, CURLOPT_PRIVATE, fp);
+
+ if (mode == 'r') {
+ err |= curl_easy_setopt(fp->easy, CURLOPT_WRITEFUNCTION, recv_callback);
+ err |= curl_easy_setopt(fp->easy, CURLOPT_WRITEDATA, fp);
+ }
+ else {
+ err |= curl_easy_setopt(fp->easy, CURLOPT_READFUNCTION, send_callback);
+ err |= curl_easy_setopt(fp->easy, CURLOPT_READDATA, fp);
+ err |= curl_easy_setopt(fp->easy, CURLOPT_UPLOAD, 1L);
+ if (add_header(fp, "Transfer-Encoding: chunked") < 0) goto error;
+ }
+
+ if (tolower(url[0]) == 's' && url[1] == '3') {
+ // Construct the HTTP-Method/Content-MD5/Content-Type part of the
+ // message to be signed. This will be destroyed by add_s3_settings().
+ kstring_t message = { 0, 0, NULL };
+ kputs((mode == 'r')? "GET\n" : "PUT\n", &message);
+ kputc('\n', &message);
+ kputc('\n', &message);
+ if (add_s3_settings(fp, url, &message) < 0) goto error;
+ }
+ else
+ err |= curl_easy_setopt(fp->easy, CURLOPT_URL, url);
+
+ err |= curl_easy_setopt(fp->easy, CURLOPT_USERAGENT, curl.useragent.s);
+ if (fp->headers)
+ err |= curl_easy_setopt(fp->easy, CURLOPT_HTTPHEADER, fp->headers);
+ err |= curl_easy_setopt(fp->easy, CURLOPT_FOLLOWLOCATION, 1L);
+ err |= curl_easy_setopt(fp->easy, CURLOPT_FAILONERROR, 1L);
+ if (hts_verbose >= 8)
+ err |= curl_easy_setopt(fp->easy, CURLOPT_VERBOSE, 1L);
+
+ if (err != 0) { errno = ENOSYS; goto error; }
+
+ errm = curl_multi_add_handle(curl.multi, fp->easy);
+ if (errm != CURLM_OK) { errno = multi_errno(errm); goto error; }
+ curl.nrunning++;
+
+ while (! fp->paused && ! fp->finished)
+ if (wait_perform() < 0) goto error_remove;
+
+ if (fp->finished && fp->final_result != CURLE_OK) {
+ errno = easy_errno(fp->easy, fp->final_result);
+ goto error_remove;
+ }
+
+ if (mode == 'r') {
+ double dval;
+ if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD,
+ &dval) == CURLE_OK && dval >= 0.0)
+ fp->file_size = (off_t) (dval + 0.1);
+ }
+
+ fp->base.backend = &libcurl_backend;
+ return &fp->base;
+
+error_remove:
+ save = errno;
+ (void) curl_multi_remove_handle(curl.multi, fp->easy);
+ curl.nrunning--;
+ errno = save;
+
+error:
+ save = errno;
+ curl_easy_cleanup(fp->easy);
+ if (fp->headers) curl_slist_free_all(fp->headers);
+ hfile_destroy((hFILE *) fp);
+ errno = save;
+ return NULL;
+}
+
+int PLUGIN_GLOBAL(hfile_plugin_init,_libcurl)(struct hFILE_plugin *self)
+{
+ static const struct hFILE_scheme_handler handler =
+ { hopen_libcurl, hfile_always_remote, "libcurl", 50 };
+
+ const curl_version_info_data *info;
+ const char * const *protocol;
+ CURLcode err;
+
+ err = curl_global_init(CURL_GLOBAL_ALL);
+ if (err != CURLE_OK) { errno = easy_errno(NULL, err); return -1; }
+
+ curl.multi = curl_multi_init();
+ if (curl.multi == NULL) { curl_global_cleanup(); errno = EIO; return -1; }
+
+ info = curl_version_info(CURLVERSION_NOW);
+ ksprintf(&curl.useragent, "htslib/%s libcurl/%s",
+ hts_version(), info->version);
+
+ curl.nrunning = 0;
+ curl.perform_again = 0;
+ self->name = "libcurl";
+ self->destroy = libcurl_exit;
+
+ for (protocol = info->protocols; *protocol; protocol++)
+ hfile_add_scheme_handler(*protocol, &handler);
+
+ hfile_add_scheme_handler("s3", &handler);
+ hfile_add_scheme_handler("s3+http", &handler);
+ if (info->features & CURL_VERSION_SSL)
+ hfile_add_scheme_handler("s3+https", &handler);
+
+ return 0;
+}
+
+
+/*******************
+ * Rewrite S3 URLs *
+ *******************/
+
+#if defined HAVE_COMMONCRYPTO
+
+#include <CommonCrypto/CommonHMAC.h>
+
+#define DIGEST_BUFSIZ CC_SHA1_DIGEST_LENGTH
+
+static size_t
+s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message)
+{
+ CCHmac(kCCHmacAlgSHA1, key->s, key->l, message->s, message->l, digest);
+ return CC_SHA1_DIGEST_LENGTH;
+}
+
+#elif defined HAVE_HMAC
+
+#include <openssl/hmac.h>
+
+#define DIGEST_BUFSIZ EVP_MAX_MD_SIZE
+
+static size_t
+s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message)
+{
+ unsigned int len;
+ HMAC(EVP_sha1(), key->s, key->l,
+ (unsigned char *) message->s, message->l, digest, &len);
+ return len;
+}
+
+#else
+#error No HMAC() routine found by configure
+#endif
+
+static void
+urldecode_kput(const char *s, int len, hFILE_libcurl *fp, kstring_t *str)
+{
+ if (memchr(s, '%', len) != NULL) {
+ int len2;
+ char *s2 = curl_easy_unescape(fp->easy, s, len, &len2);
+ if (s2 == NULL) abort();
+ kputsn(s2, len2, str);
+ curl_free(s2);
+ }
+ else kputsn(s, len, str);
+}
+
+static void base64_kput(const unsigned char *data, size_t len, kstring_t *str)
+{
+ static const char base64[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+ size_t i = 0;
+ unsigned x = 0;
+ int bits = 0, pad = 0;
+
+ while (bits || i < len) {
+ if (bits < 6) {
+ x <<= 8, bits += 8;
+ if (i < len) x |= data[i++];
+ else pad++;
+ }
+
+ bits -= 6;
+ kputc(base64[(x >> bits) & 63], str);
+ }
+
+ str->l -= pad;
+ kputsn("==", pad, str);
+}
+
+static int is_dns_compliant(const char *s0, const char *slim)
+{
+ int has_nondigit = 0, len = 0;
+ const char *s;
+
+ for (s = s0; s < slim; len++, s++)
+ if (islower(*s))
+ has_nondigit = 1;
+ else if (*s == '-') {
+ has_nondigit = 1;
+ if (s == s0 || s+1 == slim) return 0;
+ }
+ else if (isdigit(*s))
+ ;
+ else if (*s == '.') {
+ if (s == s0 || ! isalnum(s[-1])) return 0;
+ if (s+1 == slim || ! isalnum(s[1])) return 0;
+ }
+ else return 0;
+
+ return has_nondigit && len >= 3 && len <= 63;
+}
+
+static FILE *expand_tilde_open(const char *fname, const char *mode)
+{
+ FILE *fp;
+
+ if (strncmp(fname, "~/", 2) == 0) {
+ kstring_t full_fname = { 0, 0, NULL };
+ const char *home = getenv("HOME");
+ if (! home) return NULL;
+
+ kputs(home, &full_fname);
+ kputs(&fname[1], &full_fname);
+
+ fp = fopen(full_fname.s, mode);
+ free(full_fname.s);
+ }
+ else
+ fp = fopen(fname, mode);
+
+ return fp;
+}
+
+static void parse_ini(const char *fname, const char *section, ...)
+{
+ kstring_t line = { 0, 0, NULL };
+ int active = 1; // Start active, so global properties are accepted
+ char *s;
+
+ FILE *fp = expand_tilde_open(fname, "r");
+ if (fp == NULL) return;
+
+ while (line.l = 0, kgetline(&line, (kgets_func *) fgets, fp) >= 0)
+ if (line.s[0] == '[' && (s = strchr(line.s, ']')) != NULL) {
+ *s = '\0';
+ active = (strcmp(&line.s[1], section) == 0);
+ }
+ else if (active && (s = strpbrk(line.s, ":=")) != NULL) {
+ const char *key = line.s, *value = &s[1], *akey;
+ va_list args;
+
+ while (isspace(*key)) key++;
+ while (s > key && isspace(s[-1])) s--;
+ *s = '\0';
+
+ while (isspace(*value)) value++;
+ while (line.l > 0 && isspace(line.s[line.l-1]))
+ line.s[--line.l] = '\0';
+
+ va_start(args, section);
+ while ((akey = va_arg(args, const char *)) != NULL) {
+ kstring_t *avar = va_arg(args, kstring_t *);
+ if (strcmp(key, akey) == 0) { kputs(value, avar); break; }
+ }
+ va_end(args);
+ }
+
+ fclose(fp);
+ free(line.s);
+}
+
+static void parse_simple(const char *fname, kstring_t *id, kstring_t *secret)
+{
+ kstring_t text = { 0, 0, NULL };
+ char *s;
+ size_t len;
+
+ FILE *fp = expand_tilde_open(fname, "r");
+ if (fp == NULL) return;
+
+ while (kgetline(&text, (kgets_func *) fgets, fp) >= 0)
+ kputc(' ', &text);
+ fclose(fp);
+
+ s = text.s;
+ while (isspace(*s)) s++;
+ kputsn(s, len = strcspn(s, " \t"), id);
+
+ s += len;
+ while (isspace(*s)) s++;
+ kputsn(s, strcspn(s, " \t"), secret);
+
+ free(text.s);
+}
+
+static int
+add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message)
+{
+ int ret, save;
+ const char *bucket, *path;
+ char date_hdr[40];
+ CURLcode err;
+
+ kstring_t url = { 0, 0, NULL };
+ kstring_t profile = { 0, 0, NULL };
+ kstring_t id = { 0, 0, NULL };
+ kstring_t secret = { 0, 0, NULL };
+ kstring_t token = { 0, 0, NULL };
+ kstring_t token_hdr = { 0, 0, NULL };
+ kstring_t auth_hdr = { 0, 0, NULL };
+
+ time_t now = time(NULL);
+#ifdef HAVE_GMTIME_R
+ struct tm tm_buffer;
+ struct tm *tm = gmtime_r(&now, &tm_buffer);
+#else
+ struct tm *tm = gmtime(&now);
+#endif
+
+ strftime(date_hdr, sizeof date_hdr, "Date: %a, %d %b %Y %H:%M:%S GMT", tm);
+ if (add_header(fp, date_hdr) < 0) goto error;
+ kputs(&date_hdr[6], message);
+ kputc('\n', message);
+
+ // Our S3 URL format is s3[+SCHEME]://[ID[:SECRET[:TOKEN]]@]BUCKET/PATH
+
+ if (s3url[2] == '+') {
+ bucket = strchr(s3url, ':') + 1;
+ kputsn(&s3url[3], bucket - &s3url[3], &url);
+ }
+ else {
+ kputs("https:", &url);
+ bucket = &s3url[3];
+ }
+ while (*bucket == '/') kputc(*bucket++, &url);
+
+ path = bucket + strcspn(bucket, "/?#@");
+ if (*path == '@') {
+ const char *colon = strpbrk(bucket, ":@");
+ if (*colon != ':') {
+ urldecode_kput(bucket, colon - bucket, fp, &profile);
+ }
+ else {
+ const char *colon2 = strpbrk(&colon[1], ":@");
+ urldecode_kput(bucket, colon - bucket, fp, &id);
+ urldecode_kput(&colon[1], colon2 - &colon[1], fp, &secret);
+ if (*colon2 == ':')
+ urldecode_kput(&colon2[1], path - &colon2[1], fp, &token);
+ }
+
+ bucket = &path[1];
+ path = bucket + strcspn(bucket, "/?#");
+ }
+ else {
+ // If the URL has no ID[:SECRET]@, consider environment variables.
+ const char *v;
+ if ((v = getenv("AWS_ACCESS_KEY_ID")) != NULL) kputs(v, &id);
+ if ((v = getenv("AWS_SECRET_ACCESS_KEY")) != NULL) kputs(v, &secret);
+ if ((v = getenv("AWS_SESSION_TOKEN")) != NULL) kputs(v, &token);
+
+ if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile);
+ else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile);
+ else kputs("default", &profile);
+ }
+
+ // Use virtual hosted-style access if possible, otherwise path-style.
+ if (is_dns_compliant(bucket, path)) {
+ kputsn(bucket, path - bucket, &url);
+ kputs(".s3.amazonaws.com", &url);
+ }
+ else {
+ kputs("s3.amazonaws.com/", &url);
+ kputsn(bucket, path - bucket, &url);
+ }
+ kputs(path, &url);
+
+ if (id.l == 0) {
+ const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE");
+ parse_ini(v? v : "~/.aws/credentials", profile.s,
+ "aws_access_key_id", &id, "aws_secret_access_key", &secret,
+ "aws_session_token", &token, NULL);
+ }
+ if (id.l == 0)
+ parse_ini("~/.s3cfg", profile.s, "access_key", &id,
+ "secret_key", &secret, "access_token", &token, NULL);
+ if (id.l == 0)
+ parse_simple("~/.awssecret", &id, &secret);
+
+ if (token.l > 0) {
+ kputs("x-amz-security-token:", message);
+ kputs(token.s, message);
+ kputc('\n', message);
+
+ kputs("X-Amz-Security-Token: ", &token_hdr);
+ kputs(token.s, &token_hdr);
+ if (add_header(fp, token_hdr.s) < 0) goto error;
+ }
+
+ kputc('/', message);
+ kputs(bucket, message); // CanonicalizedResource is '/' + bucket + path
+
+ err = curl_easy_setopt(fp->easy, CURLOPT_URL, url.s);
+ if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); goto error; }
+
+ // If we have no id/secret, we can't sign the request but will
+ // still be able to access public data sets.
+ if (id.l > 0 && secret.l > 0) {
+ unsigned char digest[DIGEST_BUFSIZ];
+ size_t digest_len = s3_sign(digest, &secret, message);
+
+ kputs("Authorization: AWS ", &auth_hdr);
+ kputs(id.s, &auth_hdr);
+ kputc(':', &auth_hdr);
+ base64_kput(digest, digest_len, &auth_hdr);
+
+ if (add_header(fp, auth_hdr.s) < 0) goto error;
+ }
+
+ ret = 0;
+ goto free_and_return;
+
+error:
+ ret = -1;
+
+free_and_return:
+ save = errno;
+ free(url.s);
+ free(profile.s);
+ free(id.s);
+ free(secret.s);
+ free(token.s);
+ free(token_hdr.s);
+ free(auth_hdr.s);
+ free(message->s);
+ errno = save;
+ return ret;
+}
diff --git a/htslib/hfile_net.c b/htslib/hfile_net.c
new file mode 100644
index 0000000..5443b22
--- /dev/null
+++ b/htslib/hfile_net.c
@@ -0,0 +1,112 @@
+/* hfile_net.c -- network backend for low-level input/output streams.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "hfile_internal.h"
+
+#include "htslib/knetfile.h"
+
+typedef struct {
+ hFILE base;
+ knetFile *netfp;
+} hFILE_net;
+
+static int net_inited = 0;
+
+#ifdef _WIN32
+static void net_exit(void)
+{
+ knet_win32_destroy();
+}
+#endif
+
+static int net_init(void)
+{
+#ifdef _WIN32
+ if (knet_win32_init() != 0) return -1;
+
+ // In the unlikely event atexit() fails, it's better to succeed here and
+ // carry on and do the I/O; then eventually when the program exits, we'll
+ // merely have failed to clean up properly, as if we had aborted.
+ (void) atexit(net_exit);
+#endif
+
+ net_inited = 1;
+ return 0;
+}
+
+static ssize_t net_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+ hFILE_net *fp = (hFILE_net *) fpv;
+ return knet_read(fp->netfp, buffer, nbytes);
+}
+
+static off_t net_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_net *fp = (hFILE_net *) fpv;
+ return knet_seek(fp->netfp, offset, whence);
+}
+
+static int net_close(hFILE *fpv)
+{
+ hFILE_net *fp = (hFILE_net *) fpv;
+ return knet_close(fp->netfp);
+}
+
+static const struct hFILE_backend net_backend =
+{
+ net_read, NULL, net_seek, NULL, net_close
+};
+
+hFILE *hopen_net(const char *filename, const char *mode)
+{
+ hFILE_net *fp;
+
+ // Do any networking initialisation if this is the first use.
+ if (! net_inited) { if (net_init() < 0) return NULL; }
+
+ fp = (hFILE_net *) hfile_init(sizeof (hFILE_net), mode, 0);
+ if (fp == NULL) return NULL;
+
+ fp->netfp = knet_open(filename, mode);
+ if (fp->netfp == NULL) { hfile_destroy((hFILE *) fp); return NULL; }
+
+ fp->base.backend = &net_backend;
+ return &fp->base;
+}
+
+int hfile_plugin_init_net(struct hFILE_plugin *self)
+{
+ static const struct hFILE_scheme_handler handler =
+ { hopen_net, hfile_always_remote, "knetfile", 0 };
+
+ self->name = "knetfile";
+ hfile_add_scheme_handler("http", &handler);
+ hfile_add_scheme_handler("ftp", &handler);
+ return 0;
+}
diff --git a/htslib/hts.c b/htslib/hts.c
new file mode 100644
index 0000000..aaf92e2
--- /dev/null
+++ b/htslib/hts.c
@@ -0,0 +1,2066 @@
+/* hts.c -- format-neutral I/O, indexing, and iterator API functions.
+
+ Copyright (C) 2008, 2009, 2012-2016 Genome Research Ltd.
+ Copyright (C) 2012, 2013 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <zlib.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include "htslib/bgzf.h"
+#include "htslib/hts.h"
+#include "cram/cram.h"
+#include "htslib/hfile.h"
+#include "version.h"
+#include "hts_internal.h"
+
+#include "htslib/kseq.h"
+#define KS_BGZF 1
+#if KS_BGZF
+ // bgzf now supports gzip-compressed files, the gzFile branch can be removed
+ KSTREAM_INIT2(, BGZF*, bgzf_read, 65536)
+#else
+ KSTREAM_INIT2(, gzFile, gzread, 16384)
+#endif
+
+#include "htslib/khash.h"
+KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+int hts_verbose = 3;
+
+const char *hts_version()
+{
+ return HTS_VERSION;
+}
+
+const unsigned char seq_nt16_table[256] = {
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
+
+const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+/**********************
+ *** Basic file I/O ***
+ **********************/
+
+static enum htsFormatCategory format_category(enum htsExactFormat fmt)
+{
+ switch (fmt) {
+ case bam:
+ case sam:
+ case cram:
+ return sequence_data;
+
+ case vcf:
+ case bcf:
+ return variant_data;
+
+ case bai:
+ case crai:
+ case csi:
+ case gzi:
+ case tbi:
+ return index_file;
+
+ case bed:
+ return region_list;
+
+ case unknown_format:
+ case binary_format:
+ case text_format:
+ case format_maximum:
+ break;
+ }
+
+ return unknown_category;
+}
+
+// Decompress up to ten or so bytes by peeking at the file, which must be
+// positioned at the start of a GZIP block.
+static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
+{
+ // Typically at most a couple of hundred bytes of input are required
+ // to get a few bytes of output from inflate(), so hopefully this buffer
+ // size suffices in general.
+ unsigned char buffer[512];
+ z_stream zs;
+ ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
+
+ if (npeek < 0) return 0;
+
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = buffer;
+ zs.avail_in = npeek;
+ zs.next_out = dest;
+ zs.avail_out = destsize;
+ if (inflateInit2(&zs, 31) != Z_OK) return 0;
+
+ while (zs.total_out < destsize)
+ if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
+
+ destsize = zs.total_out;
+ inflateEnd(&zs);
+
+ return destsize;
+}
+
+// Parse "x.y" text, taking care because the string is not NUL-terminated
+// and filling in major/minor only when the digits are followed by a delimiter,
+// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
+static void
+parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
+{
+ const char *s = (const char *) u;
+ const char *slim = (const char *) ulim;
+ short v;
+
+ fmt->version.major = fmt->version.minor = -1;
+
+ for (v = 0; s < slim && isdigit_c(*s); s++)
+ v = 10 * v + *s - '0';
+
+ if (s < slim) {
+ fmt->version.major = v;
+ if (*s == '.') {
+ s++;
+ for (v = 0; s < slim && isdigit_c(*s); s++)
+ v = 10 * v + *s - '0';
+ if (s < slim)
+ fmt->version.minor = v;
+ }
+ else
+ fmt->version.minor = 0;
+ }
+}
+
+int hts_detect_format(hFILE *hfile, htsFormat *fmt)
+{
+ unsigned char s[21];
+ ssize_t len = hpeek(hfile, s, 18);
+ if (len < 0) return -1;
+
+ if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
+ // The stream is either gzip-compressed or BGZF-compressed.
+ // Determine which, and decompress the first few bytes.
+ fmt->compression = (len >= 18 && (s[3] & 4) &&
+ memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
+ len = decompress_peek(hfile, s, sizeof s);
+ }
+ else {
+ fmt->compression = no_compression;
+ len = hpeek(hfile, s, sizeof s);
+ }
+ if (len < 0) return -1;
+
+ fmt->compression_level = -1;
+ fmt->specific = NULL;
+
+ if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
+ fmt->category = sequence_data;
+ fmt->format = cram;
+ fmt->version.major = s[4], fmt->version.minor = s[5];
+ fmt->compression = custom;
+ return 0;
+ }
+ else if (len >= 4 && s[3] <= '\4') {
+ if (memcmp(s, "BAM\1", 4) == 0) {
+ fmt->category = sequence_data;
+ fmt->format = bam;
+ // TODO Decompress enough to pick version from @HD-VN header
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "BAI\1", 4) == 0) {
+ fmt->category = index_file;
+ fmt->format = bai;
+ fmt->version.major = -1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "BCF\4", 4) == 0) {
+ fmt->category = variant_data;
+ fmt->format = bcf;
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "BCF\2", 4) == 0) {
+ fmt->category = variant_data;
+ fmt->format = bcf;
+ fmt->version.major = s[3];
+ fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
+ return 0;
+ }
+ else if (memcmp(s, "CSI\1", 4) == 0) {
+ fmt->category = index_file;
+ fmt->format = csi;
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else if (memcmp(s, "TBI\1", 4) == 0) {
+ fmt->category = index_file;
+ fmt->format = tbi;
+ fmt->version.major = -1, fmt->version.minor = -1;
+ return 0;
+ }
+ }
+ else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
+ fmt->category = variant_data;
+ fmt->format = vcf;
+ if (len >= 21 && s[16] == 'v')
+ parse_version(fmt, &s[17], &s[len]);
+ else
+ fmt->version.major = fmt->version.minor = -1;
+ return 0;
+ }
+ else if (len >= 4 && s[0] == '@' &&
+ (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
+ memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
+ fmt->category = sequence_data;
+ fmt->format = sam;
+ // @HD-VN is not guaranteed to be the first tag, but then @HD is
+ // not guaranteed to be present at all...
+ if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
+ parse_version(fmt, &s[7], &s[len]);
+ else
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+ else {
+ // Various possibilities for tab-delimited text:
+ // .crai (gzipped tab-delimited six columns: seqid 5*number)
+ // .bed ([3..12] tab-delimited columns)
+ // .bedpe (>= 10 tab-delimited columns)
+ // .sam (tab-delimited >= 11 columns: seqid number seqid...)
+ // FIXME For now, assume it's SAM
+ fmt->category = sequence_data;
+ fmt->format = sam;
+ fmt->version.major = 1, fmt->version.minor = -1;
+ return 0;
+ }
+
+ fmt->category = unknown_category;
+ fmt->format = unknown_format;
+ fmt->version.major = fmt->version.minor = -1;
+ fmt->compression = no_compression;
+ return 0;
+}
+
+char *hts_format_description(const htsFormat *format)
+{
+ kstring_t str = { 0, 0, NULL };
+
+ switch (format->format) {
+ case sam: kputs("SAM", &str); break;
+ case bam: kputs("BAM", &str); break;
+ case cram: kputs("CRAM", &str); break;
+ case vcf: kputs("VCF", &str); break;
+ case bcf:
+ if (format->version.major == 1) kputs("Legacy BCF", &str);
+ else kputs("BCF", &str);
+ break;
+ case bai: kputs("BAI", &str); break;
+ case crai: kputs("CRAI", &str); break;
+ case csi: kputs("CSI", &str); break;
+ case tbi: kputs("Tabix", &str); break;
+ default: kputs("unknown", &str); break;
+ }
+
+ if (format->version.major >= 0) {
+ kputs(" version ", &str);
+ kputw(format->version.major, &str);
+ if (format->version.minor >= 0) {
+ kputc('.', &str);
+ kputw(format->version.minor, &str);
+ }
+ }
+
+ switch (format->compression) {
+ case custom: kputs(" compressed", &str); break;
+ case gzip: kputs(" gzip-compressed", &str); break;
+ case bgzf:
+ switch (format->format) {
+ case bam:
+ case bcf:
+ case csi:
+ case tbi:
+ // These are by definition BGZF, so just use the generic term
+ kputs(" compressed", &str);
+ break;
+ default:
+ kputs(" BGZF-compressed", &str);
+ break;
+ }
+ break;
+ default: break;
+ }
+
+ switch (format->category) {
+ case sequence_data: kputs(" sequence", &str); break;
+ case variant_data: kputs(" variant calling", &str); break;
+ case index_file: kputs(" index", &str); break;
+ case region_list: kputs(" genomic region", &str); break;
+ default: break;
+ }
+
+ if (format->compression == no_compression)
+ switch (format->format) {
+ case sam:
+ case crai:
+ case vcf:
+ case bed:
+ kputs(" text", &str);
+ break;
+
+ default:
+ kputs(" data", &str);
+ break;
+ }
+ else
+ kputs(" data", &str);
+
+ return ks_release(&str);
+}
+
+htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
+{
+ char smode[102], *cp, *cp2, *mode_c;
+ htsFile *fp = NULL;
+ hFILE *hfile;
+ char fmt_code = '\0';
+
+ strncpy(smode, mode, 100);
+ smode[100]=0;
+ if ((cp = strchr(smode, ',')))
+ *cp = '\0';
+
+ // Migrate format code (b or c) to the end of the smode buffer.
+ for (cp2 = cp = smode; *cp; cp++) {
+ if (*cp == 'b')
+ fmt_code = 'b';
+ else if (*cp == 'c')
+ fmt_code = 'c';
+ else
+ *cp2++ = *cp;
+ }
+ mode_c = cp2;
+ *cp2++ = fmt_code;
+ *cp2++ = 0;
+ *cp2++ = 0;
+
+ // Set or reset the format code if opts->format is used
+ if (fmt && fmt->format != unknown_format)
+ *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format];
+
+ hfile = hopen(fn, smode);
+ if (hfile == NULL) goto error;
+
+ fp = hts_hopen(hfile, fn, smode);
+ if (fp == NULL) goto error;
+
+ if (fmt && fmt->specific)
+ if (hts_opt_apply(fp, fmt->specific) != 0)
+ goto error;
+
+ return fp;
+
+error:
+ if (hts_verbose >= 2)
+ fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
+
+ if (hfile)
+ hclose_abruptly(hfile);
+
+ return NULL;
+}
+
+htsFile *hts_open(const char *fn, const char *mode) {
+ return hts_open_format(fn, mode, NULL);
+}
+
+/*
+ * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
+ * the prefix in lowercase into buf and returning a pointer to the suffix.
+ * On return, buf is always NUL-terminated; thus assumes that the "keyword"
+ * prefix should be one of several known values of maximum length buflen-2.
+ * (If delim is not found, returns a pointer to the '\0'.)
+ */
+static const char *
+scan_keyword(const char *str, char delim, char *buf, size_t buflen)
+{
+ size_t i = 0;
+ while (*str && *str != delim) {
+ if (i < buflen-1) buf[i++] = tolower_c(*str);
+ str++;
+ }
+
+ buf[i] = '\0';
+ return *str? str+1 : str;
+}
+
+/*
+ * Parses arg and appends it to the option list.
+ *
+ * Returns 0 on success;
+ * -1 on failure.
+ */
+int hts_opt_add(hts_opt **opts, const char *c_arg) {
+ hts_opt *o, *t;
+ char *val;
+
+ if (!c_arg)
+ return -1;
+
+ if (!(o = malloc(sizeof(*o))))
+ return -1;
+
+ if (!(o->arg = strdup(c_arg))) {
+ free(o);
+ return -1;
+ }
+
+ if (!(val = strchr(o->arg, '=')))
+ val = "1"; // assume boolean
+ else
+ *val++ = '\0';
+
+ if (strcmp(o->arg, "decode_md") == 0 ||
+ strcmp(o->arg, "DECODE_MD") == 0)
+ o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "verbosity") == 0 ||
+ strcmp(o->arg, "VERBOSITY") == 0)
+ o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
+ strcmp(o->arg, "SEQS_PER_SLICE") == 0)
+ o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "slices_per_container") == 0 ||
+ strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
+ o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "embed_ref") == 0 ||
+ strcmp(o->arg, "EMBED_REF") == 0)
+ o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "no_ref") == 0 ||
+ strcmp(o->arg, "NO_REF") == 0)
+ o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "ignore_md5") == 0 ||
+ strcmp(o->arg, "IGNORE_MD5") == 0)
+ o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "use_bzip2") == 0 ||
+ strcmp(o->arg, "USE_BZIP2") == 0)
+ o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "use_rans") == 0 ||
+ strcmp(o->arg, "USE_RANS") == 0)
+ o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "use_lzma") == 0 ||
+ strcmp(o->arg, "USE_LZMA") == 0)
+ o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "reference") == 0 ||
+ strcmp(o->arg, "REFERENCE") == 0)
+ o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
+
+ else if (strcmp(o->arg, "version") == 0 ||
+ strcmp(o->arg, "VERSION") == 0)
+ o->opt = CRAM_OPT_VERSION, o->val.s =val;
+
+ else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
+ strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
+ o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "nthreads") == 0 ||
+ strcmp(o->arg, "NTHREADS") == 0)
+ o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
+
+ else if (strcmp(o->arg, "required_fields") == 0 ||
+ strcmp(o->arg, "REQUIRED_FIELDS") == 0)
+ o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
+
+ else {
+ fprintf(stderr, "Unknown option '%s'\n", o->arg);
+ free(o->arg);
+ free(o);
+ return -1;
+ }
+
+ o->next = NULL;
+
+ // Append; assumes small list.
+ if (*opts) {
+ t = *opts;
+ while (t->next)
+ t = t->next;
+ t->next = o;
+ } else {
+ *opts = o;
+ }
+
+ return 0;
+}
+
+/*
+ * Applies an hts_opt option list to a given htsFile.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int hts_opt_apply(htsFile *fp, hts_opt *opts) {
+ hts_opt *last = NULL;
+
+ for (; opts; opts = (last=opts)->next)
+ if (hts_set_opt(fp, opts->opt, opts->val) != 0)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Frees an hts_opt list.
+ */
+void hts_opt_free(hts_opt *opts) {
+ hts_opt *last = NULL;
+ while (opts) {
+ opts = (last=opts)->next;
+ free(last->arg);
+ free(last);
+ }
+}
+
+
+/*
+ * Tokenise options as (key(=value)?,)*(key(=value)?)?
+ * NB: No provision for ',' appearing in the value!
+ * Add backslashing rules?
+ *
+ * This could be used as part of a general command line option parser or
+ * as a string concatenated onto the file open mode.
+ *
+ * Returns 0 on success
+ * -1 on failure.
+ */
+int hts_parse_opt_list(htsFormat *fmt, const char *str) {
+ while (str && *str) {
+ const char *str_start;
+ int len;
+ char arg[8001];
+
+ while (*str && *str == ',')
+ str++;
+
+ for (str_start = str; *str && *str != ','; str++);
+ len = str - str_start;
+
+ // Produce a nul terminated copy of the option
+ strncpy(arg, str_start, len < 8000 ? len : 8000);
+ arg[len < 8000 ? len : 8000] = '\0';
+
+ if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
+ return -1;
+
+ if (*str)
+ str++;
+ }
+
+ return 0;
+}
+
+/*
+ * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
+ * followed by a comma separated list of key=value options and splits
+ * these up into the fields of htsFormat struct.
+ *
+ * format is assumed to be already initialised, either to blank
+ * "unknown" values or via previous hts_opt_add calls.
+ *
+ * Returns 0 on success
+ * -1 on failure.
+ */
+int hts_parse_format(htsFormat *format, const char *str) {
+ char fmt[8];
+ const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
+
+ format->version.minor = 0; // unknown
+ format->version.major = 0; // unknown
+
+ if (strcmp(fmt, "sam") == 0) {
+ format->category = sequence_data;
+ format->format = sam;
+ format->compression = no_compression;;
+ format->compression_level = 0;
+ } else if (strcmp(fmt, "bam") == 0) {
+ format->category = sequence_data;
+ format->format = bam;
+ format->compression = bgzf;
+ format->compression_level = -1;
+ } else if (strcmp(fmt, "cram") == 0) {
+ format->category = sequence_data;
+ format->format = cram;
+ format->compression = custom;
+ format->compression_level = -1;
+ } else if (strcmp(fmt, "vcf") == 0) {
+ format->category = variant_data;
+ format->format = vcf;
+ format->compression = no_compression;;
+ format->compression_level = 0;
+ } else if (strcmp(fmt, "bcf") == 0) {
+ format->category = variant_data;
+ format->format = bcf;
+ format->compression = bgzf;
+ format->compression_level = -1;
+ } else {
+ return -1;
+ }
+
+ return hts_parse_opt_list(format, cp);
+}
+
+
+/*
+ * Tokenise options as (key(=value)?,)*(key(=value)?)?
+ * NB: No provision for ',' appearing in the value!
+ * Add backslashing rules?
+ *
+ * This could be used as part of a general command line option parser or
+ * as a string concatenated onto the file open mode.
+ *
+ * Returns 0 on success
+ * -1 on failure.
+ */
+static int hts_process_opts(htsFile *fp, const char *opts) {
+ htsFormat fmt;
+
+ fmt.specific = NULL;
+ if (hts_parse_opt_list(&fmt, opts) != 0)
+ return -1;
+
+ if (hts_opt_apply(fp, fmt.specific) != 0) {
+ hts_opt_free(fmt.specific);
+ return -1;
+ }
+
+ hts_opt_free(fmt.specific);
+
+ return 0;
+}
+
+
+htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode)
+{
+ htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
+ char simple_mode[101], *cp, *opts;
+ simple_mode[100] = '\0';
+
+ if (fp == NULL) goto error;
+
+ fp->fn = strdup(fn);
+ fp->is_be = ed_is_big();
+
+ // Split mode into simple_mode,opts strings
+ if ((cp = strchr(mode, ','))) {
+ strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
+ simple_mode[cp-mode] = '\0';
+ opts = cp+1;
+ } else {
+ strncpy(simple_mode, mode, 100);
+ opts = NULL;
+ }
+
+ if (strchr(simple_mode, 'r')) {
+ if (hts_detect_format(hfile, &fp->format) < 0) goto error;
+ }
+ else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
+ htsFormat *fmt = &fp->format;
+ fp->is_write = 1;
+
+ if (strchr(simple_mode, 'b')) fmt->format = binary_format;
+ else if (strchr(simple_mode, 'c')) fmt->format = cram;
+ else fmt->format = text_format;
+
+ if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
+ else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
+ else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
+ else {
+ // No compression mode specified, set to the default for the format
+ switch (fmt->format) {
+ case binary_format: fmt->compression = bgzf; break;
+ case cram: fmt->compression = custom; break;
+ case text_format: fmt->compression = no_compression; break;
+ default: abort();
+ }
+ }
+
+ // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
+ fmt->category = format_category(fmt->format);
+
+ fmt->version.major = fmt->version.minor = -1;
+ fmt->compression_level = -1;
+ fmt->specific = NULL;
+ }
+ else goto error;
+
+ switch (fp->format.format) {
+ case binary_format:
+ case bam:
+ case bcf:
+ fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
+ if (fp->fp.bgzf == NULL) goto error;
+ fp->is_bin = 1;
+ break;
+
+ case cram:
+ fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
+ if (fp->fp.cram == NULL) goto error;
+ if (!fp->is_write)
+ cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
+ fp->is_cram = 1;
+ break;
+
+ case text_format:
+ case sam:
+ case vcf:
+ if (!fp->is_write) {
+ #if KS_BGZF
+ BGZF *gzfp = bgzf_hopen(hfile, simple_mode);
+ #else
+ // TODO Implement gzip hFILE adaptor
+ hclose(hfile); // This won't work, especially for stdin
+ gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb");
+ #endif
+ if (gzfp) fp->fp.voidp = ks_init(gzfp);
+ else goto error;
+ }
+ else if (fp->format.compression != no_compression) {
+ fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
+ if (fp->fp.bgzf == NULL) goto error;
+ }
+ else
+ fp->fp.hfile = hfile;
+ break;
+
+ default:
+ goto error;
+ }
+
+ if (opts)
+ hts_process_opts(fp, opts);
+
+ return fp;
+
+error:
+ if (hts_verbose >= 2)
+ fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
+
+ if (fp) {
+ free(fp->fn);
+ free(fp->fn_aux);
+ free(fp);
+ }
+ return NULL;
+}
+
+int hts_close(htsFile *fp)
+{
+ int ret, save;
+
+ switch (fp->format.format) {
+ case binary_format:
+ case bam:
+ case bcf:
+ ret = bgzf_close(fp->fp.bgzf);
+ break;
+
+ case cram:
+ if (!fp->is_write) {
+ switch (cram_eof(fp->fp.cram)) {
+ case 2:
+ fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
+ break;
+ case 0: /* not at EOF, but may not have wanted all seqs */
+ default: /* case 1, expected EOF */
+ break;
+ }
+ }
+ ret = cram_close(fp->fp.cram);
+ break;
+
+ case text_format:
+ case sam:
+ case vcf:
+ if (!fp->is_write) {
+ #if KS_BGZF
+ BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f;
+ ret = bgzf_close(gzfp);
+ #else
+ gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f;
+ ret = gzclose(gzfp);
+ #endif
+ ks_destroy((kstream_t*)fp->fp.voidp);
+ }
+ else if (fp->format.compression != no_compression)
+ ret = bgzf_close(fp->fp.bgzf);
+ else
+ ret = hclose(fp->fp.hfile);
+ break;
+
+ default:
+ ret = -1;
+ break;
+ }
+
+ save = errno;
+ free(fp->fn);
+ free(fp->fn_aux);
+ free(fp->line.s);
+ free(fp);
+ errno = save;
+ return ret;
+}
+
+const htsFormat *hts_get_format(htsFile *fp)
+{
+ return fp? &fp->format : NULL;
+}
+
+const char *hts_format_file_extension(const htsFormat *format) {
+ if (!format)
+ return "?";
+
+ switch (format->format) {
+ case sam: return "sam";
+ case bam: return "bam";
+ case bai: return "bai";
+ case cram: return "cram";
+ case crai: return "crai";
+ case vcf: return "vcf";
+ case bcf: return "bcf";
+ case csi: return "csi";
+ case gzi: return "gzi";
+ case tbi: return "tbi";
+ case bed: return "bed";
+ default: return "?";
+ }
+}
+
+int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
+ int r;
+ va_list args;
+
+ if (opt == HTS_OPT_NTHREADS) {
+ va_start(args, opt);
+ int nthreads = va_arg(args, int);
+ va_end(args);
+ return hts_set_threads(fp, nthreads);
+ }
+
+ if (fp->format.format != cram)
+ return 0;
+
+ va_start(args, opt);
+ r = cram_set_voption(fp->fp.cram, opt, args);
+ va_end(args);
+
+ return r;
+}
+
+int hts_set_threads(htsFile *fp, int n)
+{
+ if (fp->format.compression == bgzf) {
+ return bgzf_mt(fp->fp.bgzf, n, 256);
+ } else if (fp->format.format == cram) {
+ return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
+ }
+ else return 0;
+}
+
+int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
+{
+ free(fp->fn_aux);
+ if (fn_aux) {
+ fp->fn_aux = strdup(fn_aux);
+ if (fp->fn_aux == NULL) return -1;
+ }
+ else fp->fn_aux = NULL;
+
+ if (fp->format.format == cram)
+ if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
+ return -1;
+
+ return 0;
+}
+
+// For VCF/BCF backward sweeper. Not exposing these functions because their
+// future is uncertain. Things will probably have to change with hFILE...
+BGZF *hts_get_bgzfp(htsFile *fp)
+{
+ if ( fp->is_bin )
+ return fp->fp.bgzf;
+ else
+ return ((kstream_t*)fp->fp.voidp)->f;
+}
+int hts_useek(htsFile *fp, long uoffset, int where)
+{
+ if ( fp->is_bin )
+ return bgzf_useek(fp->fp.bgzf, uoffset, where);
+ else
+ {
+ ks_rewind((kstream_t*)fp->fp.voidp);
+ ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset;
+ return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where);
+ }
+}
+long hts_utell(htsFile *fp)
+{
+ if ( fp->is_bin )
+ return bgzf_utell(fp->fp.bgzf);
+ else
+ return ((kstream_t*)fp->fp.voidp)->seek_pos;
+}
+
+int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
+{
+ int ret, dret;
+ ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret);
+ ++fp->lineno;
+ return ret;
+}
+
+char **hts_readlist(const char *string, int is_file, int *_n)
+{
+ int m = 0, n = 0, dret;
+ char **s = 0;
+ if ( is_file )
+ {
+#if KS_BGZF
+ BGZF *fp = bgzf_open(string, "r");
+#else
+ gzFile fp = gzopen(string, "r");
+#endif
+ if ( !fp ) return NULL;
+
+ kstream_t *ks;
+ kstring_t str;
+ str.s = 0; str.l = str.m = 0;
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0)
+ {
+ if (str.l == 0) continue;
+ n++;
+ hts_expand(char*,n,m,s);
+ s[n-1] = strdup(str.s);
+ }
+ ks_destroy(ks);
+#if KS_BGZF
+ bgzf_close(fp);
+#else
+ gzclose(fp);
+#endif
+ free(str.s);
+ }
+ else
+ {
+ const char *q = string, *p = string;
+ while ( 1 )
+ {
+ if (*p == ',' || *p == 0)
+ {
+ n++;
+ hts_expand(char*,n,m,s);
+ s[n-1] = (char*)calloc(p - q + 1, 1);
+ strncpy(s[n-1], q, p - q);
+ q = p + 1;
+ }
+ if ( !*p ) break;
+ p++;
+ }
+ }
+ s = (char**)realloc(s, n * sizeof(char*));
+ *_n = n;
+ return s;
+}
+
+char **hts_readlines(const char *fn, int *_n)
+{
+ int m = 0, n = 0, dret;
+ char **s = 0;
+#if KS_BGZF
+ BGZF *fp = bgzf_open(fn, "r");
+#else
+ gzFile fp = gzopen(fn, "r");
+#endif
+ if ( fp ) { // read from file
+ kstream_t *ks;
+ kstring_t str;
+ str.s = 0; str.l = str.m = 0;
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
+ if (str.l == 0) continue;
+ if (m == n) {
+ m = m? m<<1 : 16;
+ s = (char**)realloc(s, m * sizeof(char*));
+ }
+ s[n++] = strdup(str.s);
+ }
+ ks_destroy(ks);
+ #if KS_BGZF
+ bgzf_close(fp);
+ #else
+ gzclose(fp);
+ #endif
+ s = (char**)realloc(s, n * sizeof(char*));
+ free(str.s);
+ } else if (*fn == ':') { // read from string
+ const char *q, *p;
+ for (q = p = fn + 1;; ++p)
+ if (*p == ',' || *p == 0) {
+ if (m == n) {
+ m = m? m<<1 : 16;
+ s = (char**)realloc(s, m * sizeof(char*));
+ }
+ s[n] = (char*)calloc(p - q + 1, 1);
+ strncpy(s[n++], q, p - q);
+ q = p + 1;
+ if (*p == 0) break;
+ }
+ } else return 0;
+ s = (char**)realloc(s, n * sizeof(char*));
+ *_n = n;
+ return s;
+}
+
+// DEPRECATED: To be removed in a future HTSlib release
+int hts_file_type(const char *fname)
+{
+ int len = strlen(fname);
+ if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
+ if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
+ if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
+ if ( !strcmp("-",fname) ) return FT_STDIN;
+
+ hFILE *f = hopen(fname, "r");
+ if (f == NULL) return 0;
+
+ htsFormat fmt;
+ if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
+ if (hclose(f) < 0) return 0;
+
+ switch (fmt.format) {
+ case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
+ case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
+ default: return 0;
+ }
+}
+
+/****************
+ *** Indexing ***
+ ****************/
+
+#define HTS_MIN_MARKER_DIST 0x10000
+
+// Finds the special meta bin
+// ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
+#define META_BIN(idx) ((idx)->n_bins + 1)
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+
+#include "htslib/ksort.h"
+KSORT_INIT(_off, hts_pair64_t, pair64_lt)
+
+typedef struct {
+ int32_t m, n;
+ uint64_t loff;
+ hts_pair64_t *list;
+} bins_t;
+
+#include "htslib/khash.h"
+KHASH_MAP_INIT_INT(bin, bins_t)
+typedef khash_t(bin) bidx_t;
+
+typedef struct {
+ int32_t n, m;
+ uint64_t *offset;
+} lidx_t;
+
+struct __hts_idx_t {
+ int fmt, min_shift, n_lvls, n_bins;
+ uint32_t l_meta;
+ int32_t n, m;
+ uint64_t n_no_coor;
+ bidx_t **bidx;
+ lidx_t *lidx;
+ uint8_t *meta;
+ struct {
+ uint32_t last_bin, save_bin;
+ int last_coor, last_tid, save_tid, finished;
+ uint64_t last_off, save_off;
+ uint64_t off_beg, off_end;
+ uint64_t n_mapped, n_unmapped;
+ } z; // keep internal states
+};
+
+static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
+{
+ khint_t k;
+ bins_t *l;
+ int absent;
+ k = kh_put(bin, b, bin, &absent);
+ l = &kh_value(b, k);
+ if (absent) {
+ l->m = 1; l->n = 0;
+ l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
+ }
+ if (l->n == l->m) {
+ l->m <<= 1;
+ l->list = (hts_pair64_t*)realloc(l->list, l->m * sizeof(hts_pair64_t));
+ }
+ l->list[l->n].u = beg;
+ l->list[l->n++].v = end;
+}
+
+static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
+{
+ int i, beg, end;
+ beg = _beg >> min_shift;
+ end = (_end - 1) >> min_shift;
+ if (l->m < end + 1) {
+ int old_m = l->m;
+ l->m = end + 1;
+ kroundup32(l->m);
+ l->offset = (uint64_t*)realloc(l->offset, l->m * sizeof(uint64_t));
+ memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1
+ }
+ if (beg == end) { // to save a loop in this case
+ if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset;
+ } else {
+ for (i = beg; i <= end; ++i)
+ if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
+ }
+ if (l->n < end + 1) l->n = end + 1;
+}
+
+hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
+{
+ hts_idx_t *idx;
+ idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
+ if (idx == NULL) return NULL;
+ idx->fmt = fmt;
+ idx->min_shift = min_shift;
+ idx->n_lvls = n_lvls;
+ idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
+ idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
+ idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
+ idx->z.last_coor = 0xffffffffu;
+ if (n) {
+ idx->n = idx->m = n;
+ idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
+ if (idx->bidx == NULL) { free(idx); return NULL; }
+ idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
+ if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
+ }
+ return idx;
+}
+
+static void update_loff(hts_idx_t *idx, int i, int free_lidx)
+{
+ bidx_t *bidx = idx->bidx[i];
+ lidx_t *lidx = &idx->lidx[i];
+ khint_t k;
+ int l;
+ uint64_t offset0 = 0;
+ if (bidx) {
+ k = kh_get(bin, bidx, META_BIN(idx));
+ if (k != kh_end(bidx))
+ offset0 = kh_val(bidx, k).list[0].u;
+ for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
+ lidx->offset[l] = offset0;
+ } else l = 1;
+ for (; l < lidx->n; ++l) // fill missing values
+ if (lidx->offset[l] == (uint64_t)-1)
+ lidx->offset[l] = lidx->offset[l-1];
+ if (bidx == 0) return;
+ for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
+ if (kh_exist(bidx, k))
+ {
+ if ( kh_key(bidx, k) < idx->n_bins )
+ {
+ int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
+ // disable linear index if bot_bin out of bounds
+ kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
+ }
+ else
+ kh_val(bidx, k).loff = 0;
+ }
+ if (free_lidx) {
+ free(lidx->offset);
+ lidx->m = lidx->n = 0;
+ lidx->offset = 0;
+ }
+}
+
+static void compress_binning(hts_idx_t *idx, int i)
+{
+ bidx_t *bidx = idx->bidx[i];
+ khint_t k;
+ int l, m;
+ if (bidx == 0) return;
+ // merge a bin to its parent if the bin is too small
+ for (l = idx->n_lvls; l > 0; --l) {
+ unsigned start = hts_bin_first(l);
+ for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
+ bins_t *p, *q;
+ if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
+ p = &kh_value(bidx, k);
+ if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
+ if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
+ khint_t kp;
+ kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
+ if (kp == kh_end(bidx)) continue;
+ q = &kh_val(bidx, kp);
+ if (q->n + p->n > q->m) {
+ q->m = q->n + p->n;
+ kroundup32(q->m);
+ q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
+ }
+ memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
+ q->n += p->n;
+ free(p->list);
+ kh_del(bin, bidx, k);
+ }
+ }
+ }
+ k = kh_get(bin, bidx, 0);
+ if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
+ // merge adjacent chunks that start from the same BGZF block
+ for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
+ bins_t *p;
+ if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
+ p = &kh_value(bidx, k);
+ for (l = 1, m = 0; l < p->n; ++l) {
+ if (p->list[m].v>>16 >= p->list[l].u>>16) {
+ if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
+ } else p->list[++m] = p->list[l];
+ }
+ p->n = m + 1;
+ }
+}
+
+void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
+{
+ int i;
+ if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
+ if (idx->z.save_tid >= 0) {
+ insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
+ insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
+ insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
+ }
+ for (i = 0; i < idx->n; ++i) {
+ update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
+ compress_binning(idx, i);
+ }
+ idx->z.finished = 1;
+}
+
+int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
+{
+ int bin;
+ if (tid<0) beg = -1, end = 0;
+ if (tid >= idx->m) { // enlarge the index
+ int32_t oldm = idx->m;
+ idx->m = idx->m? idx->m<<1 : 2;
+ idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*));
+ idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t));
+ memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*));
+ memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t));
+ }
+ if (idx->n < tid + 1) idx->n = tid + 1;
+ if (idx->z.finished) return 0;
+ if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
+ if ( tid>=0 && idx->n_no_coor )
+ {
+ if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid);
+ return -1;
+ }
+ if (tid>=0 && idx->bidx[tid] != 0)
+ {
+ if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__);
+ return -1;
+ }
+ idx->z.last_tid = tid;
+ idx->z.last_bin = 0xffffffffu;
+ } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
+ if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__);
+ return -1;
+ }
+ if ( tid>=0 )
+ {
+ if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
+ if ( is_mapped)
+ insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record
+ }
+ else idx->n_no_coor++;
+ bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
+ if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
+ if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+ insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off);
+ if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
+ idx->z.off_end = idx->z.last_off;
+ insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end);
+ insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
+ idx->z.n_mapped = idx->z.n_unmapped = 0;
+ idx->z.off_beg = idx->z.off_end;
+ }
+ idx->z.save_off = idx->z.last_off;
+ idx->z.save_bin = idx->z.last_bin = bin;
+ idx->z.save_tid = tid;
+ }
+ if (is_mapped) ++idx->z.n_mapped;
+ else ++idx->z.n_unmapped;
+ idx->z.last_off = offset;
+ idx->z.last_coor = beg;
+ return 0;
+}
+
+void hts_idx_destroy(hts_idx_t *idx)
+{
+ khint_t k;
+ int i;
+ if (idx == 0) return;
+
+ // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
+ if (idx->fmt == HTS_FMT_CRAI) {
+ hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
+ cram_index_free(cidx->cram);
+ free(cidx);
+ return;
+ }
+
+ for (i = 0; i < idx->m; ++i) {
+ bidx_t *bidx = idx->bidx[i];
+ free(idx->lidx[i].offset);
+ if (bidx == 0) continue;
+ for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
+ if (kh_exist(bidx, k))
+ free(kh_value(bidx, k).list);
+ kh_destroy(bin, bidx);
+ }
+ free(idx->bidx); free(idx->lidx); free(idx->meta);
+ free(idx);
+}
+
+// The optimizer eliminates these ed_is_big() calls; still it would be good to
+// TODO Determine endianness at configure- or compile-time
+
+static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
+{
+ if (ed_is_big()) x = ed_swap_4(x);
+ return bgzf_write(fp, &x, sizeof x);
+}
+
+static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
+{
+ if (ed_is_big()) x = ed_swap_4(x);
+ return bgzf_write(fp, &x, sizeof x);
+}
+
+static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
+{
+ if (ed_is_big()) x = ed_swap_8(x);
+ return bgzf_write(fp, &x, sizeof x);
+}
+
+static inline void swap_bins(bins_t *p)
+{
+ int i;
+ for (i = 0; i < p->n; ++i) {
+ ed_swap_8p(&p->list[i].u);
+ ed_swap_8p(&p->list[i].v);
+ }
+}
+
+static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
+{
+ int32_t i, j;
+
+ #define check(ret) if ((ret) < 0) return -1
+
+ check(idx_write_int32(fp, idx->n));
+ if (fmt == HTS_FMT_TBI && idx->l_meta)
+ check(bgzf_write(fp, idx->meta, idx->l_meta));
+
+ for (i = 0; i < idx->n; ++i) {
+ khint_t k;
+ bidx_t *bidx = idx->bidx[i];
+ lidx_t *lidx = &idx->lidx[i];
+ // write binning index
+ check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
+ if (bidx)
+ for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
+ if (kh_exist(bidx, k)) {
+ bins_t *p = &kh_value(bidx, k);
+ check(idx_write_uint32(fp, kh_key(bidx, k)));
+ if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
+ //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
+ check(idx_write_int32(fp, p->n));
+ for (j = 0; j < p->n; ++j) {
+ check(idx_write_uint64(fp, p->list[j].u));
+ check(idx_write_uint64(fp, p->list[j].v));
+ }
+ }
+
+ // write linear index
+ if (fmt != HTS_FMT_CSI) {
+ check(idx_write_int32(fp, lidx->n));
+ for (j = 0; j < lidx->n; ++j)
+ check(idx_write_uint64(fp, lidx->offset[j]));
+ }
+ }
+
+ check(idx_write_uint64(fp, idx->n_no_coor));
+ return 0;
+ #undef check
+}
+
+int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
+{
+ int ret, save;
+ char *fnidx = (char*)calloc(1, strlen(fn) + 5);
+ if (fnidx == NULL) return -1;
+
+ strcpy(fnidx, fn);
+ switch (fmt) {
+ case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
+ case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
+ case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
+ default: abort();
+ }
+
+ ret = hts_idx_save_as(idx, fn, fnidx, fmt);
+ save = errno;
+ free(fnidx);
+ errno = save;
+ return ret;
+}
+
+int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
+{
+ BGZF *fp;
+
+ #define check(ret) if ((ret) < 0) goto fail
+
+ if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
+
+ fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
+ if (fp == NULL) return -1;
+
+ if (fmt == HTS_FMT_CSI) {
+ check(bgzf_write(fp, "CSI\1", 4));
+ check(idx_write_int32(fp, idx->min_shift));
+ check(idx_write_int32(fp, idx->n_lvls));
+ check(idx_write_uint32(fp, idx->l_meta));
+ if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
+ } else if (fmt == HTS_FMT_TBI) {
+ check(bgzf_write(fp, "TBI\1", 4));
+ } else if (fmt == HTS_FMT_BAI) {
+ check(bgzf_write(fp, "BAI\1", 4));
+ } else abort();
+
+ check(hts_idx_save_core(idx, fp, fmt));
+
+ return bgzf_close(fp);
+ #undef check
+
+fail:
+ bgzf_close(fp);
+ return -1;
+}
+
+static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt)
+{
+ int32_t i, n, is_be;
+ is_be = ed_is_big();
+ if (idx == NULL) return -4;
+ for (i = 0; i < idx->n; ++i) {
+ bidx_t *h;
+ lidx_t *l = &idx->lidx[i];
+ uint32_t key;
+ int j, absent;
+ bins_t *p;
+ h = idx->bidx[i] = kh_init(bin);
+ if (bgzf_read(fp, &n, 4) != 4) return -1;
+ if (is_be) ed_swap_4p(&n);
+ for (j = 0; j < n; ++j) {
+ khint_t k;
+ if (bgzf_read(fp, &key, 4) != 4) return -1;
+ if (is_be) ed_swap_4p(&key);
+ k = kh_put(bin, h, key, &absent);
+ if (absent <= 0) return -3; // Duplicate bin number
+ p = &kh_val(h, k);
+ if (fmt == HTS_FMT_CSI) {
+ if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
+ if (is_be) ed_swap_8p(&p->loff);
+ } else p->loff = 0;
+ if (bgzf_read(fp, &p->n, 4) != 4) return -1;
+ if (is_be) ed_swap_4p(&p->n);
+ p->m = p->n;
+ p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
+ if (p->list == NULL) return -2;
+ if (bgzf_read(fp, p->list, p->n<<4) != p->n<<4) return -1;
+ if (is_be) swap_bins(p);
+ }
+ if (fmt != HTS_FMT_CSI) { // load linear index
+ int j;
+ if (bgzf_read(fp, &l->n, 4) != 4) return -1;
+ if (is_be) ed_swap_4p(&l->n);
+ l->m = l->n;
+ l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
+ if (l->offset == NULL) return -2;
+ if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
+ if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
+ for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
+ if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
+ update_loff(idx, i, 1);
+ }
+ }
+ if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
+ if (is_be) ed_swap_8p(&idx->n_no_coor);
+ return 0;
+}
+
+static hts_idx_t *hts_idx_load_local(const char *fn)
+{
+ uint8_t magic[4];
+ int i, is_be;
+ hts_idx_t *idx = NULL;
+ uint8_t *meta = NULL;
+ BGZF *fp = bgzf_open(fn, "r");
+ if (fp == NULL) return NULL;
+ is_be = ed_is_big();
+ if (bgzf_read(fp, magic, 4) != 4) goto fail;
+
+ if (memcmp(magic, "CSI\1", 4) == 0) {
+ uint32_t x[3], n;
+ if (bgzf_read(fp, x, 12) != 12) goto fail;
+ if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
+ if (x[2]) {
+ if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto fail;
+ if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
+ }
+ if (bgzf_read(fp, &n, 4) != 4) goto fail;
+ if (is_be) ed_swap_4p(&n);
+ if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
+ idx->l_meta = x[2];
+ idx->meta = meta;
+ meta = NULL;
+ if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
+ }
+ else if (memcmp(magic, "TBI\1", 4) == 0) {
+ uint32_t x[8];
+ if (bgzf_read(fp, x, 32) != 32) goto fail;
+ if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]);
+ if ((idx = hts_idx_init(x[0], HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
+ idx->l_meta = 28 + x[7];
+ if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto fail;
+ memcpy(idx->meta, &x[1], 28);
+ if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto fail;
+ if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
+ }
+ else if (memcmp(magic, "BAI\1", 4) == 0) {
+ uint32_t n;
+ if (bgzf_read(fp, &n, 4) != 4) goto fail;
+ if (is_be) ed_swap_4p(&n);
+ idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5);
+ if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
+ }
+ else { errno = EINVAL; goto fail; }
+
+ bgzf_close(fp);
+ return idx;
+
+fail:
+ bgzf_close(fp);
+ hts_idx_destroy(idx);
+ free(meta);
+ return NULL;
+}
+
+void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
+{
+ if (idx->meta) free(idx->meta);
+ idx->l_meta = l_meta;
+ if (is_copy) {
+ idx->meta = (uint8_t*)malloc(l_meta);
+ memcpy(idx->meta, meta, l_meta);
+ } else idx->meta = meta;
+}
+
+uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
+{
+ *l_meta = idx->l_meta;
+ return idx->meta;
+}
+
+const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
+{
+ if ( !idx->n )
+ {
+ *n = 0;
+ return NULL;
+ }
+
+ int tid = 0, i;
+ const char **names = (const char**) calloc(idx->n,sizeof(const char*));
+ for (i=0; i<idx->n; i++)
+ {
+ bidx_t *bidx = idx->bidx[i];
+ if ( !bidx ) continue;
+ names[tid++] = getid(hdr,i);
+ }
+ *n = tid;
+ return names;
+}
+
+int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
+{
+ if ( idx->fmt == HTS_FMT_CRAI ) {
+ *mapped = 0; *unmapped = 0;
+ return -1;
+ }
+
+ bidx_t *h = idx->bidx[tid];
+ khint_t k = kh_get(bin, h, META_BIN(idx));
+ if (k != kh_end(h)) {
+ *mapped = kh_val(h, k).list[1].u;
+ *unmapped = kh_val(h, k).list[1].v;
+ return 0;
+ } else {
+ *mapped = 0; *unmapped = 0;
+ return -1;
+ }
+}
+
+uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
+{
+ return idx->n_no_coor;
+}
+
+/****************
+ *** Iterator ***
+ ****************/
+
+static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
+{
+ int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
+ if (beg >= end) return 0;
+ if (end >= 1LL<<s) end = 1LL<<s;
+ for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
+ int b, e, n, i;
+ b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
+ if (itr->bins.n + n > itr->bins.m) {
+ itr->bins.m = itr->bins.n + n;
+ kroundup32(itr->bins.m);
+ itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
+ }
+ for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
+ }
+ return itr->bins.n;
+}
+
+hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
+{
+ int i, n_off, l, bin;
+ hts_pair64_t *off;
+ khint_t k;
+ bidx_t *bidx;
+ uint64_t min_off;
+ hts_itr_t *iter = 0;
+ if (tid < 0) {
+ int finished0 = 0;
+ uint64_t off0 = (uint64_t)-1;
+ khint_t k;
+ switch (tid) {
+ case HTS_IDX_START:
+ // Find the smallest offset, note that sequence ids may not be ordered sequentially
+ for (i=0; i<idx->n; i++)
+ {
+ bidx = idx->bidx[i];
+ k = kh_get(bin, bidx, META_BIN(idx));
+ if (k == kh_end(bidx)) continue;
+ if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
+ }
+ if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
+ break;
+
+ case HTS_IDX_NOCOOR:
+ if ( idx->n>0 )
+ {
+ bidx = idx->bidx[idx->n - 1];
+ k = kh_get(bin, bidx, META_BIN(idx));
+ if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v;
+ }
+ if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
+ break;
+
+ case HTS_IDX_REST:
+ off0 = 0;
+ break;
+
+ case HTS_IDX_NONE:
+ finished0 = 1;
+ off0 = 0;
+ break;
+
+ default:
+ return 0;
+ }
+ if (off0 != (uint64_t)-1) {
+ iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
+ iter->read_rest = 1;
+ iter->finished = finished0;
+ iter->curr_off = off0;
+ iter->readrec = readrec;
+ return iter;
+ } else return 0;
+ }
+
+ if (beg < 0) beg = 0;
+ if (end < beg) return 0;
+ if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
+
+ iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
+ iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
+ iter->readrec = readrec;
+
+ // compute min_off
+ bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
+ do {
+ int first;
+ k = kh_get(bin, bidx, bin);
+ if (k != kh_end(bidx)) break;
+ first = (hts_bin_parent(bin)<<3) + 1;
+ if (bin > first) --bin;
+ else bin = hts_bin_parent(bin);
+ } while (bin);
+ if (bin == 0) k = kh_get(bin, bidx, bin);
+ min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
+ // retrieve bins
+ reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
+ for (i = n_off = 0; i < iter->bins.n; ++i)
+ if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
+ n_off += kh_value(bidx, k).n;
+ if (n_off == 0) return iter;
+ off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
+ for (i = n_off = 0; i < iter->bins.n; ++i) {
+ if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
+ int j;
+ bins_t *p = &kh_value(bidx, k);
+ for (j = 0; j < p->n; ++j)
+ if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+ }
+ }
+ if (n_off == 0) {
+ free(off); return iter;
+ }
+ ks_introsort(_off, n_off, off);
+ // resolve completely contained adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i)
+ if (off[l].v < off[i].v) off[++l] = off[i];
+ n_off = l + 1;
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+ for (i = 1; i < n_off; ++i)
+ if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+ // merge adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i) {
+ if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+ else off[++l] = off[i];
+ }
+ n_off = l + 1;
+ iter->n_off = n_off; iter->off = off;
+ return iter;
+}
+
+void hts_itr_destroy(hts_itr_t *iter)
+{
+ if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
+}
+
+static inline long long push_digit(long long i, char c)
+{
+ // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
+ int digit = c - '0';
+ return 10 * i + digit;
+}
+
+long long hts_parse_decimal(const char *str, char **strend, int flags)
+{
+ long long n = 0;
+ int decimals = 0, e = 0, lost = 0;
+ char sign = '+', esign = '+';
+ const char *s;
+
+ while (isspace(*str)) str++;
+ s = str;
+
+ if (*s == '+' || *s == '-') sign = *s++;
+ while (*s)
+ if (isdigit(*s)) n = push_digit(n, *s++);
+ else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
+ else break;
+
+ if (*s == '.') {
+ s++;
+ while (isdigit(*s)) decimals++, n = push_digit(n, *s++);
+ }
+
+ if (*s == 'E' || *s == 'e') {
+ s++;
+ if (*s == '+' || *s == '-') esign = *s++;
+ while (isdigit(*s)) e = push_digit(e, *s++);
+ if (esign == '-') e = -e;
+ }
+
+ e -= decimals;
+ while (e > 0) n *= 10, e--;
+ while (e < 0) lost += n % 10, n /= 10, e++;
+
+ if (lost > 0 && hts_verbose >= 3)
+ fprintf(stderr, "[W::%s] discarding fractional part of %.*s\n",
+ __func__, (int)(s - str), str);
+
+ if (strend) *strend = (char *) s;
+ else if (*s && hts_verbose >= 2)
+ fprintf(stderr, "[W::%s] ignoring unknown characters after %.*s[%s]\n",
+ __func__, (int)(s - str), str, s);
+
+ return (sign == '+')? n : -n;
+}
+
+const char *hts_parse_reg(const char *s, int *beg, int *end)
+{
+ char *hyphen;
+ const char *colon = strrchr(s, ':');
+ if (colon == NULL) {
+ *beg = 0; *end = INT_MAX;
+ return s + strlen(s);
+ }
+
+ *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
+ if (*beg < 0) *beg = 0;
+
+ if (*hyphen == '\0') *end = INT_MAX;
+ else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
+ else return NULL;
+
+ if (*beg >= *end) return NULL;
+ return colon;
+}
+
+hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
+{
+ int tid, beg, end;
+ const char *q;
+
+ if (strcmp(reg, ".") == 0)
+ return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
+ else if (strcmp(reg, "*") == 0)
+ return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
+
+ q = hts_parse_reg(reg, &beg, &end);
+ if (q) {
+ char *tmp = (char*)alloca(q - reg + 1);
+ strncpy(tmp, reg, q - reg);
+ tmp[q - reg] = 0;
+ tid = getid(hdr, tmp);
+ }
+ else {
+ // not parsable as a region, but possibly a sequence named "foo:a"
+ tid = getid(hdr, reg);
+ beg = 0; end = INT_MAX;
+ }
+
+ if (tid < 0) return NULL;
+ return itr_query(idx, tid, beg, end, readrec);
+}
+
+int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
+{
+ int ret, tid, beg, end;
+ if (iter == NULL || iter->finished) return -1;
+ if (iter->read_rest) {
+ if (iter->curr_off) { // seek to the start
+ if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) return -1;
+ iter->curr_off = 0; // only seek once
+ }
+ ret = iter->readrec(fp, data, r, &tid, &beg, &end);
+ if (ret < 0) iter->finished = 1;
+ iter->curr_tid = tid;
+ iter->curr_beg = beg;
+ iter->curr_end = end;
+ return ret;
+ }
+ if (iter->off == 0) return -1;
+ for (;;) {
+ if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
+ if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
+ if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
+ if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) return -1;
+ iter->curr_off = bgzf_tell(fp);
+ }
+ ++iter->i;
+ }
+ if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
+ iter->curr_off = bgzf_tell(fp);
+ if (tid != iter->tid || beg >= iter->end) { // no need to proceed
+ ret = -1; break;
+ } else if (end > iter->beg && iter->end > beg) {
+ iter->curr_tid = tid;
+ iter->curr_beg = beg;
+ iter->curr_end = end;
+ return ret;
+ }
+ } else break; // end of file or error
+ }
+ iter->finished = 1;
+ return ret;
+}
+
+/**********************
+ *** Retrieve index ***
+ **********************/
+
+static char *test_and_fetch(const char *fn)
+{
+ FILE *fp;
+ if (hisremote(fn)) {
+ const int buf_size = 1 * 1024 * 1024;
+ hFILE *fp_remote;
+ uint8_t *buf;
+ int l;
+ const char *p;
+ for (p = fn + strlen(fn) - 1; p >= fn; --p)
+ if (*p == '/') break;
+ ++p; // p now points to the local file name
+ // Attempt to open local file first
+ if ((fp = fopen((char*)p, "rb")) != 0)
+ {
+ fclose(fp);
+ return (char*)p;
+ }
+ // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
+ if ((fp_remote = hopen(fn, "r")) == 0) return 0;
+ if ((fp = fopen(p, "w")) == 0) {
+ if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
+ hclose_abruptly(fp_remote);
+ return 0;
+ }
+ if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn);
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn);
+ return (char*)p;
+ } else {
+ if ((fp = fopen(fn, "rb")) == 0) return 0;
+ fclose(fp);
+ return (char*)fn;
+ }
+}
+
+char *hts_idx_getfn(const char *fn, const char *ext)
+{
+ int i, l_fn, l_ext;
+ char *fnidx, *ret;
+ l_fn = strlen(fn); l_ext = strlen(ext);
+ fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
+ strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
+ if ((ret = test_and_fetch(fnidx)) == 0) {
+ for (i = l_fn - 1; i > 0; --i)
+ if (fnidx[i] == '.') break;
+ strcpy(fnidx + i, ext);
+ ret = test_and_fetch(fnidx);
+ }
+ if (ret == 0) {
+ free(fnidx);
+ return 0;
+ }
+ l_fn = strlen(ret);
+ memmove(fnidx, ret, l_fn + 1);
+ return fnidx;
+}
+
+hts_idx_t *hts_idx_load(const char *fn, int fmt)
+{
+ char *fnidx;
+ hts_idx_t *idx;
+ fnidx = hts_idx_getfn(fn, ".csi");
+ if (! fnidx) fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
+ if (fnidx == 0) return 0;
+
+ idx = hts_idx_load2(fn, fnidx);
+ free(fnidx);
+ return idx;
+}
+
+hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
+{
+ // Check that the index file is up to date, the main file might have changed
+ struct stat stat_idx,stat_main;
+ if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
+ {
+ if ( stat_idx.st_mtime < stat_main.st_mtime )
+ fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx);
+ }
+
+ return hts_idx_load_local(fnidx);
+}
diff --git a/htslib/hts_internal.h b/htslib/hts_internal.h
new file mode 100644
index 0000000..07d6a11
--- /dev/null
+++ b/htslib/hts_internal.h
@@ -0,0 +1,90 @@
+/* hts_internal.h -- internal functions; not part of the public API.
+
+ Copyright (C) 2015-2016 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_HTS_INTERNAL_H
+#define HTSLIB_HTS_INTERNAL_H
+
+#include <stddef.h>
+#include <ctype.h>
+
+#include "htslib/hts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The <ctype.h> functions operate on ints such as are returned by fgetc(),
+// i.e., characters represented as unsigned-char-valued ints, or EOF.
+// To operate on plain chars (and to avoid warnings on some platforms),
+// technically one must cast to unsigned char everywhere (see CERT STR37-C)
+// or less painfully use these *_c() functions that operate on plain chars
+// (but not EOF, which must be considered separately where it is applicable).
+// TODO We may eventually wish to implement these functions directly without
+// using their <ctype.h> equivalents, and thus make them immune to locales.
+static inline int isalnum_c(char c) { return isalnum((unsigned char) c); }
+static inline int isalpha_c(char c) { return isalpha((unsigned char) c); }
+static inline int isdigit_c(char c) { return isdigit((unsigned char) c); }
+static inline int isgraph_c(char c) { return isgraph((unsigned char) c); }
+static inline int islower_c(char c) { return islower((unsigned char) c); }
+static inline int isprint_c(char c) { return isprint((unsigned char) c); }
+static inline int isspace_c(char c) { return isspace((unsigned char) c); }
+static inline int isupper_c(char c) { return isupper((unsigned char) c); }
+static inline char tolower_c(char c) { return tolower((unsigned char) c); }
+static inline char toupper_c(char c) { return toupper((unsigned char) c); }
+
+
+struct cram_fd;
+
+char *hts_idx_getfn(const char *fn, const char *ext);
+
+// The CRAM implementation stores the loaded index within the cram_fd rather
+// than separately as is done elsewhere in htslib. So if p is a pointer to
+// an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an
+// hts_cram_idx_t and should be cast accordingly.
+typedef struct hts_cram_idx_t {
+ int fmt;
+ struct cram_fd *cram;
+} hts_cram_idx_t;
+
+
+struct hts_path_itr {
+ kstring_t path, entry;
+ void *dirv; // DIR * privately
+ const char *pathdir, *prefix, *suffix;
+ size_t prefix_len, suffix_len, entry_dir_l;
+};
+
+void hts_path_itr_setup(struct hts_path_itr *itr, const char *path,
+ const char *builtin_path, const char *prefix, size_t prefix_len,
+ const char *suffix, size_t suffix_len);
+
+const char *hts_path_itr_next(struct hts_path_itr *itr);
+
+void *load_plugin(void **pluginp, const char *filename, const char *symbol);
+void *plugin_sym(void *plugin, const char *name, const char **errmsg);
+void close_plugin(void *plugin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htsfile.c b/htslib/htsfile.c
new file mode 100644
index 0000000..3fa0678
--- /dev/null
+++ b/htslib/htsfile.c
@@ -0,0 +1,234 @@
+/* htsfile.c -- file identifier and minimal viewer.
+
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <unistd.h>
+
+#include "htslib/hfile.h"
+#include "htslib/hts.h"
+#include "htslib/sam.h"
+#include "htslib/vcf.h"
+
+enum { identify, view_headers, view_all } mode = identify;
+int show_headers = 1;
+int status = EXIT_SUCCESS; /* Exit status from main */
+
+static htsFile *dup_stdout(const char *mode)
+{
+ int fd = dup(STDOUT_FILENO);
+ if (fd < 0) {
+ perror("htsfile: Couldn't duplicate stdout");
+ return NULL;
+ }
+ hFILE *hfp = hdopen(fd, mode);
+ return hfp? hts_hopen(hfp, "-", mode) : NULL;
+}
+
+static int view_sam(hFILE *hfp, const char *filename)
+{
+ samFile *in = hts_hopen(hfp, filename, "r");
+ bam_hdr_t *hdr = NULL;
+ samFile *out = NULL;
+ if (in == NULL) {
+ status = EXIT_FAILURE;
+ return 0;
+ }
+ hdr = sam_hdr_read(in);
+ if (hdr == NULL) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+ out = dup_stdout("w");
+ if (out == NULL) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+
+ if (show_headers) {
+ if (sam_hdr_write(out, hdr) != 0) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+ }
+ if (mode == view_all) {
+ bam1_t *b = bam_init1();
+ int ret;
+ while ((ret = sam_read1(in, hdr, b)) >= 0) {
+ if (sam_write1(out, hdr, b) < 0) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+ }
+ bam_destroy1(b);
+ if (ret != -1) // eof
+ status = EXIT_FAILURE;
+ }
+
+ clean:
+ if (hdr != NULL) bam_hdr_destroy(hdr);
+ if (out != NULL && hts_close(out) != 0)
+ status = EXIT_FAILURE;
+ if (hts_close(in) != 0)
+ status = EXIT_FAILURE;
+ return 1;
+}
+
+static int view_vcf(hFILE *hfp, const char *filename)
+{
+ vcfFile *in = hts_hopen(hfp, filename, "r");
+ bcf_hdr_t *hdr = NULL;
+ vcfFile *out = NULL;
+ if (in == NULL) {
+ status = EXIT_FAILURE;
+ return 0;
+ }
+ hdr = bcf_hdr_read(in);
+ if (hdr == NULL) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+ out = dup_stdout("w");
+ if (out == NULL) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+
+ if (show_headers) {
+ if (bcf_hdr_write(out, hdr) != 0) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+ }
+ if (mode == view_all) {
+ bcf1_t *rec = bcf_init();
+ while (bcf_read(in, hdr, rec) >= 0) {
+ if (bcf_write(out, hdr, rec) < 0) {
+ status = EXIT_FAILURE;
+ goto clean;
+ }
+ }
+ bcf_destroy(rec);
+ }
+
+ clean:
+ if (hdr != NULL) bcf_hdr_destroy(hdr);
+ if (out != NULL) hts_close(out);
+ hts_close(in);
+ return 1;
+}
+
+static void usage(FILE *fp, int status)
+{
+ fprintf(fp,
+"Usage: htsfile [-chHv] FILE...\n"
+"Options:\n"
+" -c, --view Write textual form of FILEs to standard output\n"
+" -h, --header-only Display only headers in view mode, not records\n"
+" -H, --no-header Suppress header display in view mode\n"
+" -v, --verbose Increase verbosity of warnings and diagnostics\n");
+ exit(status);
+}
+
+int main(int argc, char **argv)
+{
+ static const struct option options[] = {
+ { "header-only", no_argument, NULL, 'h' },
+ { "no-header", no_argument, NULL, 'H' },
+ { "view", no_argument, NULL, 'c' },
+ { "verbose", no_argument, NULL, 'v' },
+ { "help", no_argument, NULL, '?' },
+ { "version", no_argument, NULL, 1 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ int c, i;
+
+ status = EXIT_SUCCESS;
+ while ((c = getopt_long(argc, argv, "chHv?", options, NULL)) >= 0)
+ switch (c) {
+ case 'c': mode = view_all; break;
+ case 'h': mode = view_headers; show_headers = 1; break;
+ case 'H': show_headers = 0; break;
+ case 'v': hts_verbose++; break;
+ case 1:
+ printf(
+"htsfile (htslib) %s\n"
+"Copyright (C) 2016 Genome Research Ltd.\n",
+ hts_version());
+ exit(EXIT_SUCCESS);
+ break;
+ case '?': usage(stdout, EXIT_SUCCESS); break;
+ default: usage(stderr, EXIT_FAILURE); break;
+ }
+
+ if (optind == argc) usage(stderr, EXIT_FAILURE);
+
+ for (i = optind; i < argc; i++) {
+ htsFormat fmt;
+ hFILE *fp = hopen(argv[i], "r");
+ if (fp == NULL) {
+ fprintf(stderr, "htsfile: can't open \"%s\": %s\n", argv[i], strerror(errno));
+ status = EXIT_FAILURE;
+ continue;
+ }
+
+ if (hts_detect_format(fp, &fmt) < 0) {
+ fprintf(stderr, "htsfile: detecting \"%s\" format failed: %s\n", argv[i], strerror(errno));
+ hclose_abruptly(fp);
+ status = EXIT_FAILURE;
+ continue;
+ }
+
+ if (mode == identify) {
+ char *description = hts_format_description(&fmt);
+ printf("%s:\t%s\n", argv[i], description);
+ free(description);
+ }
+ else
+ switch (fmt.category) {
+ case sequence_data:
+ if (view_sam(fp, argv[i])) fp = NULL;
+ break;
+ case variant_data:
+ if (view_vcf(fp, argv[i])) fp = NULL;
+ break;
+ default:
+ fprintf(stderr, "htsfile: can't view %s: unknown format\n", argv[i]);
+ status = EXIT_FAILURE;
+ break;
+ }
+
+ if (fp && hclose(fp) < 0) {
+ fprintf(stderr, "htsfile: closing %s failed\n", argv[i]);
+ status = EXIT_FAILURE;
+ }
+ }
+
+ return status;
+}
diff --git a/htslib/htslib.pc.in b/htslib/htslib.pc.in
new file mode 100644
index 0000000..465de17
--- /dev/null
+++ b/htslib/htslib.pc.in
@@ -0,0 +1,10 @@
+includedir=@includedir@
+libdir=@libdir@
+
+Name: htslib
+Description: C library for high-throughput sequencing data formats
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lhts
+Libs.private: -L${libdir} -lhts -lm -lpthread
+Requires.private: zlib
diff --git a/htslib/htslib/._bgzf.h b/htslib/htslib/._bgzf.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._bgzf.h differ
diff --git a/htslib/htslib/._cram.h b/htslib/htslib/._cram.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._cram.h differ
diff --git a/htslib/htslib/._faidx.h b/htslib/htslib/._faidx.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._faidx.h differ
diff --git a/htslib/htslib/._hfile.h b/htslib/htslib/._hfile.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._hfile.h differ
diff --git a/htslib/htslib/._hts.h b/htslib/htslib/._hts.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._hts.h differ
diff --git a/htslib/htslib/._hts_defs.h b/htslib/htslib/._hts_defs.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._hts_defs.h differ
diff --git a/htslib/htslib/._kbitset.h b/htslib/htslib/._kbitset.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._kbitset.h differ
diff --git a/htslib/htslib/._kfunc.h b/htslib/htslib/._kfunc.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._kfunc.h differ
diff --git a/htslib/htslib/._khash.h b/htslib/htslib/._khash.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._khash.h differ
diff --git a/htslib/htslib/._khash_str2int.h b/htslib/htslib/._khash_str2int.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._khash_str2int.h differ
diff --git a/htslib/htslib/._klist.h b/htslib/htslib/._klist.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._klist.h differ
diff --git a/htslib/htslib/._knetfile.h b/htslib/htslib/._knetfile.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._knetfile.h differ
diff --git a/htslib/htslib/._kseq.h b/htslib/htslib/._kseq.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._kseq.h differ
diff --git a/htslib/htslib/._ksort.h b/htslib/htslib/._ksort.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._ksort.h differ
diff --git a/htslib/htslib/._kstring.h b/htslib/htslib/._kstring.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._kstring.h differ
diff --git a/htslib/htslib/._regidx.h b/htslib/htslib/._regidx.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._regidx.h differ
diff --git a/htslib/htslib/._sam.h b/htslib/htslib/._sam.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._sam.h differ
diff --git a/htslib/htslib/._synced_bcf_reader.h b/htslib/htslib/._synced_bcf_reader.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._synced_bcf_reader.h differ
diff --git a/htslib/htslib/._tbx.h b/htslib/htslib/._tbx.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._tbx.h differ
diff --git a/htslib/htslib/._vcf.h b/htslib/htslib/._vcf.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._vcf.h differ
diff --git a/htslib/htslib/._vcf_sweep.h b/htslib/htslib/._vcf_sweep.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._vcf_sweep.h differ
diff --git a/htslib/htslib/._vcfutils.h b/htslib/htslib/._vcfutils.h
new file mode 100644
index 0000000..8252204
Binary files /dev/null and b/htslib/htslib/._vcfutils.h differ
diff --git a/htslib/htslib/bgzf.h b/htslib/htslib/bgzf.h
new file mode 100644
index 0000000..63e1c91
--- /dev/null
+++ b/htslib/htslib/bgzf.h
@@ -0,0 +1,335 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+ Copyright (C) 2009, 2013, 2014 Genome Research Ltd
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
+#ifndef HTSLIB_BGZF_H
+#define HTSLIB_BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <sys/types.h>
+
+#include "hts_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
+#define BGZF_MAX_BLOCK_SIZE 0x10000
+
+#define BGZF_ERR_ZLIB 1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO 4
+#define BGZF_ERR_MISUSE 8
+
+struct hFILE;
+struct bgzf_mtaux_t;
+typedef struct __bgzidx_t bgzidx_t;
+
+struct BGZF {
+ unsigned errcode:16, is_write:2, is_be:2;
+ signed compress_level:9;
+ unsigned is_compressed:2, is_gzip:1;
+ int cache_size;
+ int block_length, block_offset;
+ int64_t block_address, uncompressed_address;
+ void *uncompressed_block, *compressed_block;
+ void *cache; // a pointer to a hash table
+ struct hFILE *fp; // actual file handle
+ struct bgzf_mtaux_t *mt; // only used for multi-threading
+ bgzidx_t *idx; // BGZF index
+ int idx_build_otf; // build index on the fly, set by bgzf_index_build_init()
+ z_stream *gz_stream;// for gzip-compressed files
+};
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+ /******************
+ * Basic routines *
+ ******************/
+
+ /**
+ * Open an existing file descriptor for reading or writing.
+ *
+ * @param fd file descriptor
+ * @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+ * writing, 'a' for appending, 'g' for gzip rather than BGZF
+ * compression (with 'w' only), and digit specifies the zlib
+ * compression level.
+ * Note that there is a distinction between 'u' and '0': the
+ * first yields plain uncompressed output whereas the latter
+ * outputs uncompressed data wrapped in the zlib format.
+ * @return BGZF file handler; 0 on error
+ */
+ BGZF* bgzf_dopen(int fd, const char *mode);
+
+ #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
+
+ /**
+ * Open the specified file for reading or writing.
+ */
+ BGZF* bgzf_open(const char* path, const char *mode);
+
+ /**
+ * Open an existing hFILE stream for reading or writing.
+ */
+ BGZF* bgzf_hopen(struct hFILE *fp, const char *mode);
+
+ /**
+ * Close the BGZF and free all associated resources.
+ *
+ * @param fp BGZF file handler
+ * @return 0 on success and -1 on error
+ */
+ int bgzf_close(BGZF *fp);
+
+ /**
+ * Read up to _length_ bytes from the file storing into _data_.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to read into
+ * @param length size of data to read
+ * @return number of bytes actually read; 0 on end-of-file and -1 on error
+ */
+ ssize_t bgzf_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED;
+
+ /**
+ * Write _length_ bytes from _data_ to the file. If no I/O errors occur,
+ * the complete _length_ bytes will be written (or queued for writing).
+ *
+ * @param fp BGZF file handler
+ * @param data data array to write
+ * @param length size of data to write
+ * @return number of bytes written (i.e., _length_); negative on error
+ */
+ ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED;
+
+ /**
+ * Read up to _length_ bytes directly from the underlying stream without
+ * decompressing. Bypasses BGZF blocking, so must be used with care in
+ * specialised circumstances only.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to read into
+ * @param length number of raw bytes to read
+ * @return number of bytes actually read; 0 on end-of-file and -1 on error
+ */
+ ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED;
+
+ /**
+ * Write _length_ bytes directly to the underlying stream without
+ * compressing. Bypasses BGZF blocking, so must be used with care
+ * in specialised circumstances only.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to write
+ * @param length number of raw bytes to write
+ * @return number of bytes actually written; -1 on error
+ */
+ ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED;
+
+ /**
+ * Write the data in the buffer to the file.
+ *
+ * @param fp BGZF file handle
+ * @return 0 on success and -1 on error
+ */
+ int bgzf_flush(BGZF *fp) HTS_RESULT_USED;
+
+ /**
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ */
+ #define bgzf_tell(fp) (((fp)->block_address << 16) | ((fp)->block_offset & 0xFFFF))
+
+ /**
+ * Set the file to read from the location specified by _pos_.
+ *
+ * @param fp BGZF file handler
+ * @param pos virtual file offset returned by bgzf_tell()
+ * @param whence must be SEEK_SET
+ * @return 0 on success and -1 on error
+ */
+ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) HTS_RESULT_USED;
+
+ /**
+ * Check if the BGZF end-of-file (EOF) marker is present
+ *
+ * @param fp BGZF file handler opened for reading
+ * @return 1 if the EOF marker is present and correct;
+ * 2 if it can't be checked, e.g., because fp isn't seekable;
+ * 0 if the EOF marker is absent;
+ * -1 (with errno set) on error
+ */
+ int bgzf_check_EOF(BGZF *fp);
+
+ /**
+ * Check if a file is in the BGZF format
+ *
+ * @param fn file name
+ * @return 1 if _fn_ is BGZF; 0 if not or on I/O error
+ */
+ int bgzf_is_bgzf(const char *fn);
+
+ /*********************
+ * Advanced routines *
+ *********************/
+
+ /**
+ * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+ *
+ * @param fp BGZF file handler
+ * @param size size of cache in bytes; 0 to disable caching (default)
+ */
+ void bgzf_set_cache_size(BGZF *fp, int size);
+
+ /**
+ * Flush the file if the remaining buffer size is smaller than _size_
+ * @return 0 if flushing succeeded or was not needed; negative on error
+ */
+ int bgzf_flush_try(BGZF *fp, ssize_t size) HTS_RESULT_USED;
+
+ /**
+ * Read one byte from a BGZF file. It is faster than bgzf_read()
+ * @param fp BGZF file handler
+ * @return byte read; -1 on end-of-file or error
+ */
+ int bgzf_getc(BGZF *fp);
+
+ /**
+ * Read one line from a BGZF file. It is faster than bgzf_getc()
+ *
+ * @param fp BGZF file handler
+ * @param delim delimitor
+ * @param str string to write to; must be initialized
+ * @return length of the string; 0 on end-of-file; negative on error
+ */
+ int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+ /**
+ * Read the next BGZF block.
+ */
+ int bgzf_read_block(BGZF *fp) HTS_RESULT_USED;
+
+ /**
+ * Enable multi-threading (only effective on writing and when the
+ * library was compiled with -DBGZF_MT)
+ *
+ * @param fp BGZF file handler; must be opened for writing
+ * @param n_threads #threads used for writing
+ * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
+ */
+ int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
+
+ /**
+ * Compress a single BGZF block.
+ *
+ * @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
+ * @param dlen size of output buffer; updated on return to the number
+ * of bytes actually written to dst
+ * @param src buffer to be compressed
+ * @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE)
+ * @param level compression level
+ * @return 0 on success and negative on error
+ */
+ int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level);
+
+ /*******************
+ * bgzidx routines *
+ *******************/
+
+ /**
+ * Position BGZF at the uncompressed offset
+ *
+ * @param fp BGZF file handler; must be opened for reading
+ * @param uoffset file offset in the uncompressed data
+ * @param where SEEK_SET supported atm
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_useek(BGZF *fp, long uoffset, int where) HTS_RESULT_USED;
+
+ /**
+ * Position in uncompressed BGZF
+ *
+ * @param fp BGZF file handler; must be opened for reading
+ *
+ * Returns the current offset on success and -1 on error.
+ */
+ long bgzf_utell(BGZF *fp);
+
+ /**
+ * Tell BGZF to build index while compressing.
+ *
+ * @param fp BGZF file handler; can be opened for reading or writing.
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_index_build_init(BGZF *fp);
+
+ /**
+ * Load BGZF index
+ *
+ * @param fp BGZF file handler
+ * @param bname base name
+ * @param suffix suffix to add to bname (can be NULL)
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix);
+
+ /**
+ * Save BGZF index
+ *
+ * @param fp BGZF file handler
+ * @param bname base name
+ * @param suffix suffix to add to bname (can be NULL)
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_index_dump(BGZF *fp,
+ const char *bname, const char *suffix) HTS_RESULT_USED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/cram.h b/htslib/htslib/cram.h
new file mode 100644
index 0000000..bd54f39
--- /dev/null
+++ b/htslib/htslib/cram.h
@@ -0,0 +1,492 @@
+/* cram.h -- public CRAM-specific API functions.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: James Bonfield <jkb at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+/*! \file
+ * CRAM interface.
+ *
+ * Consider using the higher level hts_*() API for programs that wish to
+ * be file format agnostic (see htslib/hts.h).
+ *
+ * This API should be used for CRAM specific code. The specifics of the
+ * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h
+ * although these should not be included directly (use this file instead).
+ */
+
+#ifndef HTSLIB_CRAM_H
+#define HTSLIB_CRAM_H
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "hts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _CRAM_STRUCTS_H_
+enum cram_block_method {
+ ERROR = -1,
+ RAW = 0,
+ GZIP = 1,
+ BZIP2 = 2,
+ LZMA = 3,
+ RANS = 4, // Generic; either order
+ RANS0 = 4,
+ RANS1 = 10, // Not externalised; stored as RANS (generic)
+ GZIP_RLE = 11, // NB: not externalised in CRAM
+};
+
+enum cram_content_type {
+ CT_ERROR = -1,
+ FILE_HEADER = 0,
+ COMPRESSION_HEADER = 1,
+ MAPPED_SLICE = 2,
+ UNMAPPED_SLICE = 3, // CRAM V1.0 only
+ EXTERNAL = 4,
+ CORE = 5,
+};
+
+// Opaque data types, see cram_structs for the fully fledged versions.
+typedef struct SAM_hdr SAM_hdr;
+typedef struct cram_file_def cram_file_def;
+typedef struct cram_fd cram_fd;
+typedef struct cram_container cram_container;
+typedef struct cram_block cram_block;
+typedef struct cram_slice cram_slice;
+typedef struct cram_metrics cram_metrics;
+typedef struct cram_block_slice_hdr cram_block_slice_hdr;
+typedef struct cram_block_compression_hdr cram_block_compression_hdr;
+typedef struct refs_t refs_t; // need this?
+
+struct hFILE;
+#endif
+
+// Accessor functions
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_fd
+ */
+SAM_hdr *cram_fd_get_header(cram_fd *fd);
+void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr);
+
+int cram_fd_get_version(cram_fd *fd);
+void cram_fd_set_version(cram_fd *fd, int vers);
+
+int cram_major_vers(cram_fd *fd);
+int cram_minor_vers(cram_fd *fd);
+
+struct hFILE *cram_fd_get_fp(cram_fd *fd);
+void cram_fd_set_fp(cram_fd *fd, struct hFILE *fp);
+
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_container
+ */
+int32_t cram_container_get_length(cram_container *c);
+void cram_container_set_length(cram_container *c, int32_t length);
+int32_t cram_container_get_num_blocks(cram_container *c);
+void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks);
+int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks);
+void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
+ int32_t *landmarks);
+
+/* Returns true if the container is empty (EOF marker) */
+int cram_container_is_empty(cram_fd *fd);
+
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_block
+ */
+int32_t cram_block_get_content_id(cram_block *b);
+int32_t cram_block_get_comp_size(cram_block *b);
+int32_t cram_block_get_uncomp_size(cram_block *b);
+int32_t cram_block_get_crc32(cram_block *b);
+void * cram_block_get_data(cram_block *b);
+
+enum cram_content_type cram_block_get_content_type(cram_block *b);
+
+void cram_block_set_content_id(cram_block *b, int32_t id);
+void cram_block_set_comp_size(cram_block *b, int32_t size);
+void cram_block_set_uncomp_size(cram_block *b, int32_t size);
+void cram_block_set_crc32(cram_block *b, int32_t crc);
+void cram_block_set_data(cram_block *b, void *data);
+
+int cram_block_append(cram_block *b, void *data, int size);
+void cram_block_update_size(cram_block *b);
+
+// Offset is known as "size" internally, but it can be confusing.
+size_t cram_block_get_offset(cram_block *b);
+void cram_block_set_offset(cram_block *b, size_t offset);
+
+/*
+ * Computes the size of a cram block, including the block
+ * header itself.
+ */
+uint32_t cram_block_size(cram_block *b);
+
+/*
+ * Renumbers RG numbers in a cram compression header.
+ *
+ * CRAM stores RG as the Nth number in the header, rather than a
+ * string holding the ID: tag. This is smaller in space, but means
+ * "samtools cat" to join files together that contain single but
+ * different RG lines needs a way of renumbering them.
+ *
+ * The file descriptor is expected to be immediately after the
+ * cram_container structure (ie before the cram compression header).
+ * Due to the nature of the CRAM format, this needs to read and write
+ * the blocks itself. Note that there may be multiple slices within
+ * the container, meaning multiple compression headers to manipulate.
+ * Changing RG may change the size of the compression header and
+ * therefore the length field in the container. Hence we rewrite all
+ * blocks just incase and also emit the adjusted container.
+ *
+ * The current implementation can only cope with renumbering a single
+ * RG (and only then if it is using HUFFMAN or BETA codecs). In
+ * theory it *may* be possible to renumber multiple RGs if they use
+ * HUFFMAN to the CORE block or use an external block unshared by any
+ * other data series. So we have an API that can be upgraded to
+ * support this, but do not implement it for now. An example
+ * implementation of RG as an EXTERNAL block would be to find that
+ * block and rewrite it, returning the number of blocks consumed.
+ *
+ * Returns 0 on success;
+ * -1 if unable to edit;
+ * -2 on other errors (eg I/O).
+ */
+int cram_transcode_rg(cram_fd *in, cram_fd *out,
+ cram_container *c,
+ int nrg, int *in_rg, int *out_rg);
+
+/*
+ * Copies the blocks representing the next num_slice slices from a
+ * container from 'in' to 'out'. It is expected that the file pointer
+ * is just after the read of the cram_container and cram compression
+ * header.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice);
+
+/*
+ *-----------------------------------------------------------------------------
+ * SAM_hdr
+ */
+
+/*! Tokenises a SAM header into a hash table.
+ *
+ * Also extracts a few bits on specific data types, such as @RG lines.
+ *
+ * @return
+ * Returns a SAM_hdr struct on success (free with sam_hdr_free());
+ * NULL on failure
+ */
+SAM_hdr *sam_hdr_parse_(const char *hdr, int len);
+
+
+/*
+ *-----------------------------------------------------------------------------
+ * cram_io basics
+ */
+
+/**@{ ----------------------------------------------------------------------
+ * CRAM blocks - the dynamically growable data block. We have code to
+ * create, update, (un)compress and read/write.
+ *
+ * These are derived from the deflate_interlaced.c blocks, but with the
+ * CRAM extension of content types and IDs.
+ */
+
+/*! Allocates a new cram_block structure with a specified content_type and
+ * id.
+ *
+ * @return
+ * Returns block pointer on success;
+ * NULL on failure
+ */
+cram_block *cram_new_block(enum cram_content_type content_type,
+ int content_id);
+
+/*! Reads a block from a cram file.
+ *
+ * @return
+ * Returns cram_block pointer on success;
+ * NULL on failure
+ */
+cram_block *cram_read_block(cram_fd *fd);
+
+/*! Writes a CRAM block.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_write_block(cram_fd *fd, cram_block *b);
+
+/*! Frees a CRAM block, deallocating internal data too.
+ */
+void cram_free_block(cram_block *b);
+
+/*! Uncompresses a CRAM block, if compressed.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_uncompress_block(cram_block *b);
+
+/*! Compresses a block.
+ *
+ * Compresses a block using one of two different zlib strategies. If we only
+ * want one choice set strat2 to be -1.
+ *
+ * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
+ * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
+ * significantly faster.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
+ int method, int level);
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Containers
+ */
+
+/*! Creates a new container, specifying the maximum number of slices
+ * and records permitted.
+ *
+ * @return
+ * Returns cram_container ptr on success;
+ * NULL on failure
+ */
+cram_container *cram_new_container(int nrec, int nslice);
+void cram_free_container(cram_container *c);
+
+/*! Reads a container header.
+ *
+ * @return
+ * Returns cram_container on success;
+ * NULL on failure or no container left (fd->err == 0).
+ */
+cram_container *cram_read_container(cram_fd *fd);
+
+/*! Writes a container structure.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_write_container(cram_fd *fd, cram_container *h);
+
+/*
+ * Stores the container structure in dat and returns *size as the
+ * number of bytes written to dat[]. The input size of dat is also
+ * held in *size and should be initialised to cram_container_size(c).
+ *
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size);
+
+int cram_container_size(cram_container *c);
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * The top-level cram opening, closing and option handling
+ */
+
+/*! Opens a CRAM file for read (mode "rb") or write ("wb").
+ *
+ * The filename may be "-" to indicate stdin or stdout.
+ *
+ * @return
+ * Returns file handle on success;
+ * NULL on failure.
+ */
+cram_fd *cram_open(const char *filename, const char *mode);
+
+/*! Opens an existing stream for reading or writing.
+ *
+ * @return
+ * Returns file handle on success;
+ * NULL on failure.
+ */
+cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
+
+/*! Closes a CRAM file.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_close(cram_fd *fd);
+
+/*
+ * Seek within a CRAM file.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_seek(cram_fd *fd, off_t offset, int whence);
+
+/*
+ * Flushes a CRAM file.
+ * Useful for when writing to stdout without wishing to close the stream.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int cram_flush(cram_fd *fd);
+
+/*! Checks for end of file on a cram_fd stream.
+ *
+ * @return
+ * Returns 0 if not at end of file
+ * 1 if we hit an expected EOF (end of range or EOF block)
+ * 2 for other EOF (end of stream without EOF block)
+ */
+int cram_eof(cram_fd *fd);
+
+/*! Sets options on the cram_fd.
+ *
+ * See CRAM_OPT_* definitions in hts.h.
+ * Use this immediately after opening.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...);
+
+/*! Sets options on the cram_fd.
+ *
+ * See CRAM_OPT_* definitions in hts.h.
+ * Use this immediately after opening.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args);
+
+/*!
+ * Attaches a header to a cram_fd.
+ *
+ * This should be used when creating a new cram_fd for writing where
+ * we have an SAM_hdr already constructed (eg from a file we've read
+ * in).
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int cram_set_header(cram_fd *fd, SAM_hdr *hdr);
+
+
+/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
+int int32_put_blk(cram_block *b, int32_t val);
+
+/**@}*/
+/**@{ -------------------------------------------------------------------*/
+/*! Deallocates all storage used by a SAM_hdr struct.
+ *
+ * This also decrements the header reference count. If after decrementing
+ * it is still non-zero then the header is assumed to be in use by another
+ * caller and the free is not done.
+ *
+ * This is a synonym for sam_hdr_dec_ref().
+ */
+void sam_hdr_free(SAM_hdr *hdr);
+
+/*! Returns the current length of the SAM_hdr in text form.
+ *
+ * Call sam_hdr_rebuild() first if editing has taken place.
+ */
+int sam_hdr_length(SAM_hdr *hdr);
+
+/*! Returns the string form of the SAM_hdr.
+ *
+ * Call sam_hdr_rebuild() first if editing has taken place.
+ */
+char *sam_hdr_str(SAM_hdr *hdr);
+
+/*! Appends a formatted line to an existing SAM header.
+ *
+ * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
+ * optional new-line. If it contains more than 1 line then multiple lines
+ * will be added in order.
+ *
+ * Len is the length of the text data, or 0 if unknown (in which case
+ * it should be null terminated).
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+
+/*! Add an @PG line.
+ *
+ * If we wish complete control over this use sam_hdr_add() directly. This
+ * function uses that, but attempts to do a lot of tedious house work for
+ * you too.
+ *
+ * - It will generate a suitable ID if the supplied one clashes.
+ * - It will generate multiple @PG records if we have multiple PG chains.
+ *
+ * Call it as per sam_hdr_add() with a series of key,value pairs ending
+ * in NULL.
+ *
+ * @return
+ * Returns 0 on success;
+ * -1 on failure
+ */
+int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...);
+
+/*!
+ * A function to help with construction of CL tags in @PG records.
+ * Takes an argc, argv pair and returns a single space-separated string.
+ * This string should be deallocated by the calling function.
+ *
+ * @return
+ * Returns malloced char * on success;
+ * NULL on failure
+ */
+char *stringify_argv(int argc, char *argv[]);
+/**@}*/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/faidx.h b/htslib/htslib/faidx.h
new file mode 100644
index 0000000..62478ef
--- /dev/null
+++ b/htslib/htslib/faidx.h
@@ -0,0 +1,137 @@
+/* faidx.h -- FASTA random access.
+
+ Copyright (C) 2008, 2009, 2013, 2014 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef HTSLIB_FAIDX_H
+#define HTSLIB_FAIDX_H
+
+#include "hts_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ @header
+
+ Index FASTA files and extract subsequence.
+
+ The fai file index columns are:
+ - chromosome name
+ - chromosome length: number of bases
+ - offset: number of bytes to skip to get to the first base
+ from the beginning of the file, including the length
+ of the sequence description string (">chr ..\n")
+ - line length: number of bases per line (excluding \n)
+ - binary line length: number of bytes, including \n
+
+ @copyright The Wellcome Trust Sanger Institute.
+ */
+
+struct __faidx_t;
+typedef struct __faidx_t faidx_t;
+
+ /*!
+ @abstract Build index for a FASTA or bgzip-compressed FASTA file.
+ @param fn FASTA file name
+ @return 0 on success; or -1 on failure
+ @discussion File "fn.fai" will be generated.
+ */
+ int fai_build(const char *fn) HTS_RESULT_USED;
+
+ /*!
+ @abstract Destroy a faidx_t struct.
+ @param fai Pointer to the struct to be destroyed
+ */
+ void fai_destroy(faidx_t *fai);
+
+ /*!
+ @abstract Load index from "fn.fai".
+ @param fn File name of the FASTA file
+ */
+ faidx_t *fai_load(const char *fn);
+
+ /*!
+ @abstract Fetch the sequence in a region.
+ @param fai Pointer to the faidx_t struct
+ @param reg Region in the format "chr2:20,000-30,000"
+ @param len Length of the region; -2 if seq not present, -1 general error
+ @return Pointer to the sequence; null on failure
+
+ @discussion The returned sequence is allocated by malloc family
+ and should be destroyed by end users by calling free() on it.
+ */
+ char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
+
+ /*!
+ @abstract Fetch the number of sequences.
+ @param fai Pointer to the faidx_t struct
+ @return The number of sequences
+ */
+ int faidx_fetch_nseq(const faidx_t *fai) HTS_DEPRECATED("Please use faidx_nseq instead");
+
+ /*!
+ @abstract Fetch the sequence in a region.
+ @param fai Pointer to the faidx_t struct
+ @param c_name Region name
+ @param p_beg_i Beginning position number (zero-based)
+ @param p_end_i End position number (zero-based)
+ @param len Length of the region; -2 if c_name not present, -1 general error
+ @return Pointer to the sequence; null on failure
+
+ @discussion The returned sequence is allocated by malloc family
+ and should be destroyed by end users by calling free() on it.
+ */
+ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len);
+
+ /*!
+ @abstract Query if sequence is present
+ @param fai Pointer to the faidx_t struct
+ @param seq Sequence name
+ @return 1 if present or 0 if absent
+ */
+ int faidx_has_seq(const faidx_t *fai, const char *seq);
+
+ /*!
+ @abstract Return number of sequences in fai index
+ */
+ int faidx_nseq(const faidx_t *fai);
+
+ /*!
+ @abstract Return name of i-th sequence
+ */
+ const char *faidx_iseq(const faidx_t *fai, int i);
+
+ /*!
+ @abstract Return sequence length, -1 if not present
+ */
+ int faidx_seq_len(const faidx_t *fai, const char *seq);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/hfile.h b/htslib/htslib/hfile.h
new file mode 100644
index 0000000..cd772dd
--- /dev/null
+++ b/htslib/htslib/hfile.h
@@ -0,0 +1,215 @@
+/* hfile.h -- buffered low-level input/output streams.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_HFILE_H
+#define HTSLIB_HFILE_H
+
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "hts_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* These fields are declared here solely for the benefit of the inline functions
+ below. They may change in future releases. User code should not use them
+ directly; you should imagine that hFILE is an opaque incomplete type. */
+struct hFILE_backend;
+typedef struct hFILE {
+ char *buffer, *begin, *end, *limit;
+ const struct hFILE_backend *backend;
+ off_t offset;
+ unsigned at_eof:1;
+ int has_errno;
+} hFILE;
+
+/*!
+ @abstract Open the named file or URL as a stream
+ @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+ @notes The usual @c fopen(3) @a mode letters are supported: one of
+ @e r (read), @e w (write), @e a (append), optionally followed by any of
+ @e + (update), @e e (close on @c exec(2)), @e x (create exclusively).
+*/
+hFILE *hopen(const char *filename, const char *mode) HTS_RESULT_USED;
+
+/*!
+ @abstract Associate a stream with an existing open file descriptor
+ @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+ @notes For socket descriptors (on Windows), mode should contain 's'.
+*/
+hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED;
+
+/*!
+ @abstract Report whether the file name or URL denotes remote storage
+ @return 0 if local, 1 if remote.
+ @notes "Remote" means involving e.g. explicit network access, with the
+ implication that callers may wish to cache such files' contents locally.
+*/
+int hisremote(const char *filename) HTS_RESULT_USED;
+
+/*!
+ @abstract Flush (for output streams) and close the stream
+ @return 0 if successful, or EOF (with errno set) if an error occurred.
+*/
+int hclose(hFILE *fp) HTS_RESULT_USED;
+
+/*!
+ @abstract Close the stream, without flushing or propagating errors
+ @notes For use while cleaning up after an error only. Preserves errno.
+*/
+void hclose_abruptly(hFILE *fp);
+
+/*!
+ @abstract Return the stream's error indicator
+ @return Non-zero (in fact, an errno value) if an error has occurred.
+ @notes This would be called herror() and return true/false to parallel
+ ferror(3), but a networking-related herror(3) function already exists. */
+static inline int herrno(hFILE *fp)
+{
+ return fp->has_errno;
+}
+
+/*!
+ @abstract Clear the stream's error indicator
+*/
+static inline void hclearerr(hFILE *fp)
+{
+ fp->has_errno = 0;
+}
+
+/*!
+ @abstract Reposition the read/write stream offset
+ @return The resulting offset within the stream (as per lseek(2)),
+ or negative if an error occurred.
+*/
+off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+/*!
+ @abstract Report the current stream offset
+ @return The offset within the stream, starting from zero.
+*/
+static inline off_t htell(hFILE *fp)
+{
+ return fp->offset + (fp->begin - fp->buffer);
+}
+
+/*!
+ @abstract Read one character from the stream
+ @return The character read, or EOF on end-of-file or error
+*/
+static inline int hgetc(hFILE *fp)
+{
+ extern int hgetc2(hFILE *);
+ return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp);
+}
+
+/*!
+ @abstract Peek at characters to be read without removing them from buffers
+ @param fp The file stream
+ @param buffer The buffer to which the peeked bytes will be written
+ @param nbytes The number of bytes to peek at; limited by the size of the
+ internal buffer, which could be as small as 4K.
+ @return The number of bytes peeked, which may be less than nbytes if EOF
+ is encountered; or negative, if there was an I/O error.
+ @notes The characters peeked at remain in the stream's internal buffer,
+ and will be returned by later hread() etc calls.
+*/
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+/*!
+ @abstract Read a block of characters from the file
+ @return The number of bytes read, or negative if an error occurred.
+ @notes The full nbytes requested will be returned, except as limited
+ by EOF or I/O errors.
+*/
+static inline ssize_t HTS_RESULT_USED
+hread(hFILE *fp, void *buffer, size_t nbytes)
+{
+ extern ssize_t hread2(hFILE *, void *, size_t, size_t);
+
+ size_t n = fp->end - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(buffer, fp->begin, n);
+ fp->begin += n;
+ return (n == nbytes)? (ssize_t) n : hread2(fp, buffer, nbytes, n);
+}
+
+/*!
+ @abstract Write a character to the stream
+ @return The character written, or EOF if an error occurred.
+*/
+static inline int hputc(int c, hFILE *fp)
+{
+ extern int hputc2(int, hFILE *);
+ if (fp->begin < fp->limit) *(fp->begin++) = c;
+ else c = hputc2(c, fp);
+ return c;
+}
+
+/*!
+ @abstract Write a string to the stream
+ @return 0 if successful, or EOF if an error occurred.
+*/
+static inline int hputs(const char *text, hFILE *fp)
+{
+ extern int hputs2(const char *, size_t, size_t, hFILE *);
+
+ size_t nbytes = strlen(text), n = fp->limit - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(fp->begin, text, n);
+ fp->begin += n;
+ return (n == nbytes)? 0 : hputs2(text, nbytes, n, fp);
+}
+
+/*!
+ @abstract Write a block of characters to the file
+ @return Either nbytes, or negative if an error occurred.
+ @notes In the absence of I/O errors, the full nbytes will be written.
+*/
+static inline ssize_t HTS_RESULT_USED
+hwrite(hFILE *fp, const void *buffer, size_t nbytes)
+{
+ extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t);
+
+ size_t n = fp->limit - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(fp->begin, buffer, n);
+ fp->begin += n;
+ return (n==nbytes)? (ssize_t) n : hwrite2(fp, buffer, nbytes, n);
+}
+
+/*!
+ @abstract For writing streams, flush buffered output to the underlying stream
+ @return 0 if successful, or EOF if an error occurred.
+*/
+int hflush(hFILE *fp) HTS_RESULT_USED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/hts.h b/htslib/htslib/hts.h
new file mode 100644
index 0000000..1bac3a6
--- /dev/null
+++ b/htslib/htslib/hts.h
@@ -0,0 +1,639 @@
+/* hts.h -- format-neutral I/O, indexing, and iterator API functions.
+
+ Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_HTS_H
+#define HTSLIB_HTS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hts_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+struct cram_fd;
+struct hFILE;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/**
+ * hts_expand() - expands memory block pointed to by $ptr;
+ * hts_expand0() the latter sets the newly allocated part to 0.
+ *
+ * @param n requested number of elements of type type_t
+ * @param m size of memory allocated
+ */
+#define hts_expand(type_t, n, m, ptr) if ((n) > (m)) { \
+ (m) = (n); kroundup32(m); \
+ (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+ }
+#define hts_expand0(type_t, n, m, ptr) if ((n) > (m)) { \
+ int t = (m); (m) = (n); kroundup32(m); \
+ (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+ memset(((type_t*)ptr)+t,0,sizeof(type_t)*((m)-t)); \
+ }
+
+/************
+ * File I/O *
+ ************/
+
+// Add new entries only at the end (but before the *_maximum entry)
+// of these enums, as their numbering is part of the htslib ABI.
+
+enum htsFormatCategory {
+ unknown_category,
+ sequence_data, // Sequence data -- SAM, BAM, CRAM, etc
+ variant_data, // Variant calling data -- VCF, BCF, etc
+ index_file, // Index file associated with some data file
+ region_list, // Coordinate intervals or regions -- BED, etc
+ category_maximum = 32767
+};
+
+enum htsExactFormat {
+ unknown_format,
+ binary_format, text_format,
+ sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed,
+ format_maximum = 32767
+};
+
+enum htsCompression {
+ no_compression, gzip, bgzf, custom,
+ compression_maximum = 32767
+};
+
+typedef struct htsFormat {
+ enum htsFormatCategory category;
+ enum htsExactFormat format;
+ struct { short major, minor; } version;
+ enum htsCompression compression;
+ short compression_level; // currently unused
+ void *specific; // format specific options; see struct hts_opt.
+} htsFormat;
+
+// Maintainers note htsFile cannot be an opaque structure because some of its
+// fields are part of libhts.so's ABI (hence these fields must not be moved):
+// - fp is used in the public sam_itr_next()/etc macros
+// - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
+// - is_write and is_cram are used directly in samtools <= 1.1
+// - fp is used directly in samtools (up to and including current develop)
+// - line is used directly in bcftools (up to and including current develop)
+typedef struct {
+ uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, dummy:28;
+ int64_t lineno;
+ kstring_t line;
+ char *fn, *fn_aux;
+ union {
+ BGZF *bgzf;
+ struct cram_fd *cram;
+ struct hFILE *hfile;
+ void *voidp;
+ } fp;
+ htsFormat format;
+} htsFile;
+
+// REQUIRED_FIELDS
+enum sam_fields {
+ SAM_QNAME = 0x00000001,
+ SAM_FLAG = 0x00000002,
+ SAM_RNAME = 0x00000004,
+ SAM_POS = 0x00000008,
+ SAM_MAPQ = 0x00000010,
+ SAM_CIGAR = 0x00000020,
+ SAM_RNEXT = 0x00000040,
+ SAM_PNEXT = 0x00000080,
+ SAM_TLEN = 0x00000100,
+ SAM_SEQ = 0x00000200,
+ SAM_QUAL = 0x00000400,
+ SAM_AUX = 0x00000800,
+ SAM_RGAUX = 0x00001000,
+};
+
+// Mostly CRAM only, but this could also include other format options
+enum hts_fmt_option {
+ // CRAM specific
+ CRAM_OPT_DECODE_MD,
+ CRAM_OPT_PREFIX,
+ CRAM_OPT_VERBOSITY, // make general
+ CRAM_OPT_SEQS_PER_SLICE,
+ CRAM_OPT_SLICES_PER_CONTAINER,
+ CRAM_OPT_RANGE,
+ CRAM_OPT_VERSION, // rename to cram_version?
+ CRAM_OPT_EMBED_REF,
+ CRAM_OPT_IGNORE_MD5,
+ CRAM_OPT_REFERENCE, // make general
+ CRAM_OPT_MULTI_SEQ_PER_SLICE,
+ CRAM_OPT_NO_REF,
+ CRAM_OPT_USE_BZIP2,
+ CRAM_OPT_SHARED_REF,
+ CRAM_OPT_NTHREADS, // deprecated, use HTS_OPT_NTHREADS
+ CRAM_OPT_THREAD_POOL,// make general
+ CRAM_OPT_USE_LZMA,
+ CRAM_OPT_USE_RANS,
+ CRAM_OPT_REQUIRED_FIELDS,
+
+ // General purpose
+ HTS_OPT_COMPRESSION_LEVEL = 100,
+ HTS_OPT_NTHREADS,
+};
+
+// For backwards compatibility
+#define cram_option hts_fmt_option
+
+typedef struct hts_opt {
+ char *arg; // string form, strdup()ed
+ enum hts_fmt_option opt; // tokenised key
+ union { // ... and value
+ int i;
+ char *s;
+ } val;
+ struct hts_opt *next;
+} hts_opt;
+
+#define HTS_FILE_OPTS_INIT {{0},0}
+
+/**********************
+ * Exported functions *
+ **********************/
+
+/*
+ * Parses arg and appends it to the option list.
+ *
+ * Returns 0 on success;
+ * -1 on failure.
+ */
+int hts_opt_add(hts_opt **opts, const char *c_arg);
+
+/*
+ * Applies an hts_opt option list to a given htsFile.
+ *
+ * Returns 0 on success
+ * -1 on failure
+ */
+int hts_opt_apply(htsFile *fp, hts_opt *opts);
+
+/*
+ * Frees an hts_opt list.
+ */
+void hts_opt_free(hts_opt *opts);
+
+/*
+ * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
+ * followed by a comma separated list of key=value options and splits
+ * these up into the fields of htsFormat struct.
+ *
+ * Returns 0 on success
+ * -1 on failure.
+ */
+int hts_parse_format(htsFormat *opt, const char *str);
+
+/*
+ * Tokenise options as (key(=value)?,)*(key(=value)?)?
+ * NB: No provision for ',' appearing in the value!
+ * Add backslashing rules?
+ *
+ * This could be used as part of a general command line option parser or
+ * as a string concatenated onto the file open mode.
+ *
+ * Returns 0 on success
+ * -1 on failure.
+ */
+int hts_parse_opt_list(htsFormat *opt, const char *str);
+
+extern int hts_verbose;
+
+/*! @abstract Table for converting a nucleotide character to 4-bit encoding.
+The input character may be either an IUPAC ambiguity code, '=' for 0, or
+'0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
+for A/C/G/T or combinations of these bits for ambiguous bases.
+*/
+extern const unsigned char seq_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+ambiguity code letter (or '=' when given 0).
+*/
+extern const char seq_nt16_str[];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+*/
+extern const int seq_nt16_int[];
+
+/*!
+ @abstract Get the htslib version number
+ @return For released versions, a string like "N.N[.N]"; or git describe
+ output if using a library built within a Git repository.
+*/
+const char *hts_version(void);
+
+/*!
+ @abstract Determine format by peeking at the start of a file
+ @param fp File opened for reading, positioned at the beginning
+ @param fmt Format structure that will be filled out on return
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_detect_format(struct hFILE *fp, htsFormat *fmt);
+
+/*!
+ @abstract Get a human-readable description of the file format
+ @param fmt Format structure holding type, version, compression, etc.
+ @return Description string, to be freed by the caller after use.
+*/
+char *hts_format_description(const htsFormat *format);
+
+/*!
+ @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
+ @param fn The file name or "-" for stdin/stdout
+ @param mode Mode matching / [rwa][bceguxz0-9]* /
+ @discussion
+ With 'r' opens for reading; any further format mode letters are ignored
+ as the format is detected by checking the first few bytes or BGZF blocks
+ of the file. With 'w' or 'a' opens for writing or appending, with format
+ specifier letters:
+ b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
+ c CRAM format
+ g gzip compressed
+ u uncompressed
+ z bgzf compressed
+ [0-9] zlib compression level
+ and with non-format option letters (for any of 'r'/'w'/'a'):
+ e close the file on exec(2) (opens with O_CLOEXEC, where supported)
+ x create the file exclusively (opens with O_EXCL, where supported)
+ Note that there is a distinction between 'u' and '0': the first yields
+ plain uncompressed output whereas the latter outputs uncompressed data
+ wrapped in the zlib format.
+ @example
+ [rw]b .. compressed BCF, BAM, FAI
+ [rw]bu .. uncompressed BCF
+ [rw]z .. compressed VCF
+ [rw] .. uncompressed VCF
+*/
+htsFile *hts_open(const char *fn, const char *mode);
+
+/*!
+ @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
+ @param fn The file name or "-" for stdin/stdout
+ @param mode Open mode, as per hts_open()
+ @param fmt Optional format specific parameters
+ @discussion
+ See hts_open() for description of fn and mode.
+ // TODO Update documentation for s/opts/fmt/
+ Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
+ if defined, override mode. Opts also contains a linked list of hts_opt
+ structures to apply to the open file handle. These can contain things
+ like pointers to the reference or information on compression levels,
+ block sizes, etc.
+*/
+htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt);
+
+/*!
+ @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+ @param fn The already-open file handle
+ @param mode Open mode, as per hts_open()
+*/
+htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode);
+
+/*!
+ @abstract Close a file handle, flushing buffered data for output streams
+ @param fp The file handle to be closed
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_close(htsFile *fp);
+
+/*!
+ @abstract Returns the file's format information
+ @param fp The file handle
+ @return Read-only pointer to the file's htsFormat.
+*/
+const htsFormat *hts_get_format(htsFile *fp);
+
+/*!
+ @ abstract Returns a string containing the file format extension.
+ @ param format Format structure containing the file type.
+ @ return A string ("sam", "bam", etc) or "?" for unknown formats.
+ */
+const char *hts_format_file_extension(const htsFormat *format);
+
+/*!
+ @abstract Sets a specified CRAM option on the open file handle.
+ @param fp The file handle open the open file.
+ @param opt The CRAM_OPT_* option.
+ @param ... Optional arguments, dependent on the option used.
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...);
+
+int hts_getline(htsFile *fp, int delimiter, kstring_t *str);
+char **hts_readlines(const char *fn, int *_n);
+/*!
+ @abstract Parse comma-separated list or read list from a file
+ @param list File name or comma-separated list
+ @param is_file
+ @param _n Size of the output array (number of items read)
+ @return NULL on failure or pointer to newly allocated array of
+ strings
+*/
+char **hts_readlist(const char *fn, int is_file, int *_n);
+
+/*!
+ @abstract Create extra threads to aid compress/decompression for this file
+ @param fp The file handle
+ @param n The number of worker threads to create
+ @return 0 for success, or negative if an error occurred.
+ @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
+*/
+int hts_set_threads(htsFile *fp, int n);
+
+/*!
+ @abstract Set .fai filename for a file opened for reading
+ @return 0 for success, negative on failure
+ @discussion
+ Called before *_hdr_read(), this provides the name of a .fai file
+ used to provide a reference list if the htsFile contains no @SQ headers.
+*/
+int hts_set_fai_filename(htsFile *fp, const char *fn_aux);
+
+/************
+ * Indexing *
+ ************/
+
+/*!
+These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc,
+producing iterators operating as follows:
+ - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
+ - HTS_IDX_START iterates over the entire file
+ - HTS_IDX_REST iterates from the current position to the end of the file
+ - HTS_IDX_NONE always returns "no more alignment records"
+When one of these special tid values is used, beg and end are ignored.
+When REST or NONE is used, idx is also ignored and may be NULL.
+*/
+#define HTS_IDX_NOCOOR (-2)
+#define HTS_IDX_START (-3)
+#define HTS_IDX_REST (-4)
+#define HTS_IDX_NONE (-5)
+
+#define HTS_FMT_CSI 0
+#define HTS_FMT_BAI 1
+#define HTS_FMT_TBI 2
+#define HTS_FMT_CRAI 3
+
+struct __hts_idx_t;
+typedef struct __hts_idx_t hts_idx_t;
+
+typedef struct {
+ uint64_t u, v;
+} hts_pair64_t;
+
+typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end);
+
+typedef struct {
+ uint32_t read_rest:1, finished:1, dummy:29;
+ int tid, beg, end, n_off, i;
+ int curr_tid, curr_beg, curr_end;
+ uint64_t curr_off;
+ hts_pair64_t *off;
+ hts_readrec_func *readrec;
+ struct {
+ int n, m;
+ int *a;
+ } bins;
+} hts_itr_t;
+
+ #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7)
+ #define hts_bin_parent(l) (((l) - 1) >> 3)
+
+ hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls);
+ void hts_idx_destroy(hts_idx_t *idx);
+ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped);
+ void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset);
+
+/// Save an index to a file
+/** @param idx Index to be written
+ @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
+ @param fmt One of the HTS_FMT_* index formats
+ @return 0 if successful, or negative if an error occurred.
+*/
+int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) HTS_RESULT_USED;
+
+/// Save an index to a specific file
+/** @param idx Index to be written
+ @param fn Input BAM/BCF/etc filename
+ @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
+ @param fmt One of the HTS_FMT_* index formats
+ @return 0 if successful, or negative if an error occurred.
+*/
+int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) HTS_RESULT_USED;
+
+/// Load an index file
+/** @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
+ the extension substituted, to search for an existing index file
+ @param fmt One of the HTS_FMT_* index formats
+ @return The index, or NULL if an error occurred.
+*/
+hts_idx_t *hts_idx_load(const char *fn, int fmt);
+
+/// Load a specific index file
+/** @param fn Input BAM/BCF/etc filename
+ @param fnidx The input index filename
+ @return The index, or NULL if an error occurred.
+*/
+hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx);
+
+ uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta);
+ void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy);
+
+ int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped);
+ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx);
+
+
+#define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers
+
+/// Parse a numeric string
+/** The number may be expressed in scientific notation, and optionally may
+ contain commas in the integer part (before any decimal point or E notation).
+ @param str String to be parsed
+ @param strend If non-NULL, set on return to point to the first character
+ in @a str after those forming the parsed number
+ @param flags Or'ed-together combination of HTS_PARSE_* flags
+ @return Converted value of the parsed number.
+
+ When @a strend is NULL, a warning will be printed (if hts_verbose is 2
+ or more) if there are any trailing characters after the number.
+*/
+long long hts_parse_decimal(const char *str, char **strend, int flags);
+
+/// Parse a "CHR:START-END"-style region string
+/** @param str String to be parsed
+ @param beg Set on return to the 0-based start of the region
+ @param end Set on return to the 1-based end of the region
+ @return Pointer to the colon or '\0' after the reference sequence name,
+ or NULL if @a str could not be parsed.
+*/
+const char *hts_parse_reg(const char *str, int *beg, int *end);
+
+ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+ void hts_itr_destroy(hts_itr_t *iter);
+
+ typedef int (*hts_name2id_f)(void*, const char*);
+ typedef const char *(*hts_id2name_f)(void*, int);
+ typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+
+ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec);
+ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED;
+ const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values
+
+ /**
+ * hts_file_type() - Convenience function to determine file type
+ * DEPRECATED: This function has been replaced by hts_detect_format().
+ * It and these FT_* macros will be removed in a future HTSlib release.
+ */
+ #define FT_UNKN 0
+ #define FT_GZ 1
+ #define FT_VCF 2
+ #define FT_VCF_GZ (FT_GZ|FT_VCF)
+ #define FT_BCF (1<<2)
+ #define FT_BCF_GZ (FT_GZ|FT_BCF)
+ #define FT_STDIN (1<<3)
+ int hts_file_type(const char *fname);
+
+
+ /**********************
+ * MD5 implementation *
+ **********************/
+
+ struct hts_md5_context;
+ typedef struct hts_md5_context hts_md5_context;
+
+ /*! @abstract Intialises an MD5 context.
+ * @discussion
+ * The expected use is to allocate an hts_md5_context using
+ * hts_md5_init(). This pointer is then passed into one or more calls
+ * of hts_md5_update() to compute successive internal portions of the
+ * MD5 sum, which can then be externalised as a full 16-byte MD5sum
+ * calculation by calling hts_md5_final(). This can then be turned
+ * into ASCII via hts_md5_hex().
+ *
+ * To dealloate any resources created by hts_md5_init() call the
+ * hts_md5_destroy() function.
+ *
+ * @return hts_md5_context pointer on success, NULL otherwise.
+ */
+ hts_md5_context *hts_md5_init(void);
+
+ /*! @abstract Updates the context with the MD5 of the data. */
+ void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size);
+
+ /*! @abstract Computes the final 128-bit MD5 hash from the given context */
+ void hts_md5_final(unsigned char *digest, hts_md5_context *ctx);
+
+ /*! @abstract Resets an md5_context to the initial state, as returned
+ * by hts_md5_init().
+ */
+ void hts_md5_reset(hts_md5_context *ctx);
+
+ /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
+ * hex string.
+ */
+ void hts_md5_hex(char *hex, const unsigned char *digest);
+
+ /*! @abstract Deallocates any memory allocated by hts_md5_init. */
+ void hts_md5_destroy(hts_md5_context *ctx);
+
+
+static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+{
+ int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
+ for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
+ if (beg>>s == end>>s) return t + (beg>>s);
+ return 0;
+}
+
+static inline int hts_bin_bot(int bin, int n_lvls)
+{
+ int l, b;
+ for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin
+ return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
+}
+
+/**************
+ * Endianness *
+ **************/
+
+static inline int ed_is_big(void)
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+static inline uint16_t ed_swap_2(uint16_t v)
+{
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *ed_swap_2p(void *x)
+{
+ *(uint16_t*)x = ed_swap_2(*(uint16_t*)x);
+ return x;
+}
+static inline uint32_t ed_swap_4(uint32_t v)
+{
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *ed_swap_4p(void *x)
+{
+ *(uint32_t*)x = ed_swap_4(*(uint32_t*)x);
+ return x;
+}
+static inline uint64_t ed_swap_8(uint64_t v)
+{
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *ed_swap_8p(void *x)
+{
+ *(uint64_t*)x = ed_swap_8(*(uint64_t*)x);
+ return x;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/hts_defs.h b/htslib/htslib/hts_defs.h
new file mode 100644
index 0000000..0a672c6
--- /dev/null
+++ b/htslib/htslib/hts_defs.h
@@ -0,0 +1,72 @@
+/* hts_defs.h -- Miscellaneous definitions.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_HTS_DEFS_H
+#define HTSLIB_HTS_DEFS_H
+
+#ifdef __clang__
+#ifdef __has_attribute
+#define HTS_COMPILER_HAS(attribute) __has_attribute(attribute)
+#endif
+
+#elif defined __GNUC__
+#define HTS_GCC_AT_LEAST(major, minor) \
+ (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#endif
+
+#ifndef HTS_COMPILER_HAS
+#define HTS_COMPILER_HAS(attribute) 0
+#endif
+#ifndef HTS_GCC_AT_LEAST
+#define HTS_GCC_AT_LEAST(major, minor) 0
+#endif
+
+#if HTS_COMPILER_HAS(__noreturn__) || HTS_GCC_AT_LEAST(3,0)
+#define HTS_NORETURN __attribute__ ((__noreturn__))
+#else
+#define HTS_NORETURN
+#endif
+
+// GCC introduced warn_unused_result in 3.4 but added -Wno-unused-result later
+#if HTS_COMPILER_HAS(__warn_unused_result__) || HTS_GCC_AT_LEAST(4,5)
+#define HTS_RESULT_USED __attribute__ ((__warn_unused_result__))
+#else
+#define HTS_RESULT_USED
+#endif
+
+#if HTS_COMPILER_HAS(__unused__) || HTS_GCC_AT_LEAST(3,0)
+#define HTS_UNUSED __attribute__ ((__unused__))
+#else
+#define HTS_UNUSED
+#endif
+
+#if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(4,5)
+#define HTS_DEPRECATED(message) __attribute__ ((__deprecated__ (message)))
+#elif HTS_GCC_AT_LEAST(3,1)
+#define HTS_DEPRECATED(message) __attribute__ ((__deprecated__))
+#else
+#define HTS_DEPRECATED(message)
+#endif
+
+#endif
diff --git a/htslib/htslib/kbitset.h b/htslib/htslib/kbitset.h
new file mode 100644
index 0000000..22fb34d
--- /dev/null
+++ b/htslib/htslib/kbitset.h
@@ -0,0 +1,160 @@
+/* The MIT License
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KBITSET_H
+#define KBITSET_H
+
+/* Example of using kbitset_t, which represents a subset of {0,..., N-1},
+ where N is the size specified in kbs_init().
+
+ kbitset_t *bset = kbs_init(100);
+ kbs_insert(bset, 5);
+ kbs_insert(bset, 68);
+ kbs_delete(bset, 37);
+ // ...
+
+ if (kbs_exists(bset, 68)) printf("68 present\n");
+
+ kbitset_iter itr;
+ int i;
+ kbs_start(&itr);
+ while ((i = kbs_next(bset, &itr)) >= 0)
+ printf("%d present\n", i);
+
+ kbs_destroy(bset);
+
+ Example of declaring a kbitset_t-using function in a header file, so that
+ only source files that actually use process() need to include <kbitset.h>:
+
+ struct kbitset_t;
+ void process(struct kbitset_t *bset);
+*/
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define KBS_ELTBITS (CHAR_BIT * sizeof (unsigned long))
+#define KBS_ELT(i) ((i) / KBS_ELTBITS)
+#define KBS_MASK(i) (1UL << ((i) % KBS_ELTBITS))
+
+typedef struct kbitset_t {
+ size_t n;
+ unsigned long b[1];
+} kbitset_t;
+
+// Initialise a bit set capable of holding ni integers, 0 <= i < ni.
+// The set returned is empty if fill == 0, or all of [0,ni) otherwise.
+static inline kbitset_t *kbs_init2(size_t ni, int fill)
+{
+ size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS;
+ kbitset_t *bs =
+ (kbitset_t *) malloc(sizeof(kbitset_t) + n * sizeof(unsigned long));
+ if (bs == NULL) return NULL;
+ bs->n = n;
+ memset(bs->b, fill? ~0 : 0, n * sizeof (unsigned long));
+ bs->b[n] = ~0UL;
+ return bs;
+}
+
+// Initialise an empty bit set capable of holding ni integers, 0 <= i < ni.
+static inline kbitset_t *kbs_init(size_t ni)
+{
+ return kbs_init2(ni, 0);
+}
+
+// Destroy a bit set.
+static inline void kbs_destroy(kbitset_t *bs)
+{
+ free(bs);
+}
+
+// Reset the bit set to empty.
+static inline void kbs_clear(kbitset_t *bs)
+{
+ memset(bs->b, 0, bs->n * sizeof (unsigned long));
+}
+
+// Reset the bit set to all of [0,ni).
+static inline void kbs_insert_all(kbitset_t *bs)
+{
+ memset(bs->b, ~0, bs->n * sizeof (unsigned long));
+}
+
+// Insert an element into the bit set.
+static inline void kbs_insert(kbitset_t *bs, int i)
+{
+ bs->b[KBS_ELT(i)] |= KBS_MASK(i);
+}
+
+// Remove an element from the bit set.
+static inline void kbs_delete(kbitset_t *bs, int i)
+{
+ bs->b[KBS_ELT(i)] &= ~KBS_MASK(i);
+}
+
+// Test whether the bit set contains the element.
+static inline int kbs_exists(const kbitset_t *bs, int i)
+{
+ return (bs->b[KBS_ELT(i)] & KBS_MASK(i)) != 0;
+}
+
+typedef struct kbitset_iter_t {
+ unsigned long mask;
+ size_t elt;
+ int i;
+} kbitset_iter_t;
+
+// Initialise or reset a bit set iterator.
+static inline void kbs_start(kbitset_iter_t *itr)
+{
+ itr->mask = 1;
+ itr->elt = 0;
+ itr->i = 0;
+}
+
+// Return the next element contained in the bit set, or -1 if there are no more.
+static inline int kbs_next(const kbitset_t *bs, kbitset_iter_t *itr)
+{
+ unsigned long b = bs->b[itr->elt];
+
+ for (;;) {
+ if (itr->mask == 0) {
+ while ((b = bs->b[++itr->elt]) == 0) itr->i += KBS_ELTBITS;
+ if (itr->elt == bs->n) return -1;
+ itr->mask = 1;
+ }
+
+ if (b & itr->mask) break;
+
+ itr->i++;
+ itr->mask <<= 1;
+ }
+
+ itr->mask <<= 1;
+ return itr->i++;
+}
+
+#endif
diff --git a/htslib/htslib/kfunc.h b/htslib/htslib/kfunc.h
new file mode 100644
index 0000000..162c90d
--- /dev/null
+++ b/htslib/htslib/kfunc.h
@@ -0,0 +1,83 @@
+/* The MIT License
+
+ Copyright (C) 2010, 2013 Genome Research Ltd.
+ Copyright (C) 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef HTSLIB_KFUNC_H
+#define HTSLIB_KFUNC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Log gamma function
+ * \log{\Gamma(z)}
+ * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+ */
+double kf_lgamma(double z);
+
+/* complementary error function
+ * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
+ * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
+ */
+double kf_erfc(double x);
+
+/* The following computes regularized incomplete gamma functions.
+ * Formulas are taken from Wiki, with additional input from Numerical
+ * Recipes in C (for modified Lentz's algorithm) and AS245
+ * (http://lib.stat.cmu.edu/apstat/245).
+ *
+ * A good online calculator is available at:
+ *
+ * http://www.danielsoper.com/statcalc/calc23.aspx
+ *
+ * It calculates upper incomplete gamma function, which equals
+ * kf_gammaq(s,z)*tgamma(s).
+ */
+
+double kf_gammap(double s, double z);
+double kf_gammaq(double s, double z);
+
+/* Regularized incomplete beta function. The method is taken from
+ * Numerical Recipe in C, 2nd edition, section 6.4. The following web
+ * page calculates the incomplete beta function, which equals
+ * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
+ *
+ * http://www.danielsoper.com/statcalc/calc36.aspx
+ */
+double kf_betai(double a, double b, double x);
+
+/*
+ * n11 n12 | n1_
+ * n21 n22 | n2_
+ * -----------+----
+ * n_1 n_2 | n
+ */
+double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/khash.h b/htslib/htslib/khash.h
new file mode 100644
index 0000000..06fc7a3
--- /dev/null
+++ b/htslib/htslib/khash.h
@@ -0,0 +1,627 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2013-05-02 (0.2.8):
+
+ * Use quadratic probing. When the capacity is power of 2, stepping function
+ i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+ hashing on cache performance and is more robust than linear probing.
+
+ In theory, double hashing should be more robust than quadratic probing.
+ However, my implementation is probably not for large hash tables, because
+ the second hash function is closely tied to the first hash function,
+ which reduce the effectiveness of double hashing.
+
+ Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+ 2011-12-29 (0.2.7):
+
+ * Minor code clean up; no actual effect.
+
+ 2011-09-16 (0.2.6):
+
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+ - http://code.google.com/p/ulib/
+ - http://nothings.org/computer/judy/
+
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as it
+ is more robust to certain non-random input.
+
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
+
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+ typedef struct kh_##name##_s { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
+ extern kh_##name##_t *kh_init_##name(void); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ SCOPE kh_##name##_t *kh_init_##name(void) { \
+ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ kfree((void *)h->keys); kfree(h->flags); \
+ kfree((void *)h->vals); \
+ kfree(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t k, i, last, mask, step = 0; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); i = k & mask; \
+ last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ i = (i + (++step)) & mask; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+ khint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) new_n_buckets = 4; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (!new_flags) return -1; \
+ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (!new_keys) { kfree(new_flags); return -1; } \
+ h->keys = new_keys; \
+ if (kh_is_map) { \
+ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ if (!new_vals) { kfree(new_flags); return -1; } \
+ h->vals = new_vals; \
+ } \
+ } /* otherwise shrink */ \
+ } \
+ } \
+ if (j) { /* rehashing is needed */ \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ khint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isdel_true(h->flags, j); \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+ khint_t k, i, step = 0; \
+ k = __hash_func(key); \
+ i = k & new_mask; \
+ while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ kfree(h->flags); /* free the working space */ \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ return 0; \
+ } \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size<<1)) { \
+ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+ { \
+ khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+ else { \
+ last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ i = (i + (++step)) & mask; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [khint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [khint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = (khint_t)*s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: -1 if the operation failed;
+ 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Iterate over the entries in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param kvar Variable to which key will be assigned
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (kvar) = kh_key(h,__i); \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/*! @function
+ @abstract Iterate over the values in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/htslib/htslib/khash_str2int.h b/htslib/htslib/khash_str2int.h
new file mode 100644
index 0000000..4bbc100
--- /dev/null
+++ b/htslib/htslib/khash_str2int.h
@@ -0,0 +1,133 @@
+/* khash_str2int.h -- C-string to integer hash table.
+
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_KHASH_STR2INT_H
+#define HTSLIB_KHASH_STR2INT_H
+
+#include <htslib/khash.h>
+
+KHASH_MAP_INIT_STR(str2int, int)
+
+/*
+ * Wrappers for khash dictionaries used by mpileup.
+ */
+
+static inline void *khash_str2int_init(void)
+{
+ return kh_init(str2int);
+}
+
+/*
+ * Destroy the hash structure, but not the keys
+ */
+static inline void khash_str2int_destroy(void *_hash)
+{
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ if (hash) kh_destroy(str2int, hash); // Note that strings are not freed.
+}
+
+/*
+ * Destroys both the hash structure and the keys
+ */
+static inline void khash_str2int_destroy_free(void *_hash)
+{
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ khint_t k;
+ if (hash == 0) return;
+ for (k = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
+ kh_destroy(str2int, hash);
+}
+
+/*
+ * Returns 1 if key exists or 0 if not
+ */
+static inline int khash_str2int_has_key(void *_hash, const char *str)
+{
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ khint_t k = kh_get(str2int, hash, str);
+ if ( k == kh_end(hash) ) return 0;
+ return 1;
+}
+
+/*
+ * Returns 0 on success and -1 when the key is not present. On success,
+ * *value is set, unless NULL is passed.
+ */
+static inline int khash_str2int_get(void *_hash, const char *str, int *value)
+{
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ khint_t k;
+ if ( !hash ) return -1;
+ k = kh_get(str2int, hash, str);
+ if ( k == kh_end(hash) ) return -1;
+ if ( !value ) return 0;
+ *value = kh_val(hash, k);
+ return 0;
+}
+
+/*
+ * Add a new string to the dictionary, auto-incrementing the value.
+ * On success returns the newly inserted integer id, on error -1
+ * is returned. Note that the key must continue to exist throughout
+ * the whole life of _hash.
+ */
+static inline int khash_str2int_inc(void *_hash, const char *str)
+{
+ khint_t k;
+ int ret;
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ if ( !hash ) return -1;
+ k = kh_put(str2int, hash, str, &ret);
+ if (ret == 0) return kh_val(hash, k);
+ kh_val(hash, k) = kh_size(hash) - 1;
+ return kh_val(hash, k);
+}
+
+/*
+ * Set a new key,value pair. On success returns the bin index, on
+ * error -1 is returned. Note that the key must contnue to exist
+ * throughout the whole life of _hash.
+ */
+static inline int khash_str2int_set(void *_hash, const char *str, int value)
+{
+ khint_t k;
+ int ret;
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ if ( !hash ) return -1;
+ k = kh_put(str2int, hash, str, &ret);
+ kh_val(hash,k) = value;
+ return k;
+}
+
+/*
+ * Return the number of keys in the hash table.
+ */
+static inline int khash_str2int_size(void *_hash)
+{
+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash;
+ return kh_size(hash);
+}
+
+#endif
diff --git a/htslib/htslib/klist.h b/htslib/htslib/klist.h
new file mode 100644
index 0000000..adc3db1
--- /dev/null
+++ b/htslib/htslib/klist.h
@@ -0,0 +1,135 @@
+/* The MIT License
+
+ Copyright (c) 2008-2009, by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef _AC_KLIST_H
+#define _AC_KLIST_H
+
+#include <stdlib.h>
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KMEMPOOL_INIT2(SCOPE, name, kmptype_t, kmpfree_f) \
+ typedef struct { \
+ size_t cnt, n, max; \
+ kmptype_t **buf; \
+ } kmp_##name##_t; \
+ SCOPE kmp_##name##_t *kmp_init_##name(void) { \
+ return calloc(1, sizeof(kmp_##name##_t)); \
+ } \
+ SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \
+ size_t k; \
+ for (k = 0; k < mp->n; ++k) { \
+ kmpfree_f(mp->buf[k]); free(mp->buf[k]); \
+ } \
+ free(mp->buf); free(mp); \
+ } \
+ SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
+ ++mp->cnt; \
+ if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \
+ return mp->buf[--mp->n]; \
+ } \
+ SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+ --mp->cnt; \
+ if (mp->n == mp->max) { \
+ mp->max = mp->max? mp->max<<1 : 16; \
+ mp->buf = realloc(mp->buf, sizeof(kmptype_t *) * mp->max); \
+ } \
+ mp->buf[mp->n++] = p; \
+ }
+
+#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \
+ KMEMPOOL_INIT2(static inline klib_unused, name, kmptype_t, kmpfree_f)
+
+#define kmempool_t(name) kmp_##name##_t
+#define kmp_init(name) kmp_init_##name()
+#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
+#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
+#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
+
+#define KLIST_INIT2(SCOPE, name, kltype_t, kmpfree_t) \
+ struct __kl1_##name { \
+ kltype_t data; \
+ struct __kl1_##name *next; \
+ }; \
+ typedef struct __kl1_##name kl1_##name; \
+ KMEMPOOL_INIT2(SCOPE, name, kl1_##name, kmpfree_t) \
+ typedef struct { \
+ kl1_##name *head, *tail; \
+ kmp_##name##_t *mp; \
+ size_t size; \
+ } kl_##name##_t; \
+ SCOPE kl_##name##_t *kl_init_##name(void) { \
+ kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \
+ kl->mp = kmp_init(name); \
+ kl->head = kl->tail = kmp_alloc(name, kl->mp); \
+ kl->head->next = 0; \
+ return kl; \
+ } \
+ SCOPE void kl_destroy_##name(kl_##name##_t *kl) { \
+ kl1_##name *p; \
+ for (p = kl->head; p != kl->tail; p = p->next) \
+ kmp_free(name, kl->mp, p); \
+ kmp_free(name, kl->mp, p); \
+ kmp_destroy(name, kl->mp); \
+ free(kl); \
+ } \
+ SCOPE kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \
+ kl1_##name *q, *p = kmp_alloc(name, kl->mp); \
+ q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \
+ ++kl->size; \
+ return &q->data; \
+ } \
+ SCOPE int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \
+ kl1_##name *p; \
+ if (kl->head->next == 0) return -1; \
+ --kl->size; \
+ p = kl->head; kl->head = kl->head->next; \
+ if (d) *d = p->data; \
+ kmp_free(name, kl->mp, p); \
+ return 0; \
+ }
+
+#define KLIST_INIT(name, kltype_t, kmpfree_t) \
+ KLIST_INIT2(static inline klib_unused, name, kltype_t, kmpfree_t)
+
+#define kliter_t(name) kl1_##name
+#define klist_t(name) kl_##name##_t
+#define kl_val(iter) ((iter)->data)
+#define kl_next(iter) ((iter)->next)
+#define kl_begin(kl) ((kl)->head)
+#define kl_end(kl) ((kl)->tail)
+
+#define kl_init(name) kl_init_##name()
+#define kl_destroy(name, kl) kl_destroy_##name(kl)
+#define kl_pushp(name, kl) kl_pushp_##name(kl)
+#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
+
+#endif
diff --git a/htslib/htslib/knetfile.h b/htslib/htslib/knetfile.h
new file mode 100644
index 0000000..b200a51
--- /dev/null
+++ b/htslib/htslib/knetfile.h
@@ -0,0 +1,101 @@
+/* The MIT License
+
+ Copyright (c) 2008 by Genome Research Ltd (GRL).
+ 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KNETFILE_H
+#define KNETFILE_H
+
+#include <stdint.h>
+#include <fcntl.h>
+
+#ifndef _WIN32
+#define netread(fd, ptr, len) read(fd, ptr, len)
+#define netwrite(fd, ptr, len) write(fd, ptr, len)
+#define netclose(fd) close(fd)
+#else
+#include <winsock2.h>
+#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
+#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
+#define netclose(fd) closesocket(fd)
+#endif
+
+// FIXME: currently I/O is unbuffered
+
+#define KNF_TYPE_LOCAL 1
+#define KNF_TYPE_FTP 2
+#define KNF_TYPE_HTTP 3
+
+typedef struct knetFile_s {
+ int type, fd;
+ int64_t offset;
+ char *host, *port;
+
+ // the following are for FTP only
+ int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
+ char *response, *retr, *size_cmd;
+ int64_t seek_offset; // for lazy seek
+ int64_t file_size;
+
+ // the following are for HTTP only
+ char *path, *http_host;
+} knetFile;
+
+#define knet_tell(fp) ((fp)->offset)
+#define knet_fileno(fp) ((fp)->fd)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+ int knet_win32_init();
+ void knet_win32_destroy();
+#endif
+
+ knetFile *knet_open(const char *fn, const char *mode);
+
+ /*
+ This only works with local files.
+ */
+ knetFile *knet_dopen(int fd, const char *mode);
+
+ /*
+ If ->is_ready==0, this routine updates ->fd; otherwise, it simply
+ reads from ->fd.
+ */
+ ssize_t knet_read(knetFile *fp, void *buf, size_t len);
+
+ /*
+ This routine only sets ->offset and ->is_ready=0. It does not
+ communicate with the FTP server.
+ */
+ off_t knet_seek(knetFile *fp, off_t off, int whence);
+ int knet_close(knetFile *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/kseq.h b/htslib/htslib/kseq.h
new file mode 100644
index 0000000..e1a3eaa
--- /dev/null
+++ b/htslib/htslib/kseq.h
@@ -0,0 +1,253 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ int begin, end; \
+ int is_eof:2, bufsize:30; \
+ uint64_t seek_pos; \
+ type_t f; \
+ unsigned char *buf; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(SCOPE, type_t, __bufsize) \
+ SCOPE kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; ks->bufsize = __bufsize; \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ SCOPE void ks_destroy(kstream_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->buf); \
+ free(ks); \
+ }
+
+#define __KS_INLINED(__read) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, ks->bufsize); \
+ if (ks->end == 0) { ks->is_eof = 1; return -1; } \
+ } \
+ ks->seek_pos++; \
+ return (int)ks->buf[ks->begin++]; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(SCOPE, __read) \
+ SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ int gotany = 0; \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ uint64_t seek_pos = str->l; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, ks->bufsize); \
+ if (ks->end == 0) { ks->is_eof = 1; break; } \
+ } else break; \
+ } \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \
+ gotany = 1; \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (!gotany && ks_eof(ks)) return -1; \
+ ks->seek_pos += seek_pos; \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ }
+
+#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(SCOPE, type_t, __bufsize) \
+ __KS_GETUNTIL(SCOPE, __read) \
+ __KS_INLINED(__read)
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
+
+#define KSTREAM_DECLARE(type_t, __read) \
+ __KS_TYPE(type_t) \
+ extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
+ extern kstream_t *ks_init(type_t f); \
+ extern void ks_destroy(kstream_t *ks); \
+ __KS_INLINED(__read)
+
+/******************
+ * FASTA/Q parser *
+ ******************/
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ SCOPE void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 16384) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/htslib/htslib/ksort.h b/htslib/htslib/ksort.h
new file mode 100644
index 0000000..aa0bb93
--- /dev/null
+++ b/htslib/htslib/ksort.h
@@ -0,0 +1,285 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+/*
+ 2012-12-11 (0.1.4):
+
+ * Defined __ks_insertsort_##name as static to compile with C99.
+
+ 2008-11-16 (0.1.4):
+
+ * Fixed a bug in introsort() that happens in rare cases.
+
+ 2008-11-05 (0.1.3):
+
+ * Fixed a bug in introsort() for complex comparisons.
+
+ * Fixed a bug in mergesort(). The previous version is not stable.
+
+ 2008-09-15 (0.1.2):
+
+ * Accelerated introsort. On my Mac (not on another Linux machine),
+ my implementation is as fast as std::sort on random input.
+
+ * Added combsort and in introsort, switch to combsort if the
+ recursion is too deep.
+
+ 2008-09-13 (0.1.1):
+
+ * Added k-small algorithm
+
+ 2008-09-05 (0.1.0):
+
+ * Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+ void *left, *right;
+ int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt) \
+ void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
+ { \
+ type_t *a2[2], *a, *b; \
+ int curr, shift; \
+ \
+ a2[0] = array; \
+ a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
+ for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
+ a = a2[curr]; b = a2[1-curr]; \
+ if (shift == 0) { \
+ type_t *p = b, *i, *eb = a + n; \
+ for (i = a; i < eb; i += 2) { \
+ if (i == eb - 1) *p++ = *i; \
+ else { \
+ if (__sort_lt(*(i+1), *i)) { \
+ *p++ = *(i+1); *p++ = *i; \
+ } else { \
+ *p++ = *i; *p++ = *(i+1); \
+ } \
+ } \
+ } \
+ } else { \
+ size_t i, step = 1ul<<shift; \
+ for (i = 0; i < n; i += step<<1) { \
+ type_t *p, *j, *k, *ea, *eb; \
+ if (n < i + step) { \
+ ea = a + n; eb = a; \
+ } else { \
+ ea = a + i + step; \
+ eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+ } \
+ j = a + i; k = a + i + step; p = b + i; \
+ while (j < ea && k < eb) { \
+ if (__sort_lt(*k, *j)) *p++ = *k++; \
+ else *p++ = *j++; \
+ } \
+ while (j < ea) *p++ = *j++; \
+ while (k < eb) *p++ = *k++; \
+ } \
+ } \
+ curr = 1 - curr; \
+ } \
+ if (curr == 1) { \
+ type_t *p = a2[0], *i = a2[1], *eb = array + n; \
+ for (; p < eb; ++i) *p++ = *i; \
+ } \
+ if (temp == 0) free(a2[1]); \
+ } \
+ void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
+ { \
+ size_t k = i; \
+ type_t tmp = l[i]; \
+ while ((k = (k << 1) + 1) < n) { \
+ if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
+ if (__sort_lt(l[k], tmp)) break; \
+ l[i] = l[k]; i = k; \
+ } \
+ l[i] = tmp; \
+ } \
+ void ks_heapmake_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
+ ks_heapadjust_##name(i, lsize, l); \
+ } \
+ void ks_heapsort_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = lsize - 1; i > 0; --i) { \
+ type_t tmp; \
+ tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+ } \
+ } \
+ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
+ { \
+ type_t *i, *j, swap_tmp; \
+ for (i = s + 1; i < t; ++i) \
+ for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
+ swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
+ } \
+ } \
+ void ks_combsort_##name(size_t n, type_t a[]) \
+ { \
+ const double shrink_factor = 1.2473309501039786540366528676643; \
+ int do_swap; \
+ size_t gap = n; \
+ type_t tmp, *i, *j; \
+ do { \
+ if (gap > 2) { \
+ gap = (size_t)(gap / shrink_factor); \
+ if (gap == 9 || gap == 10) gap = 11; \
+ } \
+ do_swap = 0; \
+ for (i = a; i < a + n - gap; ++i) { \
+ j = i + gap; \
+ if (__sort_lt(*j, *i)) { \
+ tmp = *i; *i = *j; *j = tmp; \
+ do_swap = 1; \
+ } \
+ } \
+ } while (do_swap || gap > 2); \
+ if (gap != 1) __ks_insertsort_##name(a, a + n); \
+ } \
+ void ks_introsort_##name(size_t n, type_t a[]) \
+ { \
+ int d; \
+ ks_isort_stack_t *top, *stack; \
+ type_t rp, swap_tmp; \
+ type_t *s, *t, *i, *j, *k; \
+ \
+ if (n < 1) return; \
+ else if (n == 2) { \
+ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+ return; \
+ } \
+ for (d = 2; 1ul<<d < n; ++d); \
+ stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+ top = stack; s = a; t = a + (n-1); d <<= 1; \
+ while (1) { \
+ if (s < t) { \
+ if (--d == 0) { \
+ ks_combsort_##name(t - s + 1, s); \
+ t = s; \
+ continue; \
+ } \
+ i = s; j = t; k = i + ((j-i)>>1) + 1; \
+ if (__sort_lt(*k, *i)) { \
+ if (__sort_lt(*k, *j)) k = j; \
+ } else k = __sort_lt(*j, *i)? i : j; \
+ rp = *k; \
+ if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
+ for (;;) { \
+ do ++i; while (__sort_lt(*i, rp)); \
+ do --j; while (i <= j && __sort_lt(rp, *j)); \
+ if (j <= i) break; \
+ swap_tmp = *i; *i = *j; *j = swap_tmp; \
+ } \
+ swap_tmp = *i; *i = *t; *t = swap_tmp; \
+ if (i-s > t-i) { \
+ if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+ s = t-i > 16? i+1 : t; \
+ } else { \
+ if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+ t = i-s > 16? i-1 : s; \
+ } \
+ } else { \
+ if (top == stack) { \
+ free(stack); \
+ __ks_insertsort_##name(a, a+n); \
+ return; \
+ } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+ } \
+ } \
+ } \
+ /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+ /* 0 <= kk < n */ \
+ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
+ { \
+ type_t *low, *high, *k, *ll, *hh, *mid; \
+ low = arr; high = arr + n - 1; k = arr + kk; \
+ for (;;) { \
+ if (high <= low) return *k; \
+ if (high == low + 1) { \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ return *k; \
+ } \
+ mid = low + (high - low) / 2; \
+ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
+ KSORT_SWAP(type_t, *mid, *(low+1)); \
+ ll = low + 1; hh = high; \
+ for (;;) { \
+ do ++ll; while (__sort_lt(*ll, *low)); \
+ do --hh; while (__sort_lt(*low, *hh)); \
+ if (hh < ll) break; \
+ KSORT_SWAP(type_t, *ll, *hh); \
+ } \
+ KSORT_SWAP(type_t, *low, *hh); \
+ if (hh <= k) low = ll; \
+ if (hh >= k) high = hh - 1; \
+ } \
+ } \
+ void ks_shuffle_##name(size_t n, type_t a[]) \
+ { \
+ int i, j; \
+ for (i = n; i > 1; --i) { \
+ type_t tmp; \
+ j = (int)(drand48() * i); \
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \
+ } \
+ }
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/htslib/htslib/kstring.h b/htslib/htslib/kstring.h
new file mode 100644
index 0000000..fd91bbe
--- /dev/null
+++ b/htslib/htslib/kstring.h
@@ -0,0 +1,277 @@
+/* The MIT License
+
+ Copyright (C) 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#if defined __GNUC__ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4))
+#define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg)))
+#else
+#define KS_ATTR_PRINTF(fmt, arg)
+#endif
+
+
+/* kstring_t is a simple non-opaque type whose fields are likely to be
+ * used directly by user code (but see also ks_str() and ks_len() below).
+ * A kstring_t object is initialised by either of
+ * kstring_t str = { 0, 0, NULL };
+ * kstring_t str; ...; str.l = str.m = 0; str.s = NULL;
+ * and either ownership of the underlying buffer should be given away before
+ * the object disappears (see ks_release() below) or the kstring_t should be
+ * destroyed with free(str.s); */
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+typedef struct {
+ uint64_t tab[4];
+ int sep, finished;
+ const char *p; // end of the current token
+} ks_tokaux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0);
+ int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3);
+ int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+ char *kstrstr(const char *str, const char *pat, int **_prep);
+ char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+ void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
+
+ /* kstrtok() is similar to strtok_r() except that str is not
+ * modified and both str and sep can be NULL. For efficiency, it is
+ * actually recommended to set both to NULL in the subsequent calls
+ * if sep is not changed. */
+ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+
+ /* kgetline() uses the supplied fgets()-like function to read a "\n"-
+ * or "\r\n"-terminated line from fp. The line read is appended to the
+ * kstring without its terminator and 0 is returned; EOF is returned at
+ * EOF or on error (determined by querying fp, as per fgets()). */
+ typedef char *kgets_func(char *, int, void *);
+ int kgetline(kstring_t *s, kgets_func *fgets, void *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int ks_resize(kstring_t *s, size_t size)
+{
+ if (s->m < size) {
+ char *tmp;
+ s->m = size;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return -1;
+ }
+ return 0;
+}
+
+static inline char *ks_str(kstring_t *s)
+{
+ return s->s;
+}
+
+static inline size_t ks_len(kstring_t *s)
+{
+ return s->l;
+}
+
+// Give ownership of the underlying buffer away to something else (making
+// that something else responsible for freeing it), leaving the kstring_t
+// empty and ready to be used again, or ready to go out of scope without
+// needing free(str.s) to prevent a memory leak.
+static inline char *ks_release(kstring_t *s)
+{
+ char *ss = s->s;
+ s->l = s->m = 0;
+ s->s = NULL;
+ return ss;
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+ if (s->l + l + 1 >= s->m) {
+ char *tmp;
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ memcpy(s->s + s->l, p, l);
+ s->l += l;
+ s->s[s->l] = 0;
+ return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+ return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+ if (s->l + 1 >= s->m) {
+ char *tmp;
+ s->m = s->l + 2;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ s->s[s->l++] = c;
+ s->s[s->l] = 0;
+ return c;
+}
+
+static inline int kputc_(int c, kstring_t *s)
+{
+ if (s->l + 1 > s->m) {
+ char *tmp;
+ s->m = s->l + 1;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ s->s[s->l++] = c;
+ return 1;
+}
+
+static inline int kputsn_(const void *p, int l, kstring_t *s)
+{
+ if (s->l + l > s->m) {
+ char *tmp;
+ s->m = s->l + l;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ memcpy(s->s + s->l, p, l);
+ s->l += l;
+ return l;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+ char buf[16];
+ int i, l = 0;
+ unsigned int x = c;
+ if (c < 0) x = -x;
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ char *tmp;
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+ char buf[16];
+ int l, i;
+ unsigned x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (s->l + l + 1 >= s->m) {
+ char *tmp;
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+ char buf[32];
+ int i, l = 0;
+ unsigned long x = c;
+ if (c < 0) x = -x;
+ do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ char *tmp;
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+/*
+ * Returns 's' split by delimiter, with *n being the number of components;
+ * NULL on failue.
+ */
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+ int max = 0, *offsets = 0;
+ *n = ksplit_core(s->s, delimiter, &max, &offsets);
+ return offsets;
+}
+
+#endif
diff --git a/htslib/htslib/regidx.h b/htslib/htslib/regidx.h
new file mode 100644
index 0000000..79e82b7
--- /dev/null
+++ b/htslib/htslib/regidx.h
@@ -0,0 +1,154 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/*
+ Regions indexing with an optional payload. Inspired by samtools/bedidx.c.
+ This code is intended as future replacement of bcf_sr_regions_t.
+
+ Example of usage:
+
+ // Init the parser and print regions. In this example the payload is a
+ // pointer to a string. For the description of parse_custom and
+ // free_custom functions, see regidx_parse_f and regidx_free_f below,
+ // and for working example see test/test-regidx.c.
+ regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
+
+ // Query overlap with chr:from-to
+ regitr_t itr;
+ if ( regidx_overlap(idx, chr,from,to, &itr) ) printf("There is an overlap!\n");
+
+ while ( REGITR_OVERLAP(itr,from,to) )
+ {
+ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to,
+ REGITR_START(itr), REGITR_END(itr), REGITR_PAYLOAD(itr,char*));
+ itr.i++;
+ }
+
+ regidx_destroy(regs);
+*/
+
+#ifndef HTSLIB_REGIDX_H
+#define HTSLIB_REGIDX_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _regidx_t regidx_t;
+typedef struct
+{
+ uint32_t start, end;
+}
+reg_t;
+typedef struct
+{
+ int i, n;
+ reg_t *reg;
+ void *payload;
+}
+regitr_t;
+
+#define REGITR_START(itr) (itr).reg[(itr).i].start
+#define REGITR_END(itr) (itr).reg[(itr).i].end
+#define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload)[(itr).i]
+#define REGITR_OVERLAP(itr,from,to) (itr.i < itr.n && REGITR_START(itr)<=to && REGITR_END(itr)>=from )
+
+/*
+ * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
+ * or regidx_parse_tab below. The function is expected to set `chr_from` and
+ * `chr_to` to point to first and last character of chromosome name and set
+ * coordinates `reg->start` and `reg->end` (0-based, inclusive). If
+ * regidx_init() was called with non-zero payload_size, the `payload` points
+ * to a memory location of the payload_size and `usr` is data passed to
+ * regidx_init(). Any memory allocated by the function will be freed by
+ * regidx_free_f on regidx_destroy().
+ *
+ * Return value: 0 on success, -1 to skip a record, -2 on fatal error.
+ */
+typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr);
+typedef void (*regidx_free_f)(void *payload);
+
+int regidx_parse_bed(const char*,char**,char**,reg_t*,void*,void*); // CHROM,FROM,TO (0-based,right-open)
+int regidx_parse_tab(const char*,char**,char**,reg_t*,void*,void*); // CHROM,POS (1-based, inclusive)
+
+/*
+ * regidx_init() - creates new index
+ * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert()
+ * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
+ * the format will be autodected, currently either regidx_parse_tab (the default) or
+ * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that
+ * the exact autodetection algorithm will change.
+ * @param freef: NULL or see description of regidx_parse_f
+ * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
+ * @param usr: optional user data passed to regidx_parse_f
+ *
+ * Returns index on success or NULL on error.
+ */
+regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
+
+/*
+ * regidx_destroy() - free memory allocated by regidx_init
+ */
+void regidx_destroy(regidx_t *idx);
+
+/*
+ * regidx_overlap() - check overlap of the location chr:from-to with regions
+ * @param start,end: 0-based start, end coordinate (inclusive)
+ * @param itr: pointer to iterator, can be NULL if not needed
+ *
+ * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
+ * regions can be iterated as shown in the example above.
+ */
+int regidx_overlap(regidx_t *idx, const char *chr, uint32_t start, uint32_t end, regitr_t *itr);
+
+/*
+ * regidx_insert() - add a new region.
+ *
+ * After last region has been added, call regidx_insert(idx,NULL) to
+ * build the index.
+ *
+ * Returns 0 on success or -1 on error.
+ */
+int regidx_insert(regidx_t *idx, char *line);
+
+/*
+ * regidx_seq_names() - return list of all sequence names
+ */
+char **regidx_seq_names(regidx_t *idx, int *n);
+
+/*
+ * regidx_seq_nregs() - number of regions
+ * regidx_nregs() - total number of regions
+ */
+int regidx_seq_nregs(regidx_t *idx, const char *seq);
+int regidx_nregs(regidx_t *idx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/sam.h b/htslib/htslib/sam.h
new file mode 100644
index 0000000..7de8cd3
--- /dev/null
+++ b/htslib/htslib/sam.h
@@ -0,0 +1,454 @@
+/* sam.h -- SAM and BAM file I/O and manipulation.
+
+ Copyright (C) 2008, 2009, 2013-2014 Genome Research Ltd.
+ Copyright (C) 2010, 2012, 2013 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_SAM_H
+#define HTSLIB_SAM_H
+
+#include <stdint.h>
+#include "hts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**********************
+ *** SAM/BAM header ***
+ **********************/
+
+/*! @typedef
+ @abstract Structure for the alignment header.
+ @field n_targets number of reference sequences
+ @field l_text length of the plain text in the header
+ @field target_len lengths of the reference sequences
+ @field target_name names of the reference sequences
+ @field text plain text
+ @field sdict header dictionary
+ */
+
+typedef struct {
+ int32_t n_targets, ignore_sam_err;
+ uint32_t l_text;
+ uint32_t *target_len;
+ int8_t *cigar_tab;
+ char **target_name;
+ char *text;
+ void *sdict;
+} bam_hdr_t;
+
+/****************************
+ *** CIGAR related macros ***
+ ****************************/
+
+#define BAM_CMATCH 0
+#define BAM_CINS 1
+#define BAM_CDEL 2
+#define BAM_CREF_SKIP 3
+#define BAM_CSOFT_CLIP 4
+#define BAM_CHARD_CLIP 5
+#define BAM_CPAD 6
+#define BAM_CEQUAL 7
+#define BAM_CDIFF 8
+#define BAM_CBACK 9
+
+#define BAM_CIGAR_STR "MIDNSHP=XB"
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK 0xf
+#define BAM_CIGAR_TYPE 0x3C1A7
+
+#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
+#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
+#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)])
+#define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
+
+/* bam_cigar_type returns a bit flag with:
+ * bit 1 set if the cigar operation consumes the query
+ * bit 2 set if the cigar operation consumes the reference
+ *
+ * For reference, the unobfuscated truth table for this function is:
+ * BAM_CIGAR_TYPE QUERY REFERENCE
+ * --------------------------------
+ * BAM_CMATCH 1 1
+ * BAM_CINS 1 0
+ * BAM_CDEL 0 1
+ * BAM_CREF_SKIP 0 1
+ * BAM_CSOFT_CLIP 1 0
+ * BAM_CHARD_CLIP 0 0
+ * BAM_CPAD 0 0
+ * BAM_CEQUAL 1 1
+ * BAM_CDIFF 1 1
+ * BAM_CBACK 0 0
+ * --------------------------------
+ */
+#define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
+
+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
+#define BAM_FPAIRED 1
+/*! @abstract the read is mapped in a proper pair */
+#define BAM_FPROPER_PAIR 2
+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
+#define BAM_FUNMAP 4
+/*! @abstract the mate is unmapped */
+#define BAM_FMUNMAP 8
+/*! @abstract the read is mapped to the reverse strand */
+#define BAM_FREVERSE 16
+/*! @abstract the mate is mapped to the reverse strand */
+#define BAM_FMREVERSE 32
+/*! @abstract this is read1 */
+#define BAM_FREAD1 64
+/*! @abstract this is read2 */
+#define BAM_FREAD2 128
+/*! @abstract not primary alignment */
+#define BAM_FSECONDARY 256
+/*! @abstract QC failure */
+#define BAM_FQCFAIL 512
+/*! @abstract optical or PCR duplicate */
+#define BAM_FDUP 1024
+/*! @abstract supplementary alignment */
+#define BAM_FSUPPLEMENTARY 2048
+
+/*************************
+ *** Alignment records ***
+ *************************/
+
+/*! @typedef
+ @abstract Structure for core alignment information.
+ @field tid chromosome ID, defined by bam_hdr_t
+ @field pos 0-based leftmost coordinate
+ @field bin bin calculated by bam_reg2bin()
+ @field qual mapping quality
+ @field l_qname length of the query name
+ @field flag bitwise flag
+ @field n_cigar number of CIGAR operations
+ @field l_qseq length of the query sequence (read)
+ @field mtid chromosome ID of next read in template, defined by bam_hdr_t
+ @field mpos 0-based leftmost coordinate of next read in template
+ */
+typedef struct {
+ int32_t tid;
+ int32_t pos;
+ uint32_t bin:16, qual:8, l_qname:8;
+ uint32_t flag:16, n_cigar:16;
+ int32_t l_qseq;
+ int32_t mtid;
+ int32_t mpos;
+ int32_t isize;
+} bam1_core_t;
+
+/*! @typedef
+ @abstract Structure for one alignment.
+ @field core core information about the alignment
+ @field l_data current length of bam1_t::data
+ @field m_data maximum length of bam1_t::data
+ @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
+
+ @discussion Notes:
+
+ 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+ 2. l_qseq is calculated from the total length of an alignment block
+ on reading or from CIGAR.
+ 3. cigar data is encoded 4 bytes per CIGAR operation.
+ 4. seq is nybble-encoded according to bam_nt16_table.
+ */
+typedef struct {
+ bam1_core_t core;
+ int l_data, m_data;
+ uint8_t *data;
+#ifndef BAM_NO_ID
+ uint64_t id;
+#endif
+} bam1_t;
+
+/*! @function
+ @abstract Get whether the query is on the reverse strand
+ @param b pointer to an alignment
+ @return boolean true if query is on the reverse strand
+ */
+#define bam_is_rev(b) (((b)->core.flag&BAM_FREVERSE) != 0)
+/*! @function
+ @abstract Get whether the query's mate is on the reverse strand
+ @param b pointer to an alignment
+ @return boolean true if query's mate on the reverse strand
+ */
+#define bam_is_mrev(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
+/*! @function
+ @abstract Get the name of the query
+ @param b pointer to an alignment
+ @return pointer to the name string, null terminated
+ */
+#define bam_get_qname(b) ((char*)(b)->data)
+/*! @function
+ @abstract Get the CIGAR array
+ @param b pointer to an alignment
+ @return pointer to the CIGAR array
+
+ @discussion In the CIGAR array, each element is a 32-bit integer. The
+ lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+ length of a CIGAR.
+ */
+#define bam_get_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
+/*! @function
+ @abstract Get query sequence
+ @param b pointer to an alignment
+ @return pointer to sequence
+
+ @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+ 8 for T and 15 for N. Two bases are packed in one byte with the base
+ at the higher 4 bits having smaller coordinate on the read. It is
+ recommended to use bam_seqi() macro to get the base.
+ */
+#define bam_get_seq(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname)
+/*! @function
+ @abstract Get query quality
+ @param b pointer to an alignment
+ @return pointer to quality string
+ */
+#define bam_get_qual(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
+/*! @function
+ @abstract Get auxiliary data
+ @param b pointer to an alignment
+ @return pointer to the concatenated auxiliary data
+ */
+#define bam_get_aux(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1) + (b)->core.l_qseq)
+/*! @function
+ @abstract Get length of auxiliary data
+ @param b pointer to an alignment
+ @return length of the concatenated auxiliary data
+ */
+#define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1))
+/*! @function
+ @abstract Get a base on read
+ @param s Query sequence returned by bam_get_seq()
+ @param i The i-th position, 0-based
+ @return 4-bit integer representing the base.
+ */
+#define bam_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
+
+/**************************
+ *** Exported functions ***
+ **************************/
+
+ /***************
+ *** BAM I/O ***
+ ***************/
+
+ bam_hdr_t *bam_hdr_init(void);
+ bam_hdr_t *bam_hdr_read(BGZF *fp);
+ int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED;
+ void bam_hdr_destroy(bam_hdr_t *h);
+ int bam_name2id(bam_hdr_t *h, const char *ref);
+ bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0);
+
+ bam1_t *bam_init1(void);
+ void bam_destroy1(bam1_t *b);
+ int bam_read1(BGZF *fp, bam1_t *b) HTS_RESULT_USED;
+ int bam_write1(BGZF *fp, const bam1_t *b) HTS_RESULT_USED;
+ bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc);
+ bam1_t *bam_dup1(const bam1_t *bsrc);
+
+ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar);
+ int bam_cigar2rlen(int n_cigar, const uint32_t *cigar);
+
+ /*!
+ @abstract Calculate the rightmost base position of an alignment on the
+ reference genome.
+
+ @param b pointer to an alignment
+ @return the coordinate of the first base after the alignment, 0-based
+
+ @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
+ For an unmapped read (either according to its flags or if it has no cigar
+ string), we return b->core.pos + 1 by convention.
+ */
+ int32_t bam_endpos(const bam1_t *b);
+
+ int bam_str2flag(const char *str); /** returns negative value on error */
+ char *bam_flag2str(int flag); /** The string must be freed by the user */
+
+ /*************************
+ *** BAM/CRAM indexing ***
+ *************************/
+
+ // These BAM iterator functions work only on BAM files. To work with either
+ // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
+ #define bam_itr_destroy(iter) hts_itr_destroy(iter)
+ #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end)
+ #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region)
+ #define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0)
+
+// Load/build .csi or .bai BAM index file. Does not work with CRAM.
+// It is recommended to use the sam_index_* functions below instead.
+#define bam_index_load(fn) hts_idx_load((fn), HTS_FMT_BAI)
+#define bam_index_build(fn, min_shift) (sam_index_build((fn), (min_shift)))
+
+/// Load a BAM (.csi or .bai) or CRAM (.crai) index file
+/** @param fp File handle of the data file whose index is being opened
+ @param fn BAM/CRAM/etc filename to search alongside for the index file
+ @return The index, or NULL if an error occurred.
+*/
+hts_idx_t *sam_index_load(htsFile *fp, const char *fn);
+
+/// Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
+/** @param fp File handle of the data file whose index is being opened
+ @param fn BAM/CRAM/etc data file filename
+ @param fnidx Index filename, or NULL to search alongside @a fn
+ @return The index, or NULL if an error occurred.
+*/
+hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx);
+
+/// Generate and save an index file
+/** @param fn Input BAM/etc filename, to which .csi/etc will be added
+ @param min_shift Positive to generate CSI, or 0 to generate BAI
+ @return 0 if successful, or negative if an error occurred (usually -1; or
+ -2: opening fn failed; -3: format not indexable)
+*/
+int sam_index_build(const char *fn, int min_shift) HTS_RESULT_USED;
+
+/// Generate and save an index to a specific file
+/** @param fn Input BAM/CRAM/etc filename
+ @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
+ @param min_shift Positive to generate CSI, or 0 to generate BAI
+ @return 0 if successful, or negative if an error occurred.
+*/
+int sam_index_build2(const char *fn, const char *fnidx, int min_shift) HTS_RESULT_USED;
+
+ #define sam_itr_destroy(iter) hts_itr_destroy(iter)
+ hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end);
+ hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region);
+ #define sam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), (htsfp))
+
+ /***************
+ *** SAM I/O ***
+ ***************/
+
+ #define sam_open(fn, mode) (hts_open((fn), (mode)))
+ #define sam_open_format(fn, mode, fmt) (hts_open_format((fn), (mode), (fmt)))
+ #define sam_close(fp) hts_close(fp)
+
+ int sam_open_mode(char *mode, const char *fn, const char *format);
+
+ // A version of sam_open_mode that can handle ,key=value options.
+ // The format string is allocated and returned, to be freed by the caller.
+ // Prefix should be "r" or "w",
+ char *sam_open_mode_opts(const char *fn,
+ const char *mode,
+ const char *format);
+
+ typedef htsFile samFile;
+ bam_hdr_t *sam_hdr_parse(int l_text, const char *text);
+ bam_hdr_t *sam_hdr_read(samFile *fp);
+ int sam_hdr_write(samFile *fp, const bam_hdr_t *h) HTS_RESULT_USED;
+
+ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) HTS_RESULT_USED;
+ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) HTS_RESULT_USED;
+ int sam_read1(samFile *fp, bam_hdr_t *h, bam1_t *b) HTS_RESULT_USED;
+ int sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED;
+
+ /*************************************
+ *** Manipulating auxiliary fields ***
+ *************************************/
+
+ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
+ int32_t bam_aux2i(const uint8_t *s);
+ double bam_aux2f(const uint8_t *s);
+ char bam_aux2A(const uint8_t *s);
+ char *bam_aux2Z(const uint8_t *s);
+
+ void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
+ int bam_aux_del(bam1_t *b, uint8_t *s);
+
+/**************************
+ *** Pileup and Mpileup ***
+ **************************/
+
+#if !defined(BAM_NO_PILEUP)
+
+/*! @typedef
+ @abstract Structure for one alignment covering the pileup position.
+ @field b pointer to the alignment
+ @field qpos position of the read base at the pileup site, 0-based
+ @field indel indel length; 0 for no indel, positive for ins and negative for del
+ @field level the level of the read in the "viewer" mode
+ @field is_del 1 iff the base on the padded read is a deletion
+ @field is_head ???
+ @field is_tail ???
+ @field is_refskip ???
+ @field aux ???
+
+ @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+ difference between the two functions is that the former does not
+ set bam_pileup1_t::level, while the later does. Level helps the
+ implementation of alignment viewers, but calculating this has some
+ overhead.
+ */
+typedef struct {
+ bam1_t *b;
+ int32_t qpos;
+ int indel, level;
+ uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
+} bam_pileup1_t;
+
+typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
+
+struct __bam_plp_t;
+typedef struct __bam_plp_t *bam_plp_t;
+
+struct __bam_mplp_t;
+typedef struct __bam_mplp_t *bam_mplp_t;
+
+ /**
+ * bam_plp_init() - sets an iterator over multiple
+ * @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return
+ * status: 0 on success, -1 on end, < -1 on non-recoverable errors
+ * @data: user data to pass to @func
+ */
+ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
+ void bam_plp_destroy(bam_plp_t iter);
+ int bam_plp_push(bam_plp_t iter, const bam1_t *b);
+ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
+ void bam_plp_reset(bam_plp_t iter);
+
+ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
+ /**
+ * bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
+ * read pairs and for each base pair set the base quality of the
+ * lower-quality base to zero, thus effectively discarding it from
+ * calling. If the two bases are identical, the quality of the other base
+ * is increased to the sum of their qualities (capped at 200), otherwise
+ * it is multiplied by 0.8.
+ */
+ void bam_mplp_init_overlaps(bam_mplp_t iter);
+ void bam_mplp_destroy(bam_mplp_t iter);
+ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
+ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
+
+#endif // ~!defined(BAM_NO_PILEUP)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/synced_bcf_reader.h b/htslib/htslib/synced_bcf_reader.h
new file mode 100644
index 0000000..b746bc9
--- /dev/null
+++ b/htslib/htslib/synced_bcf_reader.h
@@ -0,0 +1,302 @@
+/* synced_bcf_reader.h -- stream through multiple VCF files.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+/*
+ The synced_bcf_reader allows to keep multiple VCFs open and stream them
+ using the next_line iterator in a seamless matter without worrying about
+ chromosomes and synchronizing the sites. This is used by vcfcheck to
+ compare multiple VCFs simultaneously and is used also for merging,
+ creating intersections, etc.
+
+ The synced_bcf_reader also provides API for reading indexed BCF/VCF,
+ hiding differences in BCF/VCF opening, indexing and reading.
+
+
+ Example of usage:
+
+ bcf_srs_t *sr = bcf_sr_init();
+ for (i=0; i<nfiles; i++)
+ bcf_sr_add_reader(sr,files[i]);
+ while ( bcf_sr_next_line(sr) )
+ {
+ for (i=0; i<nfiles; i++)
+ {
+ bcf1_t *line = bcf_sr_get_line(sr,i);
+ ...
+ }
+ }
+ bcf_sr_destroy(sr);
+*/
+
+#ifndef HTSLIB_SYNCED_BCF_READER_H
+#define HTSLIB_SYNCED_BCF_READER_H
+
+#include "hts.h"
+#include "vcf.h"
+#include "tbx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// How should be treated sites with the same position but different alleles
+#define COLLAPSE_NONE 0 // require the exact same set of alleles in all files
+#define COLLAPSE_SNPS 1 // allow different alleles, as long as they all are SNPs
+#define COLLAPSE_INDELS 2 // the same as above, but with indels
+#define COLLAPSE_ANY 4 // any combination of alleles can be returned by bcf_sr_next_line()
+#define COLLAPSE_SOME 8 // at least some of the ALTs must match
+#define COLLAPSE_BOTH (COLLAPSE_SNPS|COLLAPSE_INDELS)
+
+typedef struct _bcf_sr_regions_t
+{
+ // for reading from tabix-indexed file (big data)
+ tbx_t *tbx; // tabix index
+ hts_itr_t *itr; // tabix iterator
+ kstring_t line; // holder of the current line, set only when reading from tabix-indexed files
+ htsFile *file;
+ char *fname;
+ int is_bin; // is open in binary mode (tabix access)
+ char **als; // parsed alleles if targets_als set and _regions_match_alleles called
+ kstring_t als_str; // block of parsed alleles
+ int nals, mals; // number of set alleles and the size of allocated array
+ int als_type; // alleles type, currently VCF_SNP or VCF_INDEL
+
+ // user handler to deal with skipped regions without a counterpart in VCFs
+ void (*missed_reg_handler)(struct _bcf_sr_regions_t *, void *);
+ void *missed_reg_data;
+
+ // for in-memory regions (small data)
+ struct _region_t *regs; // the regions
+
+ // shared by both tabix-index and in-memory regions
+ void *seq_hash; // keys: sequence names, values: index to seqs
+ char **seq_names; // sequence names
+ int nseqs; // number of sequences (chromosomes) in the file
+ int iseq; // current position: chr name, index to snames
+ int start, end; // current position: start, end of the region (0-based)
+ int prev_seq, prev_start;
+}
+bcf_sr_regions_t;
+
+typedef struct
+{
+ htsFile *file;
+ tbx_t *tbx_idx;
+ hts_idx_t *bcf_idx;
+ bcf_hdr_t *header;
+ hts_itr_t *itr;
+ char *fname;
+ bcf1_t **buffer; // cached VCF records. First is the current record synced across the reader
+ int nbuffer, mbuffer; // number of cached records (including the current record); number of allocated records
+ int nfilter_ids, *filter_ids; // -1 for ".", otherwise filter id as returned by bcf_hdr_id2int
+ int *samples, n_smpl; // list of columns in the order consistent with bcf_srs_t.samples
+}
+bcf_sr_t;
+
+typedef enum
+{
+ open_failed, not_bgzf, idx_load_failed, file_type_error, api_usage_error,
+ header_error, no_eof
+}
+bcf_sr_error;
+
+typedef struct
+{
+ // Parameters controlling the logic
+ int collapse; // How should the duplicate sites be treated. One of the COLLAPSE_* types above.
+ char *apply_filters; // If set, sites where none of the FILTER strings is listed
+ // will be skipped. Active only at the time of
+ // initialization, that is during the add_reader()
+ // calls. Therefore, each reader can be initialized with different
+ // filters.
+ int require_index; // Some tools do not need random access
+ int max_unpack; // When reading VCFs and knowing some fields will not be needed, boost performance of vcf_parse1
+ int *has_line; // Corresponds to return value of bcf_sr_next_line but is not limited by sizeof(int). Use bcf_sr_has_line macro to query.
+ bcf_sr_error errnum;
+
+ // Auxiliary data
+ bcf_sr_t *readers;
+ int nreaders;
+ int streaming; // reading mode: index-jumping or streaming
+ int explicit_regs; // was the list of regions se by bcf_sr_set_regions or guessed from tabix index?
+ char **samples; // List of samples
+ bcf_sr_regions_t *regions, *targets; // see bcf_sr_set_[targets|regions] for description
+ int targets_als; // subset to targets not only by position but also by alleles?
+ int targets_exclude;
+ kstring_t tmps;
+ int n_smpl;
+}
+bcf_srs_t;
+
+/** Init bcf_srs_t struct */
+bcf_srs_t *bcf_sr_init(void);
+
+/** Destroy bcf_srs_t struct */
+void bcf_sr_destroy(bcf_srs_t *readers);
+
+char *bcf_sr_strerror(int errnum);
+
+
+/**
+ * bcf_sr_add_reader() - open new reader
+ * @readers: holder of the open readers
+ * @fname: the VCF file
+ *
+ * Returns 1 if the call succeeded, or 0 on error.
+ *
+ * See also the bcf_srs_t data structure for parameters controlling
+ * the reader's logic.
+ */
+int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname);
+void bcf_sr_remove_reader(bcf_srs_t *files, int i);
+
+/**
+ * bcf_sr_next_line() - the iterator
+ * @readers: holder of the open readers
+ *
+ * Returns the number of readers which have the current line
+ * (bcf_sr_t.buffer[0]) set at this position. Use the bcf_sr_has_line macro to
+ * determine which of the readers are set.
+ */
+int bcf_sr_next_line(bcf_srs_t *readers);
+#define bcf_sr_has_line(readers, i) (readers)->has_line[i]
+#define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL)
+#define bcf_sr_swap_line(_readers, i, lieu) { bcf1_t *tmp = lieu; lieu = (_readers)->readers[i].buffer[0]; (_readers)->readers[i].buffer[0] = tmp; }
+#define bcf_sr_region_done(_readers,i) (!(_readers)->has_line[i] && !(_readers)->readers[i].nbuffer ? 1 : 0)
+#define bcf_sr_get_header(_readers, i) (_readers)->readers[i].header
+#define bcf_sr_get_reader(_readers, i) &((_readers)->readers[i])
+
+/**
+ * bcf_sr_seek() - set all readers to selected position
+ * @seq: sequence name; NULL to seek to start
+ * @pos: 0-based coordinate
+ */
+int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos);
+
+/**
+ * bcf_sr_set_samples() - sets active samples
+ * @readers: holder of the open readers
+ * @samples: this can be one of: file name with one sample per line;
+ * or column-separated list of samples; or '-' for a list of
+ * samples shared by all files. If first character is the
+ * exclamation mark, all but the listed samples are included.
+ * @is_file: 0: list of samples; 1: file with sample names
+ *
+ * Returns 1 if the call succeeded, or 0 on error.
+ */
+int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file);
+
+/**
+ * bcf_sr_set_targets(), bcf_sr_set_regions() - init targets/regions
+ * @readers: holder of the open readers
+ * @targets: list of regions, one-based and inclusive.
+ * @is_fname: 0: targets is a comma-separated list of regions (chr,chr:from-to)
+ * 1: targets is a tabix indexed file with a list of regions
+ * (<chr,pos> or <chr,from,to>)
+ *
+ * Returns 0 if the call succeeded, or -1 on error.
+ *
+ * Both functions behave the same way, unlisted positions will be skipped by
+ * bcf_sr_next_line(). However, there is an important difference: regions use
+ * index to jump to desired positions while targets streams the whole files
+ * and merely skip unlisted positions.
+ *
+ * Moreover, bcf_sr_set_targets() accepts an optional parameter $alleles which
+ * is intepreted as a 1-based column index in the tab-delimited file where
+ * alleles are listed. This in principle enables to perform the COLLAPSE_*
+ * logic also with tab-delimited files. However, the current implementation
+ * considers the alleles merely as a suggestion for prioritizing one of possibly
+ * duplicate VCF lines. It is up to the caller to examine targets->als if
+ * perfect match is sought after. Note that the duplicate positions in targets
+ * file are currently not supported.
+ * Targets (but not regions) can be prefixed with "^" to request logical complement,
+ * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped.
+ */
+int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles);
+int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file);
+
+
+
+/*
+ * bcf_sr_regions_init()
+ * @regions: regions can be either a comma-separated list of regions
+ * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or
+ * tab-delimited file (the default). Uncompressed files
+ * are stored in memory while bgzip-compressed and tabix-indexed
+ * region files are streamed.
+ * @is_file: 0: regions is a comma-separated list of regions
+ * (chr|chr:pos|chr:from-to|chr:from-)
+ * 1: VCF, BED or tab-delimited file
+ * @chr, from, to:
+ * Column indexes of chromosome, start position and end position
+ * in the tab-delimited file. The positions are 1-based and
+ * inclusive.
+ * These parameters are ignored when reading from VCF, BED or
+ * tabix-indexed files. When end position column is not present,
+ * supply 'from' in place of 'to'. When 'to' is negative, first
+ * abs(to) will be attempted and if that fails, 'from' will be used
+ * instead.
+ */
+bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to);
+void bcf_sr_regions_destroy(bcf_sr_regions_t *regions);
+
+/*
+ * bcf_sr_regions_seek() - seek to the chromosome block
+ *
+ * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and
+ * reg->start,reg->end to -1.
+ */
+int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr);
+
+/*
+ * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1
+ * when all regions have been read. The fields reg->seq, reg->start and
+ * reg->end are filled with the genomic coordinates on succes or with
+ * NULL,-1,-1 when no region is available. The coordinates are 0-based,
+ * inclusive.
+ */
+int bcf_sr_regions_next(bcf_sr_regions_t *reg);
+
+/*
+ * bcf_sr_regions_overlap() - checks if the interval <start,end> overlaps any of
+ * the regions, the coordinates are 0-based, inclusive. The coordinate queries
+ * must come in ascending order.
+ *
+ * Returns 0 if the position is in regions; -1 if the position is not in the
+ * regions and more regions exist; -2 if not in the regions and there are no more
+ * regions left.
+ */
+int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end);
+
+/*
+ * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until
+ * all remaining records are processed.
+ */
+void bcf_sr_regions_flush(bcf_sr_regions_t *regs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/tbx.h b/htslib/htslib/tbx.h
new file mode 100644
index 0000000..65243b9
--- /dev/null
+++ b/htslib/htslib/tbx.h
@@ -0,0 +1,79 @@
+/* tbx.h -- tabix API functions.
+
+ Copyright (C) 2009, 2012-2015 Genome Research Ltd.
+ Copyright (C) 2010, 2012 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_TBX_H
+#define HTSLIB_TBX_H
+
+#include "hts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TBX_MAX_SHIFT 31
+
+#define TBX_GENERIC 0
+#define TBX_SAM 1
+#define TBX_VCF 2
+#define TBX_UCSC 0x10000
+
+typedef struct {
+ int32_t preset;
+ int32_t sc, bc, ec; // seq col., beg col. and end col.
+ int32_t meta_char, line_skip;
+} tbx_conf_t;
+
+typedef struct {
+ tbx_conf_t conf;
+ hts_idx_t *idx;
+ void *dict;
+} tbx_t;
+
+extern tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf;
+
+ #define tbx_itr_destroy(iter) hts_itr_destroy(iter)
+ #define tbx_itr_queryi(tbx, tid, beg, end) hts_itr_query((tbx)->idx, (tid), (beg), (end), tbx_readrec)
+ #define tbx_itr_querys(tbx, s) hts_itr_querys((tbx)->idx, (s), (hts_name2id_f)(tbx_name2id), (tbx), hts_itr_query, tbx_readrec)
+ #define tbx_itr_next(htsfp, tbx, itr, r) hts_itr_next(hts_get_bgzfp(htsfp), (itr), (r), (tbx))
+ #define tbx_bgzf_itr_next(bgzfp, tbx, itr, r) hts_itr_next((bgzfp), (itr), (r), (tbx))
+
+ int tbx_name2id(tbx_t *tbx, const char *ss);
+
+ /* Internal helper function used by tbx_itr_next() */
+ BGZF *hts_get_bgzfp(htsFile *fp);
+ int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end);
+
+ int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf);
+ int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf);
+ tbx_t *tbx_index_load(const char *fn);
+ tbx_t *tbx_index_load2(const char *fn, const char *fnidx);
+ const char **tbx_seqnames(tbx_t *tbx, int *n); // free the array but not the values
+ void tbx_destroy(tbx_t *tbx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/vcf.h b/htslib/htslib/vcf.h
new file mode 100644
index 0000000..1838c27
--- /dev/null
+++ b/htslib/htslib/vcf.h
@@ -0,0 +1,914 @@
+/* vcf.h -- VCF/BCF API functions.
+
+ Copyright (C) 2012, 2013 Broad Institute.
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+/*
+ todo:
+ - make the function names consistent
+ - provide calls to abstract away structs as much as possible
+ */
+
+#ifndef HTSLIB_VCF_H
+#define HTSLIB_VCF_H
+
+#include <stdint.h>
+#include <limits.h>
+#include <assert.h>
+#include "hts.h"
+#include "kstring.h"
+#include "hts_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*****************
+ * Header struct *
+ *****************/
+
+#define BCF_HL_FLT 0 // header line
+#define BCF_HL_INFO 1
+#define BCF_HL_FMT 2
+#define BCF_HL_CTG 3
+#define BCF_HL_STR 4 // structured header line TAG=<A=..,B=..>
+#define BCF_HL_GEN 5 // generic header line
+
+#define BCF_HT_FLAG 0 // header type
+#define BCF_HT_INT 1
+#define BCF_HT_REAL 2
+#define BCF_HT_STR 3
+
+#define BCF_VL_FIXED 0 // variable length
+#define BCF_VL_VAR 1
+#define BCF_VL_A 2
+#define BCF_VL_G 3
+#define BCF_VL_R 4
+
+/* === Dictionary ===
+
+ The header keeps three dictonaries. The first keeps IDs in the
+ "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
+ in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
+ is the actual hash table, which is opaque to the end users. In the hash
+ table, the key is the ID or sample name as a C string and the value is a
+ bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
+ table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
+ size of the hash table or, equivalently, the length of the id[] arrays.
+*/
+
+#define BCF_DT_ID 0 // dictionary type
+#define BCF_DT_CTG 1
+#define BCF_DT_SAMPLE 2
+
+// Complete textual representation of a header line
+typedef struct {
+ int type; // One of the BCF_HL_* type
+ char *key; // The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
+ char *value; // Set only for generic lines, NULL for FILTER/INFO, etc.
+ int nkeys; // Number of structured fields
+ char **keys, **vals; // The key=value pairs
+} bcf_hrec_t;
+
+typedef struct {
+ uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
+ // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
+ bcf_hrec_t *hrec[3];
+ int id;
+} bcf_idinfo_t;
+
+typedef struct {
+ const char *key;
+ const bcf_idinfo_t *val;
+} bcf_idpair_t;
+
+// Note that bcf_hdr_t structs must always be created via bcf_hdr_init()
+typedef struct {
+ int32_t n[3]; // n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
+ bcf_idpair_t *id[3];
+ void *dict[3]; // ID dictionary, contig dict and sample dict
+ char **samples;
+ bcf_hrec_t **hrec;
+ int nhrec, dirty;
+ int ntransl, *transl[2]; // for bcf_translate()
+ int nsamples_ori; // for bcf_hdr_set_samples()
+ uint8_t *keep_samples;
+ kstring_t mem;
+ int32_t m[3]; // m: allocated size of the dictionary block in use (see n above)
+} bcf_hdr_t;
+
+extern uint8_t bcf_type_shift[];
+
+/**************
+ * VCF record *
+ **************/
+
+#define BCF_BT_NULL 0
+#define BCF_BT_INT8 1
+#define BCF_BT_INT16 2
+#define BCF_BT_INT32 3
+#define BCF_BT_FLOAT 5
+#define BCF_BT_CHAR 7
+
+#define VCF_REF 0
+#define VCF_SNP 1
+#define VCF_MNP 2
+#define VCF_INDEL 4
+#define VCF_OTHER 8
+
+typedef struct {
+ int type, n; // variant type and the number of bases affected, negative for deletions
+} variant_t;
+
+typedef struct {
+ int id; // id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
+ int n, size, type; // n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
+ uint8_t *p; // same as vptr and vptr_* in bcf_info_t below
+ uint32_t p_len;
+ uint32_t p_off:31, p_free:1;
+} bcf_fmt_t;
+
+typedef struct {
+ int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
+ int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars
+ union {
+ int32_t i; // integer value
+ float f; // float value
+ } v1; // only set if $len==1; for easier access
+ uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
+ uint32_t vptr_len; // length of the vptr block or, when set, of the vptr_mod block, excluding offset
+ uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes
+ vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new
+ // data block is bigger than the original
+} bcf_info_t;
+
+
+#define BCF1_DIRTY_ID 1
+#define BCF1_DIRTY_ALS 2
+#define BCF1_DIRTY_FLT 4
+#define BCF1_DIRTY_INF 8
+
+typedef struct {
+ int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change
+ int n_flt; // Number of FILTER fields
+ int *flt; // FILTER keys in the dictionary
+ char *id, *als; // ID and REF+ALT block (\0-seperated)
+ char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated
+ bcf_info_t *info; // INFO
+ bcf_fmt_t *fmt; // FORMAT and individual sample
+ variant_t *var; // $var and $var_type set only when set_variant_types called
+ int n_var, var_type;
+ int shared_dirty; // if set, shared.s must be recreated on BCF output
+ int indiv_dirty; // if set, indiv.s must be recreated on BCF output
+} bcf_dec_t;
+
+
+#define BCF_ERR_CTG_UNDEF 1
+#define BCF_ERR_TAG_UNDEF 2
+#define BCF_ERR_NCOLS 4
+#define BCF_ERR_LIMITS 8
+#define BCF_ERR_CHAR 16
+#define BCF_ERR_CTG_INVALID 32
+#define BCF_ERR_TAG_INVALID 64
+
+/*
+ The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
+ is slower because the string is first to be parsed, packed into BCF line
+ (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
+ is known in advance that some of the fields will not be required (notably
+ the sample columns), parsing of these can be skipped by setting max_unpack
+ appropriately.
+ Similarly, it is fast to output a BCF line because the columns (kept in
+ shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
+ line must be formatted in vcf_format.
+ */
+typedef struct {
+ int32_t rid; // CHROM
+ int32_t pos; // POS
+ int32_t rlen; // length of REF
+ float qual; // QUAL
+ uint32_t n_info:16, n_allele:16;
+ uint32_t n_fmt:8, n_sample:24;
+ kstring_t shared, indiv;
+ bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
+ int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
+ int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
+ int unpack_size[3]; // the original block size of ID, REF+ALT and FILTER
+ int errcode; // one of BCF_ERR_* codes
+} bcf1_t;
+
+/*******
+ * API *
+ *******/
+
+ /***********************************************************************
+ * BCF and VCF I/O
+ *
+ * A note about naming conventions: htslib internally represents VCF
+ * records as bcf1_t data structures, therefore most functions are
+ * prefixed with bcf_. There are a few exceptions where the functions must
+ * be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
+ * these cases, functions prefixed with bcf_ are more general and work
+ * with both BCF and VCF.
+ *
+ ***********************************************************************/
+
+ /** These macros are defined only for consistency with other parts of htslib */
+ #define bcf_init1() bcf_init()
+ #define bcf_read1(fp,h,v) bcf_read((fp),(h),(v))
+ #define vcf_read1(fp,h,v) vcf_read((fp),(h),(v))
+ #define bcf_write1(fp,h,v) bcf_write((fp),(h),(v))
+ #define vcf_write1(fp,h,v) vcf_write((fp),(h),(v))
+ #define bcf_destroy1(v) bcf_destroy(v)
+ #define bcf_empty1(v) bcf_empty(v)
+ #define vcf_parse1(s,h,v) vcf_parse((s),(h),(v))
+ #define bcf_clear1(v) bcf_clear(v)
+ #define vcf_format1(h,v,s) vcf_format((h),(v),(s))
+
+ /**
+ * bcf_hdr_init() - create an empty BCF header.
+ * @param mode "r" or "w"
+ *
+ * When opened for writing, the mandatory fileFormat and
+ * FILTER=PASS lines are added automatically.
+ */
+ bcf_hdr_t *bcf_hdr_init(const char *mode);
+
+ /** Destroy a BCF header struct */
+ void bcf_hdr_destroy(bcf_hdr_t *h);
+
+ /** Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) */
+ bcf1_t *bcf_init(void);
+
+ /** Deallocate a bcf1_t object */
+ void bcf_destroy(bcf1_t *v);
+
+ /**
+ * Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
+ * not the bcf1_t object itself.
+ */
+ void bcf_empty(bcf1_t *v);
+
+ /**
+ * Make the bcf1_t object ready for next read. Intended mostly for
+ * internal use, the user should rarely need to call this function
+ * directly.
+ */
+ void bcf_clear(bcf1_t *v);
+
+
+ /** bcf_open and vcf_open mode: please see hts_open() in hts.h */
+ typedef htsFile vcfFile;
+ #define bcf_open(fn, mode) hts_open((fn), (mode))
+ #define vcf_open(fn, mode) hts_open((fn), (mode))
+ #define bcf_close(fp) hts_close(fp)
+ #define vcf_close(fp) hts_close(fp)
+
+ /** Reads VCF or BCF header */
+ bcf_hdr_t *bcf_hdr_read(htsFile *fp);
+
+ /**
+ * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
+ * @samples: samples to include or exclude from file or as a comma-separated string.
+ * LIST|FILE .. select samples in list/file
+ * ^LIST|FILE .. exclude samples from list/file
+ * - .. include all samples
+ * NULL .. exclude all samples
+ * @is_file: @samples is a file (1) or a comma-separated list (0)
+ *
+ * The bottleneck of VCF reading is parsing of genotype fields. If the
+ * reader knows in advance that only subset of samples is needed (possibly
+ * no samples at all), the performance of bcf_read() can be significantly
+ * improved by calling bcf_hdr_set_samples after bcf_hdr_read().
+ * The function bcf_read() will subset the VCF/BCF records automatically
+ * with the notable exception when reading records via bcf_itr_next().
+ * In this case, bcf_subset_format() must be called explicitly, because
+ * bcf_readrec() does not see the header.
+ *
+ * Returns 0 on success, -1 on error or a positive integer if the list
+ * contains samples not present in the VCF header. In such a case, the
+ * return value is the index of the offending sample.
+ */
+ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file);
+ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec);
+
+
+ /** Writes VCF or BCF header */
+ int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h);
+
+ /**
+ * Parse VCF line contained in kstring and populate the bcf1_t struct
+ * The line must not end with \n or \r characters.
+ */
+ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v);
+
+ /** The opposite of vcf_parse. It should rarely be called directly, see vcf_write */
+ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s);
+
+ /**
+ * bcf_read() - read next VCF or BCF record
+ *
+ * Returns -1 on critical errors, 0 otherwise. On errors which are not
+ * critical for reading, such as missing header definitions, v->errcode is
+ * set to one of BCF_ERR* code and must be checked before calling
+ * vcf_write().
+ */
+ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
+
+ /**
+ * bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
+ *
+ * Note that bcf_unpack() must be called even when reading VCF. It is safe
+ * to call the function repeatedly, it will not unpack the same field
+ * twice.
+ */
+ #define BCF_UN_STR 1 // up to ALT inclusive
+ #define BCF_UN_FLT 2 // up to FILTER
+ #define BCF_UN_INFO 4 // up to INFO
+ #define BCF_UN_SHR (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) // all shared information
+ #define BCF_UN_FMT 8 // unpack format and each sample
+ #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT
+ #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything
+ int bcf_unpack(bcf1_t *b, int which);
+
+ /*
+ * bcf_dup() - create a copy of BCF record.
+ *
+ * Note that bcf_unpack() must be called on the returned copy as if it was
+ * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
+ * internally to reflect any changes made by bcf_update_* functions.
+ */
+ bcf1_t *bcf_dup(bcf1_t *src);
+ bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src);
+
+ /**
+ * bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
+ */
+ int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v);
+
+ /**
+ * The following functions work only with VCFs and should rarely be called
+ * directly. Usually one wants to use their bcf_* alternatives, which work
+ * transparently with both VCFs and BCFs.
+ */
+ bcf_hdr_t *vcf_hdr_read(htsFile *fp);
+ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h);
+ int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
+ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
+
+ /** Helper function for the bcf_itr_next() macro; internal use, ignore it */
+ int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end);
+
+
+
+ /**************************************************************************
+ * Header querying and manipulation routines
+ **************************************************************************/
+
+ /** Create a new header using the supplied template */
+ bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr);
+
+ /**
+ * Copy header lines from src to dst if not already present in dst. See also bcf_translate().
+ * Returns 0 on success or sets a bit on error:
+ * 1 .. conflicting definitions of tag length
+ * // todo
+ */
+ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) HTS_DEPRECATED("Please use bcf_hdr_merge instead");
+
+ /**
+ * bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
+ * @param dst: the destination header to be merged into, NULL on the first pass
+ * @param src: the source header
+ *
+ * Notes:
+ * - use as:
+ * bcf_hdr_t *dst = NULL;
+ * for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
+ *
+ * - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
+ * combining multiple BCF headers. The current bcf_hdr_combine()
+ * does not have this problem, but became slow when used for many files.
+ */
+ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src);
+
+ /**
+ * bcf_hdr_add_sample() - add a new sample.
+ * @param sample: sample name to be added
+ */
+ int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample);
+
+ /** Read VCF header from a file and update the header */
+ int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname);
+
+ /** Returns formatted header (newly allocated string) and its length,
+ * excluding the terminating \0. If is_bcf parameter is unset, IDX
+ * fields are discarded.
+ */
+ char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len);
+
+ /** Append new VCF header line, returns 0 on success */
+ int bcf_hdr_append(bcf_hdr_t *h, const char *line);
+ int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
+
+ /** VCF version, e.g. VCFv4.2 */
+ const char *bcf_hdr_get_version(const bcf_hdr_t *hdr);
+ void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version);
+
+ /**
+ * bcf_hdr_remove() - remove VCF header tag
+ * @param type: one of BCF_HL_*
+ * @param key: tag name or NULL to remove all tags of the given type
+ */
+ void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key);
+
+ /**
+ * bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
+ * @param n: number of samples to keep
+ * @param samples: names of the samples to keep
+ * @param imap: mapping from index in @samples to the sample index in the original file
+ *
+ * Sample names not present in h0 are ignored. The number of unmatched samples can be checked
+ * by comparing n and bcf_hdr_nsamples(out_hdr).
+ * This function can be used to reorder samples.
+ * See also bcf_subset() which subsets individual records.
+ */
+ bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap);
+
+ /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */
+ const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs);
+
+ /** Get number of samples */
+ #define bcf_hdr_nsamples(hdr) (hdr)->n[BCF_DT_SAMPLE]
+
+
+ /** The following functions are for internal use and should rarely be called directly */
+ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt);
+ int bcf_hdr_sync(bcf_hdr_t *h);
+ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len);
+ void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str);
+ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec);
+ /**
+ * bcf_hdr_get_hrec() - get header line info
+ * @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
+ * @param key: the header key for generic lines (e.g. "fileformat"), any field
+ * for structured lines, typically "ID".
+ * @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
+ * @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
+ */
+ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class);
+ bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec);
+ void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len);
+ void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted);
+ int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key);
+ void hrec_add_idx(bcf_hrec_t *hrec, int idx);
+ void bcf_hrec_destroy(bcf_hrec_t *hrec);
+
+
+
+ /**************************************************************************
+ * Individual record querying and manipulation routines
+ **************************************************************************/
+
+ /** See the description of bcf_hdr_subset() */
+ int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap);
+
+ /**
+ * bcf_translate() - translate tags ids to be consistent with different header. This function
+ * is useful when lines from multiple VCF need to be combined.
+ * @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
+ * @src_hdr: the source header, used in bcf_read()
+ * @src_line: line obtained by bcf_read()
+ */
+ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line);
+
+ /**
+ * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc
+ */
+ int bcf_get_variant_types(bcf1_t *rec);
+ int bcf_get_variant_type(bcf1_t *rec, int ith_allele);
+ int bcf_is_snp(bcf1_t *v);
+
+ /**
+ * bcf_update_filter() - sets the FILTER column
+ * @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+ * @n: Number of filters. If n==0, all filters are removed
+ */
+ int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n);
+ /**
+ * bcf_add_filter() - adds to the FILTER column
+ * @flt_id: filter ID to add, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+ *
+ * If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
+ */
+ int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id);
+ /**
+ * bcf_remove_filter() - removes from the FILTER column
+ * @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+ * @pass: when set to 1 and no filters are present, set to PASS
+ */
+ int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass);
+ /**
+ * Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
+ */
+ int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter);
+ /**
+ * bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALLT column
+ * @alleles: Array of alleles
+ * @nals: Number of alleles
+ * @alleles_string: Comma-separated alleles, starting with the REF allele
+ */
+ int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals);
+ int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string);
+
+ /**
+ * bcf_update_id() - sets new ID string
+ * bcf_add_id() - adds to the ID string checking for duplicates
+ */
+ int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id);
+ int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id);
+
+ /*
+ * bcf_update_info_*() - functions for updating INFO fields
+ * @hdr: the BCF header
+ * @line: VCF line to be edited
+ * @key: the INFO tag to be updated
+ * @values: pointer to the array of values. Pass NULL to remove the tag.
+ * @n: number of values in the array. When set to 0, the INFO tag is removed
+ *
+ * The @string in bcf_update_info_flag() is optional, @n indicates whether
+ * the flag is set or removed.
+ *
+ * Returns 0 on success or negative value on error.
+ */
+ #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT)
+ #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL)
+ #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG)
+ #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR)
+ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
+
+ /*
+ * bcf_update_format_*() - functions for updating FORMAT fields
+ * @values: pointer to the array of values, the same number of elements
+ * is expected for each sample. Missing values must be padded
+ * with bcf_*_missing or bcf_*_vector_end values.
+ * @n: number of values in the array. If n==0, existing tag is removed.
+ *
+ * The function bcf_update_format_string() is a higher-level (slower) variant of
+ * bcf_update_format_char(). The former accepts array of \0-terminated strings
+ * whereas the latter requires that the strings are collapsed into a single array
+ * of fixed-length strings. In case of strings with variable length, shorter strings
+ * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
+ * are not \0-terminated.
+ *
+ * Returns 0 on success or negative value on error.
+ */
+ #define bcf_update_format_int32(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_INT)
+ #define bcf_update_format_float(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_REAL)
+ #define bcf_update_format_char(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_STR)
+ #define bcf_update_genotypes(hdr,line,gts,n) bcf_update_format((hdr),(line),"GT",(gts),(n),BCF_HT_INT) // See bcf_gt_ macros below
+ int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n);
+ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
+
+ // Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
+ // to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
+ // from bcf_get_genotypes() below.
+ #define bcf_gt_phased(idx) (((idx)+1)<<1|1)
+ #define bcf_gt_unphased(idx) (((idx)+1)<<1)
+ #define bcf_gt_missing 0
+ #define bcf_gt_is_missing(val) ((val)>>1 ? 0 : 1)
+ #define bcf_gt_is_phased(idx) ((idx)&1)
+ #define bcf_gt_allele(val) (((val)>>1)-1)
+
+ /** Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) */
+ #define bcf_alleles2gt(a,b) ((a)>(b)?((a)*((a)+1)/2+(b)):((b)*((b)+1)/2+(a)))
+ static inline void bcf_gt2alleles(int igt, int *a, int *b)
+ {
+ int k = 0, dk = 1;
+ while ( k<igt ) { dk++; k += dk; }
+ *b = dk - 1; *a = igt - k + *b;
+ }
+
+ /**
+ * bcf_get_fmt() - returns pointer to FORMAT's field data
+ * @header: for access to BCF_DT_ID dictionary
+ * @line: VCF line obtained from vcf_parse1
+ * @fmt: one of GT,PL,...
+ *
+ * Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
+ * is not available.
+ */
+ bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
+ bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
+
+ /**
+ * bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
+ * @line: VCF line obtained from vcf_parse1
+ * @id: The header index for the tag, obtained from bcf_hdr_id2int()
+ *
+ * Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
+ * as their goal is to avoid the header lookup.
+ */
+ bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id);
+ bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id);
+
+ /**
+ * bcf_get_info_*() - get INFO values, integers or floats
+ * @hdr: BCF header
+ * @line: BCF record
+ * @tag: INFO tag to retrieve
+ * @dst: *dst is pointer to a memory location, can point to NULL
+ * @ndst: pointer to the size of allocated memory
+ *
+ * Returns negative value on error or the number of written values on
+ * success. bcf_get_info_string() returns on success the number of
+ * characters written excluding the null-terminating byte. bcf_get_info_flag()
+ * returns 1 when flag is set or 0 if not.
+ *
+ * List of return codes:
+ * -1 .. no such INFO tag defined in the header
+ * -2 .. clash between types defined in the header and encountered in the VCF record
+ * -3 .. tag is not present in the VCF record
+ */
+ #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
+ #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
+ #define bcf_get_info_string(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
+ #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG)
+ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
+
+ /**
+ * bcf_get_format_*() - same as bcf_get_info*() above
+ *
+ * The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
+ * see the description of bcf_update_format_string() and bcf_update_format_char() above.
+ * Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
+ * a single block of \0-terminated strings collapsed into a single array and an array of pointers
+ * to these strings. Both arrays must be cleaned by the user.
+ *
+ * Returns negative value on error or the number of written values on success.
+ *
+ * Example:
+ * int ndst = 0; char **dst = NULL;
+ * if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
+ * for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]);
+ * free(dst[0]); free(dst);
+ *
+ * Example:
+ * int ngt, *gt_arr = NULL, ngt_arr = 0;
+ * ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr);
+ */
+ #define bcf_get_format_int32(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
+ #define bcf_get_format_float(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
+ #define bcf_get_format_char(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
+ #define bcf_get_genotypes(hdr,line,dst,ndst) bcf_get_format_values(hdr,line,"GT",(void**)(dst),ndst,BCF_HT_INT)
+ int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst);
+ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
+
+
+
+ /**************************************************************************
+ * Helper functions
+ **************************************************************************/
+
+ /**
+ * bcf_hdr_id2int() - Translates string into numeric ID
+ * bcf_hdr_int2id() - Translates numeric ID into string
+ * @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
+ * @id: tag name, such as: PL, DP, GT, etc.
+ *
+ * Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
+ * fields in BCF records.
+ */
+ int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id);
+ #define bcf_hdr_int2id(hdr,type,int_id) ((hdr)->id[type][int_id].key)
+
+ /**
+ * bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
+ * bcf_hdr_id2name() - Translates numeric ID to sequence name
+ */
+ static inline int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) { return bcf_hdr_id2int(hdr, BCF_DT_CTG, id); }
+ static inline const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid) { return hdr->id[BCF_DT_CTG][rid].key; }
+ static inline const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec) { return hdr->id[BCF_DT_CTG][rec->rid].key; }
+
+ /**
+ * bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
+ * @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
+ * @int_id: return value of bcf_hdr_id2int, must be >=0
+ *
+ * The returned values are:
+ * bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
+ * bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields
+ * bcf_hdr_id2type .. the field type, one of BCF_HT_*
+ * bcf_hdr_id2coltype .. the column type, one of BCF_HL_*
+ *
+ * Notes: Prior to using the macros, the presence of the info should be
+ * tested with bcf_hdr_idinfo_exists().
+ */
+ #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf)
+ #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12)
+ #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf)
+ #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf)
+ #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1)
+ #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)])
+
+ void bcf_fmt_array(kstring_t *s, int n, int type, void *data);
+ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr);
+
+ void bcf_enc_vchar(kstring_t *s, int l, const char *a);
+ void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize);
+ void bcf_enc_vfloat(kstring_t *s, int n, float *a);
+
+
+ /**************************************************************************
+ * BCF index
+ *
+ * Note that these functions work with BCFs only. See synced_bcf_reader.h
+ * which provides (amongst other things) an API to work transparently with
+ * both indexed BCFs and VCFs.
+ **************************************************************************/
+
+ #define bcf_itr_destroy(iter) hts_itr_destroy(iter)
+ #define bcf_itr_queryi(idx, tid, beg, end) hts_itr_query((idx), (tid), (beg), (end), bcf_readrec)
+ #define bcf_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bcf_hdr_name2id), (hdr), hts_itr_query, bcf_readrec)
+ #define bcf_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0)
+ #define bcf_index_load(fn) hts_idx_load(fn, HTS_FMT_CSI)
+ #define bcf_index_seqnames(idx, hdr, nptr) hts_idx_seqnames((idx),(nptr),(hts_id2name_f)(bcf_hdr_id2name),(hdr))
+
+ hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx);
+ int bcf_index_build(const char *fn, int min_shift);
+ int bcf_index_build2(const char *fn, const char *fnidx, int min_shift);
+
+/*******************
+ * Typed value I/O *
+ *******************/
+
+/*
+ Note that in contrast with BCFv2.1 specification, HTSlib implementation
+ allows missing values in vectors. For integer types, the values 0x80,
+ 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
+ 0x80000001 as end-of-vector indicators. Similarly for floats, the value of
+ 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
+ end-of-vector indicator.
+ Note that the end-of-vector byte is not part of the vector.
+
+ This trial BCF version (v2.2) is compatible with the VCF specification and
+ enables to handle correctly vectors with different ploidy in presence of
+ missing values.
+ */
+#define bcf_int8_vector_end (INT8_MIN+1)
+#define bcf_int16_vector_end (INT16_MIN+1)
+#define bcf_int32_vector_end (INT32_MIN+1)
+#define bcf_str_vector_end 0
+#define bcf_int8_missing INT8_MIN
+#define bcf_int16_missing INT16_MIN
+#define bcf_int32_missing INT32_MIN
+#define bcf_str_missing 0x07
+extern uint32_t bcf_float_vector_end;
+extern uint32_t bcf_float_missing;
+static inline void bcf_float_set(float *ptr, uint32_t value)
+{
+ union { uint32_t i; float f; } u;
+ u.i = value;
+ *ptr = u.f;
+}
+#define bcf_float_set_vector_end(x) bcf_float_set(&(x),bcf_float_vector_end)
+#define bcf_float_set_missing(x) bcf_float_set(&(x),bcf_float_missing)
+static inline int bcf_float_is_missing(float f)
+{
+ union { uint32_t i; float f; } u;
+ u.f = f;
+ return u.i==bcf_float_missing ? 1 : 0;
+}
+static inline int bcf_float_is_vector_end(float f)
+{
+ union { uint32_t i; float f; } u;
+ u.f = f;
+ return u.i==bcf_float_vector_end ? 1 : 0;
+}
+
+static inline void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
+{
+ #define BRANCH(type_t, missing, vector_end) { \
+ type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \
+ int i; \
+ for (i=0; i<fmt->n && ptr[i]!=vector_end; i++) \
+ { \
+ if ( i ) kputc("/|"[ptr[i]&1], str); \
+ if ( !(ptr[i]>>1) ) kputc('.', str); \
+ else kputw((ptr[i]>>1) - 1, str); \
+ } \
+ if (i == 0) kputc('.', str); \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ case BCF_BT_NULL: kputc('.', str); break;
+ default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
+ }
+ #undef BRANCH
+}
+
+static inline void bcf_enc_size(kstring_t *s, int size, int type)
+{
+ if (size >= 15) {
+ kputc(15<<4|type, s);
+ if (size >= 128) {
+ if (size >= 32768) {
+ int32_t x = size;
+ kputc(1<<4|BCF_BT_INT32, s);
+ kputsn((char*)&x, 4, s);
+ } else {
+ int16_t x = size;
+ kputc(1<<4|BCF_BT_INT16, s);
+ kputsn((char*)&x, 2, s);
+ }
+ } else {
+ kputc(1<<4|BCF_BT_INT8, s);
+ kputc(size, s);
+ }
+ } else kputc(size<<4|type, s);
+}
+
+static inline int bcf_enc_inttype(long x)
+{
+ if (x <= INT8_MAX && x > bcf_int8_missing) return BCF_BT_INT8;
+ if (x <= INT16_MAX && x > bcf_int16_missing) return BCF_BT_INT16;
+ return BCF_BT_INT32;
+}
+
+static inline void bcf_enc_int1(kstring_t *s, int32_t x)
+{
+ if (x == bcf_int32_vector_end) {
+ bcf_enc_size(s, 1, BCF_BT_INT8);
+ kputc(bcf_int8_vector_end, s);
+ } else if (x == bcf_int32_missing) {
+ bcf_enc_size(s, 1, BCF_BT_INT8);
+ kputc(bcf_int8_missing, s);
+ } else if (x <= INT8_MAX && x > bcf_int8_missing) {
+ bcf_enc_size(s, 1, BCF_BT_INT8);
+ kputc(x, s);
+ } else if (x <= INT16_MAX && x > bcf_int16_missing) {
+ int16_t z = x;
+ bcf_enc_size(s, 1, BCF_BT_INT16);
+ kputsn((char*)&z, 2, s);
+ } else {
+ int32_t z = x;
+ bcf_enc_size(s, 1, BCF_BT_INT32);
+ kputsn((char*)&z, 4, s);
+ }
+}
+
+static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
+{
+ if (type == BCF_BT_INT8) {
+ *q = (uint8_t*)p + 1;
+ return *(int8_t*)p;
+ } else if (type == BCF_BT_INT16) {
+ *q = (uint8_t*)p + 2;
+ return *(int16_t*)p;
+ } else {
+ *q = (uint8_t*)p + 4;
+ return *(int32_t*)p;
+ }
+}
+
+static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
+{
+ return bcf_dec_int1(p + 1, *p&0xf, q);
+}
+
+static inline int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
+{
+ *type = *p & 0xf;
+ if (*p>>4 != 15) {
+ *q = (uint8_t*)p + 1;
+ return *p>>4;
+ } else return bcf_dec_typed_int1(p + 1, q);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/vcf_sweep.h b/htslib/htslib/vcf_sweep.h
new file mode 100644
index 0000000..82c9b03
--- /dev/null
+++ b/htslib/htslib/vcf_sweep.h
@@ -0,0 +1,47 @@
+/* vcf_sweep.h -- forward/reverse sweep API.
+
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_VCF_SWEEP_H
+#define HTSLIB_VCF_SWEEP_H
+
+#include "hts.h"
+#include "vcf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _bcf_sweep_t bcf_sweep_t;
+
+bcf_sweep_t *bcf_sweep_init(const char *fname);
+void bcf_sweep_destroy(bcf_sweep_t *sw);
+bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw);
+bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw);
+bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib/vcfutils.h b/htslib/htslib/vcfutils.h
new file mode 100644
index 0000000..82181b1
--- /dev/null
+++ b/htslib/htslib/vcfutils.h
@@ -0,0 +1,134 @@
+/* vcfutils.h -- allele-related utility functions.
+
+ Copyright (C) 2012, 2013, 2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_VCFUTILS_H
+#define HTSLIB_VCFUTILS_H
+
+#include "vcf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct kbitset_t;
+
+/**
+ * bcf_trim_alleles() - remove ALT alleles unused in genotype fields
+ * @header: for access to BCF_DT_ID dictionary
+ * @line: VCF line obtain from vcf_parse1
+ *
+ * Returns the number of removed alleles on success or negative
+ * on error:
+ * -1 .. some allele index is out of bounds
+ */
+int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line);
+
+/**
+ * bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
+ * @header: for access to BCF_DT_ID dictionary
+ * @line: VCF line obtained from vcf_parse1
+ * @mask: alleles to remove
+ *
+ * If you have more than 31 alleles, then the integer bit mask will
+ * overflow, so use bcf_remove_allele_set instead
+ */
+void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask);
+
+/**
+ * bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
+ * @header: for access to BCF_DT_ID dictionary
+ * @line: VCF line obtained from vcf_parse1
+ * @rm_set: pointer to kbitset_t object with bits set for allele
+ * indexes to remove
+ *
+ * Number=A,R,G INFO and FORMAT fields will be updated accordingly.
+ */
+void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set);
+
+/**
+ * bcf_calc_ac() - calculate the number of REF and ALT alleles
+ * @header: for access to BCF_DT_ID dictionary
+ * @line: VCF line obtained from vcf_parse1
+ * @ac: array of length line->n_allele
+ * @which: determine if INFO/AN,AC and indv fields be used
+ *
+ * Returns 1 if the call succeeded, or 0 if the value could not
+ * be determined.
+ *
+ * The value of @which determines if existing INFO/AC,AN can be
+ * used (BCF_UN_INFO) and and if indv fields can be splitted
+ * (BCF_UN_FMT).
+ */
+int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which);
+
+
+/**
+ * bcf_gt_type() - determines type of the genotype
+ * @fmt_ptr: the GT format field as set for example by set_fmt_ptr
+ * @isample: sample index (starting from 0)
+ * @ial: index of the 1st non-reference allele (starting from 1)
+ * @jal: index of the 2nd non-reference allele (starting from 1)
+ *
+ * Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
+ * GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
+ * is not NULL and the genotype has one or more non-reference
+ * alleles, $ial will be set. In case of GT_HET_AA, $ial is the
+ * position of the allele which appeared first in ALT. If $jal is
+ * not null and the genotype is GT_HET_AA, $jal will be set and is
+ * the position of the second allele in ALT.
+ */
+#define GT_HOM_RR 0 // note: the actual value of GT_* matters, used in dosage r2 calculation
+#define GT_HOM_AA 1
+#define GT_HET_RA 2
+#define GT_HET_AA 3
+#define GT_HAPL_R 4
+#define GT_HAPL_A 5
+#define GT_UNKN 6
+int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal);
+
+static inline int bcf_acgt2int(char c)
+{
+ if ( (int)c>96 ) c -= 32;
+ if ( c=='A' ) return 0;
+ if ( c=='C' ) return 1;
+ if ( c=='G' ) return 2;
+ if ( c=='T' ) return 3;
+ return -1;
+}
+
+#define bcf_int2acgt(i) "ACGT"[i]
+
+/**
+ * bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
+ * @i,j: allele indexes, 0-based, i<=j
+ *
+ * Returns index to the Number=G diploid array
+ */
+#define bcf_ij2G(i, j) ((j)*((j)+1)/2+(i))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htslib/htslib_vars.mk b/htslib/htslib_vars.mk
new file mode 100644
index 0000000..05a4906
--- /dev/null
+++ b/htslib/htslib_vars.mk
@@ -0,0 +1,49 @@
+# Makefile variables useful for third-party code using htslib's public API.
+#
+# Copyright (C) 2013-2016 Genome Research Ltd.
+#
+# Author: John Marshall <jm18 at sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# These variables can be used to express dependencies on htslib headers.
+# See htslib.mk for details.
+
+htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h $(htslib_hts_defs_h)
+htslib_cram_h = $(HTSPREFIX)htslib/cram.h $(htslib_hts_h)
+htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h $(htslib_hts_defs_h)
+htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h)
+htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h)
+htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h
+htslib_kbitset_h = $(HTSPREFIX)htslib/kbitset.h
+htslib_kfunc_h = $(HTSPREFIX)htslib/kfunc.h
+htslib_khash_h = $(HTSPREFIX)htslib/khash.h
+htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h)
+htslib_klist_h = $(HTSPREFIX)htslib/klist.h
+htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h
+htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h
+htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h
+htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h
+htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h
+htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h)
+htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h)
+htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h)
+htslib_vcf_h = $(HTSPREFIX)htslib/vcf.h $(htslib_hts_h) $(htslib_kstring_h) $(htslib_hts_defs_h)
+htslib_vcf_sweep_h = $(HTSPREFIX)htslib/vcf_sweep.h $(htslib_hts_h) $(htslib_vcf_h)
+htslib_vcfutils_h = $(HTSPREFIX)htslib/vcfutils.h $(htslib_vcf_h)
diff --git a/htslib/kfunc.c b/htslib/kfunc.c
new file mode 100644
index 0000000..323e70f
--- /dev/null
+++ b/htslib/kfunc.c
@@ -0,0 +1,282 @@
+/* The MIT License
+
+ Copyright (C) 2010, 2013 Genome Research Ltd.
+ Copyright (C) 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <config.h>
+
+#include <math.h>
+#include <stdlib.h>
+#include "htslib/kfunc.h"
+
+/* Log gamma function
+ * \log{\Gamma(z)}
+ * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+ */
+double kf_lgamma(double z)
+{
+ double x = 0;
+ x += 0.1659470187408462e-06 / (z+7);
+ x += 0.9934937113930748e-05 / (z+6);
+ x -= 0.1385710331296526 / (z+5);
+ x += 12.50734324009056 / (z+4);
+ x -= 176.6150291498386 / (z+3);
+ x += 771.3234287757674 / (z+2);
+ x -= 1259.139216722289 / (z+1);
+ x += 676.5203681218835 / z;
+ x += 0.9999999999995183;
+ return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
+}
+
+/* complementary error function
+ * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
+ * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
+ */
+double kf_erfc(double x)
+{
+ const double p0 = 220.2068679123761;
+ const double p1 = 221.2135961699311;
+ const double p2 = 112.0792914978709;
+ const double p3 = 33.912866078383;
+ const double p4 = 6.37396220353165;
+ const double p5 = .7003830644436881;
+ const double p6 = .03526249659989109;
+ const double q0 = 440.4137358247522;
+ const double q1 = 793.8265125199484;
+ const double q2 = 637.3336333788311;
+ const double q3 = 296.5642487796737;
+ const double q4 = 86.78073220294608;
+ const double q5 = 16.06417757920695;
+ const double q6 = 1.755667163182642;
+ const double q7 = .08838834764831844;
+ double expntl, z, p;
+ z = fabs(x) * M_SQRT2;
+ if (z > 37.) return x > 0.? 0. : 2.;
+ expntl = exp(z * z * - .5);
+ if (z < 10. / M_SQRT2) // for small z
+ p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
+ / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
+ else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
+ return x > 0.? 2. * p : 2. * (1. - p);
+}
+
+/* The following computes regularized incomplete gamma functions.
+ * Formulas are taken from Wiki, with additional input from Numerical
+ * Recipes in C (for modified Lentz's algorithm) and AS245
+ * (http://lib.stat.cmu.edu/apstat/245).
+ *
+ * A good online calculator is available at:
+ *
+ * http://www.danielsoper.com/statcalc/calc23.aspx
+ *
+ * It calculates upper incomplete gamma function, which equals
+ * kf_gammaq(s,z)*tgamma(s).
+ */
+
+#define KF_GAMMA_EPS 1e-14
+#define KF_TINY 1e-290
+
+// regularized lower incomplete gamma function, by series expansion
+static double _kf_gammap(double s, double z)
+{
+ double sum, x;
+ int k;
+ for (k = 1, sum = x = 1.; k < 100; ++k) {
+ sum += (x *= z / (s + k));
+ if (x / sum < KF_GAMMA_EPS) break;
+ }
+ return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
+}
+// regularized upper incomplete gamma function, by continued fraction
+static double _kf_gammaq(double s, double z)
+{
+ int j;
+ double C, D, f;
+ f = 1. + z - s; C = f; D = 0.;
+ // Modified Lentz's algorithm for computing continued fraction
+ // See Numerical Recipes in C, 2nd edition, section 5.2
+ for (j = 1; j < 100; ++j) {
+ double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
+ D = b + a * D;
+ if (D < KF_TINY) D = KF_TINY;
+ C = b + a / C;
+ if (C < KF_TINY) C = KF_TINY;
+ D = 1. / D;
+ d = C * D;
+ f *= d;
+ if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+ }
+ return exp(s * log(z) - z - kf_lgamma(s) - log(f));
+}
+
+double kf_gammap(double s, double z)
+{
+ return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
+}
+
+double kf_gammaq(double s, double z)
+{
+ return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
+}
+
+/* Regularized incomplete beta function. The method is taken from
+ * Numerical Recipe in C, 2nd edition, section 6.4. The following web
+ * page calculates the incomplete beta function, which equals
+ * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
+ *
+ * http://www.danielsoper.com/statcalc/calc36.aspx
+ */
+static double kf_betai_aux(double a, double b, double x)
+{
+ double C, D, f;
+ int j;
+ if (x == 0.) return 0.;
+ if (x == 1.) return 1.;
+ f = 1.; C = f; D = 0.;
+ // Modified Lentz's algorithm for computing continued fraction
+ for (j = 1; j < 200; ++j) {
+ double aa, d;
+ int m = j>>1;
+ aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
+ : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
+ D = 1. + aa * D;
+ if (D < KF_TINY) D = KF_TINY;
+ C = 1. + aa / C;
+ if (C < KF_TINY) C = KF_TINY;
+ D = 1. / D;
+ d = C * D;
+ f *= d;
+ if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+ }
+ return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
+}
+double kf_betai(double a, double b, double x)
+{
+ return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
+}
+
+#ifdef KF_MAIN
+#include <stdio.h>
+int main(int argc, char *argv[])
+{
+ double x = 5.5, y = 3;
+ double a, b;
+ printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
+ printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
+ a = 2; b = 2; x = 0.5;
+ printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
+ return 0;
+}
+#endif
+
+
+// log\binom{n}{k}
+static double lbinom(int n, int k)
+{
+ if (k == 0 || n == k) return 0;
+ return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+}
+
+// n11 n12 | n1_
+// n21 n22 | n2_
+//-----------+----
+// n_1 n_2 | n
+
+// hypergeometric distribution
+static double hypergeo(int n11, int n1_, int n_1, int n)
+{
+ return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
+}
+
+typedef struct {
+ int n11, n1_, n_1, n;
+ double p;
+} hgacc_t;
+
+// incremental version of hypergenometric distribution
+static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux)
+{
+ if (n1_ || n_1 || n) {
+ aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
+ } else { // then only n11 changed; the rest fixed
+ if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
+ if (n11 == aux->n11 + 1) { // incremental
+ aux->p *= (double)(aux->n1_ - aux->n11) / n11
+ * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
+ aux->n11 = n11;
+ return aux->p;
+ }
+ if (n11 == aux->n11 - 1) { // incremental
+ aux->p *= (double)aux->n11 / (aux->n1_ - n11)
+ * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
+ aux->n11 = n11;
+ return aux->p;
+ }
+ }
+ aux->n11 = n11;
+ }
+ aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
+ return aux->p;
+}
+
+double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two)
+{
+ int i, j, max, min;
+ double p, q, left, right;
+ hgacc_t aux;
+ int n1_, n_1, n;
+
+ n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
+ max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
+ min = n1_ + n_1 - n; // not sure why n11-n22 is used instead of min(n_1,n1_)
+ if (min < 0) min = 0; // min n11, for left tail
+ *two = *_left = *_right = 1.;
+ if (min == max) return 1.; // no need to do test
+ q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
+ // left tail
+ p = hypergeo_acc(min, 0, 0, 0, &aux);
+ for (left = 0., i = min + 1; p < 0.99999999 * q && i<=max; ++i) // loop until underflow
+ left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
+ --i;
+ if (p < 1.00000001 * q) left += p;
+ else --i;
+ // right tail
+ p = hypergeo_acc(max, 0, 0, 0, &aux);
+ for (right = 0., j = max - 1; p < 0.99999999 * q && j>=0; --j) // loop until underflow
+ right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
+ ++j;
+ if (p < 1.00000001 * q) right += p;
+ else ++j;
+ // two-tail
+ *two = left + right;
+ if (*two > 1.) *two = 1.;
+ // adjust left and right
+ if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
+ else left = 1.0 - right + q;
+ *_left = left; *_right = right;
+ return q;
+}
+
+
+
diff --git a/htslib/knetfile.c b/htslib/knetfile.c
new file mode 100644
index 0000000..aa28858
--- /dev/null
+++ b/htslib/knetfile.c
@@ -0,0 +1,634 @@
+/* The MIT License
+
+ Copyright (c) 2008 by Genome Research Ltd (GRL).
+ 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Probably I will not do socket programming in the next few years and
+ therefore I decide to heavily annotate this file, for Linux and
+ Windows as well. -ac */
+
+#include <config.h>
+
+#include <time.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#include "htslib/knetfile.h"
+
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
+static int socket_wait(int fd, int is_read)
+{
+ fd_set fds, *fdr = 0, *fdw = 0;
+ struct timeval tv;
+ int ret;
+ tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ if (is_read) fdr = &fds;
+ else fdw = &fds;
+ ret = select(fd+1, fdr, fdw, 0, &tv);
+#ifndef _WIN32
+ if (ret == -1) perror("select");
+#else
+ if (ret == 0)
+ fprintf(stderr, "select time-out\n");
+ else if (ret == SOCKET_ERROR)
+ fprintf(stderr, "select: %d\n", WSAGetLastError());
+#endif
+ return ret;
+}
+
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+ int ai_err, on = 1, fd;
+ struct linger lng = { 0, 0 };
+ struct addrinfo hints, *res = 0;
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ /* In Unix/Mac, getaddrinfo() is the most convenient way to get
+ * server information. */
+ if ((ai_err = getaddrinfo(host, port, &hints, &res)) != 0) { fprintf(stderr, "can't resolve %s:%s: %s\n", host, port, gai_strerror(ai_err)); return -1; }
+ if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+ /* The following two setsockopt() are used by ftplib
+ * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+ * necessary. */
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+ freeaddrinfo(res);
+ return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *int64tostr(char *buf, int64_t x)
+{
+ int cnt;
+ int i = 0;
+ do {
+ buf[i++] = '0' + x % 10;
+ x /= 10;
+ } while (x);
+ buf[i] = 0;
+ for (cnt = i, i = 0; i < cnt/2; ++i) {
+ int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+ }
+ return buf;
+}
+
+int64_t strtoint64(const char *buf)
+{
+ int64_t x;
+ for (x = 0; *buf != '\0'; ++buf)
+ x = x * 10 + ((int64_t) *buf - 48);
+ return x;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+ WSADATA wsaData;
+ return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+ WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) \
+ do { \
+ fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
+ return -1; \
+ } while (0)
+
+ int on = 1;
+ SOCKET fd;
+ struct linger lng = { 0, 0 };
+ struct sockaddr_in server;
+ struct hostent *hp = 0;
+ // open socket
+ if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ // get host info
+ if (isalpha(host[0])) hp = gethostbyname(host);
+ else {
+ struct in_addr addr;
+ addr.s_addr = inet_addr(host);
+ hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+ }
+ if (hp == 0) __err_connect("gethost");
+ // connect
+ server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+ server.sin_family= AF_INET;
+ server.sin_port = htons(atoi(port));
+ if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+ // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+ return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+ off_t rest = len, curr, l = 0;
+ /* recv() and read() may not read the required length of data with
+ * one call. They have to be called repeatedly. */
+ while (rest) {
+ if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+ curr = netread(fd, (void*)((char*)buf + l), rest);
+ /* According to the glibc manual, section 13.2, a zero returned
+ * value indicates end-of-file (EOF), which should mean that
+ * read() will not return zero if EOF has not been met but data
+ * are not immediately available. */
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
+static int kftp_get_response(knetFile *ftp)
+{
+#ifndef _WIN32
+ unsigned char c;
+#else
+ char c;
+#endif
+ int n = 0;
+ char *p;
+ if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+ while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+ //fputc(c, stderr);
+ if (n >= ftp->max_response) {
+ ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+ ftp->response = (char*)realloc(ftp->response, ftp->max_response);
+ }
+ ftp->response[n++] = c;
+ if (c == '\n') {
+ if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+ && ftp->response[3] != '-') break;
+ n = 0;
+ continue;
+ }
+ }
+ if (n < 2) return -1;
+ ftp->response[n-2] = 0;
+ return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+ if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+ int len = strlen(cmd);
+ if ( netwrite(ftp->ctrl_fd, cmd, len) != len ) return -1;
+ return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+ char *p;
+ int v[6];
+ kftp_send_cmd(ftp, "PASV\r\n", 1);
+ for (p = ftp->response; *p && *p != '('; ++p);
+ if (*p != '(') return -1;
+ ++p;
+ sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+ memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+ ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+ return 0;
+}
+
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+ char host[80], port[10];
+ if (ftp->pasv_port == 0) {
+ fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+ return -1;
+ }
+ sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+ sprintf(port, "%d", ftp->pasv_port);
+ ftp->fd = socket_connect(host, port);
+ if (ftp->fd == -1) return -1;
+ return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+ ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+ if (ftp->ctrl_fd == -1) return -1;
+ kftp_get_response(ftp);
+ kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+ kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+ kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+ return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+ if (ftp->ctrl_fd != -1) {
+ netclose(ftp->ctrl_fd);
+ ftp->ctrl_fd = -1;
+ }
+ netclose(ftp->fd);
+ ftp->fd = -1;
+ return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host, ->retr and ->size
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p;
+ int l;
+ if (strstr(fn, "ftp://") != fn) return 0;
+ for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+ if (*p != '/') return 0;
+ l = p - fn - 6;
+ fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_FTP;
+ fp->fd = -1;
+ /* the Linux/Mac version of socket_connect() also recognizes a port
+ * like "ftp", but the Windows version does not. */
+ fp->port = strdup("21");
+ fp->host = (char*)calloc(l + 1, 1);
+ if (strchr(mode, 'c')) fp->no_reconnect = 1;
+ strncpy(fp->host, fn + 6, l);
+ fp->retr = (char*)calloc(strlen(p) + 8, 1);
+ sprintf(fp->retr, "RETR %s\r\n", p);
+ fp->size_cmd = (char*)calloc(strlen(p) + 8, 1);
+ sprintf(fp->size_cmd, "SIZE %s\r\n", p);
+ fp->seek_offset = 0;
+ return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+ int ret;
+ long long file_size;
+ if (fp->fd != -1) {
+ netclose(fp->fd);
+ if (fp->no_reconnect) kftp_get_response(fp);
+ }
+ kftp_pasv_prep(fp);
+ kftp_send_cmd(fp, fp->size_cmd, 1);
+#ifndef _WIN32
+ // If the file does not exist, the response will be "550 Could not get file
+ // size". Be silent on failure, hts_idx_load can be trying the existence of .csi or .tbi.
+ if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) return -1;
+#else
+ const char *p = fp->response;
+ while (*p != ' ') ++p;
+ while (*p < '0' || *p > '9') ++p;
+ file_size = strtoint64(p);
+#endif
+ fp->file_size = file_size;
+ if (fp->offset>=0) {
+ char tmp[32];
+#ifndef _WIN32
+ sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+ strcpy(tmp, "REST ");
+ int64tostr(tmp + 5, fp->offset);
+ strcat(tmp, "\r\n");
+#endif
+ kftp_send_cmd(fp, tmp, 1);
+ }
+ kftp_send_cmd(fp, fp->retr, 0);
+ kftp_pasv_connect(fp);
+ ret = kftp_get_response(fp);
+ if (ret != 150) {
+ fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ fp->is_ready = 1;
+ return 0;
+}
+
+
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p, *proxy, *q;
+ int l;
+ if (strstr(fn, "http://") != fn) return 0;
+ // set ->http_host
+ for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+ l = p - fn - 7;
+ fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->http_host = (char*)calloc(l + 1, 1);
+ strncpy(fp->http_host, fn + 7, l);
+ fp->http_host[l] = 0;
+ for (q = fp->http_host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ // get http_proxy
+ proxy = getenv("http_proxy");
+ // set ->host, ->port and ->path
+ if (proxy == 0) {
+ fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(*p? p : "/");
+ } else {
+ fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+ for (q = fp->host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(fn);
+ }
+ fp->type = KNF_TYPE_HTTP;
+ fp->ctrl_fd = fp->fd = -1;
+ fp->seek_offset = 0;
+ return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+ int ret, l = 0;
+ char *buf, *p;
+ if (fp->fd != -1) netclose(fp->fd);
+ fp->fd = socket_connect(fp->host, fp->port);
+ buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+ l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+ l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+ l += sprintf(buf + l, "\r\n");
+ if ( netwrite(fp->fd, buf, l) != l ) { free(buf); return -1; }
+ l = 0;
+ while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+ if (buf[l] == '\n' && l >= 3)
+ if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+ ++l;
+ }
+ buf[l] = 0;
+ if (l < 14) { // prematured header
+ free(buf);
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ ret = strtol(buf + 8, &p, 0); // HTTP return code
+ if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
+ off_t rest = fp->offset;
+ while (rest) {
+ off_t l = rest < 0x10000? rest : 0x10000;
+ rest -= my_netread(fp->fd, buf, l);
+ }
+ } else if (ret != 206 && ret != 200) {
+ // failed to open file
+ free(buf);
+ netclose(fp->fd);
+ switch (ret) {
+ case 401: errno = EPERM; break;
+ case 403: errno = EACCES; break;
+ case 404: errno = ENOENT; break;
+ case 407: errno = EPERM; break;
+ case 408: errno = ETIMEDOUT; break;
+ case 410: errno = ENOENT; break;
+ case 503: errno = EAGAIN; break;
+ case 504: errno = ETIMEDOUT; break;
+ default: errno = (ret >= 400 && ret < 500)? EINVAL : EIO; break;
+ }
+ fp->fd = -1;
+ return -1;
+ }
+ free(buf);
+ fp->is_ready = 1;
+ return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+ knetFile *fp = 0;
+ if (mode[0] != 'r') {
+ fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+ return 0;
+ }
+ if (strstr(fn, "ftp://") == fn) {
+ fp = kftp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ if (kftp_connect(fp) == -1) {
+ knet_close(fp);
+ return 0;
+ }
+ kftp_connect_file(fp);
+ } else if (strstr(fn, "http://") == fn) {
+ fp = khttp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ khttp_connect_file(fp);
+ } else { // local file
+#ifdef _WIN32
+ /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+ * be undefined on some systems, although it is defined on my
+ * Mac and the Linux I have tested on. */
+ int fd = open(fn, O_RDONLY | O_BINARY);
+#else
+ int fd = open(fn, O_RDONLY);
+#endif
+ if (fd == -1) {
+ perror("open");
+ return 0;
+ }
+ fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ fp->ctrl_fd = -1;
+ }
+ if (fp && fp->fd == -1) {
+ knet_close(fp);
+ return 0;
+ }
+ return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+ knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ return fp;
+}
+
+ssize_t knet_read(knetFile *fp, void *buf, size_t len)
+{
+ off_t l = 0;
+ if (fp->fd == -1) return 0;
+ if (fp->type == KNF_TYPE_FTP) {
+ if (fp->is_ready == 0) {
+ if (!fp->no_reconnect) kftp_reconnect(fp);
+ kftp_connect_file(fp);
+ }
+ } else if (fp->type == KNF_TYPE_HTTP) {
+ if (fp->is_ready == 0)
+ khttp_connect_file(fp);
+ }
+ if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+ size_t rest = len;
+ ssize_t curr;
+ while (rest) {
+ do {
+ curr = read(fp->fd, (void*)((char*)buf + l), rest);
+ } while (curr < 0 && EINTR == errno);
+ if (curr < 0) return -1;
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ } else l = my_netread(fp->fd, buf, len);
+ fp->offset += l;
+ return l;
+}
+
+off_t knet_seek(knetFile *fp, off_t off, int whence)
+{
+ if (whence == SEEK_SET && off == fp->offset) return 0;
+ if (fp->type == KNF_TYPE_LOCAL) {
+ /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */
+ off_t offset = lseek(fp->fd, off, whence);
+ if (offset == -1) return -1;
+ fp->offset = offset;
+ return fp->offset;
+ } else if (fp->type == KNF_TYPE_FTP) {
+ if (whence == SEEK_CUR) fp->offset += off;
+ else if (whence == SEEK_SET) fp->offset = off;
+ else if (whence == SEEK_END) fp->offset = fp->file_size + off;
+ else return -1;
+ fp->is_ready = 0;
+ return fp->offset;
+ } else if (fp->type == KNF_TYPE_HTTP) {
+ if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
+ fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
+ errno = ESPIPE;
+ return -1;
+ }
+ if (whence == SEEK_CUR) fp->offset += off;
+ else if (whence == SEEK_SET) fp->offset = off;
+ else return -1;
+ fp->is_ready = 0;
+ return fp->offset;
+ }
+ errno = EINVAL;
+ fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+ return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+ if (fp == 0) return 0;
+ if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+ if (fp->fd != -1) {
+ /* On Linux/Mac, netclose() is an alias of close(), but on
+ * Windows, it is an alias of closesocket(). */
+ if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+ else netclose(fp->fd);
+ }
+ free(fp->host); free(fp->port);
+ free(fp->response); free(fp->retr); // FTP specific
+ free(fp->path); free(fp->http_host); // HTTP specific
+ free(fp);
+ return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+ char *buf;
+ knetFile *fp;
+ int type = 4, l;
+#ifdef _WIN32
+ knet_win32_init();
+#endif
+ buf = calloc(0x100000, 1);
+ if (type == 0) {
+ fp = knet_open("knetfile.c", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 1) { // NCBI FTP, large file
+ fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+ knet_seek(fp, 2500000000ll, SEEK_SET);
+ l = knet_read(fp, buf, 255);
+ } else if (type == 2) {
+ fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 3) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 4) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+ knet_read(fp, buf, 10000);
+ knet_seek(fp, 20000, SEEK_SET);
+ knet_seek(fp, 10000, SEEK_SET);
+ l = knet_read(fp, buf+10000, 10000000) + 10000;
+ }
+ if (type != 4 && type != 1) {
+ knet_read(fp, buf, 255);
+ buf[255] = 0;
+ printf("%s\n", buf);
+ } else write(fileno(stdout), buf, l);
+ knet_close(fp);
+ free(buf);
+ return 0;
+}
+#endif
diff --git a/htslib/kstring.c b/htslib/kstring.c
new file mode 100644
index 0000000..f43a982
--- /dev/null
+++ b/htslib/kstring.c
@@ -0,0 +1,276 @@
+/* The MIT License
+
+ Copyright (C) 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <config.h>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "htslib/kstring.h"
+
+int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
+{
+ va_list args;
+ int l;
+ va_copy(args, ap);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
+ va_end(args);
+ if (l + 1 > s->m - s->l) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ va_copy(args, ap);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
+ va_end(args);
+ }
+ s->l += l;
+ return l;
+}
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+ va_list ap;
+ int l;
+ va_start(ap, fmt);
+ l = kvsprintf(s, fmt, ap);
+ va_end(ap);
+ return l;
+}
+
+char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
+{
+ const char *p, *start;
+ if (sep) { // set up the table
+ if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
+ aux->finished = 0;
+ if (sep[1]) {
+ aux->sep = -1;
+ aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+ for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+ } else aux->sep = sep[0];
+ }
+ if (aux->finished) return 0;
+ else if (str) aux->p = str - 1, aux->finished = 0;
+ if (aux->sep < 0) {
+ for (p = start = aux->p + 1; *p; ++p)
+ if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+ } else {
+ for (p = start = aux->p + 1; *p; ++p)
+ if (*p == aux->sep) break;
+ }
+ aux->p = p; // end of token
+ if (*p == 0) aux->finished = 1; // no more tokens
+ return (char*)start;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+ int i, n, max, last_char, last_start, *offsets, l;
+ n = 0; max = *_max; offsets = *_offsets;
+ l = strlen(s);
+
+#define __ksplit_aux do { \
+ if (_offsets) { \
+ s[i] = 0; \
+ if (n == max) { \
+ int *tmp; \
+ max = max? max<<1 : 2; \
+ if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) { \
+ offsets = tmp; \
+ } else { \
+ free(offsets); \
+ *_offsets = NULL; \
+ return 0; \
+ } \
+ } \
+ offsets[n++] = last_start; \
+ } else ++n; \
+ } while (0)
+
+ for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+ if (delimiter == 0) {
+ if (isspace(s[i]) || s[i] == 0) {
+ if (isgraph(last_char)) __ksplit_aux; // the end of a field
+ } else {
+ if (isspace(last_char) || last_char == 0) last_start = i;
+ }
+ } else {
+ if (s[i] == delimiter || s[i] == 0) {
+ if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+ } else {
+ if (last_char == delimiter || last_char == 0) last_start = i;
+ }
+ }
+ last_char = s[i];
+ }
+ *_max = max; *_offsets = offsets;
+ return n;
+}
+
+int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
+{
+ size_t l0 = s->l;
+
+ while (s->l == l0 || s->s[s->l-1] != '\n') {
+ if (s->m - s->l < 200) ks_resize(s, s->m + 200);
+ if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
+ s->l += strlen(s->s + s->l);
+ }
+
+ if (s->l == l0) return EOF;
+
+ if (s->l > l0 && s->s[s->l-1] == '\n') {
+ s->l--;
+ if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
+ }
+ s->s[s->l] = '\0';
+ return 0;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+typedef unsigned char ubyte_t;
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+static int *ksBM_prep(const ubyte_t *pat, int m)
+{
+ int i, *suff, *prep, *bmGs, *bmBc;
+ prep = (int*)calloc(m + 256, sizeof(int));
+ bmGs = prep; bmBc = prep + m;
+ { // preBmBc()
+ for (i = 0; i < 256; ++i) bmBc[i] = m;
+ for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+ }
+ suff = (int*)calloc(m, sizeof(int));
+ { // suffixes()
+ int f = 0, g;
+ suff[m - 1] = m;
+ g = m - 1;
+ for (i = m - 2; i >= 0; --i) {
+ if (i > g && suff[i + m - 1 - f] < i - g)
+ suff[i] = suff[i + m - 1 - f];
+ else {
+ if (i < g) g = i;
+ f = i;
+ while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+ suff[i] = f - g;
+ }
+ }
+ }
+ { // preBmGs()
+ int j = 0;
+ for (i = 0; i < m; ++i) bmGs[i] = m;
+ for (i = m - 1; i >= 0; --i)
+ if (suff[i] == i + 1)
+ for (; j < m - 1 - i; ++j)
+ if (bmGs[j] == m)
+ bmGs[j] = m - 1 - i;
+ for (i = 0; i <= m - 2; ++i)
+ bmGs[m - 1 - suff[i]] = m - 1 - i;
+ }
+ free(suff);
+ return prep;
+}
+
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
+{
+ int i, j, *prep = 0, *bmGs, *bmBc;
+ const ubyte_t *str, *pat;
+ str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+ prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+ if (_prep && *_prep == 0) *_prep = prep;
+ bmGs = prep; bmBc = prep + m;
+ j = 0;
+ while (j <= n - m) {
+ for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+ if (i >= 0) {
+ int max = bmBc[str[i+j]] - m + 1 + i;
+ if (max < bmGs[i]) max = bmGs[i];
+ j += max;
+ } else return (void*)(str + j);
+ }
+ if (_prep == 0) free(prep);
+ return 0;
+}
+
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+ return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+ return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+ kstring_t *s;
+ int *fields, n, i;
+ ks_tokaux_t aux;
+ char *p;
+ s = (kstring_t*)calloc(1, sizeof(kstring_t));
+ // test ksprintf()
+ ksprintf(s, " abcdefg: %d ", 100);
+ printf("'%s'\n", s->s);
+ // test ksplit()
+ fields = ksplit(s, 0, &n);
+ for (i = 0; i < n; ++i)
+ printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+ // test kstrtok()
+ s->l = 0;
+ for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+ kputsn(p, aux.p - p, s);
+ kputc('\n', s);
+ }
+ printf("%s", s->s);
+ // free
+ free(s->s); free(s); free(fields);
+
+ {
+ static char *str = "abcdefgcdgcagtcakcdcd";
+ static char *pat = "cd";
+ char *ret, *s = str;
+ int *prep = 0;
+ while ((ret = kstrstr(s, pat, &prep)) != 0) {
+ printf("match: %s\n", ret);
+ s = ret + prep[0];
+ }
+ free(prep);
+ }
+ return 0;
+}
+#endif
diff --git a/htslib/md5.c b/htslib/md5.c
new file mode 100644
index 0000000..ddcfcdf
--- /dev/null
+++ b/htslib/md5.c
@@ -0,0 +1,386 @@
+/*
+ * Trivial amendments by James Bonfield <jkb at sanger.ac.uk> to provide an
+ * HTSlib interface. 2015.
+ *
+ * Externally our API uses an opaque hts_md5_context structure.
+ *
+ * Internally either this gets defined and used with the routines here
+ * or it remains incomplete and is cast to the OpenSSL MD5_CTX structure
+ * and used by routines from OpenSSL.
+ */
+
+/*
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001. No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * (This is a heavily cut-down "BSD license".)
+ *
+ * This differs from Colin Plumb's older public domain implementation in that
+ * no exactly 32-bit integer data type is required (any 32-bit or wider
+ * unsigned integer data type will do), there's no compile-time endianness
+ * configuration, and the function prototypes match OpenSSL's. No code from
+ * Colin Plumb's implementation has been reused; this comment merely compares
+ * the properties of the two independent implementations.
+ *
+ * The primary goals of this implementation are portability and ease of use.
+ * It is meant to be fast, but not as fast as possible. Some known
+ * optimizations are not included to reduce source code size and avoid
+ * compile-time configuration.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include "htslib/hts.h"
+
+#ifndef HAVE_OPENSSL
+
+#include <string.h>
+
+/* Any 32-bit or wider unsigned integer data type will do */
+typedef unsigned int hts_md5_u32plus;
+
+struct hts_md5_context {
+ hts_md5_u32plus lo, hi;
+ hts_md5_u32plus a, b, c, d;
+ unsigned char buffer[64];
+ hts_md5_u32plus block[16];
+};
+
+/*
+ * The basic MD5 functions.
+ *
+ * F and G are optimized compared to their RFC 1321 definitions for
+ * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
+ * implementation.
+ */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z) (((x) ^ (y)) ^ (z))
+#define H2(x, y, z) ((x) ^ ((y) ^ (z)))
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+
+/*
+ * The MD5 transformation for all four rounds.
+ */
+#define STEP(f, a, b, c, d, x, t, s) \
+ (a) += f((b), (c), (d)) + (x) + (t); \
+ (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
+ (a) += (b);
+
+/*
+ * SET reads 4 input bytes in little-endian byte order and stores them
+ * in a properly aligned word in host byte order.
+ *
+ * The check for little-endian architectures that tolerate unaligned
+ * memory accesses is just an optimization. Nothing will break if it
+ * doesn't work.
+ */
+#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
+#define SET(n) \
+ (*(hts_md5_u32plus *)&ptr[(n) * 4])
+#define GET(n) \
+ SET(n)
+#else
+#define SET(n) \
+ (ctx->block[(n)] = \
+ (hts_md5_u32plus)ptr[(n) * 4] | \
+ ((hts_md5_u32plus)ptr[(n) * 4 + 1] << 8) | \
+ ((hts_md5_u32plus)ptr[(n) * 4 + 2] << 16) | \
+ ((hts_md5_u32plus)ptr[(n) * 4 + 3] << 24))
+#define GET(n) \
+ (ctx->block[(n)])
+#endif
+
+/*
+ * This processes one or more 64-byte data blocks, but does NOT update
+ * the bit counters. There are no alignment requirements.
+ */
+static const void *body(hts_md5_context *ctx, const void *data, unsigned long size)
+{
+ const unsigned char *ptr;
+ hts_md5_u32plus a, b, c, d;
+ hts_md5_u32plus saved_a, saved_b, saved_c, saved_d;
+
+ ptr = (const unsigned char *)data;
+
+ a = ctx->a;
+ b = ctx->b;
+ c = ctx->c;
+ d = ctx->d;
+
+ do {
+ saved_a = a;
+ saved_b = b;
+ saved_c = c;
+ saved_d = d;
+
+/* Round 1 */
+ STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
+ STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
+ STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
+ STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
+ STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
+ STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
+ STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
+ STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
+ STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
+ STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
+ STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
+ STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
+ STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
+ STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
+ STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
+ STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
+
+/* Round 2 */
+ STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+ STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+ STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+ STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+ STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+ STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+ STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+ STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+ STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+ STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+ STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+ STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+ STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+ STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+ STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+ STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+
+/* Round 3 */
+ STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+ STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
+ STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+ STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
+ STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+ STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+ STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+ STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
+ STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+ STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
+ STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+ STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
+ STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+ STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
+ STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+ STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
+
+/* Round 4 */
+ STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+ STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+ STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+ STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+ STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+ STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+ STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+ STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+ STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+ STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+ STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+ STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+ STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+ STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+ STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+ STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+
+ a += saved_a;
+ b += saved_b;
+ c += saved_c;
+ d += saved_d;
+
+ ptr += 64;
+ } while (size -= 64);
+
+ ctx->a = a;
+ ctx->b = b;
+ ctx->c = c;
+ ctx->d = d;
+
+ return ptr;
+}
+
+void hts_md5_reset(hts_md5_context *ctx)
+{
+ ctx->a = 0x67452301;
+ ctx->b = 0xefcdab89;
+ ctx->c = 0x98badcfe;
+ ctx->d = 0x10325476;
+
+ ctx->lo = 0;
+ ctx->hi = 0;
+}
+
+void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size)
+{
+ hts_md5_u32plus saved_lo;
+ unsigned long used, available;
+
+ saved_lo = ctx->lo;
+ if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
+ ctx->hi++;
+ ctx->hi += size >> 29;
+
+ used = saved_lo & 0x3f;
+
+ if (used) {
+ available = 64 - used;
+
+ if (size < available) {
+ memcpy(&ctx->buffer[used], data, size);
+ return;
+ }
+
+ memcpy(&ctx->buffer[used], data, available);
+ data = (const unsigned char *)data + available;
+ size -= available;
+ body(ctx, ctx->buffer, 64);
+ }
+
+ if (size >= 64) {
+ data = body(ctx, data, size & ~(unsigned long)0x3f);
+ size &= 0x3f;
+ }
+
+ memcpy(ctx->buffer, data, size);
+}
+
+void hts_md5_final(unsigned char *result, hts_md5_context *ctx)
+{
+ unsigned long used, available;
+
+ used = ctx->lo & 0x3f;
+
+ ctx->buffer[used++] = 0x80;
+
+ available = 64 - used;
+
+ if (available < 8) {
+ memset(&ctx->buffer[used], 0, available);
+ body(ctx, ctx->buffer, 64);
+ used = 0;
+ available = 64;
+ }
+
+ memset(&ctx->buffer[used], 0, available - 8);
+
+ ctx->lo <<= 3;
+ ctx->buffer[56] = ctx->lo;
+ ctx->buffer[57] = ctx->lo >> 8;
+ ctx->buffer[58] = ctx->lo >> 16;
+ ctx->buffer[59] = ctx->lo >> 24;
+ ctx->buffer[60] = ctx->hi;
+ ctx->buffer[61] = ctx->hi >> 8;
+ ctx->buffer[62] = ctx->hi >> 16;
+ ctx->buffer[63] = ctx->hi >> 24;
+
+ body(ctx, ctx->buffer, 64);
+
+ result[0] = ctx->a;
+ result[1] = ctx->a >> 8;
+ result[2] = ctx->a >> 16;
+ result[3] = ctx->a >> 24;
+ result[4] = ctx->b;
+ result[5] = ctx->b >> 8;
+ result[6] = ctx->b >> 16;
+ result[7] = ctx->b >> 24;
+ result[8] = ctx->c;
+ result[9] = ctx->c >> 8;
+ result[10] = ctx->c >> 16;
+ result[11] = ctx->c >> 24;
+ result[12] = ctx->d;
+ result[13] = ctx->d >> 8;
+ result[14] = ctx->d >> 16;
+ result[15] = ctx->d >> 24;
+
+ memset(ctx, 0, sizeof(*ctx));
+}
+
+
+hts_md5_context *hts_md5_init(void)
+{
+ hts_md5_context *ctx = malloc(sizeof(*ctx));
+ if (!ctx)
+ return NULL;
+
+ hts_md5_reset(ctx);
+ return ctx;
+}
+
+#else
+
+#include <openssl/md5.h>
+#include <assert.h>
+
+/*
+ * Wrappers around the OpenSSL libcrypto.so MD5 implementation.
+ *
+ * These are here to ensure they end up in the symbol table of the
+ * library regardless of the static inline in the headers.
+ */
+hts_md5_context *hts_md5_init(void)
+{
+ MD5_CTX *ctx = malloc(sizeof(*ctx));
+ if (!ctx)
+ return NULL;
+
+ MD5_Init(ctx);
+
+ return (hts_md5_context *)ctx;
+}
+
+void hts_md5_reset(hts_md5_context *ctx)
+{
+ MD5_Init((MD5_CTX *)ctx);
+}
+
+void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size)
+{
+ MD5_Update((MD5_CTX *)ctx, data, size);
+}
+
+void hts_md5_final(unsigned char *result, hts_md5_context *ctx)
+{
+ MD5_Final(result, (MD5_CTX *)ctx);
+}
+
+#endif
+
+void hts_md5_destroy(hts_md5_context *ctx)
+{
+ if (!ctx)
+ return;
+
+ free(ctx);
+}
+
+void hts_md5_hex(char *hex, const unsigned char *digest)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ hex[i*2+0] = "0123456789abcdef"[(digest[i]>>4)&0xf];
+ hex[i*2+1] = "0123456789abcdef"[digest[i]&0xf];
+ }
+ hex[32] = 0;
+}
diff --git a/htslib/plugin.c b/htslib/plugin.c
new file mode 100644
index 0000000..7525cc6
--- /dev/null
+++ b/htslib/plugin.c
@@ -0,0 +1,171 @@
+/* plugin.c -- low-level path parsing and plugin functions.
+
+ Copyright (C) 2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <dirent.h>
+#include <dlfcn.h>
+
+#include "hts_internal.h"
+#include "htslib/kstring.h"
+
+#ifndef PLUGINPATH
+#define PLUGINPATH ""
+#endif
+
+#ifdef __APPLE__
+#define PLUGIN_EXT ".bundle"
+#define PLUGIN_EXT_LEN 7
+#else
+#define PLUGIN_EXT ".so"
+#define PLUGIN_EXT_LEN 3
+#endif
+
+static DIR *open_nextdir(struct hts_path_itr *itr)
+{
+ DIR *dir;
+
+ while (1) {
+ const char *colon = strchr(itr->pathdir, ':');
+ if (colon == NULL) return NULL;
+
+ itr->entry.l = 0;
+ kputsn(itr->pathdir, colon - itr->pathdir, &itr->entry);
+ itr->pathdir = &colon[1];
+ if (itr->entry.l == 0) continue;
+
+ dir = opendir(itr->entry.s);
+ if (dir) break;
+
+ if (hts_verbose >= 4)
+ fprintf(stderr,
+ "[W::hts_path_itr] can't scan directory \"%s\": %s\n",
+ itr->entry.s, strerror(errno));
+ }
+
+ if (itr->entry.s[itr->entry.l-1] != '/') kputc('/', &itr->entry);
+ itr->entry_dir_l = itr->entry.l;
+ return dir;
+}
+
+void hts_path_itr_setup(struct hts_path_itr *itr, const char *path,
+ const char *builtin_path, const char *prefix, size_t prefix_len,
+ const char *suffix, size_t suffix_len)
+{
+ itr->prefix = prefix;
+ itr->prefix_len = prefix_len;
+
+ if (suffix) itr->suffix = suffix, itr->suffix_len = suffix_len;
+ else itr->suffix = PLUGIN_EXT, itr->suffix_len = PLUGIN_EXT_LEN;
+
+ itr->path.l = itr->path.m = 0; itr->path.s = NULL;
+ itr->entry.l = itr->entry.m = 0; itr->entry.s = NULL;
+
+ if (! builtin_path) builtin_path = PLUGINPATH;
+ if (! path) {
+ path = getenv("HTS_PATH");
+ if (! path) path = "";
+ }
+
+ while (1) {
+ size_t len = strcspn(path, ":");
+ if (len == 0) kputs(builtin_path, &itr->path);
+ else kputsn(path, len, &itr->path);
+ kputc(':', &itr->path);
+
+ path += len;
+ if (*path == ':') path++;
+ else break;
+ }
+
+ // Note that ':' now terminates entries rather than separates them
+ itr->pathdir = itr->path.s;
+ itr->dirv = open_nextdir(itr);
+}
+
+const char *hts_path_itr_next(struct hts_path_itr *itr)
+{
+ while (itr->dirv) {
+ struct dirent *e;
+ while ((e = readdir((DIR *) itr->dirv)) != NULL) {
+ size_t d_name_len = strlen(e->d_name);
+ if (strncmp(e->d_name, itr->prefix, itr->prefix_len) == 0 &&
+ d_name_len >= itr->suffix_len &&
+ strncmp(e->d_name + d_name_len - itr->suffix_len, itr->suffix,
+ itr->suffix_len) == 0) {
+ itr->entry.l = itr->entry_dir_l;
+ kputs(e->d_name, &itr->entry);
+ return itr->entry.s;
+ }
+ }
+
+ closedir((DIR *) itr->dirv);
+ itr->dirv = open_nextdir(itr);
+ }
+
+ itr->pathdir = NULL;
+ free(itr->path.s); itr->path.s = NULL;
+ free(itr->entry.s); itr->entry.s = NULL;
+ return NULL;
+}
+
+void *load_plugin(void **pluginp, const char *filename, const char *symbol)
+{
+ void *lib = dlopen(filename, RTLD_NOW | RTLD_LOCAL);
+ if (lib == NULL) goto error;
+
+ void *sym = dlsym(lib, symbol);
+ if (sym == NULL) goto error;
+
+ *pluginp = lib;
+ return sym;
+
+error:
+ if (hts_verbose >= 4)
+ fprintf(stderr, "[W::%s] can't load plugin \"%s\": %s\n",
+ __func__, filename, dlerror());
+ if (lib) dlclose(lib);
+ return NULL;
+}
+
+void *plugin_sym(void *plugin, const char *name, const char **errmsg)
+{
+ void *sym = dlsym(plugin, name);
+ if (sym == NULL) *errmsg = dlerror();
+ return sym;
+}
+
+void close_plugin(void *plugin)
+{
+ if (dlclose(plugin) != 0) {
+ if (hts_verbose >= 4)
+ fprintf(stderr, "[W::%s] dlclose() failed: %s\n",
+ __func__, dlerror());
+ }
+}
diff --git a/htslib/regidx.c b/htslib/regidx.c
new file mode 100644
index 0000000..84f18b4
--- /dev/null
+++ b/htslib/regidx.c
@@ -0,0 +1,340 @@
+/*
+ Copyright (C) 2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <config.h>
+
+#include "htslib/hts.h"
+#include "htslib/kstring.h"
+#include "htslib/kseq.h"
+#include "htslib/khash_str2int.h"
+#include "htslib/regidx.h"
+
+#define LIDX_SHIFT 13 // number of insignificant index bits
+
+// List of regions for one chromosome
+typedef struct
+{
+ int *idx, nidx;
+ int nregs, mregs; // n:used, m:alloced
+ reg_t *regs;
+ void *payload;
+}
+reglist_t;
+
+// Container of all sequences
+struct _regidx_t
+{
+ int nseq, mseq; // n:used, m:alloced
+ reglist_t *seq; // regions for each sequence
+ void *seq2regs; // hash for fast lookup from chr name to regions
+ char **seq_names;
+ regidx_free_f free; // function to free any data allocated by regidx_parse_f
+ regidx_parse_f parse; // parse one input line
+ void *usr; // user data to pass to regidx_parse_f
+
+ // temporary data for index initialization
+ kstring_t str;
+ int rid_prev, start_prev, end_prev;
+ int payload_size;
+ void *payload;
+};
+
+int regidx_seq_nregs(regidx_t *idx, const char *seq)
+{
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
+ return idx->seq[iseq].nregs;
+}
+
+int regidx_nregs(regidx_t *idx)
+{
+ int i, nregs = 0;
+ for (i=0; i<idx->nseq; i++) nregs += idx->seq[i].nregs;
+ return nregs;
+}
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+ *n = idx->nseq;
+ return idx->seq_names;
+}
+
+int _regidx_build_index(regidx_t *idx)
+{
+ int iseq;
+ for (iseq=0; iseq<idx->nseq; iseq++)
+ {
+ reglist_t *list = &idx->seq[iseq];
+ int j,k, imax = 0; // max index bin
+ for (j=0; j<list->nregs; j++)
+ {
+ int ibeg = list->regs[j].start >> LIDX_SHIFT;
+ int iend = list->regs[j].end >> LIDX_SHIFT;
+ if ( imax < iend + 1 )
+ {
+ int old_imax = imax;
+ imax = iend + 1;
+ kroundup32(imax);
+ list->idx = (int*) realloc(list->idx, imax*sizeof(int));
+ for (k=old_imax; k<imax; k++) list->idx[k] = -1;
+ }
+ if ( ibeg==iend )
+ {
+ if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
+ }
+ else
+ {
+ for (k=ibeg; k<=iend; k++)
+ if ( list->idx[k]<0 ) list->idx[k] = j;
+ }
+ list->nidx = iend + 1;
+ }
+ }
+ return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+ if ( !line )
+ return _regidx_build_index(idx);
+
+ char *chr_from, *chr_to;
+ reg_t reg;
+ int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr);
+ if ( ret==-2 ) return -1; // error
+ if ( ret==-1 ) return 0; // skip the line
+
+ int rid;
+ idx->str.l = 0;
+ kputsn(chr_from, chr_to-chr_from+1, &idx->str);
+ if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+ {
+ idx->nseq++;
+ int m_prev = idx->mseq;
+ hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+ hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+ idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+ rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+ }
+
+ reglist_t *list = &idx->seq[rid];
+ list->nregs++;
+ int m_prev = list->mregs;
+ hts_expand(reg_t,list->nregs,list->mregs,list->regs);
+ list->regs[list->nregs-1] = reg;
+ if ( idx->payload_size )
+ {
+ if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs);
+ memcpy(list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size);
+ }
+
+ if ( idx->rid_prev==rid )
+ {
+ if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) )
+ {
+ fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n",
+ idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1);
+ return -1;
+ }
+ }
+ idx->rid_prev = rid;
+ idx->start_prev = reg.start;
+ idx->end_prev = reg.end;
+ return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ if ( !parser )
+ {
+ if ( !fname ) parser = regidx_parse_tab;
+ else
+ {
+ int len = strlen(fname);
+ if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
+ parser = regidx_parse_bed;
+ else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
+ parser = regidx_parse_bed;
+ else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
+ parser = regidx_parse_bed;
+ else
+ parser = regidx_parse_tab;
+ }
+ }
+
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ idx->free = free_f;
+ idx->parse = parser;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->rid_prev = -1;
+ idx->start_prev = -1;
+ idx->end_prev = -1;
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ if ( !fname ) return idx;
+
+ kstring_t str = {0,0,0};
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) goto error;
+
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( regidx_insert(idx, str.s) ) goto error;
+ }
+ regidx_insert(idx, NULL);
+
+ free(str.s);
+ hts_close(fp);
+ return idx;
+
+error:
+ free(str.s);
+ if ( fp ) hts_close(fp);
+ regidx_destroy(idx);
+ return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+ int i, j;
+ for (i=0; i<idx->nseq; i++)
+ {
+ reglist_t *list = &idx->seq[i];
+ if ( idx->free )
+ {
+ for (j=0; j<list->nregs; j++)
+ idx->free(list->payload + idx->payload_size*j);
+ }
+ free(list->payload);
+ free(list->regs);
+ free(list->idx);
+ }
+ free(idx->seq_names);
+ free(idx->seq);
+ free(idx->str.s);
+ free(idx->payload);
+ khash_str2int_destroy_free(idx->seq2regs);
+ free(idx);
+}
+
+int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr)
+{
+ if ( itr ) itr->i = itr->n = 0;
+
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+ reglist_t *list = &idx->seq[iseq];
+ if ( !list->nregs ) return 0;
+
+ int i, ibeg = from>>LIDX_SHIFT;
+ int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
+ if ( ireg < 0 )
+ {
+ // linear search; if slow, replace with binary search
+ if ( ibeg > list->nidx ) ibeg = list->nidx;
+ for (i=ibeg - 1; i>=0; i--)
+ if ( list->idx[i] >=0 ) break;
+ ireg = i>=0 ? list->idx[i] : 0;
+ }
+ for (i=ireg; i<list->nregs; i++)
+ {
+ if ( list->regs[i].start > to ) return 0; // no match
+ if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
+ }
+
+ if ( i>=list->nregs ) return 0; // no match
+
+ if ( !itr ) return 1;
+
+ itr->i = 0;
+ itr->n = list->nregs - i;
+ itr->reg = &idx->seq[iseq].regs[i];
+ if ( idx->payload_size )
+ itr->payload = idx->seq[iseq].payload + i*idx->payload_size;
+ else
+ itr->payload = NULL;
+
+ return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ ss = se+1;
+ reg->start = hts_parse_decimal(ss, &se, 0);
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ ss = se+1;
+ reg->end = hts_parse_decimal(ss, &se, 0) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ ss = se+1;
+ reg->start = hts_parse_decimal(ss, &se, 0) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ if ( !se[0] || !se[1] )
+ reg->end = reg->start;
+ else
+ {
+ ss = se+1;
+ reg->end = hts_parse_decimal(ss, &se, 0);
+ if ( ss==se ) reg->end = reg->start;
+ else reg->end--;
+ }
+
+ return 0;
+}
+
diff --git a/htslib/sam.c b/htslib/sam.c
new file mode 100644
index 0000000..66eeb2b
--- /dev/null
+++ b/htslib/sam.c
@@ -0,0 +1,2073 @@
+/* sam.c -- SAM and BAM file I/O and manipulation.
+
+ Copyright (C) 2008-2010, 2012-2016 Genome Research Ltd.
+ Copyright (C) 2010, 2012, 2013 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <ctype.h>
+#include <zlib.h>
+#include "htslib/sam.h"
+#include "htslib/bgzf.h"
+#include "cram/cram.h"
+#include "hts_internal.h"
+#include "htslib/hfile.h"
+
+#include "htslib/khash.h"
+KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
+
+typedef khash_t(s2i) sdict_t;
+
+/**********************
+ *** BAM header I/O ***
+ **********************/
+
+bam_hdr_t *bam_hdr_init()
+{
+ return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
+}
+
+void bam_hdr_destroy(bam_hdr_t *h)
+{
+ int32_t i;
+ if (h == NULL) return;
+ if (h->target_name) {
+ for (i = 0; i < h->n_targets; ++i)
+ free(h->target_name[i]);
+ free(h->target_name);
+ free(h->target_len);
+ }
+ free(h->text); free(h->cigar_tab);
+ if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
+ free(h);
+}
+
+bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
+{
+ if (h0 == NULL) return NULL;
+ bam_hdr_t *h;
+ if ((h = bam_hdr_init()) == NULL) return NULL;
+ // copy the simple data
+ h->n_targets = h0->n_targets;
+ h->ignore_sam_err = h0->ignore_sam_err;
+ h->l_text = h0->l_text;
+ // Then the pointery stuff
+ h->cigar_tab = NULL;
+ h->sdict = NULL;
+ // TODO Check for memory allocation failures
+ h->text = (char*)calloc(h->l_text + 1, 1);
+ memcpy(h->text, h0->text, h->l_text);
+ h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
+ h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
+ int i;
+ for (i = 0; i < h->n_targets; ++i) {
+ h->target_len[i] = h0->target_len[i];
+ h->target_name[i] = strdup(h0->target_name[i]);
+ }
+ return h;
+}
+
+
+static bam_hdr_t *hdr_from_dict(sdict_t *d)
+{
+ bam_hdr_t *h;
+ khint_t k;
+ h = bam_hdr_init();
+ h->sdict = d;
+ h->n_targets = kh_size(d);
+ // TODO Check for memory allocation failures
+ h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
+ h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
+ for (k = kh_begin(d); k != kh_end(d); ++k) {
+ if (!kh_exist(d, k)) continue;
+ h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
+ h->target_len[kh_val(d, k)>>32] = kh_val(d, k)<<32>>32;
+ kh_val(d, k) >>= 32;
+ }
+ return h;
+}
+
+bam_hdr_t *bam_hdr_read(BGZF *fp)
+{
+ bam_hdr_t *h;
+ char buf[4];
+ int magic_len, has_EOF;
+ int32_t i, name_len, num_names = 0;
+ size_t bufsize;
+ ssize_t bytes;
+ // check EOF
+ has_EOF = bgzf_check_EOF(fp);
+ if (has_EOF < 0) {
+ perror("[W::bam_hdr_read] bgzf_check_EOF");
+ } else if (has_EOF == 0 && hts_verbose >= 2)
+ fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
+ // read "BAM1"
+ magic_len = bgzf_read(fp, buf, 4);
+ if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
+ if (hts_verbose >= 1) fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__);
+ return 0;
+ }
+ h = bam_hdr_init();
+ if (!h) goto nomem;
+
+ // read plain text and the number of reference sequences
+ bytes = bgzf_read(fp, &h->l_text, 4);
+ if (bytes != 4) goto read_err;
+ if (fp->is_be) ed_swap_4p(&h->l_text);
+
+ bufsize = ((size_t) h->l_text) + 1;
+ if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
+ h->text = (char*)malloc(bufsize);
+ if (!h->text) goto nomem;
+ h->text[h->l_text] = 0; // make sure it is NULL terminated
+ bytes = bgzf_read(fp, h->text, h->l_text);
+ if (bytes != h->l_text) goto read_err;
+
+ bytes = bgzf_read(fp, &h->n_targets, 4);
+ if (bytes != 4) goto read_err;
+ if (fp->is_be) ed_swap_4p(&h->n_targets);
+
+ if (h->n_targets < 0) goto invalid;
+
+ // read reference sequence names and lengths
+ if (h->n_targets > 0) {
+ h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
+ if (!h->target_name) goto nomem;
+ h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
+ if (!h->target_len) goto nomem;
+ }
+ else {
+ h->target_name = NULL;
+ h->target_len = NULL;
+ }
+
+ for (i = 0; i != h->n_targets; ++i) {
+ bytes = bgzf_read(fp, &name_len, 4);
+ if (bytes != 4) goto read_err;
+ if (fp->is_be) ed_swap_4p(&name_len);
+ if (name_len <= 0) goto invalid;
+
+ h->target_name[i] = (char*)malloc(name_len);
+ if (!h->target_name[i]) goto nomem;
+ num_names++;
+
+ bytes = bgzf_read(fp, h->target_name[i], name_len);
+ if (bytes != name_len) goto read_err;
+
+ if (h->target_name[i][name_len - 1] != '\0') {
+ /* Fix missing NUL-termination. Is this being too nice?
+ We could alternatively bail out with an error. */
+ char *new_name;
+ if (name_len == INT32_MAX) goto invalid;
+ new_name = realloc(h->target_name[i], name_len + 1);
+ if (new_name == NULL) goto nomem;
+ h->target_name[i] = new_name;
+ h->target_name[i][name_len] = '\0';
+ }
+
+ bytes = bgzf_read(fp, &h->target_len[i], 4);
+ if (bytes != 4) goto read_err;
+ if (fp->is_be) ed_swap_4p(&h->target_len[i]);
+ }
+ return h;
+
+ nomem:
+ if (hts_verbose >= 1) fprintf(stderr, "[E::%s] out of memory\n", __func__);
+ goto clean;
+
+ read_err:
+ if (hts_verbose >= 1) {
+ if (bytes < 0) {
+ fprintf(stderr, "[E::%s] error reading BGZF stream\n", __func__);
+ } else {
+ fprintf(stderr, "[E::%s] truncated bam header\n", __func__);
+ }
+ }
+ goto clean;
+
+ invalid:
+ if (hts_verbose >= 1) {
+ fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__);
+ }
+
+ clean:
+ if (h != NULL) {
+ h->n_targets = num_names; // ensure we free only allocated target_names
+ bam_hdr_destroy(h);
+ }
+ return NULL;
+}
+
+int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
+{
+ char buf[4];
+ int32_t i, name_len, x;
+ // write "BAM1"
+ strncpy(buf, "BAM\1", 4);
+ if (bgzf_write(fp, buf, 4) < 0) return -1;
+ // write plain text and the number of reference sequences
+ if (fp->is_be) {
+ x = ed_swap_4(h->l_text);
+ if (bgzf_write(fp, &x, 4) < 0) return -1;
+ if (h->l_text) {
+ if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
+ }
+ x = ed_swap_4(h->n_targets);
+ if (bgzf_write(fp, &x, 4) < 0) return -1;
+ } else {
+ if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
+ if (h->l_text) {
+ if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
+ }
+ if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
+ }
+ // write sequence names and lengths
+ for (i = 0; i != h->n_targets; ++i) {
+ char *p = h->target_name[i];
+ name_len = strlen(p) + 1;
+ if (fp->is_be) {
+ x = ed_swap_4(name_len);
+ if (bgzf_write(fp, &x, 4) < 0) return -1;
+ } else {
+ if (bgzf_write(fp, &name_len, 4) < 0) return -1;
+ }
+ if (bgzf_write(fp, p, name_len) < 0) return -1;
+ if (fp->is_be) {
+ x = ed_swap_4(h->target_len[i]);
+ if (bgzf_write(fp, &x, 4) < 0) return -1;
+ } else {
+ if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
+ }
+ }
+ if (bgzf_flush(fp) < 0) return -1;
+ return 0;
+}
+
+int bam_name2id(bam_hdr_t *h, const char *ref)
+{
+ sdict_t *d = (sdict_t*)h->sdict;
+ khint_t k;
+ if (h->sdict == 0) {
+ int i, absent;
+ d = kh_init(s2i);
+ for (i = 0; i < h->n_targets; ++i) {
+ k = kh_put(s2i, d, h->target_name[i], &absent);
+ kh_val(d, k) = i;
+ }
+ h->sdict = d;
+ }
+ k = kh_get(s2i, d, ref);
+ return k == kh_end(d)? -1 : kh_val(d, k);
+}
+
+/*************************
+ *** BAM alignment I/O ***
+ *************************/
+
+bam1_t *bam_init1()
+{
+ return (bam1_t*)calloc(1, sizeof(bam1_t));
+}
+
+void bam_destroy1(bam1_t *b)
+{
+ if (b == 0) return;
+ free(b->data); free(b);
+}
+
+bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+{
+ uint8_t *data = bdst->data;
+ int m_data = bdst->m_data; // backup data and m_data
+ if (m_data < bsrc->l_data) { // double the capacity
+ m_data = bsrc->l_data; kroundup32(m_data);
+ data = (uint8_t*)realloc(data, m_data);
+ }
+ memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
+ *bdst = *bsrc; // copy the rest
+ // restore the backup
+ bdst->m_data = m_data;
+ bdst->data = data;
+ return bdst;
+}
+
+bam1_t *bam_dup1(const bam1_t *bsrc)
+{
+ if (bsrc == NULL) return NULL;
+ bam1_t *bdst = bam_init1();
+ if (bdst == NULL) return NULL;
+ return bam_copy1(bdst, bsrc);
+}
+
+int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
+{
+ int k, l;
+ for (k = l = 0; k < n_cigar; ++k)
+ if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
+ l += bam_cigar_oplen(cigar[k]);
+ return l;
+}
+
+int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
+{
+ int k, l;
+ for (k = l = 0; k < n_cigar; ++k)
+ if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
+ l += bam_cigar_oplen(cigar[k]);
+ return l;
+}
+
+int32_t bam_endpos(const bam1_t *b)
+{
+ if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
+ return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
+ else
+ return b->core.pos + 1;
+}
+
+static inline int aux_type2size(uint8_t type)
+{
+ switch (type) {
+ case 'A': case 'c': case 'C':
+ return 1;
+ case 's': case 'S':
+ return 2;
+ case 'i': case 'I': case 'f':
+ return 4;
+ case 'd':
+ return 8;
+ case 'Z': case 'H': case 'B':
+ return type;
+ default:
+ return 0;
+ }
+}
+
+static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
+{
+ uint8_t *s;
+ uint32_t *cigar = (uint32_t*)(data + c->l_qname);
+ uint32_t i, n;
+ s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+ for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
+ while (s < data + l_data) {
+ int size;
+ s += 2; // skip key
+ size = aux_type2size(*s); ++s; // skip type
+ switch (size) {
+ case 1: ++s; break;
+ case 2: ed_swap_2p(s); s += 2; break;
+ case 4: ed_swap_4p(s); s += 4; break;
+ case 8: ed_swap_8p(s); s += 8; break;
+ case 'Z':
+ case 'H':
+ while (*s) ++s;
+ ++s;
+ break;
+ case 'B':
+ size = aux_type2size(*s); ++s;
+ if (is_host) memcpy(&n, s, 4), ed_swap_4p(s);
+ else ed_swap_4p(s), memcpy(&n, s, 4);
+ s += 4;
+ switch (size) {
+ case 1: s += n; break;
+ case 2: for (i = 0; i < n; ++i, s += 2) ed_swap_2p(s); break;
+ case 4: for (i = 0; i < n; ++i, s += 4) ed_swap_4p(s); break;
+ case 8: for (i = 0; i < n; ++i, s += 8) ed_swap_8p(s); break;
+ }
+ break;
+ }
+ }
+}
+
+int bam_read1(BGZF *fp, bam1_t *b)
+{
+ bam1_core_t *c = &b->core;
+ int32_t block_len, ret, i;
+ uint32_t x[8];
+ if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
+ if (ret == 0) return -1; // normal end-of-file
+ else return -2; // truncated
+ }
+ if (bgzf_read(fp, x, 32) != 32) return -3;
+ if (fp->is_be) {
+ ed_swap_4p(&block_len);
+ for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
+ }
+ c->tid = x[0]; c->pos = x[1];
+ c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+ c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+ c->l_qseq = x[4];
+ c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+ b->l_data = block_len - 32;
+ if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4;
+ if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data)
+ return -4;
+ if (b->m_data < b->l_data) {
+ b->m_data = b->l_data;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ if (!b->data)
+ return -4;
+ }
+ if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4;
+ //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+ if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
+ return 4 + block_len;
+}
+
+int bam_write1(BGZF *fp, const bam1_t *b)
+{
+ const bam1_core_t *c = &b->core;
+ uint32_t x[8], block_len = b->l_data + 32, y;
+ int i, ok;
+ x[0] = c->tid;
+ x[1] = c->pos;
+ x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+ x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+ x[4] = c->l_qseq;
+ x[5] = c->mtid;
+ x[6] = c->mpos;
+ x[7] = c->isize;
+ ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
+ if (fp->is_be) {
+ for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
+ y = block_len;
+ if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
+ swap_data(c, b->l_data, b->data, 1);
+ } else {
+ if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
+ }
+ if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
+ if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0);
+ if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
+ return ok? 4 + block_len : -1;
+}
+
+/********************
+ *** BAM indexing ***
+ ********************/
+
+static hts_idx_t *bam_index(BGZF *fp, int min_shift)
+{
+ int n_lvls, i, fmt, ret;
+ bam1_t *b;
+ hts_idx_t *idx;
+ bam_hdr_t *h;
+ h = bam_hdr_read(fp);
+ if (h == NULL) return NULL;
+ if (min_shift > 0) {
+ int64_t max_len = 0, s;
+ for (i = 0; i < h->n_targets; ++i)
+ if (max_len < h->target_len[i]) max_len = h->target_len[i];
+ max_len += 256;
+ for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
+ fmt = HTS_FMT_CSI;
+ } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
+ idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
+ bam_hdr_destroy(h);
+ b = bam_init1();
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
+ if (ret < 0) goto err; // unsorted
+ }
+ if (ret < -1) goto err; // corrupted BAM file
+
+ hts_idx_finish(idx, bgzf_tell(fp));
+ bam_destroy1(b);
+ return idx;
+
+err:
+ bam_destroy1(b);
+ hts_idx_destroy(idx);
+ return NULL;
+}
+
+int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
+{
+ hts_idx_t *idx;
+ htsFile *fp;
+ int ret = 0;
+
+ if ((fp = hts_open(fn, "r")) == 0) return -2;
+ switch (fp->format.format) {
+ case cram:
+ ret = cram_index_build(fp->fp.cram, fn, fnidx);
+ break;
+
+ case bam:
+ idx = bam_index(fp->fp.bgzf, min_shift);
+ if (idx) {
+ ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
+ hts_idx_destroy(idx);
+ }
+ else ret = -1;
+ break;
+
+ default:
+ ret = -3;
+ break;
+ }
+ hts_close(fp);
+
+ return ret;
+}
+
+int sam_index_build(const char *fn, int min_shift)
+{
+ return sam_index_build2(fn, NULL, min_shift);
+}
+
+// Provide bam_index_build() symbol for binary compability with earlier HTSlib
+#undef bam_index_build
+int bam_index_build(const char *fn, int min_shift)
+{
+ return sam_index_build2(fn, NULL, min_shift);
+}
+
+static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
+{
+ bam1_t *b = bv;
+ int ret;
+ if ((ret = bam_read1(fp, b)) >= 0) {
+ *tid = b->core.tid;
+ *beg = b->core.pos;
+ *end = bam_endpos(b);
+ }
+ return ret;
+}
+
+// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
+static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
+{
+ htsFile *fp = fpv;
+ bam1_t *b = bv;
+ return cram_get_bam_seq(fp->fp.cram, &b);
+}
+
+// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
+static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
+{
+ htsFile *fp = fpv;
+ bam1_t *b = bv;
+ switch (fp->format.format) {
+ case bam: return bam_read1(bgzfp, b);
+ case cram: return cram_get_bam_seq(fp->fp.cram, &b);
+ default:
+ // TODO Need headers available to implement this for SAM files
+ fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n");
+ abort();
+ }
+}
+
+hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
+{
+ switch (fp->format.format) {
+ case bam:
+ return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
+
+ case cram: {
+ if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
+ // Cons up a fake "index" just pointing at the associated cram_fd:
+ hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
+ if (idx == NULL) return NULL;
+ idx->fmt = HTS_FMT_CRAI;
+ idx->cram = fp->fp.cram;
+ return (hts_idx_t *) idx;
+ }
+
+ default:
+ return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
+ }
+}
+
+hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
+{
+ return sam_index_load2(fp, fn, NULL);
+}
+
+static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
+{
+ const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
+ hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
+ if (iter == NULL) return NULL;
+
+ // Cons up a dummy iterator for which hts_itr_next() will simply invoke
+ // the readrec function:
+ iter->read_rest = 1;
+ iter->off = NULL;
+ iter->bins.a = NULL;
+ iter->readrec = readrec;
+
+ if (tid >= 0 || tid == HTS_IDX_NOCOOR) {
+ cram_range r = { tid == HTS_IDX_NOCOOR ? -1 : tid, beg+1, end };
+ int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
+
+ iter->curr_off = 0;
+ // The following fields are not required by hts_itr_next(), but are
+ // filled in in case user code wants to look at them.
+ iter->tid = tid;
+ iter->beg = beg;
+ iter->end = end;
+
+ switch (ret) {
+ case 0:
+ break;
+
+ case -2:
+ // No data vs this ref, so mark iterator as completed.
+ // Same as HTS_IDX_NONE.
+ iter->finished = 1;
+ break;
+
+ default:
+ free(iter);
+ return NULL;
+ }
+ }
+ else switch (tid) {
+ case HTS_IDX_REST:
+ iter->curr_off = 0;
+ break;
+ case HTS_IDX_NONE:
+ iter->curr_off = 0;
+ iter->finished = 1;
+ break;
+ default:
+ fprintf(stderr, "[cram_itr_query] tid=%d not implemented for CRAM files -- Exiting\n", tid);
+ abort();
+ break;
+ }
+
+ return iter;
+}
+
+hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+{
+ const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
+ if (idx == NULL)
+ return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
+ else if (cidx->fmt == HTS_FMT_CRAI)
+ return cram_itr_query(idx, tid, beg, end, cram_readrec);
+ else
+ return hts_itr_query(idx, tid, beg, end, bam_readrec);
+}
+
+static int cram_name2id(void *fdv, const char *ref)
+{
+ cram_fd *fd = (cram_fd *) fdv;
+ return sam_hdr_name2ref(fd->header, ref);
+}
+
+hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
+{
+ const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
+ if (cidx->fmt == HTS_FMT_CRAI)
+ return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
+ else
+ return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
+}
+
+/**********************
+ *** SAM header I/O ***
+ **********************/
+
+#include "htslib/kseq.h"
+#include "htslib/kstring.h"
+
+bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
+{
+ const char *q, *r, *p;
+ khash_t(s2i) *d;
+ d = kh_init(s2i);
+ for (p = text; *p; ++p) {
+ if (strncmp(p, "@SQ\t", 4) == 0) {
+ char *sn = 0;
+ int ln = -1;
+ for (q = p + 4;; ++q) {
+ if (strncmp(q, "SN:", 3) == 0) {
+ q += 3;
+ for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
+ sn = (char*)calloc(r - q + 1, 1);
+ strncpy(sn, q, r - q);
+ q = r;
+ } else if (strncmp(q, "LN:", 3) == 0)
+ ln = strtol(q + 3, (char**)&q, 10);
+ while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
+ if (*q == '\0' || *q == '\n') break;
+ }
+ p = q;
+ if (sn && ln >= 0) {
+ khint_t k;
+ int absent;
+ k = kh_put(s2i, d, sn, &absent);
+ if (!absent) {
+ if (hts_verbose >= 2)
+ fprintf(stderr, "[W::%s] duplicated sequence '%s'\n", __func__, sn);
+ free(sn);
+ } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
+ }
+ }
+ while (*p != '\0' && *p != '\n') ++p;
+ }
+ return hdr_from_dict(d);
+}
+
+bam_hdr_t *sam_hdr_read(htsFile *fp)
+{
+ switch (fp->format.format) {
+ case bam:
+ return bam_hdr_read(fp->fp.bgzf);
+
+ case cram:
+ return cram_header_to_bam(fp->fp.cram->header);
+
+ case sam: {
+ kstring_t str;
+ bam_hdr_t *h;
+ int has_SQ = 0;
+ str.l = str.m = 0; str.s = 0;
+ while (hts_getline(fp, KS_SEP_LINE, &fp->line) >= 0) {
+ if (fp->line.s[0] != '@') break;
+ if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
+ kputsn(fp->line.s, fp->line.l, &str);
+ kputc('\n', &str);
+ }
+ if (! has_SQ && fp->fn_aux) {
+ char line[2048];
+ FILE *f = fopen(fp->fn_aux, "r");
+ if (f == NULL) return NULL;
+ while (fgets(line, sizeof line, f)) {
+ const char *name = strtok(line, "\t");
+ const char *length = strtok(NULL, "\t");
+ ksprintf(&str, "@SQ\tSN:%s\tLN:%s\n", name, length);
+ }
+ fclose(f);
+ }
+ if (str.l == 0) kputsn("", 0, &str);
+ h = sam_hdr_parse(str.l, str.s);
+ h->l_text = str.l; h->text = str.s;
+ return h;
+ }
+
+ default:
+ abort();
+ }
+}
+
+int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
+{
+ switch (fp->format.format) {
+ case binary_format:
+ fp->format.category = sequence_data;
+ fp->format.format = bam;
+ /* fall-through */
+ case bam:
+ if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
+ break;
+
+ case cram: {
+ cram_fd *fd = fp->fp.cram;
+ SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
+ if (! hdr) return -1;
+ if (cram_set_header(fd, hdr) < 0) return -1;
+ if (fp->fn_aux)
+ cram_load_reference(fd, fp->fn_aux);
+ if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
+ }
+ break;
+
+ case text_format:
+ fp->format.category = sequence_data;
+ fp->format.format = sam;
+ /* fall-through */
+ case sam: {
+ char *p;
+ hputs(h->text, fp->fp.hfile);
+ p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
+ if (p == 0) {
+ int i;
+ for (i = 0; i < h->n_targets; ++i) {
+ fp->line.l = 0;
+ kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
+ kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
+ if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
+ }
+ }
+ if ( hflush(fp->fp.hfile) != 0 ) return -1;
+ }
+ break;
+
+ default:
+ abort();
+ }
+ return 0;
+}
+
+/**********************
+ *** SAM record I/O ***
+ **********************/
+
+int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
+{
+#define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
+#define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
+#define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
+#define _parse_err(cond, msg) do { if ((cond) && hts_verbose >= 1) { fprintf(stderr, "[E::%s] " msg "\n", __func__); goto err_ret; } } while (0)
+#define _parse_warn(cond, msg) if ((cond) && hts_verbose >= 2) fprintf(stderr, "[W::%s] " msg "\n", __func__)
+
+ uint8_t *t;
+ char *p = s->s, *q;
+ int i;
+ kstring_t str;
+ bam1_core_t *c = &b->core;
+
+ str.l = b->l_data = 0;
+ str.s = (char*)b->data; str.m = b->m_data;
+ memset(c, 0, 32);
+ if (h->cigar_tab == 0) {
+ h->cigar_tab = (int8_t*) malloc(128);
+ for (i = 0; i < 128; ++i)
+ h->cigar_tab[i] = -1;
+ for (i = 0; BAM_CIGAR_STR[i]; ++i)
+ h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
+ }
+ // qname
+ q = _read_token(p);
+ _parse_warn(p - q <= 1, "empty query name");
+ _parse_err(p - q > 255, "query name too long");
+ kputsn_(q, p - q, &str);
+ c->l_qname = p - q;
+ // flag
+ c->flag = strtol(p, &p, 0);
+ if (*p++ != '\t') goto err_ret; // malformated flag
+ // chr
+ q = _read_token(p);
+ if (strcmp(q, "*")) {
+ _parse_err(h->n_targets == 0, "missing SAM header");
+ c->tid = bam_name2id(h, q);
+ _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
+ } else c->tid = -1;
+ // pos
+ c->pos = strtol(p, &p, 10) - 1;
+ if (*p++ != '\t') goto err_ret;
+ if (c->pos < 0 && c->tid >= 0) {
+ _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
+ c->tid = -1;
+ }
+ if (c->tid < 0) c->flag |= BAM_FUNMAP;
+ // mapq
+ c->qual = strtol(p, &p, 10);
+ if (*p++ != '\t') goto err_ret;
+ // cigar
+ if (*p != '*') {
+ uint32_t *cigar;
+ size_t n_cigar = 0;
+ for (q = p; *p && *p != '\t'; ++p)
+ if (!isdigit(*p)) ++n_cigar;
+ if (*p++ != '\t') goto err_ret;
+ _parse_err(n_cigar == 0, "no CIGAR operations");
+ _parse_err(n_cigar >= 65536, "too many CIGAR operations");
+ c->n_cigar = n_cigar;
+ _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
+ for (i = 0; i < c->n_cigar; ++i, ++q) {
+ int op;
+ cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
+ op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
+ _parse_err(op < 0, "unrecognized CIGAR operator");
+ cigar[i] |= op;
+ }
+ // can't use bam_endpos() directly as some fields not yet set up
+ i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
+ } else {
+ _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
+ c->flag |= BAM_FUNMAP;
+ q = _read_token(p);
+ i = 1;
+ }
+ c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
+ // mate chr
+ q = _read_token(p);
+ if (strcmp(q, "=") == 0) {
+ c->mtid = c->tid;
+ } else if (strcmp(q, "*") == 0) {
+ c->mtid = -1;
+ } else {
+ c->mtid = bam_name2id(h, q);
+ _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
+ }
+ // mpos
+ c->mpos = strtol(p, &p, 10) - 1;
+ if (*p++ != '\t') goto err_ret;
+ if (c->mpos < 0 && c->mtid >= 0) {
+ _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
+ c->mtid = -1;
+ }
+ // tlen
+ c->isize = strtol(p, &p, 10);
+ if (*p++ != '\t') goto err_ret;
+ // seq
+ q = _read_token(p);
+ if (strcmp(q, "*")) {
+ c->l_qseq = p - q - 1;
+ i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
+ _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
+ i = (c->l_qseq + 1) >> 1;
+ _get_mem(uint8_t, &t, &str, i);
+ memset(t, 0, i);
+ for (i = 0; i < c->l_qseq; ++i)
+ t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
+ } else c->l_qseq = 0;
+ // qual
+ q = _read_token_aux(p);
+ _get_mem(uint8_t, &t, &str, c->l_qseq);
+ if (strcmp(q, "*")) {
+ _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
+ for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
+ } else memset(t, 0xff, c->l_qseq);
+ // aux
+ // Note that (like the bam1_core_t fields) this aux data in b->data is
+ // stored in host endianness; so there is no byte swapping needed here.
+ while (p < s->s + s->l) {
+ uint8_t type;
+ q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
+ _parse_err(p - q - 1 < 6, "incomplete aux field");
+ kputsn_(q, 2, &str);
+ q += 3; type = *q++; ++q; // q points to value
+ if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
+ kputc_('A', &str);
+ kputc_(*q, &str);
+ } else if (type == 'i' || type == 'I') {
+ if (*q == '-') {
+ long x = strtol(q, &q, 10);
+ if (x >= INT8_MIN) {
+ kputc_('c', &str); kputc_(x, &str);
+ } else if (x >= INT16_MIN) {
+ int16_t y = x;
+ kputc_('s', &str); kputsn_((char*)&y, 2, &str);
+ } else {
+ int32_t y = x;
+ kputc_('i', &str); kputsn_(&y, 4, &str);
+ }
+ } else {
+ unsigned long x = strtoul(q, &q, 10);
+ if (x <= UINT8_MAX) {
+ kputc_('C', &str); kputc_(x, &str);
+ } else if (x <= UINT16_MAX) {
+ uint16_t y = x;
+ kputc_('S', &str); kputsn_(&y, 2, &str);
+ } else {
+ uint32_t y = x;
+ kputc_('I', &str); kputsn_(&y, 4, &str);
+ }
+ }
+ } else if (type == 'f') {
+ float x;
+ x = strtod(q, &q);
+ kputc_('f', &str); kputsn_(&x, 4, &str);
+ } else if (type == 'd') {
+ double x;
+ x = strtod(q, &q);
+ kputc_('d', &str); kputsn_(&x, 8, &str);
+ } else if (type == 'Z' || type == 'H') {
+ kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
+ } else if (type == 'B') {
+ int32_t n;
+ char *r;
+ _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
+ type = *q++; // q points to the first ',' following the typing byte
+ for (r = q, n = 0; *r; ++r)
+ if (*r == ',') ++n;
+ kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
+ // FIXME: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
+ if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
+ else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
+ else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
+ else if (type == 'S') while (q + 1 < p) { uint16_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 2, &str); }
+ else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
+ else if (type == 'I') while (q + 1 < p) { uint32_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 4, &str); }
+ else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
+ else _parse_err(1, "unrecognized type");
+ } else _parse_err(1, "unrecognized type");
+ }
+ b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
+ return 0;
+
+#undef _parse_warn
+#undef _parse_err
+#undef _get_mem
+#undef _read_token_aux
+#undef _read_token
+err_ret:
+ b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
+ return -2;
+}
+
+int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
+{
+ switch (fp->format.format) {
+ case bam: {
+ int r = bam_read1(fp->fp.bgzf, b);
+ if (r >= 0) {
+ if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
+ b->core.mtid >= h->n_targets || b->core.mtid < -1)
+ return -3;
+ }
+ return r;
+ }
+
+ case cram: {
+ int ret = cram_get_bam_seq(fp->fp.cram, &b);
+ return ret >= 0
+ ? ret
+ : (cram_eof(fp->fp.cram) ? -1 : -2);
+ }
+
+ case sam: {
+ int ret;
+err_recover:
+ if (fp->line.l == 0) {
+ ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
+ if (ret < 0) return -1;
+ }
+ ret = sam_parse1(&fp->line, h, b);
+ fp->line.l = 0;
+ if (ret < 0) {
+ if (hts_verbose >= 1)
+ fprintf(stderr, "[W::%s] parse error at line %lld\n", __func__, (long long)fp->lineno);
+ if (h->ignore_sam_err) goto err_recover;
+ }
+ return ret;
+ }
+
+ default:
+ abort();
+ }
+}
+
+int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
+{
+ int i;
+ uint8_t *s;
+ const bam1_core_t *c = &b->core;
+
+ str->l = 0;
+ kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name
+ kputw(c->flag, str); kputc('\t', str); // flag
+ if (c->tid >= 0) { // chr
+ kputs(h->target_name[c->tid] , str);
+ kputc('\t', str);
+ } else kputsn("*\t", 2, str);
+ kputw(c->pos + 1, str); kputc('\t', str); // pos
+ kputw(c->qual, str); kputc('\t', str); // qual
+ if (c->n_cigar) { // cigar
+ uint32_t *cigar = bam_get_cigar(b);
+ for (i = 0; i < c->n_cigar; ++i) {
+ kputw(bam_cigar_oplen(cigar[i]), str);
+ kputc(bam_cigar_opchr(cigar[i]), str);
+ }
+ } else kputc('*', str);
+ kputc('\t', str);
+ if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
+ else if (c->mtid == c->tid) kputsn("=\t", 2, str);
+ else {
+ kputs(h->target_name[c->mtid], str);
+ kputc('\t', str);
+ }
+ kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
+ kputw(c->isize, str); kputc('\t', str); // template len
+ if (c->l_qseq) { // seq and qual
+ uint8_t *s = bam_get_seq(b);
+ for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
+ kputc('\t', str);
+ s = bam_get_qual(b);
+ if (s[0] == 0xff) kputc('*', str);
+ else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
+ } else kputsn("*\t*", 3, str);
+
+ // FIXME change "s+N <= b->data+b->l_data" to "b->data+b->l_data - s >= N"
+ // (or equivalent) everywhere to avoid looking past the end of the array
+ s = bam_get_aux(b); // aux
+ while (s+4 <= b->data + b->l_data) {
+ uint8_t type, key[2];
+ key[0] = s[0]; key[1] = s[1];
+ s += 2; type = *s++;
+ kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
+ if (type == 'A') {
+ kputsn("A:", 2, str);
+ kputc(*s, str);
+ ++s;
+ } else if (type == 'C') {
+ kputsn("i:", 2, str);
+ kputw(*s, str);
+ ++s;
+ } else if (type == 'c') {
+ kputsn("i:", 2, str);
+ kputw(*(int8_t*)s, str);
+ ++s;
+ } else if (type == 'S') {
+ if (s+2 <= b->data + b->l_data) {
+ kputsn("i:", 2, str);
+ kputw(*(uint16_t*)s, str);
+ s += 2;
+ } else return -1;
+ } else if (type == 's') {
+ if (s+2 <= b->data + b->l_data) {
+ kputsn("i:", 2, str);
+ kputw(*(int16_t*)s, str);
+ s += 2;
+ } else return -1;
+ } else if (type == 'I') {
+ if (s+4 <= b->data + b->l_data) {
+ kputsn("i:", 2, str);
+ kputuw(*(uint32_t*)s, str);
+ s += 4;
+ } else return -1;
+ } else if (type == 'i') {
+ if (s+4 <= b->data + b->l_data) {
+ kputsn("i:", 2, str);
+ kputw(*(int32_t*)s, str);
+ s += 4;
+ } else return -1;
+ } else if (type == 'f') {
+ if (s+4 <= b->data + b->l_data) {
+ ksprintf(str, "f:%g", *(float*)s);
+ s += 4;
+ } else return -1;
+
+ } else if (type == 'd') {
+ if (s+8 <= b->data + b->l_data) {
+ ksprintf(str, "d:%g", *(double*)s);
+ s += 8;
+ } else return -1;
+ } else if (type == 'Z' || type == 'H') {
+ kputc(type, str); kputc(':', str);
+ while (s < b->data + b->l_data && *s) kputc(*s++, str);
+ if (s >= b->data + b->l_data)
+ return -1;
+ ++s;
+ } else if (type == 'B') {
+ uint8_t sub_type = *(s++);
+ int sub_type_size = aux_type2size(sub_type);
+ uint32_t n;
+ if (sub_type_size == 0 || b->data + b->l_data - s < 4)
+ return -1;
+ memcpy(&n, s, 4);
+ s += 4; // now points to the start of the array
+ if ((b->data + b->l_data - s) / sub_type_size < n)
+ return -1;
+ kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
+ for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
+ kputc(',', str);
+ if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
+ else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
+ else if ('s' == sub_type) { kputw(*(int16_t*)s, str); s += 2; }
+ else if ('S' == sub_type) { kputw(*(uint16_t*)s, str); s += 2; }
+ else if ('i' == sub_type) { kputw(*(int32_t*)s, str); s += 4; }
+ else if ('I' == sub_type) { kputuw(*(uint32_t*)s, str); s += 4; }
+ else if ('f' == sub_type) { ksprintf(str, "%g", *(float*)s); s += 4; }
+ else return -1;
+ }
+ }
+ }
+ return str->l;
+}
+
+int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
+{
+ switch (fp->format.format) {
+ case binary_format:
+ fp->format.category = sequence_data;
+ fp->format.format = bam;
+ /* fall-through */
+ case bam:
+ return bam_write1(fp->fp.bgzf, b);
+
+ case cram:
+ return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
+
+ case text_format:
+ fp->format.category = sequence_data;
+ fp->format.format = sam;
+ /* fall-through */
+ case sam:
+ if (sam_format1(h, b, &fp->line) < 0) return -1;
+ kputc('\n', &fp->line);
+ if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
+ return fp->line.l;
+
+ default:
+ abort();
+ }
+}
+
+/************************
+ *** Auxiliary fields ***
+ ************************/
+
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+ int ori_len = b->l_data;
+ b->l_data += 3 + len;
+ if (b->m_data < b->l_data) {
+ b->m_data = b->l_data;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+ b->data[ori_len + 2] = type;
+ memcpy(b->data + ori_len + 3, data, len);
+}
+
+static inline uint8_t *skip_aux(uint8_t *s)
+{
+ int size = aux_type2size(*s); ++s; // skip type
+ uint32_t n;
+ switch (size) {
+ case 'Z':
+ case 'H':
+ while (*s) ++s;
+ return s + 1;
+ case 'B':
+ size = aux_type2size(*s); ++s;
+ memcpy(&n, s, 4); s += 4;
+ return s + size * n;
+ case 0:
+ abort();
+ break;
+ default:
+ return s + size;
+ }
+}
+
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
+{
+ uint8_t *s;
+ int y = tag[0]<<8 | tag[1];
+ s = bam_get_aux(b);
+ while (s < b->data + b->l_data) {
+ int x = (int)s[0]<<8 | s[1];
+ s += 2;
+ if (x == y) return s;
+ s = skip_aux(s);
+ }
+ return 0;
+}
+// s MUST BE returned by bam_aux_get()
+int bam_aux_del(bam1_t *b, uint8_t *s)
+{
+ uint8_t *p, *aux;
+ int l_aux = bam_get_l_aux(b);
+ aux = bam_get_aux(b);
+ p = s - 2;
+ s = skip_aux(s);
+ memmove(p, s, l_aux - (s - aux));
+ b->l_data -= s - p;
+ return 0;
+}
+
+int32_t bam_aux2i(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (type == 'c') return (int32_t)*(int8_t*)s;
+ else if (type == 'C') return (int32_t)*(uint8_t*)s;
+ else if (type == 's') return (int32_t)*(int16_t*)s;
+ else if (type == 'S') return (int32_t)*(uint16_t*)s;
+ else if (type == 'i' || type == 'I') return *(int32_t*)s;
+ else return 0;
+}
+
+double bam_aux2f(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (type == 'd') return *(double*)s;
+ else if (type == 'f') return *(float*)s;
+ else return 0.0;
+}
+
+char bam_aux2A(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (type == 'A') return *(char*)s;
+ else return 0;
+}
+
+char *bam_aux2Z(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (type == 'Z' || type == 'H') return (char*)s;
+ else return 0;
+}
+
+int sam_open_mode(char *mode, const char *fn, const char *format)
+{
+ // TODO Parse "bam5" etc for compression level
+ if (format == NULL) {
+ // Try to pick a format based on the filename extension
+ const char *ext = fn? strrchr(fn, '.') : NULL;
+ if (ext == NULL || strchr(ext, '/')) return -1;
+ return sam_open_mode(mode, fn, ext+1);
+ }
+ else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
+ else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
+ else if (strcmp(format, "sam") == 0) strcpy(mode, "");
+ else return -1;
+
+ return 0;
+}
+
+// A version of sam_open_mode that can handle ,key=value options.
+// The format string is allocated and returned, to be freed by the caller.
+// Prefix should be "r" or "w",
+char *sam_open_mode_opts(const char *fn,
+ const char *mode,
+ const char *format)
+{
+ char *mode_opts = malloc((format ? strlen(format) : 1) +
+ (mode ? strlen(mode) : 1) + 12);
+ char *opts, *cp;
+ int format_len;
+
+ if (!mode_opts)
+ return NULL;
+
+ strcpy(mode_opts, mode ? mode : "r");
+ cp = mode_opts + strlen(mode_opts);
+
+ if (format == NULL) {
+ // Try to pick a format based on the filename extension
+ const char *ext = fn? strrchr(fn, '.') : NULL;
+ if (ext == NULL || strchr(ext, '/')) {
+ free(mode_opts);
+ return NULL;
+ }
+ return sam_open_mode(cp, fn, ext+1)
+ ? (free(mode_opts), NULL)
+ : mode_opts;
+ }
+
+ if ((opts = strchr(format, ','))) {
+ format_len = opts-format;
+ } else {
+ opts="";
+ format_len = strlen(format);
+ }
+
+ if (strncmp(format, "bam", format_len) == 0) {
+ *cp++ = 'b';
+ } else if (strncmp(format, "cram", format_len) == 0) {
+ *cp++ = 'c';
+ } else if (strncmp(format, "cram2", format_len) == 0) {
+ *cp++ = 'c';
+ strcpy(cp, ",VERSION=2.1");
+ cp += 12;
+ } else if (strncmp(format, "cram3", format_len) == 0) {
+ *cp++ = 'c';
+ strcpy(cp, ",VERSION=3.0");
+ cp += 12;
+ } else if (strncmp(format, "sam", format_len) == 0) {
+ ; // format mode=""
+ } else {
+ free(mode_opts);
+ return NULL;
+ }
+
+ strcpy(cp, opts);
+
+ return mode_opts;
+}
+
+#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
+int bam_str2flag(const char *str)
+{
+ char *end, *beg = (char*) str;
+ long int flag = strtol(str, &end, 0);
+ if ( end!=str ) return flag; // the conversion was successful
+ flag = 0;
+ while ( *str )
+ {
+ end = beg;
+ while ( *end && *end!=',' ) end++;
+ if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
+ else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
+ else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
+ else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
+ else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
+ else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
+ else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
+ else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
+ else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
+ else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
+ else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
+ else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
+ else return -1;
+ if ( !*end ) break;
+ beg = end + 1;
+ }
+ return flag;
+}
+
+char *bam_flag2str(int flag)
+{
+ kstring_t str = {0,0,0};
+ if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
+ if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
+ if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
+ if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
+ if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
+ if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
+ if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
+ if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
+ if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
+ if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
+ if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
+ if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
+ if ( str.l == 0 ) kputsn("", 0, &str);
+ return str.s;
+}
+
+
+/**************************
+ *** Pileup and Mpileup ***
+ **************************/
+
+#if !defined(BAM_NO_PILEUP)
+
+#include <assert.h>
+
+/*******************
+ *** Memory pool ***
+ *******************/
+
+typedef struct {
+ int k, x, y, end;
+} cstate_t;
+
+static cstate_t g_cstate_null = { -1, 0, 0, 0 };
+
+typedef struct __linkbuf_t {
+ bam1_t b;
+ int32_t beg, end;
+ cstate_t s;
+ struct __linkbuf_t *next;
+} lbnode_t;
+
+typedef struct {
+ int cnt, n, max;
+ lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init(void)
+{
+ mempool_t *mp;
+ mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+ return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+ int k;
+ for (k = 0; k < mp->n; ++k) {
+ free(mp->buf[k]->b.data);
+ free(mp->buf[k]);
+ }
+ free(mp->buf);
+ free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+ else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+ --mp->cnt; p->next = 0; // clear lbnode_t::next here
+ if (mp->n == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 256;
+ mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+ }
+ mp->buf[mp->n++] = p;
+}
+
+/**********************
+ *** CIGAR resolver ***
+ **********************/
+
+/* s->k: the index of the CIGAR operator that has just been processed.
+ s->x: the reference coordinate of the start of s->k
+ s->y: the query coordiante of the start of s->k
+ */
+static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
+{
+#define _cop(c) ((c)&BAM_CIGAR_MASK)
+#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
+
+ bam1_t *b = p->b;
+ bam1_core_t *c = &b->core;
+ uint32_t *cigar = bam_get_cigar(b);
+ int k;
+ // determine the current CIGAR operation
+// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
+ if (s->k == -1) { // never processed
+ if (c->n_cigar == 1) { // just one operation, save a loop
+ if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
+ } else { // find the first match or deletion
+ for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
+ int op = _cop(cigar[k]);
+ int l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+ else if (op == BAM_CREF_SKIP) s->x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+ }
+ assert(k < c->n_cigar);
+ s->k = k;
+ }
+ } else { // the read has been processed before
+ int op, l = _cln(cigar[s->k]);
+ if (pos - s->x >= l) { // jump to the next operation
+ assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
+ op = _cop(cigar[s->k+1]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+ s->x += l;
+ ++s->k;
+ } else { // find the next M/D/N/=/X
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+ s->x += l;
+ for (k = s->k + 1; k < c->n_cigar; ++k) {
+ op = _cop(cigar[k]), l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+ }
+ s->k = k;
+ }
+ assert(s->k < c->n_cigar); // otherwise a bug
+ } // else, do nothing
+ }
+ { // collect pileup information
+ int op, l;
+ op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
+ p->is_del = p->indel = p->is_refskip = 0;
+ if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
+ int op2 = _cop(cigar[s->k+1]);
+ int l2 = _cln(cigar[s->k+1]);
+ if (op2 == BAM_CDEL) p->indel = -(int)l2;
+ else if (op2 == BAM_CINS) p->indel = l2;
+ else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
+ int l3 = 0;
+ for (k = s->k + 2; k < c->n_cigar; ++k) {
+ op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
+ if (op2 == BAM_CINS) l3 += l2;
+ else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
+ }
+ if (l3 > 0) p->indel = l3;
+ }
+ }
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ p->qpos = s->y + (pos - s->x);
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
+ p->is_refskip = (op == BAM_CREF_SKIP);
+ } // cannot be other operations; otherwise a bug
+ p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
+ }
+ return 1;
+}
+
+/***********************
+ *** Pileup iterator ***
+ ***********************/
+
+// Dictionary of overlapping reads
+KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
+typedef khash_t(olap_hash) olap_hash_t;
+
+struct __bam_plp_t {
+ mempool_t *mp;
+ lbnode_t *head, *tail;
+ int32_t tid, pos, max_tid, max_pos;
+ int is_eof, max_plp, error, maxcnt;
+ uint64_t id;
+ bam_pileup1_t *plp;
+ // for the "auto" interface only
+ bam1_t *b;
+ bam_plp_auto_f func;
+ void *data;
+ olap_hash_t *overlaps;
+};
+
+bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
+{
+ bam_plp_t iter;
+ iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
+ iter->mp = mp_init();
+ iter->head = iter->tail = mp_alloc(iter->mp);
+ iter->max_tid = iter->max_pos = -1;
+ iter->maxcnt = 8000;
+ if (func) {
+ iter->func = func;
+ iter->data = data;
+ iter->b = bam_init1();
+ }
+ return iter;
+}
+
+void bam_plp_init_overlaps(bam_plp_t iter)
+{
+ iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
+}
+
+void bam_plp_destroy(bam_plp_t iter)
+{
+ lbnode_t *p, *pnext;
+ if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
+ for (p = iter->head; p != NULL; p = pnext) {
+ pnext = p->next;
+ mp_free(iter->mp, p);
+ }
+ mp_destroy(iter->mp);
+ if (iter->b) bam_destroy1(iter->b);
+ free(iter->plp);
+ free(iter);
+}
+
+
+//---------------------------------
+//--- Tweak overlapping reads
+//---------------------------------
+
+/**
+ * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
+ * cigar_iref2iseq_next() - get the next CMATCH base
+ * @cigar: pointer to current cigar block (rw)
+ * @cigar_max: pointer just beyond the last cigar block
+ * @icig: position within the current cigar block (rw)
+ * @iseq: position in the sequence (rw)
+ * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
+ *
+ * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
+ */
+static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
+{
+ int pos = *iref;
+ if ( pos < 0 ) return -1;
+ *icig = 0;
+ *iseq = 0;
+ *iref = 0;
+ while ( *cigar<cigar_max )
+ {
+ int cig = (**cigar) & BAM_CIGAR_MASK;
+ int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
+
+ if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
+ if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
+ if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
+ {
+ pos -= ncig;
+ if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
+ (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
+ continue;
+ }
+ if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
+ if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
+ {
+ pos -= ncig;
+ if ( pos<0 ) pos = 0;
+ (*cigar)++; *icig = 0; *iref += ncig;
+ continue;
+ }
+ fprintf(stderr,"todo: cigar %d\n", cig);
+ assert(0);
+ }
+ *iseq = -1;
+ return -1;
+}
+static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
+{
+ while ( *cigar < cigar_max )
+ {
+ int cig = (**cigar) & BAM_CIGAR_MASK;
+ int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
+
+ if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
+ {
+ if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
+ (*iseq)++; (*icig)++; (*iref)++;
+ return BAM_CMATCH;
+ }
+ if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
+ if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
+ if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
+ if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
+ fprintf(stderr,"todo: cigar %d\n", cig);
+ assert(0);
+ }
+ *iseq = -1;
+ *iref = -1;
+ return -1;
+}
+
+static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
+{
+ uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
+ uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
+ int a_icig = 0, a_iseq = 0;
+ int b_icig = 0, b_iseq = 0;
+ uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
+ uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
+
+ int iref = b->core.pos;
+ int a_iref = iref - a->core.pos;
+ int b_iref = iref - b->core.pos;
+ int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
+ if ( a_ret<0 ) return; // no overlap
+ int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
+ if ( b_ret<0 ) return; // no overlap
+
+ #if DBG
+ fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
+ a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
+ #endif
+
+ while ( 1 )
+ {
+ // Increment reference position
+ while ( a_iref>=0 && a_iref < iref - a->core.pos )
+ a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
+ if ( a_ret<0 ) break; // done
+ if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
+
+ while ( b_iref>=0 && b_iref < iref - b->core.pos )
+ b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
+ if ( b_ret<0 ) break; // done
+ if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
+
+ iref++;
+ if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
+
+ if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
+ {
+ #if DBG
+ fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
+ #endif
+ // we are very confident about this base
+ int qual = a_qual[a_iseq] + b_qual[b_iseq];
+ a_qual[a_iseq] = qual>200 ? 200 : qual;
+ b_qual[b_iseq] = 0;
+ }
+ else
+ {
+ if ( a_qual[a_iseq] >= b_qual[b_iseq] )
+ {
+ #if DBG
+ fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
+ #endif
+ a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
+ b_qual[b_iseq] = 0;
+ }
+ else
+ {
+ #if DBG
+ fprintf(stderr,"[%c/%c]",tolower(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
+ #endif
+ b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
+ a_qual[a_iseq] = 0;
+ }
+ }
+ }
+ #if DBG
+ fprintf(stderr,"\n");
+ #endif
+}
+
+// Fix overlapping reads. Simple soft-clipping did not give good results.
+// Lowering qualities of unwanted bases is more selective and works better.
+//
+static void overlap_push(bam_plp_t iter, lbnode_t *node)
+{
+ if ( !iter->overlaps ) return;
+
+ // mapped mates and paired reads only
+ if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
+
+ // no overlap possible, unless some wild cigar
+ if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
+
+ khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
+ if ( kitr==kh_end(iter->overlaps) )
+ {
+ int ret;
+ kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
+ kh_value(iter->overlaps, kitr) = node;
+ }
+ else
+ {
+ lbnode_t *a = kh_value(iter->overlaps, kitr);
+ tweak_overlap_quality(&a->b, &node->b);
+ kh_del(olap_hash, iter->overlaps, kitr);
+ assert(a->end-1 == a->s.end);
+ a->end = bam_endpos(&a->b);
+ a->s.end = a->end - 1;
+ }
+}
+
+static void overlap_remove(bam_plp_t iter, const bam1_t *b)
+{
+ if ( !iter->overlaps ) return;
+
+ khiter_t kitr;
+ if ( b )
+ {
+ kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
+ if ( kitr!=kh_end(iter->overlaps) )
+ kh_del(olap_hash, iter->overlaps, kitr);
+ }
+ else
+ {
+ // remove all
+ for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
+ if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
+ }
+}
+
+
+
+// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
+// pointer to the piled records if next position is ready or NULL if there is not enough records in the
+// buffer yet (the current position is still the maximum position across all buffered reads).
+const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+ if (iter->error) { *_n_plp = -1; return 0; }
+ *_n_plp = 0;
+ if (iter->is_eof && iter->head == iter->tail) return 0;
+ while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
+ int n_plp = 0;
+ // write iter->plp at iter->pos
+ lbnode_t **pptr = &iter->head;
+ while (*pptr != iter->tail) {
+ lbnode_t *p = *pptr;
+ if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
+ overlap_remove(iter, &p->b);
+ *pptr = p->next; mp_free(iter->mp, p);
+ }
+ else {
+ if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
+ if (n_plp == iter->max_plp) { // then double the capacity
+ iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
+ iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
+ }
+ iter->plp[n_plp].b = &p->b;
+ if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
+ }
+ pptr = &(*pptr)->next;
+ }
+ }
+ *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
+ // update iter->tid and iter->pos
+ if (iter->head != iter->tail) {
+ if (iter->tid > iter->head->b.core.tid) {
+ fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
+ iter->error = 1;
+ *_n_plp = -1;
+ return 0;
+ }
+ }
+ if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
+ iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
+ } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
+ iter->pos = iter->head->beg; // jump to the next position
+ } else ++iter->pos; // scan contiguously
+ // return
+ if (n_plp) return iter->plp;
+ if (iter->is_eof && iter->head == iter->tail) break;
+ }
+ return 0;
+}
+
+int bam_plp_push(bam_plp_t iter, const bam1_t *b)
+{
+ if (iter->error) return -1;
+ if (b) {
+ if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
+ // Skip only unmapped reads here, any additional filtering must be done in iter->func
+ if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
+ if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
+ {
+ overlap_remove(iter, b);
+ return 0;
+ }
+ bam_copy1(&iter->tail->b, b);
+ overlap_push(iter, iter->tail);
+#ifndef BAM_NO_ID
+ iter->tail->b.id = iter->id++;
+#endif
+ iter->tail->beg = b->core.pos;
+ iter->tail->end = bam_endpos(b);
+ iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
+ if (b->core.tid < iter->max_tid) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+ iter->error = 1;
+ return -1;
+ }
+ if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+ iter->error = 1;
+ return -1;
+ }
+ iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
+ if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
+ iter->tail->next = mp_alloc(iter->mp);
+ iter->tail = iter->tail->next;
+ }
+ } else iter->is_eof = 1;
+ return 0;
+}
+
+const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+ const bam_pileup1_t *plp;
+ if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ else { // no pileup line can be obtained; read alignments
+ *_n_plp = 0;
+ if (iter->is_eof) return 0;
+ int ret;
+ while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
+ if (bam_plp_push(iter, iter->b) < 0) {
+ *_n_plp = -1;
+ return 0;
+ }
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ // otherwise no pileup line can be returned; read the next alignment.
+ }
+ if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
+ bam_plp_push(iter, 0);
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ return 0;
+ }
+}
+
+void bam_plp_reset(bam_plp_t iter)
+{
+ overlap_remove(iter, NULL);
+ iter->max_tid = iter->max_pos = -1;
+ iter->tid = iter->pos = 0;
+ iter->is_eof = 0;
+ while (iter->head != iter->tail) {
+ lbnode_t *p = iter->head;
+ iter->head = p->next;
+ mp_free(iter->mp, p);
+ }
+}
+
+void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
+{
+ iter->maxcnt = maxcnt;
+}
+
+/************************
+ *** Mpileup iterator ***
+ ************************/
+
+struct __bam_mplp_t {
+ int n;
+ uint64_t min, *pos;
+ bam_plp_t *iter;
+ int *n_plp;
+ const bam_pileup1_t **plp;
+};
+
+bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+{
+ int i;
+ bam_mplp_t iter;
+ iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
+ iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
+ iter->n_plp = (int*)calloc(n, sizeof(int));
+ iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
+ iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
+ iter->n = n;
+ iter->min = (uint64_t)-1;
+ for (i = 0; i < n; ++i) {
+ iter->iter[i] = bam_plp_init(func, data[i]);
+ iter->pos[i] = iter->min;
+ }
+ return iter;
+}
+
+void bam_mplp_init_overlaps(bam_mplp_t iter)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i)
+ bam_plp_init_overlaps(iter->iter[i]);
+}
+
+void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i)
+ iter->iter[i]->maxcnt = maxcnt;
+}
+
+void bam_mplp_destroy(bam_mplp_t iter)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
+ free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
+ free(iter);
+}
+
+int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, ret = 0;
+ uint64_t new_min = (uint64_t)-1;
+ for (i = 0; i < iter->n; ++i) {
+ if (iter->pos[i] == iter->min) {
+ int tid, pos;
+ iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
+ if ( iter->iter[i]->error ) return -1;
+ iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
+ }
+ if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
+ }
+ iter->min = new_min;
+ if (new_min == (uint64_t)-1) return 0;
+ *_tid = new_min>>32; *_pos = (uint32_t)new_min;
+ for (i = 0; i < iter->n; ++i) {
+ if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
+ n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
+ ++ret;
+ } else n_plp[i] = 0, plp[i] = 0;
+ }
+ return ret;
+}
+
+#endif // ~!defined(BAM_NO_PILEUP)
diff --git a/htslib/synced_bcf_reader.c b/htslib/synced_bcf_reader.c
new file mode 100644
index 0000000..1e70fc6
--- /dev/null
+++ b/htslib/synced_bcf_reader.c
@@ -0,0 +1,1284 @@
+/* synced_bcf_reader.c -- stream through multiple VCF files.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#include "htslib/synced_bcf_reader.h"
+#include "htslib/kseq.h"
+#include "htslib/khash_str2int.h"
+#include "htslib/bgzf.h"
+
+#define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
+
+typedef struct
+{
+ uint32_t start, end;
+}
+region1_t;
+
+typedef struct _region_t
+{
+ region1_t *regs;
+ int nregs, mregs, creg;
+}
+region_t;
+
+static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end);
+static bcf_sr_regions_t *_regions_init_string(const char *str);
+static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
+
+char *bcf_sr_strerror(int errnum)
+{
+ switch (errnum)
+ {
+ case open_failed:
+ return strerror(errno); break;
+ case not_bgzf:
+ return "not compressed with bgzip"; break;
+ case idx_load_failed:
+ return "could not load index"; break;
+ case file_type_error:
+ return "unknown file type"; break;
+ case api_usage_error:
+ return "API usage error"; break;
+ case header_error:
+ return "could not parse header"; break;
+ case no_eof:
+ return "no BGZF EOF marker; file may be truncated"; break;
+ default: return "";
+ }
+}
+
+static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
+{
+ kstring_t str = {0,0,0};
+ const char *tmp = filters, *prev = filters;
+ int nout = 0, *out = NULL;
+ while ( 1 )
+ {
+ if ( *tmp==',' || !*tmp )
+ {
+ out = (int*) realloc(out, (nout+1)*sizeof(int));
+ if ( tmp-prev==1 && *prev=='.' )
+ {
+ out[nout] = -1;
+ nout++;
+ }
+ else
+ {
+ str.l = 0;
+ kputsn(prev, tmp-prev, &str);
+ out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s);
+ if ( out[nout]>=0 ) nout++;
+ }
+ if ( !*tmp ) break;
+ prev = tmp+1;
+ }
+ tmp++;
+ }
+ if ( str.m ) free(str.s);
+ *nfilters = nout;
+ return out;
+}
+
+int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
+{
+ assert( !readers->regions );
+ if ( readers->nreaders )
+ {
+ fprintf(stderr,"[%s:%d %s] Error: bcf_sr_set_regions() must be called before bcf_sr_add_reader()\n", __FILE__,__LINE__,__FUNCTION__);
+ return -1;
+ }
+ readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2);
+ if ( !readers->regions ) return -1;
+ readers->explicit_regs = 1;
+ readers->require_index = 1;
+ return 0;
+}
+int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles)
+{
+ assert( !readers->targets );
+ if ( targets[0]=='^' )
+ {
+ readers->targets_exclude = 1;
+ targets++;
+ }
+ readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2);
+ if ( !readers->targets ) return -1;
+ readers->targets_als = alleles;
+ return 0;
+}
+
+int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
+{
+ htsFile* file_ptr = hts_open(fname, "r");
+ if ( ! file_ptr ) {
+ files->errnum = open_failed;
+ return 0;
+ }
+
+ files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
+ files->has_line[files->nreaders] = 0;
+ files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
+ bcf_sr_t *reader = &files->readers[files->nreaders++];
+ memset(reader,0,sizeof(bcf_sr_t));
+
+ reader->file = file_ptr;
+
+ files->errnum = 0;
+
+ if ( reader->file->format.compression==bgzf )
+ {
+ BGZF *bgzf = hts_get_bgzfp(reader->file);
+ if ( bgzf && bgzf_check_EOF(bgzf) == 0 ) {
+ files->errnum = no_eof;
+ fprintf(stderr,"[%s] Warning: no BGZF EOF marker; file may be truncated.\n", fname);
+ }
+ }
+
+ if ( files->require_index )
+ {
+ if ( reader->file->format.format==vcf )
+ {
+ if ( reader->file->format.compression!=bgzf )
+ {
+ files->errnum = not_bgzf;
+ return 0;
+ }
+
+ reader->tbx_idx = tbx_index_load(fname);
+ if ( !reader->tbx_idx )
+ {
+ files->errnum = idx_load_failed;
+ return 0;
+ }
+
+ reader->header = bcf_hdr_read(reader->file);
+ }
+ else if ( reader->file->format.format==bcf )
+ {
+ if ( reader->file->format.compression!=bgzf )
+ {
+ files->errnum = not_bgzf;
+ return 0;
+ }
+
+ reader->header = bcf_hdr_read(reader->file);
+
+ reader->bcf_idx = bcf_index_load(fname);
+ if ( !reader->bcf_idx )
+ {
+ files->errnum = idx_load_failed;
+ return 0;
+ }
+ }
+ else
+ {
+ files->errnum = file_type_error;
+ return 0;
+ }
+ }
+ else
+ {
+ if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
+ {
+ reader->header = bcf_hdr_read(reader->file);
+ }
+ else
+ {
+ files->errnum = file_type_error;
+ return 0;
+ }
+ files->streaming = 1;
+ }
+ if ( files->streaming && files->nreaders>1 )
+ {
+ files->errnum = api_usage_error;
+ fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
+ return 0;
+ }
+ if ( files->streaming && files->regions )
+ {
+ files->errnum = api_usage_error;
+ fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
+ return 0;
+ }
+ if ( !reader->header )
+ {
+ files->errnum = header_error;
+ return 0;
+ }
+
+ reader->fname = strdup(fname);
+ if ( files->apply_filters )
+ reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);
+
+ // Update list of chromosomes
+ if ( !files->explicit_regs && !files->streaming )
+ {
+ int n,i;
+ const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
+ for (i=0; i<n; i++)
+ {
+ if ( !files->regions )
+ files->regions = _regions_init_string(names[i]);
+ else
+ _regions_add(files->regions, names[i], -1, -1);
+ }
+ free(names);
+ }
+
+ return 1;
+}
+
+bcf_srs_t *bcf_sr_init(void)
+{
+ bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t));
+ return files;
+}
+
+static void bcf_sr_destroy1(bcf_sr_t *reader)
+{
+ free(reader->fname);
+ if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx);
+ if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx);
+ bcf_hdr_destroy(reader->header);
+ hts_close(reader->file);
+ if ( reader->itr ) tbx_itr_destroy(reader->itr);
+ int j;
+ for (j=0; j<reader->mbuffer; j++)
+ bcf_destroy1(reader->buffer[j]);
+ free(reader->buffer);
+ free(reader->samples);
+ free(reader->filter_ids);
+}
+void bcf_sr_destroy(bcf_srs_t *files)
+{
+ int i;
+ for (i=0; i<files->nreaders; i++)
+ bcf_sr_destroy1(&files->readers[i]);
+ free(files->has_line);
+ free(files->readers);
+ for (i=0; i<files->n_smpl; i++) free(files->samples[i]);
+ free(files->samples);
+ if (files->targets) bcf_sr_regions_destroy(files->targets);
+ if (files->regions) bcf_sr_regions_destroy(files->regions);
+ if ( files->tmps.m ) free(files->tmps.s);
+ free(files);
+}
+
+void bcf_sr_remove_reader(bcf_srs_t *files, int i)
+{
+ assert( !files->samples ); // not ready for this yet
+ bcf_sr_destroy1(&files->readers[i]);
+ if ( i+1 < files->nreaders )
+ {
+ memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t));
+ memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int));
+ }
+ files->nreaders--;
+}
+
+
+/*
+ Removes duplicate records from the buffer. The meaning of "duplicate" is
+ controlled by the $collapse variable, which can cause that from multiple
+ <indel|snp|any> lines only the first is considered and the rest is ignored.
+ The removal is done by setting the redundant lines' positions to -1 and
+ moving these lines at the end of the buffer.
+ */
+static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader)
+{
+ int irec,jrec, has_snp=0, has_indel=0, has_any=0;
+ for (irec=1; irec<=reader->nbuffer; irec++)
+ {
+ bcf1_t *line = reader->buffer[irec];
+ if ( line->pos != reader->buffer[1]->pos ) break;
+ if ( files->collapse&COLLAPSE_ANY )
+ {
+ if ( !has_any ) has_any = 1;
+ else line->pos = -1;
+ }
+ int line_type = bcf_get_variant_types(line);
+ if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) )
+ {
+ if ( !has_snp ) has_snp = 1;
+ else line->pos = -1;
+ }
+ if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL )
+ {
+ if ( !has_indel ) has_indel = 1;
+ else line->pos = -1;
+ }
+ }
+ bcf1_t *tmp;
+ irec = jrec = 1;
+ while ( irec<=reader->nbuffer && jrec<=reader->nbuffer )
+ {
+ if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; }
+ if ( jrec<=irec ) jrec = irec+1;
+ while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++;
+ if ( jrec<=reader->nbuffer )
+ {
+ tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp;
+ }
+ }
+ reader->nbuffer = irec - 1;
+}
+
+void debug_buffer(FILE *fp, bcf_sr_t *reader)
+{
+ int j;
+ for (j=0; j<=reader->nbuffer; j++)
+ {
+ bcf1_t *line = reader->buffer[j];
+ fprintf(fp,"\t%p\t%s%s\t%s:%d\t%s ", line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:"");
+ int k;
+ for (k=1; k<line->n_allele; k++) fprintf(fp," %s", line->d.allele[k]);
+ fprintf(fp,"\n");
+ }
+}
+
+void debug_buffers(FILE *fp, bcf_srs_t *files)
+{
+ int i;
+ for (i=0; i<files->nreaders; i++)
+ {
+ fprintf(fp, "has_line: %d\t%s\n", bcf_sr_has_line(files,i),files->readers[i].fname);
+ debug_buffer(fp, &files->readers[i]);
+ }
+ fprintf(fp,"\n");
+}
+
+static inline int has_filter(bcf_sr_t *reader, bcf1_t *line)
+{
+ int i, j;
+ if ( !line->d.n_flt )
+ {
+ for (j=0; j<reader->nfilter_ids; j++)
+ if ( reader->filter_ids[j]<0 ) return 1;
+ return 0;
+ }
+ for (i=0; i<line->d.n_flt; i++)
+ {
+ for (j=0; j<reader->nfilter_ids; j++)
+ if ( line->d.flt[i]==reader->filter_ids[j] ) return 1;
+ }
+ return 0;
+}
+
+static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end)
+{
+ if ( end>=MAX_CSI_COOR )
+ {
+ fprintf(stderr,"The coordinate is out of csi index limit: %d\n", end+1);
+ exit(1);
+ }
+ if ( reader->itr )
+ {
+ hts_itr_destroy(reader->itr);
+ reader->itr = NULL;
+ }
+ reader->nbuffer = 0;
+ if ( reader->tbx_idx )
+ {
+ int tid = tbx_name2id(reader->tbx_idx, seq);
+ if ( tid==-1 ) return -1; // the sequence not present in this file
+ reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,start,end+1);
+ }
+ else
+ {
+ int tid = bcf_hdr_name2id(reader->header, seq);
+ if ( tid==-1 ) return -1; // the sequence not present in this file
+ reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1);
+ }
+ if ( !reader->itr ) fprintf(stderr,"Could not seek: %s:%d-%d\n",seq,start+1,end+1);
+ assert(reader->itr);
+ return 0;
+}
+
+/*
+ * _readers_next_region() - jumps to next region if necessary
+ * Returns 0 on success or -1 when there are no more regions left
+ */
+static int _readers_next_region(bcf_srs_t *files)
+{
+ // Need to open new chromosome? Check number of lines in all readers' buffers
+ int i, eos = 0;
+ for (i=0; i<files->nreaders; i++)
+ if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
+
+ if ( eos!=files->nreaders )
+ {
+ // Some of the readers still has buffered lines
+ return 0;
+ }
+
+ // No lines in the buffer, need to open new region or quit
+ if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
+
+ for (i=0; i<files->nreaders; i++)
+ _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
+
+ return 0;
+}
+
+/*
+ * _reader_fill_buffer() - buffers all records with the same coordinate
+ */
+static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
+{
+ // Return if the buffer is full: the coordinate of the last buffered record differs
+ if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;
+
+ // No iterator (sequence not present in this file) and not streaming
+ if ( !reader->itr && !files->streaming ) return;
+
+ // Fill the buffer with records starting at the same position
+ int i, ret = 0;
+ while (1)
+ {
+ if ( reader->nbuffer+1 >= reader->mbuffer )
+ {
+ // Increase buffer size
+ reader->mbuffer += 8;
+ reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
+ for (i=8; i>0; i--) // initialize
+ {
+ reader->buffer[reader->mbuffer-i] = bcf_init1();
+ reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
+ reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
+ }
+ }
+ if ( files->streaming )
+ {
+ if ( reader->file->format.format==vcf )
+ {
+ if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines
+ int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
+ if ( ret<0 ) break;
+ }
+ else if ( reader->file->format.format==bcf )
+ {
+ if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
+ }
+ else
+ {
+ fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
+ exit(1);
+ }
+ }
+ else if ( reader->tbx_idx )
+ {
+ if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines
+ vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
+ }
+ else
+ {
+ if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
+ bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
+ }
+
+ // apply filter
+ if ( !reader->nfilter_ids )
+ bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
+ else
+ {
+ bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
+ if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
+ }
+ reader->nbuffer++;
+
+ if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full
+ }
+ if ( ret<0 )
+ {
+ // done for this region
+ tbx_itr_destroy(reader->itr);
+ reader->itr = NULL;
+ }
+ if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
+ collapse_buffer(files, reader);
+}
+
+/*
+ * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position
+ */
+static void _reader_shift_buffer(bcf_sr_t *reader)
+{
+ int i;
+ for (i=2; i<=reader->nbuffer; i++)
+ if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break;
+ if ( i<=reader->nbuffer )
+ {
+ // A record with a different position follows, swap it. Because of the reader's logic,
+ // only one such line can be present.
+ bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp;
+ reader->nbuffer = 1;
+ }
+ else
+ reader->nbuffer = 0; // no other line
+}
+
+/*
+ * _reader_match_alleles() - from multiple buffered lines selects the one which
+ * corresponds best to the template line. The logic is controlled by COLLAPSE_*
+ * Returns 0 on success or -1 when no good matching line is found.
+ */
+static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl)
+{
+ int i, irec = -1;
+
+ // if no template given, use the first available record
+ if ( !tmpl )
+ irec = 1;
+ else
+ {
+ int tmpl_type = bcf_get_variant_types(tmpl);
+ for (i=1; i<=reader->nbuffer; i++)
+ {
+ bcf1_t *line = reader->buffer[i];
+ if ( line->pos != reader->buffer[1]->pos ) break; // done with this reader
+
+ // Easiest case: matching by position only
+ if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; }
+
+ int line_type = bcf_get_variant_types(line);
+
+ // No matter what the alleles are, as long as they are both SNPs
+ if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; }
+ // ... or indels
+ if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; }
+
+ // More thorough checking: REFs must match
+ if ( tmpl->rlen != line->rlen ) continue; // different length
+ if ( !tmpl->d.allele || !line->d.allele ) continue; // one of the lines is empty, someone is swapped buffered lines?!
+ if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match
+
+ int ial,jal;
+ if ( files->collapse==COLLAPSE_NONE )
+ {
+ // Exact match, all alleles must be identical
+ if ( tmpl->n_allele!=line->n_allele ) continue; // different number of alleles, skip
+
+ int nmatch = 1; // REF has been already checked
+ for (ial=1; ial<tmpl->n_allele; ial++)
+ {
+ for (jal=1; jal<line->n_allele; jal++)
+ if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
+ }
+ if ( nmatch==tmpl->n_allele ) { irec=i; break; } // found: exact match
+ continue;
+ }
+
+ if ( line->n_allele==1 && tmpl->n_allele==1 ) { irec=i; break; } // both sites are non-variant
+
+ // COLLAPSE_SOME: at least some ALTs must match
+ for (ial=1; ial<tmpl->n_allele; ial++)
+ {
+ for (jal=1; jal<line->n_allele; jal++)
+ if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; }
+ if ( irec>=1 ) break;
+ }
+ if ( irec>=1 ) break;
+ }
+ if ( irec==-1 ) return -1; // no matching line was found
+ }
+
+ // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward
+ // and put the old bcf1_t record at the end.
+ bcf1_t *tmp = reader->buffer[0];
+ reader->buffer[0] = reader->buffer[irec];
+ for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i];
+ reader->buffer[ reader->nbuffer ] = tmp;
+ reader->nbuffer--;
+
+ return 0;
+}
+
+int _reader_next_line(bcf_srs_t *files)
+{
+ int i, min_pos = INT_MAX;
+
+ // Loop until next suitable line is found or all readers have finished
+ while ( 1 )
+ {
+ // Get all readers ready for the next region.
+ if ( files->regions && _readers_next_region(files)<0 ) break;
+
+ // Fill buffers
+ const char *chr = NULL;
+ for (i=0; i<files->nreaders; i++)
+ {
+ _reader_fill_buffer(files, &files->readers[i]);
+
+ // Update the minimum coordinate
+ if ( !files->readers[i].nbuffer ) continue;
+ if ( min_pos > files->readers[i].buffer[1]->pos )
+ {
+ min_pos = files->readers[i].buffer[1]->pos;
+ chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]);
+ }
+ }
+ if ( min_pos==INT_MAX )
+ {
+ if ( !files->regions ) break;
+ continue;
+ }
+
+ // Skip this position if not present in targets
+ if ( files->targets )
+ {
+ int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos);
+ if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) )
+ {
+ // Remove all lines with this position from the buffer
+ for (i=0; i<files->nreaders; i++)
+ if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos )
+ _reader_shift_buffer(&files->readers[i]);
+ min_pos = INT_MAX;
+ continue;
+ }
+ }
+
+ break; // done: min_pos is set
+ }
+
+ // There can be records with duplicate positions. Set the active line intelligently so that
+ // the alleles match.
+ int nret = 0; // number of readers sharing the position
+ bcf1_t *first = NULL; // record which will be used for allele matching
+ for (i=0; i<files->nreaders; i++)
+ {
+ files->has_line[i] = 0;
+
+ // Skip readers with no records at this position
+ if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue;
+
+ // Until now buffer[0] of all reader was empty and the lines started at buffer[1].
+ // Now lines which are ready to be output will be moved to buffer[0].
+ if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue;
+ if ( !first ) first = files->readers[i].buffer[0];
+
+ nret++;
+ files->has_line[i] = 1;
+ }
+ return nret;
+}
+
+int bcf_sr_next_line(bcf_srs_t *files)
+{
+ if ( !files->targets_als )
+ return _reader_next_line(files);
+
+ while (1)
+ {
+ int i, ret = _reader_next_line(files);
+ if ( !ret ) return ret;
+
+ for (i=0; i<files->nreaders; i++)
+ if ( files->has_line[i] ) break;
+
+ if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret;
+
+ // Check if there are more duplicate lines in the buffers. If not, return this line as if it
+ // matched the targets, even if there is a type mismatch
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !files->has_line[i] ) continue;
+ if ( files->readers[i].nbuffer==0 || files->readers[i].buffer[1]->pos!=files->readers[i].buffer[0]->pos ) continue;
+ break;
+ }
+ if ( i==files->nreaders ) return ret; // no more lines left, output even if target alleles are not of the same type
+ }
+}
+
+static void bcf_sr_seek_start(bcf_srs_t *readers)
+{
+ bcf_sr_regions_t *reg = readers->regions;
+ int i;
+ for (i=0; i<reg->nseqs; i++)
+ reg->regs[i].creg = -1;
+ reg->iseq = 0;
+}
+
+
+int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos)
+{
+ if ( !readers->regions ) return 0;
+ if ( !seq && !pos )
+ {
+ // seek to start
+ bcf_sr_seek_start(readers);
+ return 0;
+ }
+ bcf_sr_regions_overlap(readers->regions, seq, pos, pos);
+ int i, nret = 0;
+ for (i=0; i<readers->nreaders; i++)
+ {
+ nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1);
+ }
+ return nret;
+}
+
+int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
+{
+ int i, j, nsmpl, free_smpl = 0;
+ char **smpl = NULL;
+
+ void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
+ if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
+ {
+ smpl = hts_readlist(fname, is_file, &nsmpl);
+ if ( !smpl )
+ {
+ fprintf(stderr,"Could not read the file: \"%s\"\n", fname);
+ return 0;
+ }
+ if ( exclude )
+ {
+ for (i=0; i<nsmpl; i++)
+ khash_str2int_inc(exclude, smpl[i]);
+ }
+ free_smpl = 1;
+ }
+ if ( !smpl )
+ {
+ smpl = files->readers[0].header->samples; // intersection of all samples
+ nsmpl = bcf_hdr_nsamples(files->readers[0].header);
+ }
+
+ files->samples = NULL;
+ files->n_smpl = 0;
+ for (i=0; i<nsmpl; i++)
+ {
+ if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue;
+
+ int n_isec = 0;
+ for (j=0; j<files->nreaders; j++)
+ {
+ if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
+ n_isec++;
+ }
+ if ( n_isec!=files->nreaders )
+ {
+ fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname);
+ continue;
+ }
+
+ files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
+ files->samples[files->n_smpl++] = strdup(smpl[i]);
+ }
+
+ if ( exclude ) khash_str2int_destroy(exclude);
+ if ( free_smpl )
+ {
+ for (i=0; i<nsmpl; i++) free(smpl[i]);
+ free(smpl);
+ }
+
+ if ( !files->n_smpl )
+ {
+ if ( files->nreaders>1 )
+ fprintf(stderr,"No samples in common.\n");
+ return 0;
+ }
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ reader->samples = (int*) malloc(sizeof(int)*files->n_smpl);
+ reader->n_smpl = files->n_smpl;
+ for (j=0; j<files->n_smpl; j++)
+ reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
+ }
+ return 1;
+}
+
+// Add a new region into a list sorted by start,end. On input the coordinates
+// are 1-based, stored 0-based, inclusive.
+static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end)
+{
+ if ( start==-1 && end==-1 )
+ {
+ start = 0; end = MAX_CSI_COOR-1;
+ }
+ else
+ {
+ start--; end--; // store 0-based coordinates
+ }
+
+ if ( !reg->seq_hash )
+ reg->seq_hash = khash_str2int_init();
+
+ int iseq;
+ if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 )
+ {
+ // the chromosome block does not exist
+ iseq = reg->nseqs++;
+ reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs);
+ reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs);
+ memset(®->regs[reg->nseqs-1],0,sizeof(region_t));
+ reg->seq_names[iseq] = strdup(chr);
+ reg->regs[iseq].creg = -1;
+ khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq);
+ }
+
+ region_t *creg = ®->regs[iseq];
+
+ // the regions may not be sorted on input: binary search
+ int i, min = 0, max = creg->nregs - 1;
+ while ( min<=max )
+ {
+ i = (max+min)/2;
+ if ( start < creg->regs[i].start ) max = i - 1;
+ else if ( start > creg->regs[i].start ) min = i + 1;
+ else break;
+ }
+ if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end )
+ {
+ // no such region, insert a new one just after max
+ hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs);
+ if ( ++max < creg->nregs )
+ memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t));
+ creg->regs[max].start = start;
+ creg->regs[max].end = end;
+ creg->nregs++;
+ }
+}
+
+// File name or a list of genomic locations. If file name, NULL is returned.
+static bcf_sr_regions_t *_regions_init_string(const char *str)
+{
+ bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
+ reg->start = reg->end = -1;
+ reg->prev_start = reg->prev_seq = -1;
+
+ kstring_t tmp = {0,0,0};
+ const char *sp = str, *ep = str;
+ int from, to;
+ while ( 1 )
+ {
+ while ( *ep && *ep!=',' && *ep!=':' ) ep++;
+ tmp.l = 0;
+ kputsn(sp,ep-sp,&tmp);
+ if ( *ep==':' )
+ {
+ sp = ep+1;
+ from = hts_parse_decimal(sp,(char**)&ep,0);
+ if ( sp==ep )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
+ free(reg); free(tmp.s); return NULL;
+ }
+ if ( !*ep || *ep==',' )
+ {
+ _regions_add(reg, tmp.s, from, from);
+ sp = ep;
+ continue;
+ }
+ if ( *ep!='-' )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
+ free(reg); free(tmp.s); return NULL;
+ }
+ ep++;
+ sp = ep;
+ to = hts_parse_decimal(sp,(char**)&ep,0);
+ if ( *ep && *ep!=',' )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
+ free(reg); free(tmp.s); return NULL;
+ }
+ if ( sp==ep ) to = MAX_CSI_COOR-1;
+ _regions_add(reg, tmp.s, from, to);
+ if ( !*ep ) break;
+ sp = ep;
+ }
+ else
+ {
+ if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
+ if ( !*ep ) break;
+ sp = ++ep;
+ }
+ }
+ free(tmp.s);
+ return reg;
+}
+
+// ichr,ifrom,ito are 0-based;
+// returns -1 on error, 0 if the line is a comment line, 1 on success
+static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to)
+{
+ *chr_end = NULL;
+
+ if ( line[0]=='#' ) return 0;
+
+ int k,l; // index of the start and end column of the tab-delimited file
+ if ( ifrom <= ito )
+ k = ifrom, l = ito;
+ else
+ l = ifrom, k = ito;
+
+ int i;
+ char *se = line, *ss = NULL; // start and end
+ char *tmp;
+ for (i=0; i<=k && *se; i++)
+ {
+ ss = i==0 ? se++ : ++se;
+ while (*se && *se!='\t') se++;
+ }
+ if ( i<=k ) return -1;
+ if ( k==l )
+ {
+ *from = *to = hts_parse_decimal(ss, &tmp, 0);
+ if ( tmp==ss ) return -1;
+ }
+ else
+ {
+ if ( k==ifrom )
+ *from = hts_parse_decimal(ss, &tmp, 0);
+ else
+ *to = hts_parse_decimal(ss, &tmp, 0);
+ if ( ss==tmp ) return -1;
+
+ for (i=k; i<l && *se; i++)
+ {
+ ss = ++se;
+ while (*se && *se!='\t') se++;
+ }
+ if ( i<l ) return -1;
+ if ( k==ifrom )
+ *to = hts_parse_decimal(ss, &tmp, 0);
+ else
+ *from = hts_parse_decimal(ss, &tmp, 0);
+ if ( ss==tmp ) return -1;
+ }
+
+ ss = se = line;
+ for (i=0; i<=ichr && *se; i++)
+ {
+ if ( i>0 ) ss = ++se;
+ while (*se && *se!='\t') se++;
+ }
+ if ( i<=ichr ) return -1;
+ *chr_end = se;
+ *chr = ss;
+ return 1;
+}
+
+bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
+{
+ bcf_sr_regions_t *reg;
+ if ( !is_file ) return _regions_init_string(regions);
+
+ reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
+ reg->start = reg->end = -1;
+ reg->prev_start = reg->prev_seq = -1;
+
+ reg->file = hts_open(regions, "rb");
+ if ( !reg->file )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
+ free(reg);
+ return NULL;
+ }
+
+ reg->tbx = tbx_index_load(regions);
+ if ( !reg->tbx )
+ {
+ int len = strlen(regions);
+ int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1;
+ if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;
+
+ if ( reg->file->format.format==vcf ) ito = 1;
+
+ // read the whole file, tabix index is not present
+ while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 )
+ {
+ char *chr, *chr_end;
+ int from, to, ret;
+ ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
+ if ( ret < 0 )
+ {
+ if ( ito<0 )
+ ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
+ if ( ret<0 )
+ {
+ fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
+ hts_close(reg->file); reg->file = NULL; free(reg);
+ return NULL;
+ }
+ }
+ if ( !ret ) continue;
+ if ( is_bed ) from++;
+ *chr_end = 0;
+ _regions_add(reg, chr, from, to);
+ *chr_end = '\t';
+ }
+ hts_close(reg->file); reg->file = NULL;
+ if ( !reg->nseqs ) { free(reg); return NULL; }
+ return reg;
+ }
+
+ reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs);
+ if ( !reg->seq_hash )
+ reg->seq_hash = khash_str2int_init();
+ int i;
+ for (i=0; i<reg->nseqs; i++)
+ {
+ khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
+ }
+ reg->fname = strdup(regions);
+ reg->is_bin = 1;
+ return reg;
+}
+
+void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)
+{
+ int i;
+ free(reg->fname);
+ if ( reg->itr ) tbx_itr_destroy(reg->itr);
+ if ( reg->tbx ) tbx_destroy(reg->tbx);
+ if ( reg->file ) hts_close(reg->file);
+ if ( reg->als ) free(reg->als);
+ if ( reg->als_str.s ) free(reg->als_str.s);
+ free(reg->line.s);
+ if ( reg->regs )
+ {
+ // free only in-memory names, tbx names are const
+ for (i=0; i<reg->nseqs; i++)
+ {
+ free(reg->seq_names[i]);
+ free(reg->regs[i].regs);
+ }
+ }
+ free(reg->regs);
+ free(reg->seq_names);
+ khash_str2int_destroy(reg->seq_hash);
+ free(reg);
+}
+
+int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
+{
+ reg->iseq = reg->start = reg->end = -1;
+ if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions
+
+ // using in-memory regions
+ if ( reg->regs )
+ {
+ reg->regs[reg->iseq].creg = -1;
+ return 0;
+ }
+
+ // reading regions from tabix
+ if ( reg->itr ) tbx_itr_destroy(reg->itr);
+ reg->itr = tbx_itr_querys(reg->tbx, seq);
+ if ( reg->itr ) return 0;
+
+ return -1;
+}
+
+int bcf_sr_regions_next(bcf_sr_regions_t *reg)
+{
+ if ( reg->iseq<0 ) return -1;
+ reg->start = reg->end = -1;
+ reg->nals = 0;
+
+ // using in-memory regions
+ if ( reg->regs )
+ {
+ while ( reg->iseq < reg->nseqs )
+ {
+ reg->regs[reg->iseq].creg++;
+ if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break;
+ reg->iseq++;
+ }
+ if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left
+ region1_t *creg = ®->regs[reg->iseq].regs[reg->regs[reg->iseq].creg];
+ reg->start = creg->start;
+ reg->end = creg->end;
+ return 0;
+ }
+
+ // reading from tabix
+ char *chr, *chr_end;
+ int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
+ if ( reg->tbx )
+ {
+ ichr = reg->tbx->conf.sc-1;
+ ifrom = reg->tbx->conf.bc-1;
+ ito = reg->tbx->conf.ec-1;
+ if ( ito<0 ) ito = ifrom;
+ is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0;
+ }
+
+ int ret = 0;
+ while ( !ret )
+ {
+ if ( reg->itr )
+ {
+ // tabix index present, reading a chromosome block
+ ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, ®->line);
+ if ( ret<0 ) { reg->iseq = -1; return -1; }
+ }
+ else
+ {
+ if ( reg->is_bin )
+ {
+ // Waited for seek which never came. Reopen in text mode and stream
+ // through the regions, otherwise hts_getline would fail
+ hts_close(reg->file);
+ reg->file = hts_open(reg->fname, "r");
+ if ( !reg->file )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname);
+ reg->file = NULL;
+ bcf_sr_regions_destroy(reg);
+ return -1;
+ }
+ reg->is_bin = 0;
+ }
+
+ // tabix index absent, reading the whole file
+ ret = hts_getline(reg->file, KS_SEP_LINE, ®->line);
+ if ( ret<0 ) { reg->iseq = -1; return -1; }
+ }
+ ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to);
+ if ( ret<0 )
+ {
+ fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1);
+ return -1;
+ }
+ }
+ if ( is_bed ) from++;
+
+ *chr_end = 0;
+ if ( khash_str2int_get(reg->seq_hash, chr, ®->iseq)<0 )
+ {
+ fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s);
+ exit(1);
+ }
+ *chr_end = '\t';
+
+ reg->start = from - 1;
+ reg->end = to - 1;
+ return 0;
+}
+
+static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec)
+{
+ if ( reg->regs )
+ {
+ // payload is not supported for in-memory regions, switch to regidx instead in future
+ fprintf(stderr,"Error: Compressed and indexed targets file is required\n");
+ exit(1);
+ }
+
+ int i = 0, max_len = 0;
+ if ( !reg->nals )
+ {
+ char *ss = reg->line.s;
+ while ( i<als_idx && *ss )
+ {
+ if ( *ss=='\t' ) i++;
+ ss++;
+ }
+ char *se = ss;
+ reg->nals = 1;
+ while ( *se && *se!='\t' )
+ {
+ if ( *se==',' ) reg->nals++;
+ se++;
+ }
+ ks_resize(®->als_str, se-ss+1+reg->nals);
+ reg->als_str.l = 0;
+ hts_expand(char*,reg->nals,reg->mals,reg->als);
+ reg->nals = 0;
+
+ se = ss;
+ while ( *(++se) )
+ {
+ if ( *se=='\t' ) break;
+ if ( *se!=',' ) continue;
+ reg->als[reg->nals] = ®->als_str.s[reg->als_str.l];
+ kputsn(ss,se-ss,®->als_str);
+ if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals];
+ reg->als_str.l++;
+ reg->nals++;
+ ss = ++se;
+ }
+ reg->als[reg->nals] = ®->als_str.s[reg->als_str.l];
+ kputsn(ss,se-ss,®->als_str);
+ if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals];
+ reg->nals++;
+ reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types
+ }
+ int type = bcf_get_variant_types(rec);
+ if ( reg->als_type & VCF_INDEL )
+ return type & VCF_INDEL ? 1 : 0;
+ return !(type & VCF_INDEL) ? 1 : 0;
+}
+
+int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
+{
+ int iseq;
+ if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence
+
+ if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek
+ {
+ // flush regions left on previous chromosome
+ if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 )
+ bcf_sr_regions_flush(reg);
+
+ bcf_sr_regions_seek(reg, seq);
+ reg->start = reg->end = -1;
+ }
+ if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
+ reg->prev_seq = reg->iseq;
+ reg->prev_start = start;
+
+ while ( iseq==reg->iseq && reg->end < start )
+ {
+ if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left
+ if ( reg->iseq != iseq ) return -1; // does not overlap any regions
+ if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data);
+ }
+ if ( reg->start <= end ) return 0; // region overlap
+ return -1; // no overlap
+}
+
+void bcf_sr_regions_flush(bcf_sr_regions_t *reg)
+{
+ if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return;
+ while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data);
+ return;
+}
+
diff --git a/htslib/tabix.c b/htslib/tabix.c
new file mode 100644
index 0000000..8bd65e7
--- /dev/null
+++ b/htslib/tabix.c
@@ -0,0 +1,544 @@
+/* tabix.c -- Generic indexer for TAB-delimited genome position files.
+
+ Copyright (C) 2009-2011 Broad Institute.
+ Copyright (C) 2010-2012, 2014-2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include "htslib/tbx.h"
+#include "htslib/sam.h"
+#include "htslib/vcf.h"
+#include "htslib/kseq.h"
+#include "htslib/bgzf.h"
+#include "htslib/hts.h"
+#include "htslib/regidx.h"
+
+typedef struct
+{
+ char *regions_fname, *targets_fname;
+ int print_header, header_only;
+}
+args_t;
+
+static void error(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+#define IS_GFF (1<<0)
+#define IS_BED (1<<1)
+#define IS_SAM (1<<2)
+#define IS_VCF (1<<3)
+#define IS_BCF (1<<4)
+#define IS_BAM (1<<5)
+#define IS_CRAM (1<<6)
+#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
+
+int file_type(const char *fname)
+{
+ int l = strlen(fname);
+ int strcasecmp(const char *s1, const char *s2);
+ if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
+ else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
+ else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
+ else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
+ else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
+ else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
+ else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
+
+ htsFile *fp = hts_open(fname,"r");
+ enum htsExactFormat format = fp->format.format;
+ hts_close(fp);
+ if ( format == bcf ) return IS_BCF;
+ if ( format == bam ) return IS_BAM;
+ if ( format == cram ) return IS_CRAM;
+ if ( format == vcf ) return IS_VCF;
+
+ return 0;
+}
+
+static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
+{
+ kstring_t str = {0,0,0};
+ int iseq = 0, ireg = 0;
+ char **regs = NULL;
+ *nregs = argc;
+
+ if ( regions_fname )
+ {
+ // improve me: this is a too heavy machinery for parsing regions...
+
+ regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
+ if ( !idx ) error("Could not read %s\n", regions_fname);
+
+ (*nregs) += regidx_nregs(idx);
+ regs = (char**) malloc(sizeof(char*)*(*nregs));
+
+ int nseq;
+ char **seqs = regidx_seq_names(idx, &nseq);
+ for (iseq=0; iseq<nseq; iseq++)
+ {
+ regitr_t itr;
+ regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr);
+ while ( itr.i < itr.n )
+ {
+ str.l = 0;
+ ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1);
+ regs[ireg++] = strdup(str.s);
+ itr.i++;
+ }
+ }
+ regidx_destroy(idx);
+ }
+ free(str.s);
+
+ if ( !ireg )
+ {
+ if ( argc )
+ regs = (char**) malloc(sizeof(char*)*argc);
+ else
+ {
+ regs = (char**) malloc(sizeof(char*));
+ regs[0] = strdup(".");
+ *nregs = 1;
+ }
+ }
+
+ for (iseq=0; iseq<argc; iseq++) regs[ireg++] = strdup(argv[iseq]);
+ return regs;
+}
+static int query_regions(args_t *args, char *fname, char **regs, int nregs)
+{
+ int i;
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) error("Could not read %s\n", fname);
+ enum htsExactFormat format = hts_get_format(fp)->format;
+
+ regidx_t *reg_idx = NULL;
+ if ( args->targets_fname )
+ {
+ reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
+ if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
+ }
+
+ if ( format == bcf )
+ {
+ htsFile *out = hts_open("-","w");
+ if ( !out ) error("Could not open stdout\n", fname);
+ hts_idx_t *idx = bcf_index_load(fname);
+ if ( !idx ) error("Could not load .csi index of %s\n", fname);
+ bcf_hdr_t *hdr = bcf_hdr_read(fp);
+ if ( !hdr ) error("Could not read the header: %s\n", fname);
+ if ( args->print_header )
+ bcf_hdr_write(out,hdr);
+ if ( !args->header_only )
+ {
+ bcf1_t *rec = bcf_init();
+ for (i=0; i<nregs; i++)
+ {
+ hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
+ while ( bcf_itr_next(fp, itr, rec) >=0 )
+ {
+ if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
+ bcf_write(out,hdr,rec);
+ }
+ tbx_itr_destroy(itr);
+ }
+ bcf_destroy(rec);
+ }
+ if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
+ bcf_hdr_destroy(hdr);
+ hts_idx_destroy(idx);
+ }
+ else if ( format==vcf || format==sam || format==unknown_format )
+ {
+ tbx_t *tbx = tbx_index_load(fname);
+ if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
+ kstring_t str = {0,0,0};
+ if ( args->print_header )
+ {
+ while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
+ {
+ if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
+ puts(str.s);
+ }
+ }
+ if ( !args->header_only )
+ {
+ int nseq;
+ const char **seq = NULL;
+ if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
+ for (i=0; i<nregs; i++)
+ {
+ hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
+ if ( !itr ) continue;
+ while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
+ {
+ if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
+ puts(str.s);
+ }
+ tbx_itr_destroy(itr);
+ }
+ free(seq);
+ }
+ free(str.s);
+ tbx_destroy(tbx);
+ }
+ else if ( format==bam )
+ error("Please use \"samtools view\" for querying BAM files.\n");
+
+ if ( reg_idx ) regidx_destroy(reg_idx);
+ if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
+
+ for (i=0; i<nregs; i++) free(regs[i]);
+ free(regs);
+ return 0;
+}
+static int query_chroms(char *fname)
+{
+ const char **seq;
+ int i, nseq, ftype = file_type(fname);
+ if ( ftype & IS_TXT || !ftype )
+ {
+ tbx_t *tbx = tbx_index_load(fname);
+ if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
+ seq = tbx_seqnames(tbx, &nseq);
+ for (i=0; i<nseq; i++)
+ printf("%s\n", seq[i]);
+ free(seq);
+ tbx_destroy(tbx);
+ }
+ else if ( ftype==IS_BCF )
+ {
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) error("Could not read %s\n", fname);
+ bcf_hdr_t *hdr = bcf_hdr_read(fp);
+ if ( !hdr ) error("Could not read the header: %s\n", fname);
+ hts_close(fp);
+ hts_idx_t *idx = bcf_index_load(fname);
+ if ( !idx ) error("Could not load .csi index of %s\n", fname);
+ seq = bcf_index_seqnames(idx, hdr, &nseq);
+ for (i=0; i<nseq; i++)
+ printf("%s\n", seq[i]);
+ free(seq);
+ bcf_hdr_destroy(hdr);
+ hts_idx_destroy(idx);
+ }
+ else if ( ftype==IS_BAM ) // todo: BAM
+ error("BAM: todo\n");
+ return 0;
+}
+
+int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
+{
+ if ( ftype & IS_TXT || !ftype )
+ {
+ BGZF *fp = bgzf_open(fname,"r");
+ if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
+
+ char *buffer = fp->uncompressed_block;
+ int skip_until = 0;
+
+ // Skip the header: find out the position of the data block
+ if ( buffer[0]==conf->meta_char )
+ {
+ skip_until = 1;
+ while (1)
+ {
+ if ( buffer[skip_until]=='\n' )
+ {
+ skip_until++;
+ if ( skip_until>=fp->block_length )
+ {
+ if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
+ skip_until = 0;
+ }
+ // The header has finished
+ if ( buffer[skip_until]!=conf->meta_char ) break;
+ }
+ skip_until++;
+ if ( skip_until>=fp->block_length )
+ {
+ if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
+ skip_until = 0;
+ }
+ }
+ }
+
+ // Output the new header
+ FILE *hdr = fopen(header,"r");
+ if ( !hdr ) error("%s: %s", header,strerror(errno));
+ const size_t page_size = 32768;
+ char *buf = malloc(page_size);
+ BGZF *bgzf_out = bgzf_open("-", "w");
+ ssize_t nread;
+ while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
+ {
+ if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
+ if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
+ }
+ if ( fclose(hdr) ) error("close failed: %s\n", header);
+
+ // Output all remainig data read with the header block
+ if ( fp->block_length - skip_until > 0 )
+ {
+ if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
+ }
+ if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+
+ while (1)
+ {
+ nread = bgzf_raw_read(fp, buf, page_size);
+ if ( nread<=0 ) break;
+
+ int count = bgzf_raw_write(bgzf_out, buf, nread);
+ if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
+ }
+ if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+ if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
+ free(buf);
+ }
+ else
+ error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header.
+ return 0;
+}
+
+static int usage(void)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Version: %s\n", hts_version());
+ fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Indexing Options:\n");
+ fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
+ fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
+ fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
+ fprintf(stderr, " -C, --csi generate CSI index for VCF (default is TBI)\n");
+ fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
+ fprintf(stderr, " -f, --force overwrite existing index without asking\n");
+ fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf\n");
+ fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
+ fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Querying and other options:\n");
+ fprintf(stderr, " -h, --print-header print also the header lines\n");
+ fprintf(stderr, " -H, --only-header print only the header lines\n");
+ fprintf(stderr, " -l, --list-chroms list chromosome names\n");
+ fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
+ fprintf(stderr, " -R, --regions FILE restrict to regions listed in the file\n");
+ fprintf(stderr, " -T, --targets FILE similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ int c, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
+ tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+ char *reheader = NULL;
+ args_t args;
+ memset(&args,0,sizeof(args_t));
+
+ static const struct option loptions[] =
+ {
+ {"help", no_argument, NULL, 'h'},
+ {"regions", required_argument, NULL, 'R'},
+ {"targets", required_argument, NULL, 'T'},
+ {"csi", no_argument, NULL, 'C'},
+ {"zero-based", no_argument, NULL, '0'},
+ {"print-header", no_argument, NULL, 'h'},
+ {"only-header", no_argument, NULL, 'H'},
+ {"begin", required_argument, NULL, 'b'},
+ {"comment", required_argument, NULL, 'c'},
+ {"end", required_argument, NULL, 'e'},
+ {"force", no_argument, NULL, 'f'},
+ {"preset", required_argument, NULL, 'p'},
+ {"sequence", required_argument, NULL, 's'},
+ {"skip-lines", required_argument, NULL, 'S'},
+ {"list-chroms", no_argument, NULL, 'l'},
+ {"reheader", required_argument, NULL, 'r'},
+ {"version", no_argument, NULL, 1},
+ {NULL, 0, NULL, 0}
+ };
+
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'R': args.regions_fname = optarg; break;
+ case 'T': args.targets_fname = optarg; break;
+ case 'C': do_csi = 1; break;
+ case 'r': reheader = optarg; break;
+ case 'h': args.print_header = 1; break;
+ case 'H': args.print_header = 1; args.header_only = 1; break;
+ case 'l': list_chroms = 1; break;
+ case '0': conf.preset |= TBX_UCSC; break;
+ case 'b':
+ conf.bc = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: -b %s\n", optarg);
+ break;
+ case 'e':
+ conf.ec = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: -e %s\n", optarg);
+ break;
+ case 'c': conf.meta_char = *optarg; break;
+ case 'f': is_force = 1; break;
+ case 'm':
+ min_shift = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: -m %s\n", optarg);
+ break;
+ case 'p':
+ if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ else if (strcmp(optarg, "bcf") == 0) ; // bcf is autodetected, preset is not needed
+ else if (strcmp(optarg, "bam") == 0) ; // same as bcf
+ else error("The preset string not recognised: '%s'\n", optarg);
+ break;
+ case 's':
+ conf.sc = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: -s %s\n", optarg);
+ break;
+ case 'S':
+ conf.line_skip = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: -S %s\n", optarg);
+ break;
+ case 1:
+ printf(
+"tabix (htslib) %s\n"
+"Copyright (C) 2016 Genome Research Ltd.\n", hts_version());
+ return EXIT_SUCCESS;
+ default: return usage();
+ }
+ }
+
+ if ( optind==argc ) return usage();
+
+ if ( list_chroms )
+ return query_chroms(argv[optind]);
+
+ if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
+ {
+ int nregs = 0;
+ char **regs = NULL;
+ if ( !args.header_only )
+ regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
+ return query_regions(&args, argv[optind], regs, nregs);
+ }
+
+ char *fname = argv[optind];
+ int ftype = file_type(fname);
+ if ( !conf_ptr ) // no preset given
+ {
+ if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff;
+ else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed;
+ else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam;
+ else if ( ftype==IS_VCF )
+ {
+ conf_ptr = &tbx_conf_vcf;
+ if ( !min_shift && do_csi ) min_shift = 14;
+ }
+ else if ( ftype==IS_BCF )
+ {
+ if ( !min_shift ) min_shift = 14;
+ }
+ else if ( ftype==IS_BAM )
+ {
+ if ( !min_shift ) min_shift = 14;
+ }
+ }
+ if ( do_csi )
+ {
+ if ( !min_shift ) min_shift = 14;
+ min_shift *= do_csi; // positive for CSIv2, negative for CSIv1
+ }
+ if ( min_shift!=0 && !do_csi ) do_csi = 1;
+
+ if ( reheader )
+ return reheader_file(fname, reheader, ftype, conf_ptr);
+
+ if ( conf_ptr )
+ conf = *conf_ptr;
+
+ char *suffix = ".tbi";
+ if ( do_csi ) suffix = ".csi";
+ else if ( ftype==IS_BAM ) suffix = ".bai";
+ else if ( ftype==IS_CRAM ) suffix = ".crai";
+
+ char *idx_fname = calloc(strlen(fname) + 5, 1);
+ strcat(strcpy(idx_fname, fname), suffix);
+
+ struct stat stat_tbi, stat_file;
+ if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
+ {
+ // Before complaining about existing index, check if the VCF file isn't
+ // newer. This is a common source of errors, people tend not to notice
+ // that tabix failed
+ stat(fname, &stat_file);
+ if ( stat_file.st_mtime <= stat_tbi.st_mtime )
+ error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
+ }
+ free(idx_fname);
+
+ if ( ftype==IS_CRAM )
+ {
+ if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
+ return 0;
+ }
+ else if ( do_csi )
+ {
+ if ( ftype==IS_BCF )
+ {
+ if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
+ return 0;
+ }
+ if ( ftype==IS_BAM )
+ {
+ if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
+ return 0;
+ }
+ if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
+ return 0;
+ }
+ else // TBI index
+ {
+ if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
+ return 0;
+ }
+ return 0;
+}
diff --git a/htslib/tbx.c b/htslib/tbx.c
new file mode 100644
index 0000000..7b74ea4
--- /dev/null
+++ b/htslib/tbx.c
@@ -0,0 +1,333 @@
+/* tbx.c -- tabix API functions.
+
+ Copyright (C) 2009, 2010, 2012-2015 Genome Research Ltd.
+ Copyright (C) 2010-2012 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <assert.h>
+#include "htslib/tbx.h"
+#include "htslib/bgzf.h"
+
+#include "htslib/khash.h"
+KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
+
+tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
+tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
+tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
+tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
+tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
+
+typedef struct {
+ int64_t beg, end;
+ char *ss, *se;
+ int tid;
+} tbx_intv_t;
+
+static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
+{
+ khint_t k;
+ khash_t(s2i) *d;
+ if (tbx->dict == 0) tbx->dict = kh_init(s2i);
+ d = (khash_t(s2i)*)tbx->dict;
+ if (is_add) {
+ int absent;
+ k = kh_put(s2i, d, ss, &absent);
+ if (absent) {
+ kh_key(d, k) = strdup(ss);
+ kh_val(d, k) = kh_size(d) - 1;
+ }
+ } else k = kh_get(s2i, d, ss);
+ return k == kh_end(d)? -1 : kh_val(d, k);
+}
+
+int tbx_name2id(tbx_t *tbx, const char *ss)
+{
+ return get_tid(tbx, ss, 0);
+}
+
+int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
+{
+ int i, b = 0, id = 1, ncols = 0;
+ char *s;
+ intv->ss = intv->se = 0; intv->beg = intv->end = -1;
+ for (i = 0; i <= len; ++i) {
+ if (line[i] == '\t' || line[i] == 0) {
+ ++ncols;
+ if (id == conf->sc) {
+ intv->ss = line + b; intv->se = line + i;
+ } else if (id == conf->bc) {
+ // here ->beg is 0-based.
+ intv->beg = intv->end = strtol(line + b, &s, 0);
+ if ( s==line+b ) return -1; // expected int
+ if (!(conf->preset&TBX_UCSC)) --intv->beg;
+ else ++intv->end;
+ if (intv->beg < 0) intv->beg = 0;
+ if (intv->end < 1) intv->end = 1;
+ } else {
+ if ((conf->preset&0xffff) == TBX_GENERIC) {
+ if (id == conf->ec)
+ {
+ intv->end = strtol(line + b, &s, 0);
+ if ( s==line+b ) return -1; // expected int
+ }
+ } else if ((conf->preset&0xffff) == TBX_SAM) {
+ if (id == 6) { // CIGAR
+ int l = 0, op;
+ char *t;
+ for (s = line + b; s < line + i;) {
+ long x = strtol(s, &t, 10);
+ op = toupper(*t);
+ if (op == 'M' || op == 'D' || op == 'N') l += x;
+ s = t + 1;
+ }
+ if (l == 0) l = 1;
+ intv->end = intv->beg + l;
+ }
+ } else if ((conf->preset&0xffff) == TBX_VCF) {
+ if (id == 4) {
+ if (b < i) intv->end = intv->beg + (i - b);
+ } else if (id == 8) { // look for "END="
+ int c = line[i];
+ line[i] = 0;
+ s = strstr(line + b, "END=");
+ if (s == line + b) s += 4;
+ else if (s) {
+ s = strstr(line + b, ";END=");
+ if (s) s += 5;
+ }
+ if (s) intv->end = strtol(s, &s, 0);
+ line[i] = c;
+ }
+ }
+ }
+ b = i + 1;
+ ++id;
+ }
+ }
+ if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
+ return 0;
+}
+
+static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
+{
+ if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
+ int c = *intv->se;
+ *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
+ return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
+ } else {
+ char *type = NULL;
+ switch (tbx->conf.preset&0xffff)
+ {
+ case TBX_SAM: type = "TBX_SAM"; break;
+ case TBX_VCF: type = "TBX_VCF"; break;
+ case TBX_UCSC: type = "TBX_UCSC"; break;
+ default: type = "TBX_GENERIC"; break;
+ }
+ fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s);
+ return -1;
+ }
+}
+
+int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
+{
+ tbx_t *tbx = (tbx_t *) tbxv;
+ kstring_t *s = (kstring_t *) sv;
+ int ret;
+ if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
+ tbx_intv_t intv;
+ get_intv(tbx, s, &intv, 0);
+ *tid = intv.tid; *beg = intv.beg; *end = intv.end;
+ }
+ return ret;
+}
+
+void tbx_set_meta(tbx_t *tbx)
+{
+ int i, l = 0, l_nm;
+ uint32_t x[7];
+ char **name;
+ uint8_t *meta;
+ khint_t k;
+ khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
+
+ memcpy(x, &tbx->conf, 24);
+ name = (char**)malloc(sizeof(char*) * kh_size(d));
+ for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
+ if (!kh_exist(d, k)) continue;
+ name[kh_val(d, k)] = (char*)kh_key(d, k);
+ l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
+ }
+ l_nm = x[6] = l;
+ meta = (uint8_t*)malloc(l_nm + 28);
+ if (ed_is_big())
+ for (i = 0; i < 7; ++i)
+ x[i] = ed_swap_4(x[i]);
+ memcpy(meta, x, 28);
+ for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
+ int x = strlen(name[i]) + 1;
+ memcpy(meta + l, name[i], x);
+ l += x;
+ }
+ free(name);
+ hts_idx_set_meta(tbx->idx, l, meta, 0);
+}
+
+tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
+{
+ tbx_t *tbx;
+ kstring_t str;
+ int ret, first = 0, n_lvls, fmt;
+ int64_t lineno = 0;
+ uint64_t last_off = 0;
+ tbx_intv_t intv;
+
+ str.s = 0; str.l = str.m = 0;
+ tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
+ tbx->conf = *conf;
+ if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
+ else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
+ while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
+ ++lineno;
+ if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
+ last_off = bgzf_tell(fp);
+ continue;
+ }
+ if (first == 0) {
+ tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
+ first = 1;
+ }
+ get_intv(tbx, &str, &intv, 1);
+ ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
+ if (ret < 0)
+ {
+ free(str.s);
+ tbx_destroy(tbx);
+ return NULL;
+ }
+ }
+ if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file
+ if ( !tbx->dict ) tbx->dict = kh_init(s2i);
+ hts_idx_finish(tbx->idx, bgzf_tell(fp));
+ tbx_set_meta(tbx);
+ free(str.s);
+ return tbx;
+}
+
+void tbx_destroy(tbx_t *tbx)
+{
+ khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
+ if (d != NULL)
+ {
+ khint_t k;
+ for (k = kh_begin(d); k != kh_end(d); ++k)
+ if (kh_exist(d, k)) free((char*)kh_key(d, k));
+ }
+ hts_idx_destroy(tbx->idx);
+ kh_destroy(s2i, d);
+ free(tbx);
+}
+
+int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
+{
+ tbx_t *tbx;
+ BGZF *fp;
+ int ret;
+ if ( bgzf_is_bgzf(fn)!=1 ) { fprintf(stderr,"Not a BGZF file: %s\n", fn); return -1; }
+ if ((fp = bgzf_open(fn, "r")) == 0) return -1;
+ if ( !fp->is_compressed ) { bgzf_close(fp); return -1; }
+ tbx = tbx_index(fp, min_shift, conf);
+ bgzf_close(fp);
+ if ( !tbx ) return -1;
+ ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
+ tbx_destroy(tbx);
+ return ret;
+}
+
+int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
+{
+ return tbx_index_build2(fn, NULL, min_shift, conf);
+}
+
+tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
+{
+ tbx_t *tbx;
+ uint8_t *meta;
+ char *nm, *p;
+ uint32_t x[7];
+ int l_meta, l_nm;
+ tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
+ tbx->idx = fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_TBI);
+ if ( !tbx->idx )
+ {
+ free(tbx);
+ return NULL;
+ }
+ meta = hts_idx_get_meta(tbx->idx, &l_meta);
+ if ( !meta )
+ {
+ free(tbx);
+ return NULL;
+ }
+ memcpy(x, meta, 28);
+ memcpy(&tbx->conf, x, 24);
+ p = nm = (char*)meta + 28;
+ l_nm = x[6];
+ for (; p - nm < l_nm; p += strlen(p) + 1) get_tid(tbx, p, 1);
+ return tbx;
+}
+
+tbx_t *tbx_index_load(const char *fn)
+{
+ return tbx_index_load2(fn, NULL);
+}
+
+const char **tbx_seqnames(tbx_t *tbx, int *n)
+{
+ khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
+ if (d == NULL)
+ {
+ *n = 0;
+ return NULL;
+ }
+ int tid, m = kh_size(d);
+ const char **names = (const char**) calloc(m,sizeof(const char*));
+ khint_t k;
+ for (k=kh_begin(d); k<kh_end(d); k++)
+ {
+ if ( !kh_exist(d,k) ) continue;
+ tid = kh_val(d,k);
+ assert( tid<m );
+ names[tid] = kh_key(d,k);
+ }
+ // sanity check: there should be no gaps
+ for (tid=0; tid<m; tid++)
+ assert(names[tid]);
+ *n = m;
+ return names;
+}
+
diff --git a/htslib/vcf.c b/htslib/vcf.c
new file mode 100644
index 0000000..3602e14
--- /dev/null
+++ b/htslib/vcf.c
@@ -0,0 +1,3483 @@
+/* vcf.c -- VCF/BCF API functions.
+
+ Copyright (C) 2012, 2013 Broad Institute.
+ Copyright (C) 2012-2016 Genome Research Ltd.
+ Portions copyright (C) 2014 Intel Corporation.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "htslib/kstring.h"
+#include "htslib/bgzf.h"
+#include "htslib/vcf.h"
+#include "htslib/tbx.h"
+#include "htslib/hfile.h"
+#include "htslib/khash_str2int.h"
+#include "hts_internal.h"
+
+#include "htslib/khash.h"
+KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+typedef khash_t(vdict) vdict_t;
+
+#include "htslib/kseq.h"
+KSTREAM_DECLARE(gzFile, gzread)
+
+uint32_t bcf_float_missing = 0x7F800001;
+uint32_t bcf_float_vector_end = 0x7F800002;
+uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
+
+static const char *dump_char(char *buffer, char c)
+{
+ switch (c) {
+ case '\n': strcpy(buffer, "\\n"); break;
+ case '\r': strcpy(buffer, "\\r"); break;
+ case '\t': strcpy(buffer, "\\t"); break;
+ case '\'':
+ case '\"':
+ case '\\':
+ sprintf(buffer, "\\%c", c);
+ break;
+ default:
+ if (isprint_c(c)) sprintf(buffer, "%c", c);
+ else sprintf(buffer, "\\x%02X", (unsigned char) c);
+ break;
+ }
+ return buffer;
+}
+
+/*************************
+ *** VCF header parser ***
+ *************************/
+
+int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
+{
+ if ( !s ) return 0;
+
+ const char *ss = s;
+ while ( !*ss && isspace(*ss) ) ss++;
+ if ( !*ss )
+ {
+ fprintf(stderr,"[E::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
+ abort();
+ }
+
+ vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
+ int ret;
+ char *sdup = strdup(s);
+ int k = kh_put(vdict, d, sdup, &ret);
+ if (ret) { // absent
+ kh_val(d, k) = bcf_idinfo_def;
+ kh_val(d, k).id = kh_size(d) - 1;
+ } else {
+ if (hts_verbose >= 2)
+ {
+ fprintf(stderr, "[E::%s] Duplicated sample name '%s'\n", __func__, s);
+ abort();
+ }
+ free(sdup);
+ return -1;
+ }
+ int n = kh_size(d);
+ h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
+ h->samples[n-1] = sdup;
+ h->dirty = 1;
+ return 0;
+}
+
+int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
+{
+ int ret = 0;
+ int i = 0;
+ const char *p, *q;
+ // add samples
+ for (p = q = str;; ++q) {
+ if (*q != '\t' && *q != 0 && *q != '\n') continue;
+ if (++i > 9) {
+ char *s = (char*)malloc(q - p + 1);
+ strncpy(s, p, q - p);
+ s[q - p] = 0;
+ if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1;
+ free(s);
+ }
+ if (*q == 0 || *q == '\n') break;
+ p = q + 1;
+ }
+ bcf_hdr_add_sample(h,NULL);
+ return ret;
+}
+
+int bcf_hdr_sync(bcf_hdr_t *h)
+{
+ int i;
+ for (i = 0; i < 3; i++)
+ {
+ vdict_t *d = (vdict_t*)h->dict[i];
+ khint_t k;
+ if ( h->n[i] < kh_size(d) )
+ {
+ // this should be true only for i=2, BCF_DT_SAMPLE
+ h->n[i] = kh_size(d);
+ h->id[i] = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
+ }
+ for (k=kh_begin(d); k<kh_end(d); k++)
+ {
+ if (!kh_exist(d,k)) continue;
+ h->id[i][kh_val(d,k).id].key = kh_key(d,k);
+ h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
+ }
+ }
+ h->dirty = 0;
+ return 0;
+}
+
+void bcf_hrec_destroy(bcf_hrec_t *hrec)
+{
+ free(hrec->key);
+ if ( hrec->value ) free(hrec->value);
+ int i;
+ for (i=0; i<hrec->nkeys; i++)
+ {
+ free(hrec->keys[i]);
+ free(hrec->vals[i]);
+ }
+ free(hrec->keys);
+ free(hrec->vals);
+ free(hrec);
+}
+
+// Copies all fields except IDX.
+bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
+{
+ bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
+ out->type = hrec->type;
+ if ( hrec->key ) out->key = strdup(hrec->key);
+ if ( hrec->value ) out->value = strdup(hrec->value);
+ out->nkeys = hrec->nkeys;
+ out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
+ out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
+ int i, j = 0;
+ for (i=0; i<hrec->nkeys; i++)
+ {
+ if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
+ if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]);
+ if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
+ j++;
+ }
+ if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
+ return out;
+}
+
+void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
+{
+ fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
+ int i;
+ for (i=0; i<hrec->nkeys; i++)
+ fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
+ fprintf(fp, "\n");
+}
+
+void bcf_header_debug(bcf_hdr_t *hdr)
+{
+ int i, j;
+ for (i=0; i<hdr->nhrec; i++)
+ {
+ if ( !hdr->hrec[i]->value )
+ {
+ fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
+ fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
+ for (j=1; j<hdr->hrec[i]->nkeys; j++)
+ fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
+ fprintf(stderr,">\n");
+ }
+ else
+ fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
+ }
+}
+
+void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
+{
+ int n = ++hrec->nkeys;
+ hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
+ hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
+ assert( len );
+ hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char));
+ memcpy(hrec->keys[n-1],str,len);
+ hrec->keys[n-1][len] = 0;
+ hrec->vals[n-1] = NULL;
+}
+
+void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
+{
+ if ( !str ) { hrec->vals[i] = NULL; return; }
+ if ( hrec->vals[i] ) free(hrec->vals[i]);
+ if ( is_quoted )
+ {
+ hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
+ hrec->vals[i][0] = '"';
+ memcpy(&hrec->vals[i][1],str,len);
+ hrec->vals[i][len+1] = '"';
+ hrec->vals[i][len+2] = 0;
+ }
+ else
+ {
+ hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
+ memcpy(hrec->vals[i],str,len);
+ hrec->vals[i][len] = 0;
+ }
+}
+
+void hrec_add_idx(bcf_hrec_t *hrec, int idx)
+{
+ int n = ++hrec->nkeys;
+ hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
+ hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
+ hrec->keys[n-1] = strdup("IDX");
+ kstring_t str = {0,0,0};
+ kputw(idx, &str);
+ hrec->vals[n-1] = str.s;
+}
+
+int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
+{
+ int i;
+ for (i=0; i<hrec->nkeys; i++)
+ if ( !strcasecmp(key,hrec->keys[i]) ) return i;
+ return -1;
+}
+
+static inline int is_escaped(const char *min, const char *str)
+{
+ int n = 0;
+ while ( --str>=min && *str=='\\' ) n++;
+ return n%2;
+}
+
+bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
+{
+ const char *p = line;
+ if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
+ p += 2;
+
+ const char *q = p;
+ while ( *q && *q!='=' ) q++;
+ int n = q-p;
+ if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format
+
+ bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
+ hrec->key = (char*) malloc(sizeof(char)*(n+1));
+ memcpy(hrec->key,p,n);
+ hrec->key[n] = 0;
+
+ p = ++q;
+ if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
+ {
+ while ( *q && *q!='\n' ) q++;
+ hrec->value = (char*) malloc((q-p+1)*sizeof(char));
+ memcpy(hrec->value, p, q-p);
+ hrec->value[q-p] = 0;
+ *len = q-line+1;
+ return hrec;
+ }
+
+ // structured line, e.g.
+ // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
+ // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
+ int nopen = 1;
+ while ( *q && *q!='\n' && nopen>0 )
+ {
+ p = ++q;
+ while ( *q && *q==' ' ) { p++; q++; }
+ // ^[A-Za-z_][0-9A-Za-z_.]*$
+ if (p==q && *q && (isalpha(*q) || *q=='_'))
+ {
+ q++;
+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
+ }
+ n = q-p;
+ int m = 0;
+ while ( *q && *q==' ' ) { q++; m++; }
+ if ( *q!='=' || !n )
+ {
+ // wrong format
+ while ( *q && *q!='\n' ) q++;
+ kstring_t tmp = {0,0,0};
+ kputsn(line,q-line,&tmp);
+ fprintf(stderr,"Could not parse the header line: \"%s\"\n", tmp.s);
+ free(tmp.s);
+ *len = q-line+1;
+ bcf_hrec_destroy(hrec);
+ return NULL;
+ }
+ bcf_hrec_add_key(hrec, p, q-p-m);
+ p = ++q;
+ while ( *q && *q==' ' ) { p++; q++; }
+ int quoted = *p=='"' ? 1 : 0;
+ if ( quoted ) p++, q++;
+ while ( *q && *q != '\n' )
+ {
+ if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
+ else
+ {
+ if ( *q=='<' ) nopen++;
+ if ( *q=='>' ) nopen--;
+ if ( !nopen ) break;
+ if ( *q==',' && nopen==1 ) break;
+ }
+ q++;
+ }
+ const char *r = q;
+ while ( r > p && r[-1] == ' ' ) r--;
+ bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted);
+ if ( quoted && *q=='"' ) q++;
+ if ( *q=='>' ) { nopen--; q++; }
+ }
+
+ // Skip trailing spaces
+ while ( *q && *q==' ' ) { q++; }
+
+ *len = q-line+1;
+ return hrec;
+}
+
+static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
+{
+ // If available, preserve existing IDX
+ if ( idinfo->id==-1 )
+ idinfo->id = hdr->n[dict_type]++;
+ else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
+ {
+ fprintf(stderr,"[%s:%d %s] Conflicting IDX=%d lines in the header dictionary, the new tag is %s\n", __FILE__,__LINE__,__FUNCTION__, idinfo->id, tag);
+ exit(1);
+ }
+
+ if ( idinfo->id >= hdr->n[dict_type] ) hdr->n[dict_type] = idinfo->id+1;
+ hts_expand0(bcf_idpair_t,hdr->n[dict_type],hdr->m[dict_type],hdr->id[dict_type]);
+
+ // NB: the next kh_put call can invalidate the idinfo pointer, therefore
+ // we leave it unassigned here. It myst be set explicitly in bcf_hdr_sync.
+ hdr->id[dict_type][idinfo->id].key = tag;
+
+ return 0;
+}
+
+// returns: 1 when hdr needs to be synced, 0 otherwise
+int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
+{
+ // contig
+ int i,j, ret;
+ khint_t k;
+ char *str;
+ if ( !strcmp(hrec->key, "contig") )
+ {
+ hrec->type = BCF_HL_CTG;
+
+ // Get the contig ID ($str) and length ($j)
+ i = bcf_hrec_find_key(hrec,"length");
+ if ( i<0 ) j = 0;
+ else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
+
+ i = bcf_hrec_find_key(hrec,"ID");
+ if ( i<0 ) return 0;
+ str = strdup(hrec->vals[i]);
+
+ // Register in the dictionary
+ vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
+ khint_t k = kh_get(vdict, d, str);
+ if ( k != kh_end(d) ) { free(str); return 0; } // already present
+ k = kh_put(vdict, d, str, &ret);
+
+ int idx = bcf_hrec_find_key(hrec,"IDX");
+ if ( idx!=-1 )
+ {
+ char *tmp = hrec->vals[idx];
+ idx = strtol(hrec->vals[idx], &tmp, 10);
+ if ( *tmp || idx < 0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
+ return 0;
+ }
+ }
+
+ kh_val(d, k) = bcf_idinfo_def;
+ kh_val(d, k).id = idx;
+ kh_val(d, k).info[0] = j;
+ kh_val(d, k).hrec[0] = hrec;
+ bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k));
+ if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d,k).id);
+
+ return 1;
+ }
+
+ if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
+ else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
+ else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
+ else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
+ else return 0;
+
+ // INFO/FILTER/FORMAT
+ char *id = NULL;
+ int type = -1, num = -1, var = -1, idx = -1;
+ for (i=0; i<hrec->nkeys; i++)
+ {
+ if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
+ else if ( !strcmp(hrec->keys[i], "IDX") )
+ {
+ char *tmp = hrec->vals[i];
+ idx = strtol(hrec->vals[i], &tmp, 10);
+ if ( *tmp || idx < 0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
+ return 0;
+ }
+ }
+ else if ( !strcmp(hrec->keys[i], "Type") )
+ {
+ if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
+ else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
+ else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
+ else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
+ else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
+ else
+ {
+ if (hts_verbose >= 2) fprintf(stderr, "[E::%s] The type \"%s\" is not supported, assuming \"String\"\n", __func__, hrec->vals[i]);
+ type = BCF_HT_STR;
+ }
+ }
+ else if ( !strcmp(hrec->keys[i], "Number") )
+ {
+ if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
+ else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
+ else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
+ else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
+ else
+ {
+ sscanf(hrec->vals[i],"%d",&num);
+ var = BCF_VL_FIXED;
+ }
+ if (var != BCF_VL_FIXED) num = 0xfffff;
+ }
+ }
+ uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type;
+
+ if ( !id ) return 0;
+ str = strdup(id);
+
+ vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
+ k = kh_get(vdict, d, str);
+ if ( k != kh_end(d) )
+ {
+ // already present
+ free(str);
+ if ( kh_val(d, k).hrec[info&0xf] ) return 0;
+ kh_val(d, k).info[info&0xf] = info;
+ kh_val(d, k).hrec[info&0xf] = hrec;
+ if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
+ return 1;
+ }
+ k = kh_put(vdict, d, str, &ret);
+ kh_val(d, k) = bcf_idinfo_def;
+ kh_val(d, k).info[info&0xf] = info;
+ kh_val(d, k).hrec[info&0xf] = hrec;
+ kh_val(d, k).id = idx;
+ bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k));
+ if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d,k).id);
+
+ return 1;
+}
+
+int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
+{
+ if ( !hrec ) return 0;
+
+ hrec->type = BCF_HL_GEN;
+ if ( !bcf_hdr_register_hrec(hdr,hrec) )
+ {
+ // If one of the hashed field, then it is already present
+ if ( hrec->type != BCF_HL_GEN )
+ {
+ bcf_hrec_destroy(hrec);
+ return 0;
+ }
+
+ // Is one of the generic fields and already present?
+ int i;
+ for (i=0; i<hdr->nhrec; i++)
+ {
+ if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
+ if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
+ if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
+ }
+ if ( i<hdr->nhrec )
+ {
+ bcf_hrec_destroy(hrec);
+ return 0;
+ }
+ }
+
+ // New record, needs to be added
+ int n = ++hdr->nhrec;
+ hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
+ hdr->hrec[n-1] = hrec;
+ hdr->dirty = 1;
+
+ return hrec->type==BCF_HL_GEN ? 0 : 1;
+}
+
+/*
+ * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
+ * the STR,GEN lines are searched for linearly in a linked list of all header lines.
+ * This may become a problem for VCFs with huge headers, we might need to build a
+ * dictionary for these lines as well.
+ */
+bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
+{
+ int i;
+ if ( type==BCF_HL_GEN )
+ {
+ for (i=0; i<hdr->nhrec; i++)
+ {
+ if ( hdr->hrec[i]->type!=type ) continue;
+ if ( strcmp(hdr->hrec[i]->key,key) ) continue;
+ if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
+ }
+ return NULL;
+ }
+ else if ( type==BCF_HL_STR )
+ {
+ for (i=0; i<hdr->nhrec; i++)
+ {
+ if ( hdr->hrec[i]->type!=type ) continue;
+ if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
+ int j = bcf_hrec_find_key(hdr->hrec[i],key);
+ if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
+ }
+ return NULL;
+ }
+ vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
+ khint_t k = kh_get(vdict, d, value);
+ if ( k == kh_end(d) ) return NULL;
+ return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
+}
+
+void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
+{
+ static int PL_warned = 0, GL_warned = 0;
+
+ if ( !PL_warned )
+ {
+ int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
+ if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
+ {
+ if (hts_verbose >= 2) fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__);
+ PL_warned = 1;
+ }
+ }
+ if ( !GL_warned )
+ {
+ int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
+ if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
+ {
+ if (hts_verbose >= 2) fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__);
+ GL_warned = 1;
+ }
+ }
+}
+
+int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
+{
+ int len, needs_sync = 0;
+ char *p = htxt;
+
+ // Check sanity: "fileformat" string must come as first
+ bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
+ if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
+ fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__);
+ needs_sync += bcf_hdr_add_hrec(hdr, hrec);
+
+ // The filter PASS must appear first in the dictionary
+ hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
+ needs_sync += bcf_hdr_add_hrec(hdr, hrec);
+
+ // Parse the whole header
+ while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) )
+ {
+ needs_sync += bcf_hdr_add_hrec(hdr, hrec);
+ p += len;
+ }
+ int ret = bcf_hdr_parse_sample_line(hdr,p);
+ bcf_hdr_sync(hdr);
+ bcf_hdr_check_sanity(hdr);
+ return ret;
+}
+
+int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
+{
+ int len;
+ bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
+ if ( !hrec ) return -1;
+ bcf_hdr_add_hrec(hdr, hrec);
+ return 0;
+}
+
+void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
+{
+ int i = 0;
+ bcf_hrec_t *hrec;
+ if ( !key )
+ {
+ while ( i<hdr->nhrec )
+ {
+ if ( hdr->hrec[i]->type!=type ) { i++; continue; }
+ hrec = hdr->hrec[i];
+
+ if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
+ {
+ int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
+ if ( j>0 )
+ {
+ vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
+ khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]);
+ kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
+ }
+ }
+
+ hdr->dirty = 1;
+ hdr->nhrec--;
+ if ( i < hdr->nhrec )
+ memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
+ bcf_hrec_destroy(hrec);
+ }
+ return;
+ }
+ while (1)
+ {
+ if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
+ {
+ hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
+ if ( !hrec ) return;
+
+ for (i=0; i<hdr->nhrec; i++)
+ if ( hdr->hrec[i]==hrec ) break;
+ assert( i<hdr->nhrec );
+
+ vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
+ khint_t k = kh_get(vdict, d, key);
+ kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
+ }
+ else
+ {
+ for (i=0; i<hdr->nhrec; i++)
+ {
+ if ( hdr->hrec[i]->type!=type ) continue;
+ if ( type==BCF_HL_GEN )
+ {
+ if ( !strcmp(hdr->hrec[i]->key,key) ) break;
+ }
+ else
+ {
+ // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
+ int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
+ if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
+ }
+ }
+ if ( i==hdr->nhrec ) return;
+ hrec = hdr->hrec[i];
+ }
+
+ hdr->nhrec--;
+ if ( i < hdr->nhrec )
+ memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
+ bcf_hrec_destroy(hrec);
+ hdr->dirty = 1;
+ }
+}
+
+int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ int n = vsnprintf(NULL, 0, fmt, ap) + 2;
+ va_end(ap);
+
+ char *line = (char*)malloc(n);
+ va_start(ap, fmt);
+ vsnprintf(line, n, fmt, ap);
+ va_end(ap);
+
+ int ret = bcf_hdr_append(hdr, line);
+
+ free(line);
+ return ret;
+}
+
+
+/**********************
+ *** BCF header I/O ***
+ **********************/
+
+const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
+{
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
+ if ( !hrec )
+ {
+ fprintf(stderr,"No version string found, assuming VCFv4.2\n");
+ return "VCFv4.2";
+ }
+ return hrec->value;
+}
+
+void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
+{
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
+ if ( !hrec )
+ {
+ int len;
+ kstring_t str = {0,0,0};
+ ksprintf(&str,"##fileformat=%s", version);
+ hrec = bcf_hdr_parse_line(hdr, str.s, &len);
+ free(str.s);
+ }
+ else
+ {
+ free(hrec->value);
+ hrec->value = strdup(version);
+ }
+ hdr->dirty = 1;
+}
+
+bcf_hdr_t *bcf_hdr_init(const char *mode)
+{
+ int i;
+ bcf_hdr_t *h;
+ h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
+ if (!h) return NULL;
+ for (i = 0; i < 3; ++i)
+ if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
+ if ( strchr(mode,'w') )
+ {
+ bcf_hdr_append(h, "##fileformat=VCFv4.2");
+ // The filter PASS must appear first in the dictionary
+ bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
+ }
+ return h;
+
+ fail:
+ for (i = 0; i < 3; ++i)
+ kh_destroy(vdict, h->dict[i]);
+ free(h);
+ return NULL;
+}
+
+void bcf_hdr_destroy(bcf_hdr_t *h)
+{
+ int i;
+ khint_t k;
+ for (i = 0; i < 3; ++i) {
+ vdict_t *d = (vdict_t*)h->dict[i];
+ if (d == 0) continue;
+ for (k = kh_begin(d); k != kh_end(d); ++k)
+ if (kh_exist(d, k)) free((char*)kh_key(d, k));
+ kh_destroy(vdict, d);
+ free(h->id[i]);
+ }
+ for (i=0; i<h->nhrec; i++)
+ bcf_hrec_destroy(h->hrec[i]);
+ if (h->nhrec) free(h->hrec);
+ if (h->samples) free(h->samples);
+ free(h->keep_samples);
+ free(h->transl[0]); free(h->transl[1]);
+ free(h->mem.s);
+ free(h);
+}
+
+bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
+{
+ if (hfp->format.format == vcf)
+ return vcf_hdr_read(hfp);
+
+ BGZF *fp = hfp->fp.bgzf;
+ uint8_t magic[5];
+ bcf_hdr_t *h;
+ h = bcf_hdr_init("r");
+ if (!h) {
+ fprintf(stderr, "[E::%s] failed to allocate bcf header\n", __func__);
+ return NULL;
+ }
+ if (bgzf_read(fp, magic, 5) != 5)
+ {
+ fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__);
+ bcf_hdr_destroy(h);
+ return NULL;
+ }
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
+ {
+ if (!strncmp((char*)magic, "BCF", 3))
+ fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__);
+ else if (hts_verbose >= 2)
+ fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__);
+ bcf_hdr_destroy(h);
+ return NULL;
+ }
+ int hlen;
+ char *htxt = NULL;
+ if (bgzf_read(fp, &hlen, 4) != 4) goto fail;
+ htxt = (char*)malloc(hlen);
+ if (!htxt) goto fail;
+ if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
+ bcf_hdr_parse(h, htxt); // FIXME: Does this return anything meaningful?
+ free(htxt);
+ return h;
+ fail:
+ if (hts_verbose >= 2) {
+ fprintf(stderr, "[E::%s] failed to read BCF header\n", __func__);
+ }
+ free(htxt);
+ bcf_hdr_destroy(h);
+ return NULL;
+}
+
+int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
+{
+ if ( h->dirty ) bcf_hdr_sync(h);
+ if (hfp->format.format == vcf || hfp->format.format == text_format)
+ return vcf_hdr_write(hfp, h);
+
+ int hlen;
+ char *htxt = bcf_hdr_fmt_text(h, 1, &hlen);
+ hlen++; // include the \0 byte
+
+ BGZF *fp = hfp->fp.bgzf;
+ if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
+ if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1;
+ if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1;
+
+ free(htxt);
+ return 0;
+}
+
+/********************
+ *** BCF site I/O ***
+ ********************/
+
+bcf1_t *bcf_init()
+{
+ bcf1_t *v;
+ v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
+ return v;
+}
+
+void bcf_clear(bcf1_t *v)
+{
+ int i;
+ for (i=0; i<v->d.m_info; i++)
+ {
+ if ( v->d.info[i].vptr_free )
+ {
+ free(v->d.info[i].vptr - v->d.info[i].vptr_off);
+ v->d.info[i].vptr_free = 0;
+ }
+ }
+ for (i=0; i<v->d.m_fmt; i++)
+ {
+ if ( v->d.fmt[i].p_free )
+ {
+ free(v->d.fmt[i].p - v->d.fmt[i].p_off);
+ v->d.fmt[i].p_free = 0;
+ }
+ }
+ v->rid = v->pos = v->rlen = v->unpacked = 0;
+ bcf_float_set_missing(v->qual);
+ v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
+ v->shared.l = v->indiv.l = 0;
+ v->d.var_type = -1;
+ v->d.shared_dirty = 0;
+ v->d.indiv_dirty = 0;
+ v->d.n_flt = 0;
+ v->errcode = 0;
+ if (v->d.m_als) v->d.als[0] = 0;
+ if (v->d.m_id) v->d.id[0] = 0;
+}
+
+void bcf_empty(bcf1_t *v)
+{
+ bcf_clear1(v);
+ free(v->d.id);
+ free(v->d.als);
+ free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
+ if (v->d.var ) free(v->d.var);
+ free(v->shared.s); free(v->indiv.s);
+}
+
+void bcf_destroy(bcf1_t *v)
+{
+ bcf_empty1(v);
+ free(v);
+}
+
+static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
+{
+ uint32_t x[8];
+ int ret;
+ if ((ret = bgzf_read(fp, x, 32)) != 32) {
+ if (ret == 0) return -1;
+ return -2;
+ }
+ bcf_clear1(v);
+ x[0] -= 24; // to exclude six 32-bit integers
+ ks_resize(&v->shared, x[0]);
+ ks_resize(&v->indiv, x[1]);
+ memcpy(v, x + 2, 16);
+ v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
+ v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
+ v->shared.l = x[0], v->indiv.l = x[1];
+ // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
+ if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
+
+ if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -1;
+ if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -1;
+ return 0;
+}
+
+#define bit_array_size(n) ((n)/8+1)
+#define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
+#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
+#define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
+
+static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
+int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
+{
+ if ( !hdr->keep_samples ) return 0;
+ if ( !bcf_hdr_nsamples(hdr) )
+ {
+ rec->indiv.l = rec->n_sample = 0;
+ return 0;
+ }
+
+ int i, j;
+ uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
+ bcf_dec_t *dec = &rec->d;
+ hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
+ for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
+
+ for (i=0; i<rec->n_fmt; i++)
+ {
+ ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
+ src = dec->fmt[i].p - dec->fmt[i].size;
+ if ( dst )
+ {
+ memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
+ dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
+ }
+ dst = dec->fmt[i].p;
+ for (j=0; j<hdr->nsamples_ori; j++)
+ {
+ src += dec->fmt[i].size;
+ if ( !bit_array_test(hdr->keep_samples,j) ) continue;
+ memmove(dst, src, dec->fmt[i].size);
+ dst += dec->fmt[i].size;
+ }
+ rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
+ dec->fmt[i].p_len = dst - dec->fmt[i].p;
+ }
+ rec->unpacked |= BCF_UN_FMT;
+
+ rec->n_sample = bcf_hdr_nsamples(hdr);
+ return 0;
+}
+
+int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+{
+ if (fp->format.format == vcf) return vcf_read(fp,h,v);
+ int ret = bcf_read1_core(fp->fp.bgzf, v);
+ if ( ret!=0 || !h->keep_samples ) return ret;
+ return bcf_subset_format(h,v);
+}
+
+int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end)
+{
+ bcf1_t *v = (bcf1_t *) vv;
+ int ret;
+ if ((ret = bcf_read1_core(fp, v)) >= 0)
+ *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
+ return ret;
+}
+
+static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str)
+{
+ // single typed string
+ if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
+ else bcf_enc_size(str, 0, BCF_BT_CHAR);
+}
+static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
+{
+ // list of typed strings
+ int i;
+ for (i=0; i<line->n_allele; i++)
+ bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]);
+ if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
+}
+static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str)
+{
+ // typed vector of integers
+ if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
+ else bcf_enc_vint(str, 0, 0, -1);
+}
+
+static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str)
+{
+ // pairs of typed vectors
+ int i, irm = -1;
+ for (i=0; i<line->n_info; i++)
+ {
+ bcf_info_t *info = &line->d.info[i];
+ if ( !info->vptr )
+ {
+ // marked for removal
+ if ( irm < 0 ) irm = i;
+ continue;
+ }
+ kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str);
+ if ( irm >=0 )
+ {
+ bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
+ while ( irm<=i && line->d.info[irm].vptr ) irm++;
+ }
+ }
+ if ( irm>=0 ) line->n_info = irm;
+}
+
+static int bcf1_sync(bcf1_t *line)
+{
+ char *shared_ori = line->shared.s;
+ size_t prev_len;
+
+ kstring_t tmp = {0,0,0};
+ if ( !line->shared.l )
+ {
+ // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
+ tmp = line->shared;
+ bcf1_sync_id(line, &tmp);
+ line->unpack_size[0] = tmp.l; prev_len = tmp.l;
+
+ bcf1_sync_alleles(line, &tmp);
+ line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
+
+ bcf1_sync_filter(line, &tmp);
+ line->unpack_size[2] = tmp.l - prev_len;
+
+ bcf1_sync_info(line, &tmp);
+ line->shared = tmp;
+ }
+ else if ( line->d.shared_dirty )
+ {
+ // The line was edited, update the BCF data block.
+
+ if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
+
+ // ptr_ori points to the original unchanged BCF data.
+ uint8_t *ptr_ori = (uint8_t *) line->shared.s;
+
+ // ID: single typed string
+ if ( line->d.shared_dirty & BCF1_DIRTY_ID )
+ bcf1_sync_id(line, &tmp);
+ else
+ kputsn_(ptr_ori, line->unpack_size[0], &tmp);
+ ptr_ori += line->unpack_size[0];
+ line->unpack_size[0] = tmp.l; prev_len = tmp.l;
+
+ // REF+ALT: list of typed strings
+ if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
+ bcf1_sync_alleles(line, &tmp);
+ else
+ {
+ kputsn_(ptr_ori, line->unpack_size[1], &tmp);
+ if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
+ }
+ ptr_ori += line->unpack_size[1];
+ line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
+
+ if ( line->unpacked & BCF_UN_FLT )
+ {
+ // FILTER: typed vector of integers
+ if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
+ bcf1_sync_filter(line, &tmp);
+ else if ( line->d.n_flt )
+ kputsn_(ptr_ori, line->unpack_size[2], &tmp);
+ else
+ bcf_enc_vint(&tmp, 0, 0, -1);
+ ptr_ori += line->unpack_size[2];
+ line->unpack_size[2] = tmp.l - prev_len;
+
+ if ( line->unpacked & BCF_UN_INFO )
+ {
+ // INFO: pairs of typed vectors
+ if ( line->d.shared_dirty & BCF1_DIRTY_INF )
+ {
+ bcf1_sync_info(line, &tmp);
+ ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
+ }
+ }
+ }
+
+ int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
+ if ( size ) kputsn_(ptr_ori, size, &tmp);
+
+ free(line->shared.s);
+ line->shared = tmp;
+ }
+ if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
+ {
+ // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
+ size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
+ int i;
+ for (i=0; i<line->n_info; i++)
+ {
+ uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
+ line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
+ off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
+ if ( vptr_free )
+ {
+ free(vptr_free);
+ line->d.info[i].vptr_free = 0;
+ }
+ }
+ }
+
+ if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
+ {
+ // The genotype fields changed or are not present
+ tmp.l = tmp.m = 0; tmp.s = NULL;
+ int i, irm = -1;
+ for (i=0; i<line->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ if ( !fmt->p )
+ {
+ // marked for removal
+ if ( irm < 0 ) irm = i;
+ continue;
+ }
+ kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
+ if ( irm >=0 )
+ {
+ bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
+ while ( irm<=i && line->d.fmt[irm].p ) irm++;
+ }
+
+ }
+ if ( irm>=0 ) line->n_fmt = irm;
+ free(line->indiv.s);
+ line->indiv = tmp;
+
+ // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
+ size_t off_new = 0;
+ for (i=0; i<line->n_fmt; i++)
+ {
+ uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
+ line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
+ off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
+ if ( p_free )
+ {
+ free(p_free);
+ line->d.fmt[i].p_free = 0;
+ }
+ }
+ }
+ if ( !line->n_sample ) line->n_fmt = 0;
+ line->d.shared_dirty = line->d.indiv_dirty = 0;
+ return 0;
+}
+
+bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
+{
+ bcf1_sync(src);
+
+ bcf_clear(dst);
+ dst->rid = src->rid;
+ dst->pos = src->pos;
+ dst->rlen = src->rlen;
+ dst->qual = src->qual;
+ dst->n_info = src->n_info; dst->n_allele = src->n_allele;
+ dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
+
+ dst->shared.m = dst->shared.l = src->shared.l;
+ dst->shared.s = (char*) malloc(dst->shared.l);
+ memcpy(dst->shared.s,src->shared.s,dst->shared.l);
+
+ dst->indiv.m = dst->indiv.l = src->indiv.l;
+ dst->indiv.s = (char*) malloc(dst->indiv.l);
+ memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
+
+ return dst;
+}
+bcf1_t *bcf_dup(bcf1_t *src)
+{
+ bcf1_t *out = bcf_init1();
+ return bcf_copy(out, src);
+}
+
+int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
+{
+ if ( h->dirty ) bcf_hdr_sync(h);
+ if ( bcf_hdr_nsamples(h)!=v->n_sample )
+ {
+ fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
+ __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
+ return -1;
+ }
+
+ if ( hfp->format.format == vcf || hfp->format.format == text_format )
+ return vcf_write(hfp,h,v);
+
+ if ( v->errcode )
+ {
+ // vcf_parse1() encountered a new contig or tag, undeclared in the
+ // header. At this point, the header must have been printed,
+ // proceeding would lead to a broken BCF file. Errors must be checked
+ // and cleared by the caller before we can proceed.
+ fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode);
+ exit(1);
+ }
+ bcf1_sync(v); // check if the BCF record was modified
+
+ BGZF *fp = hfp->fp.bgzf;
+ uint32_t x[8];
+ x[0] = v->shared.l + 24; // to include six 32-bit integers
+ x[1] = v->indiv.l;
+ memcpy(x + 2, v, 16);
+ x[6] = (uint32_t)v->n_allele<<16 | v->n_info;
+ x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample;
+ if ( bgzf_write(fp, x, 32) != 32 ) return -1;
+ if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
+ if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
+ return 0;
+}
+
+/**********************
+ *** VCF header I/O ***
+ **********************/
+
+bcf_hdr_t *vcf_hdr_read(htsFile *fp)
+{
+ kstring_t txt, *s = &fp->line;
+ bcf_hdr_t *h;
+ h = bcf_hdr_init("r");
+ if (!h) {
+ fprintf(stderr, "[E::%s] failed to allocate bcf header\n", __func__);
+ return NULL;
+ }
+ txt.l = txt.m = 0; txt.s = 0;
+ while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
+ if (s->l == 0) continue;
+ if (s->s[0] != '#') {
+ if (hts_verbose >= 2)
+ fprintf(stderr, "[E::%s] no sample line\n", __func__);
+ free(txt.s);
+ bcf_hdr_destroy(h);
+ return NULL;
+ }
+ if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
+ int dret;
+ gzFile f;
+ kstream_t *ks;
+ kstring_t tmp;
+ tmp.l = tmp.m = 0; tmp.s = 0;
+ f = gzopen(fp->fn_aux, "r");
+ ks = ks_init(f);
+ while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
+ int c;
+ kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
+ ks_getuntil(ks, 0, &tmp, &dret);
+ kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
+ kputsn(">\n", 2, &txt);
+ if (dret != '\n')
+ while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
+ }
+ free(tmp.s);
+ ks_destroy(ks);
+ gzclose(f);
+ }
+ kputsn(s->s, s->l, &txt);
+ kputc('\n', &txt);
+ if (s->s[1] != '#') break;
+ }
+ if ( !txt.s )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__);
+ return NULL;
+ }
+ bcf_hdr_parse(h, txt.s);
+
+ // check tabix index, are all contigs listed in the header? add the missing ones
+ tbx_t *idx = tbx_index_load(fp->fn);
+ if ( idx )
+ {
+ int i, n, need_sync = 0;
+ const char **names = tbx_seqnames(idx, &n);
+ for (i=0; i<n; i++)
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
+ if ( hrec ) continue;
+ hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
+ hrec->key = strdup("contig");
+ bcf_hrec_add_key(hrec, "ID", strlen("ID"));
+ bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
+ bcf_hdr_add_hrec(h, hrec);
+ need_sync = 1;
+ }
+ free(names);
+ tbx_destroy(idx);
+ if ( need_sync )
+ bcf_hdr_sync(h);
+ }
+ free(txt.s);
+ return h;
+}
+
+int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
+{
+ int i, n;
+ char **lines = hts_readlines(fname, &n);
+ if ( !lines ) return 1;
+ for (i=0; i<n-1; i++)
+ {
+ int k;
+ bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
+ if ( hrec ) bcf_hdr_add_hrec(hdr, hrec);
+ free(lines[i]);
+ }
+ bcf_hdr_parse_sample_line(hdr,lines[n-1]);
+ free(lines[n-1]);
+ free(lines);
+ bcf_hdr_sync(hdr);
+ return 0;
+}
+
+static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
+{
+ if ( !hrec->value )
+ {
+ int j, nout = 0;
+ ksprintf(str, "##%s=<", hrec->key);
+ for (j=0; j<hrec->nkeys; j++)
+ {
+ // do not output IDX if output is VCF
+ if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
+ if ( nout ) kputc(',',str);
+ ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
+ nout++;
+ }
+ ksprintf(str,">\n");
+ }
+ else
+ ksprintf(str,"##%s=%s\n", hrec->key,hrec->value);
+}
+
+void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
+{
+ _bcf_hrec_format(hrec,0,str);
+}
+char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
+{
+ int i;
+ kstring_t txt = {0,0,0};
+ for (i=0; i<hdr->nhrec; i++)
+ _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt);
+
+ ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
+ if ( bcf_hdr_nsamples(hdr) )
+ {
+ ksprintf(&txt,"\tFORMAT");
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ ksprintf(&txt,"\t%s", hdr->samples[i]);
+ }
+ ksprintf(&txt,"\n");
+
+ if ( len ) *len = txt.l;
+ return txt.s;
+}
+
+const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
+{
+ vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
+ int tid, m = kh_size(d);
+ const char **names = (const char**) calloc(m,sizeof(const char*));
+ khint_t k;
+ for (k=kh_begin(d); k<kh_end(d); k++)
+ {
+ if ( !kh_exist(d,k) ) continue;
+ tid = kh_val(d,k).id;
+ assert( tid<m );
+ names[tid] = kh_key(d,k);
+ }
+ // sanity check: there should be no gaps
+ for (tid=0; tid<m; tid++)
+ assert(names[tid]);
+ *n = m;
+ return names;
+}
+
+int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
+{
+ int hlen;
+ char *htxt = bcf_hdr_fmt_text(h, 0, &hlen);
+ while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros
+ int ret;
+ if ( fp->format.compression!=no_compression )
+ ret = bgzf_write(fp->fp.bgzf, htxt, hlen);
+ else
+ ret = hwrite(fp->fp.hfile, htxt, hlen);
+ free(htxt);
+ return ret<0 ? -1 : 0;
+}
+
+/***********************
+ *** Typed value I/O ***
+ ***********************/
+
+void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
+{
+ int32_t max = INT32_MIN + 1, min = INT32_MAX;
+ int i;
+ if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL);
+ else if (n == 1) bcf_enc_int1(s, a[0]);
+ else {
+ if (wsize <= 0) wsize = n;
+ for (i = 0; i < n; ++i) {
+ if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
+ if (max < a[i]) max = a[i];
+ if (min > a[i]) min = a[i];
+ }
+ if (max <= INT8_MAX && min > bcf_int8_vector_end) {
+ bcf_enc_size(s, wsize, BCF_BT_INT8);
+ for (i = 0; i < n; ++i)
+ if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
+ else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
+ else kputc(a[i], s);
+ } else if (max <= INT16_MAX && min > bcf_int16_vector_end) {
+ bcf_enc_size(s, wsize, BCF_BT_INT16);
+ for (i = 0; i < n; ++i)
+ {
+ int16_t x;
+ if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
+ else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
+ else x = a[i];
+ kputsn((char*)&x, 2, s);
+ }
+ } else {
+ bcf_enc_size(s, wsize, BCF_BT_INT32);
+ for (i = 0; i < n; ++i) {
+ int32_t x = a[i];
+ kputsn((char*)&x, 4, s);
+ }
+ }
+ }
+}
+
+void bcf_enc_vfloat(kstring_t *s, int n, float *a)
+{
+ bcf_enc_size(s, n, BCF_BT_FLOAT);
+ kputsn((char*)a, n << 2, s);
+}
+
+void bcf_enc_vchar(kstring_t *s, int l, const char *a)
+{
+ bcf_enc_size(s, l, BCF_BT_CHAR);
+ kputsn(a, l, s);
+}
+
+void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
+{
+ int j = 0;
+ if (n == 0) {
+ kputc('.', s);
+ return;
+ }
+ if (type == BCF_BT_CHAR)
+ {
+ char *p = (char*)data;
+ for (j = 0; j < n && *p; ++j, ++p)
+ {
+ if ( *p==bcf_str_missing ) kputc('.', s);
+ else kputc(*p, s);
+ }
+ }
+ else
+ {
+ #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
+ type_t *p = (type_t *) data; \
+ for (j=0; j<n; j++) \
+ { \
+ if ( is_vector_end ) break; \
+ if ( j ) kputc(',', s); \
+ if ( is_missing ) kputc('.', s); \
+ else kprint; \
+ } \
+ }
+ switch (type) {
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, kputw(p[j], s)); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, kputw(p[j], s)); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, kputw(p[j], s)); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), ksprintf(s, "%g", p[j])); break;
+ default: fprintf(stderr,"todo: type %d\n", type); exit(1); break;
+ }
+ #undef BRANCH
+ }
+}
+
+uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
+{
+ int x, type;
+ x = bcf_dec_size(ptr, &ptr, &type);
+ bcf_fmt_array(s, x, type, ptr);
+ return ptr + (x << bcf_type_shift[type]);
+}
+
+/********************
+ *** VCF site I/O ***
+ ********************/
+
+typedef struct {
+ int key, max_m, size, offset;
+ uint64_t is_gt:1, max_g:31, max_l:32;
+ uint32_t y;
+ uint8_t *buf;
+} fmt_aux_t;
+
+static inline void align_mem(kstring_t *s)
+{
+ if (s->l&7) {
+ uint64_t zero = 0;
+ int l = ((s->l + 7)>>3<<3) - s->l;
+ kputsn((char*)&zero, l, s);
+ }
+}
+
+// p,q is the start and the end of the FORMAT field
+#define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */
+static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
+{
+ if ( !bcf_hdr_nsamples(h) ) return 0;
+
+ char *r, *t;
+ int j, l, m, g;
+ khint_t k;
+ ks_tokaux_t aux1;
+ vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
+ kstring_t *mem = (kstring_t*)&h->mem;
+ fmt_aux_t fmt[MAX_N_FMT];
+ mem->l = 0;
+
+ char *end = s->s + s->l;
+ if ( q>=end )
+ {
+ fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1);
+ return -1;
+ }
+
+ // get format information from the dictionary
+ v->n_fmt = 0;
+ for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
+ if (j >= MAX_N_FMT) {
+ v->errcode |= BCF_ERR_LIMITS;
+ fprintf(stderr,"[E::%s] Error: FORMAT column at %s:%d lists more identifiers than htslib can handle.\n", __func__, bcf_seqname(h,v), v->pos+1);
+ return -1;
+ }
+
+ *(char*)aux1.p = 0;
+ k = kh_get(vdict, d, t);
+ if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
+ if (hts_verbose >= 2) fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t);
+ kstring_t tmp = {0,0,0};
+ int l;
+ ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
+ bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
+ free(tmp.s);
+ if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
+ k = kh_get(vdict, d, t);
+ v->errcode = BCF_ERR_TAG_UNDEF;
+ if (k == kh_end(d)) {
+ fprintf(stderr, "[E::%s] Could not add dummy header for FORMAT '%s'\n", __func__, t);
+ v->errcode |= BCF_ERR_TAG_INVALID;
+ return -1;
+ }
+ }
+ fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
+ fmt[j].key = kh_val(d, k).id;
+ fmt[j].is_gt = !strcmp(t, "GT");
+ fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
+ v->n_fmt++;
+ }
+ // compute max
+ int n_sample_ori = -1;
+ r = q + 1; // r: position in the format string
+ l = 0, m = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
+ while ( r<end )
+ {
+ // can we skip some samples?
+ if ( h->keep_samples )
+ {
+ n_sample_ori++;
+ if ( !bit_array_test(h->keep_samples,n_sample_ori) )
+ {
+ while ( *r!='\t' && r<end ) r++;
+ if ( *r=='\t' ) { *r = 0; r++; }
+ continue;
+ }
+ }
+
+ // collect fmt stats: max vector size, length, number of alleles
+ j = 0; // j-th format field
+ for (;;)
+ {
+ if ( *r == '\t' ) *r = 0;
+ if ( *r == ':' || !*r ) // end of field or end of sample
+ {
+ if (fmt[j].max_m < m) fmt[j].max_m = m;
+ if (fmt[j].max_l < l) fmt[j].max_l = l;
+ if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
+ l = 0, m = g = 1;
+ if ( *r==':' )
+ {
+ j++;
+ if ( j>=v->n_fmt )
+ {
+ fprintf(stderr,"Incorrect number of FORMAT fields at %s:%d\n", h->id[BCF_DT_CTG][v->rid].key,v->pos+1);
+ exit(1);
+ }
+ }
+ else break;
+ }
+ else if ( *r== ',' ) m++;
+ else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++;
+ if ( r>=end ) break;
+ r++; l++;
+ }
+ v->n_sample++;
+ if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
+ r++;
+ }
+
+ // allocate memory for arrays
+ for (j = 0; j < v->n_fmt; ++j) {
+ fmt_aux_t *f = &fmt[j];
+ if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
+ if ((f->y>>4&0xf) == BCF_HT_STR) {
+ f->size = f->is_gt? f->max_g << 2 : f->max_l;
+ } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
+ f->size = f->max_m << 2;
+ } else
+ {
+ fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf);
+ abort(); // I do not know how to do with Flag in the genotype fields
+ }
+ align_mem(mem);
+ f->offset = mem->l;
+ ks_resize(mem, mem->l + v->n_sample * f->size);
+ mem->l += v->n_sample * f->size;
+ }
+ for (j = 0; j < v->n_fmt; ++j)
+ fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
+ // fill the sample fields; at beginning of the loop, t points to the first char of a format
+ n_sample_ori = -1;
+ t = q + 1; m = 0; // m: sample id
+ while ( t<end )
+ {
+ // can we skip some samples?
+ if ( h->keep_samples )
+ {
+ n_sample_ori++;
+ if ( !bit_array_test(h->keep_samples,n_sample_ori) )
+ {
+ while ( *t && t<end ) t++;
+ t++;
+ continue;
+ }
+ }
+ if ( m == bcf_hdr_nsamples(h) ) break;
+
+ j = 0; // j-th format field, m-th sample
+ while ( t < end )
+ {
+ fmt_aux_t *z = &fmt[j++];
+ if ((z->y>>4&0xf) == BCF_HT_STR) {
+ if (z->is_gt) { // genotypes
+ int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
+ for (l = 0;; ++t) {
+ if (*t == '.') ++t, x[l++] = is_phased;
+ else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
+#if THOROUGH_SANITY_CHECKS
+ assert( 0 ); // success of strtol,strtod not checked
+#endif
+ is_phased = (*t == '|');
+ if (*t != '|' && *t != '/') break;
+ }
+ if ( !l ) x[l++] = 0; // An empty field, insert missing value
+ for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
+ } else {
+ char *x = (char*)z->buf + z->size * m;
+ for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
+ for (; l < z->size; ++l) x[l] = 0;
+ }
+ } else if ((z->y>>4&0xf) == BCF_HT_INT) {
+ int32_t *x = (int32_t*)(z->buf + z->size * m);
+ for (l = 0;; ++t) {
+ if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
+ else x[l++] = strtol(t, &t, 10);
+ if (*t != ',') break;
+ }
+ if ( !l ) x[l++] = bcf_int32_missing;
+ for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
+ } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
+ float *x = (float*)(z->buf + z->size * m);
+ for (l = 0;; ++t) {
+ if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
+ else x[l++] = strtod(t, &t);
+ if (*t != ',') break;
+ }
+ if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
+ for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
+ } else abort();
+
+ if (*t == '\0') {
+ break;
+ }
+ else if (*t == ':') {
+ t++;
+ }
+ else {
+ char buffer[8];
+ fprintf(stderr,"[E::%s] Invalid character '%s' in '%s' FORMAT field at %s:%d\n", __FUNCTION__, dump_char(buffer, *t), h->id[BCF_DT_ID][z->key].key, bcf_seqname(h,v), v->pos+1);
+ v->errcode |= BCF_ERR_CHAR;
+ return -1;
+ }
+ }
+
+ for (; j < v->n_fmt; ++j) { // fill end-of-vector values
+ fmt_aux_t *z = &fmt[j];
+ if ((z->y>>4&0xf) == BCF_HT_STR) {
+ if (z->is_gt) {
+ int32_t *x = (int32_t*)(z->buf + z->size * m);
+ if (z->size) x[0] = bcf_int32_missing;
+ for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
+ } else {
+ char *x = (char*)z->buf + z->size * m;
+ if ( z->size ) x[0] = '.';
+ for (l = 1; l < z->size; ++l) x[l] = 0;
+ }
+ } else if ((z->y>>4&0xf) == BCF_HT_INT) {
+ int32_t *x = (int32_t*)(z->buf + z->size * m);
+ x[0] = bcf_int32_missing;
+ for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
+ } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
+ float *x = (float*)(z->buf + z->size * m);
+ bcf_float_set_missing(x[0]);
+ for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
+ }
+ }
+
+ m++; t++;
+ }
+
+ // write individual genotype information
+ kstring_t *str = &v->indiv;
+ int i;
+ if (v->n_sample > 0) {
+ for (i = 0; i < v->n_fmt; ++i) {
+ fmt_aux_t *z = &fmt[i];
+ bcf_enc_int1(str, z->key);
+ if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
+ bcf_enc_size(str, z->size, BCF_BT_CHAR);
+ kputsn((char*)z->buf, z->size * v->n_sample, str);
+ } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
+ bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
+ } else {
+ bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
+ kputsn((char*)z->buf, z->size * v->n_sample, str);
+ }
+ }
+ }
+
+ if ( v->n_sample!=bcf_hdr_nsamples(h) )
+ {
+ fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
+ __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
+ v->errcode |= BCF_ERR_NCOLS;
+ return -1;
+ }
+ if ( v->indiv.l > 0xffffffff )
+ {
+ fprintf(stderr,"[%s:%d %s] The FORMAT at %s:%d is too long...\n",
+ __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1);
+ v->errcode |= BCF_ERR_LIMITS;
+
+ // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
+ v->n_fmt = 0;
+ return -1;
+ }
+
+ return 0;
+}
+
+int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
+{
+ int i = 0;
+ char *p, *q, *r, *t;
+ kstring_t *str;
+ khint_t k;
+ ks_tokaux_t aux;
+
+ bcf_clear1(v);
+ str = &v->shared;
+ memset(&aux, 0, sizeof(ks_tokaux_t));
+ for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
+ q = (char*)aux.p;
+ *q = 0;
+ if (i == 0) { // CHROM
+ vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
+ k = kh_get(vdict, d, p);
+ if (k == kh_end(d))
+ {
+ // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
+ // been already printed, but will enable tools like vcfcheck to proceed.
+ if (hts_verbose >= 2) fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
+ kstring_t tmp = {0,0,0};
+ int l;
+ ksprintf(&tmp, "##contig=<ID=%s>", p);
+ bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
+ free(tmp.s);
+ if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
+ k = kh_get(vdict, d, p);
+ v->errcode = BCF_ERR_CTG_UNDEF;
+ if (k == kh_end(d)) {
+ fprintf(stderr, "[E::%s] Could not add dummy header for contig '%s'\n", __func__, p);
+ v->errcode |= BCF_ERR_CTG_INVALID;
+ return -1;
+ }
+ }
+ v->rid = kh_val(d, k).id;
+ } else if (i == 1) { // POS
+ v->pos = atoi(p) - 1;
+ } else if (i == 2) { // ID
+ if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
+ else bcf_enc_size(str, 0, BCF_BT_CHAR);
+ } else if (i == 3) { // REF
+ bcf_enc_vchar(str, q - p, p);
+ v->n_allele = 1, v->rlen = q - p;
+ } else if (i == 4) { // ALT
+ if (strcmp(p, ".")) {
+ for (r = t = p;; ++r) {
+ if (*r == ',' || *r == 0) {
+ bcf_enc_vchar(str, r - t, t);
+ t = r + 1;
+ ++v->n_allele;
+ }
+ if (r == q) break;
+ }
+ }
+ } else if (i == 5) { // QUAL
+ if (strcmp(p, ".")) v->qual = atof(p);
+ else memcpy(&v->qual, &bcf_float_missing, 4);
+ if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR
+ } else if (i == 6) { // FILTER
+ if (strcmp(p, ".")) {
+ int32_t *a;
+ int n_flt = 1, i;
+ ks_tokaux_t aux1;
+ vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
+ // count the number of filters
+ if (*(q-1) == ';') *(q-1) = 0;
+ for (r = p; *r; ++r)
+ if (*r == ';') ++n_flt;
+ a = (int32_t*)alloca(n_flt * sizeof(int32_t));
+ // add filters
+ for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
+ *(char*)aux1.p = 0;
+ k = kh_get(vdict, d, t);
+ if (k == kh_end(d))
+ {
+ // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
+ // been already printed, but will enable tools like vcfcheck to proceed.
+ if (hts_verbose >= 2) fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t);
+ kstring_t tmp = {0,0,0};
+ int l;
+ ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
+ bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
+ free(tmp.s);
+ if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
+ k = kh_get(vdict, d, t);
+ v->errcode = BCF_ERR_TAG_UNDEF;
+ if (k == kh_end(d)) {
+ fprintf(stderr, "[E::%s] Could not add dummy header for FILTER '%s'\n", __func__, t);
+ v->errcode |= BCF_ERR_TAG_INVALID;
+ return -1;
+ }
+ }
+ a[i++] = kh_val(d, k).id;
+ }
+ n_flt = i;
+ bcf_enc_vint(str, n_flt, a, -1);
+ } else bcf_enc_vint(str, 0, 0, -1);
+ if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT
+ } else if (i == 7) { // INFO
+ char *key;
+ vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
+ v->n_info = 0;
+ if (strcmp(p, ".")) {
+ if (*(q-1) == ';') *(q-1) = 0;
+ for (r = key = p;; ++r) {
+ int c;
+ char *val, *end;
+ if (*r != ';' && *r != '=' && *r != 0) continue;
+ val = end = 0;
+ c = *r; *r = 0;
+ if (c == '=') {
+ val = r + 1;
+ for (end = val; *end != ';' && *end != 0; ++end);
+ c = *end; *end = 0;
+ } else end = r;
+ if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
+ k = kh_get(vdict, d, key);
+ if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
+ {
+ if (hts_verbose >= 2) fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key);
+ kstring_t tmp = {0,0,0};
+ int l;
+ ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
+ bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
+ free(tmp.s);
+ if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
+ k = kh_get(vdict, d, key);
+ v->errcode = BCF_ERR_TAG_UNDEF;
+ if (k == kh_end(d)) {
+ fprintf(stderr, "[E::%s] Could not add dummy header for INFO '%s'\n", __func__, key);
+ v->errcode |= BCF_ERR_TAG_INVALID;
+ return -1;
+ }
+ }
+ uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
+ ++v->n_info;
+ bcf_enc_int1(str, kh_val(d, k).id);
+ if (val == 0) {
+ bcf_enc_size(str, 0, BCF_BT_NULL);
+ } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
+ bcf_enc_vchar(str, end - val, val);
+ } else { // int/float value/array
+ int i, n_val;
+ char *t, *te;
+ for (t = val, n_val = 1; *t; ++t) // count the number of values
+ if (*t == ',') ++n_val;
+ if ((y>>4&0xf) == BCF_HT_INT) {
+ int32_t *z;
+ z = (int32_t*)alloca(n_val * sizeof(int32_t));
+ for (i = 0, t = val; i < n_val; ++i, ++t)
+ {
+ z[i] = strtol(t, &te, 10);
+ if ( te==t ) // conversion failed
+ {
+ z[i] = bcf_int32_missing;
+ while ( *te && *te!=',' ) te++;
+ }
+ t = te;
+ }
+ bcf_enc_vint(str, n_val, z, -1);
+ if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos;
+ } else if ((y>>4&0xf) == BCF_HT_REAL) {
+ float *z;
+ z = (float*)alloca(n_val * sizeof(float));
+ for (i = 0, t = val; i < n_val; ++i, ++t)
+ {
+ z[i] = strtod(t, &te);
+ if ( te==t ) // conversion failed
+ {
+ bcf_float_set_missing(z[i]);
+ while ( *te && *te!=',' ) te++;
+ }
+ t = te;
+ }
+ bcf_enc_vfloat(str, n_val, z);
+ }
+ }
+ if (c == 0) break;
+ r = end;
+ key = r + 1;
+ }
+ }
+ if ( v->max_unpack && !(v->max_unpack>>3) ) return 0;
+ } else if (i == 8) // FORMAT
+ return vcf_parse_format(s, h, v, p, q);
+ }
+ return 0;
+}
+
+int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+{
+ int ret;
+ ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
+ if (ret < 0) return -1;
+ return vcf_parse1(&fp->line, h, v);
+}
+
+static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
+{
+ uint8_t *ptr_start = ptr;
+ fmt->id = bcf_dec_typed_int1(ptr, &ptr);
+ fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
+ fmt->size = fmt->n << bcf_type_shift[fmt->type];
+ fmt->p = ptr;
+ fmt->p_off = ptr - ptr_start;
+ fmt->p_free = 0;
+ ptr += n_sample * fmt->size;
+ fmt->p_len = ptr - fmt->p;
+ return ptr;
+}
+
+static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
+{
+ uint8_t *ptr_start = ptr;
+ info->key = bcf_dec_typed_int1(ptr, &ptr);
+ info->len = bcf_dec_size(ptr, &ptr, &info->type);
+ info->vptr = ptr;
+ info->vptr_off = ptr - ptr_start;
+ info->vptr_free = 0;
+ info->v1.i = 0;
+ if (info->len == 1) {
+ if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
+ else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr;
+ else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr;
+ else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr;
+ }
+ ptr += info->len << bcf_type_shift[info->type];
+ info->vptr_len = ptr - info->vptr;
+ return ptr;
+}
+
+int bcf_unpack(bcf1_t *b, int which)
+{
+ if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
+ uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
+ int *offset, i;
+ bcf_dec_t *d = &b->d;
+ if (which & BCF_UN_FLT) which |= BCF_UN_STR;
+ if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
+ if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
+ {
+ kstring_t tmp;
+
+ // ID
+ tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
+ ptr_ori = ptr;
+ ptr = bcf_fmt_sized_array(&tmp, ptr);
+ b->unpack_size[0] = ptr - ptr_ori;
+ kputc('\0', &tmp);
+ d->id = tmp.s; d->m_id = tmp.m;
+
+ // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
+ tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
+ offset = (int*)alloca(b->n_allele * sizeof(int));
+ ptr_ori = ptr;
+ for (i = 0; i < b->n_allele; ++i) {
+ offset[i] = tmp.l;
+ ptr = bcf_fmt_sized_array(&tmp, ptr);
+ kputc('\0', &tmp);
+ }
+ b->unpack_size[1] = ptr - ptr_ori;
+ d->als = tmp.s; d->m_als = tmp.m;
+
+ hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
+ for (i = 0; i < b->n_allele; ++i)
+ d->allele[i] = d->als + offset[i];
+ b->unpacked |= BCF_UN_STR;
+ }
+ if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
+ ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
+ ptr_ori = ptr;
+ if (*ptr>>4) {
+ int type;
+ d->n_flt = bcf_dec_size(ptr, &ptr, &type);
+ hts_expand(int, d->n_flt, d->m_flt, d->flt);
+ for (i = 0; i < d->n_flt; ++i)
+ d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
+ } else ++ptr, d->n_flt = 0;
+ b->unpack_size[2] = ptr - ptr_ori;
+ b->unpacked |= BCF_UN_FLT;
+ }
+ if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
+ ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
+ hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
+ for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
+ for (i = 0; i < b->n_info; ++i)
+ ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
+ b->unpacked |= BCF_UN_INFO;
+ }
+ if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
+ ptr = (uint8_t*)b->indiv.s;
+ hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
+ for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
+ for (i = 0; i < b->n_fmt; ++i)
+ ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
+ b->unpacked |= BCF_UN_FMT;
+ }
+ return 0;
+}
+
+int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
+{
+ int i;
+ bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
+ kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
+ kputc('\t', s); kputw(v->pos + 1, s); // POS
+ kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
+ kputc('\t', s); // REF
+ if (v->n_allele > 0) kputs(v->d.allele[0], s);
+ else kputc('.', s);
+ kputc('\t', s); // ALT
+ if (v->n_allele > 1) {
+ for (i = 1; i < v->n_allele; ++i) {
+ if (i > 1) kputc(',', s);
+ kputs(v->d.allele[i], s);
+ }
+ } else kputc('.', s);
+ kputc('\t', s); // QUAL
+ if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
+ else ksprintf(s, "%g", v->qual);
+ kputc('\t', s); // FILTER
+ if (v->d.n_flt) {
+ for (i = 0; i < v->d.n_flt; ++i) {
+ if (i) kputc(';', s);
+ kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s);
+ }
+ } else kputc('.', s);
+ kputc('\t', s); // INFO
+ if (v->n_info) {
+ int first = 1;
+ for (i = 0; i < v->n_info; ++i) {
+ bcf_info_t *z = &v->d.info[i];
+ if ( !z->vptr ) continue;
+ if ( !first ) kputc(';', s);
+ first = 0;
+ kputs(h->id[BCF_DT_ID][z->key].key, s);
+ if (z->len <= 0) continue;
+ kputc('=', s);
+ if (z->len == 1)
+ {
+ switch (z->type)
+ {
+ case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
+ case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
+ case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else ksprintf(s, "%g", z->v1.f); break;
+ case BCF_BT_CHAR: kputc(z->v1.i, s); break;
+ default: fprintf(stderr,"todo: type %d\n", z->type); exit(1); break;
+ }
+ }
+ else bcf_fmt_array(s, z->len, z->type, z->vptr);
+ }
+ if ( first ) kputc('.', s);
+ } else kputc('.', s);
+ // FORMAT and individual information
+ if (v->n_sample)
+ {
+ int i,j;
+ if ( v->n_fmt)
+ {
+ int gt_i = -1;
+ bcf_fmt_t *fmt = v->d.fmt;
+ int first = 1;
+ for (i = 0; i < (int)v->n_fmt; ++i) {
+ if ( !fmt[i].p ) continue;
+ kputc(!first ? ':' : '\t', s); first = 0;
+ if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
+ {
+ fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id);
+ abort();
+ }
+ kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
+ if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
+ }
+ if ( first ) kputs("\t.", s);
+ for (j = 0; j < v->n_sample; ++j) {
+ kputc('\t', s);
+ first = 1;
+ for (i = 0; i < (int)v->n_fmt; ++i) {
+ bcf_fmt_t *f = &fmt[i];
+ if ( !f->p ) continue;
+ if (!first) kputc(':', s);
+ first = 0;
+ if (gt_i == i)
+ bcf_format_gt(f,j,s);
+ else
+ bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
+ }
+ if ( first ) kputc('.', s);
+ }
+ }
+ else
+ for (j=0; j<=v->n_sample; j++)
+ kputs("\t.", s);
+ }
+ kputc('\n', s);
+ return 0;
+}
+
+int vcf_write_line(htsFile *fp, kstring_t *line)
+{
+ int ret;
+ if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
+ if ( fp->format.compression!=no_compression )
+ ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
+ else
+ ret = hwrite(fp->fp.hfile, line->s, line->l);
+ return ret==line->l ? 0 : -1;
+}
+
+int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+{
+ int ret;
+ fp->line.l = 0;
+ vcf_format1(h, v, &fp->line);
+ if ( fp->format.compression!=no_compression )
+ ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
+ else
+ ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
+ return ret==fp->line.l ? 0 : -1;
+}
+
+/************************
+ * Data access routines *
+ ************************/
+
+int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
+{
+ khint_t k;
+ vdict_t *d = (vdict_t*)h->dict[which];
+ k = kh_get(vdict, d, id);
+ return k == kh_end(d)? -1 : kh_val(d, k).id;
+}
+
+
+/********************
+ *** BCF indexing ***
+ ********************/
+
+hts_idx_t *bcf_index(htsFile *fp, int min_shift)
+{
+ int n_lvls, i;
+ bcf1_t *b;
+ hts_idx_t *idx;
+ bcf_hdr_t *h;
+ int64_t max_len = 0, s;
+ h = bcf_hdr_read(fp);
+ if ( !h ) return NULL;
+ int nids = 0;
+ for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
+ {
+ if ( !h->id[BCF_DT_CTG][i].val ) continue;
+ if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0];
+ nids++;
+ }
+ if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken.
+ max_len += 256;
+ for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
+ idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
+ b = bcf_init1();
+ while (bcf_read1(fp,h, b) >= 0) {
+ int ret;
+ ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
+ if (ret < 0)
+ {
+ bcf_destroy1(b);
+ hts_idx_destroy(idx);
+ return NULL;
+ }
+ }
+ hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
+ bcf_destroy1(b);
+ bcf_hdr_destroy(h);
+ return idx;
+}
+
+hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
+{
+ return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
+}
+
+int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
+{
+ htsFile *fp;
+ hts_idx_t *idx;
+ int ret;
+ if ((fp = hts_open(fn, "rb")) == 0) return -1;
+ if ( fp->format.compression!=bgzf ) { hts_close(fp); return -1; }
+ idx = bcf_index(fp, min_shift);
+ hts_close(fp);
+ if ( !idx ) return -1;
+ ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
+ hts_idx_destroy(idx);
+ return ret;
+}
+
+int bcf_index_build(const char *fn, int min_shift)
+{
+ return bcf_index_build2(fn, NULL, min_shift);
+}
+
+/*****************
+ *** Utilities ***
+ *****************/
+
+int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
+{
+ int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0;
+ for (i=0; i<src->nhrec; i++)
+ {
+ if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
+ {
+ int j;
+ for (j=0; j<ndst_ori; j++)
+ {
+ if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
+
+ // Checking only the key part of generic lines, otherwise
+ // the VCFs are too verbose. Should we perhaps add a flag
+ // to bcf_hdr_combine() and make this optional?
+ if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
+ }
+ if ( j>=ndst_ori )
+ need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
+ }
+ else if ( src->hrec[i]->type==BCF_HL_STR )
+ {
+ // NB: we are ignoring fields without ID
+ int j = bcf_hrec_find_key(src->hrec[i],"ID");
+ if ( j>=0 )
+ {
+ bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
+ if ( !rec )
+ need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
+ }
+ }
+ else
+ {
+ int j = bcf_hrec_find_key(src->hrec[i],"ID");
+ assert( j>=0 ); // this should always be true for valid VCFs
+
+ bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
+ if ( !rec )
+ need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
+ else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
+ {
+ // Check that both records are of the same type. The bcf_hdr_id2length
+ // macro cannot be used here because dst header is not synced yet.
+ vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
+ vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
+ khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
+ khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
+ if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
+ {
+ fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]);
+ ret |= 1;
+ }
+ if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
+ {
+ fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]);
+ ret |= 1;
+ }
+ }
+ }
+ }
+ if ( need_sync ) bcf_hdr_sync(dst);
+ return ret;
+}
+
+bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
+{
+ if ( !dst )
+ {
+ // this will effectively strip existing IDX attributes from src to become dst
+ dst = bcf_hdr_init("r");
+ char *htxt = bcf_hdr_fmt_text(src, 0, NULL);
+ bcf_hdr_parse(dst, htxt);
+ free(htxt);
+ return dst;
+ }
+
+ int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0;
+ for (i=0; i<src->nhrec; i++)
+ {
+ if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
+ {
+ int j;
+ for (j=0; j<ndst_ori; j++)
+ {
+ if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
+
+ // Checking only the key part of generic lines, otherwise
+ // the VCFs are too verbose. Should we perhaps add a flag
+ // to bcf_hdr_combine() and make this optional?
+ if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
+ }
+ if ( j>=ndst_ori )
+ need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
+ }
+ else if ( src->hrec[i]->type==BCF_HL_STR )
+ {
+ // NB: we are ignoring fields without ID
+ int j = bcf_hrec_find_key(src->hrec[i],"ID");
+ if ( j>=0 )
+ {
+ bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
+ if ( !rec )
+ need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
+ }
+ }
+ else
+ {
+ int j = bcf_hrec_find_key(src->hrec[i],"ID");
+ assert( j>=0 ); // this should always be true for valid VCFs
+
+ bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
+ if ( !rec )
+ need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
+ else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
+ {
+ // Check that both records are of the same type. The bcf_hdr_id2length
+ // macro cannot be used here because dst header is not synced yet.
+ vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
+ vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
+ khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
+ khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
+ if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
+ {
+ fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]);
+ ret |= 1;
+ }
+ if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
+ {
+ fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]);
+ ret |= 1;
+ }
+ }
+ }
+ }
+ if ( need_sync ) bcf_hdr_sync(dst);
+ return dst;
+}
+int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
+{
+ int i;
+ if ( line->errcode )
+ {
+ fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode);
+ exit(1);
+ }
+ if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
+ if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
+ {
+ int dict;
+ for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
+ {
+ src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
+ for (i=0; i<src_hdr->n[dict]; i++)
+ {
+ if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
+ {
+ src_hdr->transl[dict][i] = -1;
+ continue;
+ }
+ src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
+ if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
+ }
+ }
+ if ( !src_hdr->ntransl )
+ {
+ free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
+ free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
+ src_hdr->ntransl = -1;
+ }
+ if ( src_hdr->ntransl==-1 ) return 0;
+ }
+ bcf_unpack(line,BCF_UN_ALL);
+
+ // CHROM
+ if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
+
+ // FILTER
+ for (i=0; i<line->d.n_flt; i++)
+ {
+ int src_id = line->d.flt[i];
+ if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
+ line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
+ line->d.shared_dirty |= BCF1_DIRTY_FLT;
+ }
+
+ // INFO
+ for (i=0; i<line->n_info; i++)
+ {
+ int src_id = line->d.info[i].key;
+ int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
+ if ( dst_id<0 ) continue;
+ int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
+ int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
+ if ( src_size==dst_size ) // can overwrite
+ {
+ line->d.info[i].key = dst_id;
+ uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
+ if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
+ else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
+ else { *(uint32_t*)vptr = (uint32_t)dst_id; }
+ }
+ else // must realloc
+ {
+ bcf_info_t *info = &line->d.info[i];
+ assert( !info->vptr_free );
+ kstring_t str = {0,0,0};
+ bcf_enc_int1(&str, dst_id);
+ bcf_enc_size(&str, info->len,info->type);
+ info->vptr_off = str.l;
+ kputsn((char*)info->vptr, info->vptr_len, &str);
+ info->vptr = (uint8_t*)str.s + info->vptr_off;
+ info->vptr_free = 1;
+ info->key = dst_id;
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ }
+ }
+
+ // FORMAT
+ for (i=0; i<line->n_fmt; i++)
+ {
+ int src_id = line->d.fmt[i].id;
+ int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
+ if ( dst_id<0 ) continue;
+ int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
+ int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
+ if ( src_size==dst_size ) // can overwrite
+ {
+ line->d.fmt[i].id = dst_id;
+ uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
+ if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
+ else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; }
+ else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
+ }
+ else // must realloc
+ {
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ assert( !fmt->p_free );
+ kstring_t str = {0,0,0};
+ bcf_enc_int1(&str, dst_id);
+ bcf_enc_size(&str, fmt->n, fmt->type);
+ fmt->p_off = str.l;
+ kputsn((char*)fmt->p, fmt->p_len, &str);
+ fmt->p = (uint8_t*)str.s + fmt->p_off;
+ fmt->p_free = 1;
+ fmt->id = dst_id;
+ line->d.indiv_dirty = 1;
+ }
+ }
+ return 0;
+}
+
+bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
+{
+ bcf_hdr_t *hout = bcf_hdr_init("r");
+ char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL);
+ if (!hout) {
+ fprintf(stderr, "[E::%s] failed to allocate bcf header\n", __func__);
+ free(htxt);
+ return NULL;
+ }
+ bcf_hdr_parse(hout, htxt);
+ free(htxt);
+ return hout;
+}
+
+bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
+{
+ int hlen;
+ void *names_hash = khash_str2int_init();
+ char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen);
+ kstring_t str;
+ bcf_hdr_t *h;
+ str.l = str.m = 0; str.s = 0;
+ h = bcf_hdr_init("w");
+ if (!h) {
+ fprintf(stderr, "[E::%s] failed to allocate bcf header\n", __func__);
+ free(htxt);
+ return NULL;
+ }
+ bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
+ int j;
+ for (j=0; j<n; j++) imap[j] = -1;
+ if ( bcf_hdr_nsamples(h0) > 0) {
+ char *p;
+ int i = 0, end = n? 8 : 7;
+ while ((p = strstr(htxt, "#CHROM\t")) != 0)
+ if (p > htxt && *(p-1) == '\n') break;
+ while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
+ if (i != end) {
+ free(h); free(str.s);
+ return 0; // malformated header
+ }
+ kputsn(htxt, p - htxt, &str);
+ for (i = 0; i < n; ++i) {
+ if ( khash_str2int_has_key(names_hash,samples[i]) )
+ {
+ fprintf(stderr,"[E::bcf_hdr_subset] Duplicate sample name \"%s\".\n", samples[i]);
+ free(str.s);
+ free(htxt);
+ khash_str2int_destroy(names_hash);
+ bcf_hdr_destroy(h);
+ return NULL;
+ }
+ imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
+ if (imap[i] < 0) continue;
+ kputc('\t', &str);
+ kputs(samples[i], &str);
+ khash_str2int_inc(names_hash,samples[i]);
+ }
+ } else kputsn(htxt, hlen, &str);
+ while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
+ kputc('\n',&str);
+ bcf_hdr_parse(h, str.s);
+ free(str.s);
+ free(htxt);
+ khash_str2int_destroy(names_hash);
+ return h;
+}
+
+int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
+{
+ if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
+
+ hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
+ if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples
+
+ int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
+ hdr->keep_samples = (uint8_t*) calloc(narr,1);
+ if ( samples[0]=='^' )
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
+
+ int idx, n, ret = 0;
+ char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
+ if ( !smpls ) return -1;
+ for (i=0; i<n; i++)
+ {
+ idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
+ if ( idx<0 )
+ {
+ if ( !ret ) ret = i+1;
+ continue;
+ }
+ assert( idx<bcf_hdr_nsamples(hdr) );
+ if ( samples[0]=='^' )
+ bit_array_clear(hdr->keep_samples, idx);
+ else
+ bit_array_set(hdr->keep_samples, idx);
+ }
+ for (i=0; i<n; i++) free(smpls[i]);
+ free(smpls);
+
+ bcf_hdr_nsamples(hdr) = 0;
+ for (i=0; i<hdr->nsamples_ori; i++)
+ if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
+ if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
+ else
+ {
+ char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
+ idx = 0;
+ for (i=0; i<hdr->nsamples_ori; i++)
+ if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]);
+ free(hdr->samples);
+ hdr->samples = samples;
+
+ // delete original samples from the dictionary
+ vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
+ int k;
+ for (k = kh_begin(d); k != kh_end(d); ++k)
+ if (kh_exist(d, k)) free((char*)kh_key(d, k));
+ kh_destroy(vdict, d);
+
+ // add the subset back
+ hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict);
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ {
+ int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore);
+ kh_val(d, k) = bcf_idinfo_def;
+ kh_val(d, k).id = kh_size(d) - 1;
+ }
+ bcf_hdr_sync(hdr);
+ }
+
+ return ret;
+}
+
+int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
+{
+ kstring_t ind;
+ ind.s = 0; ind.l = ind.m = 0;
+ if (n) {
+ bcf_fmt_t *fmt;
+ int i, j;
+ fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t));
+ uint8_t *ptr = (uint8_t*)v->indiv.s;
+ for (i = 0; i < v->n_fmt; ++i)
+ ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
+ for (i = 0; i < (int)v->n_fmt; ++i) {
+ bcf_fmt_t *f = &fmt[i];
+ bcf_enc_int1(&ind, f->id);
+ bcf_enc_size(&ind, f->n, f->type);
+ for (j = 0; j < n; ++j)
+ if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
+ }
+ for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
+ v->n_sample = i;
+ } else v->n_sample = 0;
+ if ( !v->n_sample ) v->n_fmt = 0;
+ free(v->indiv.s);
+ v->indiv = ind;
+ v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
+ return 0;
+}
+
+int bcf_is_snp(bcf1_t *v)
+{
+ int i;
+ bcf_unpack(v, BCF_UN_STR);
+ for (i = 0; i < v->n_allele; ++i)
+ {
+ if ( v->d.allele[i][1]==0 ) continue;
+
+ // mpileup's <X> allele, see also below. This is not completely satisfactory,
+ // a general library is here narrowly tailored to fit samtools.
+ if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
+ if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
+
+ break;
+ }
+ return i == v->n_allele;
+}
+
+static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var)
+{
+ // The most frequent case
+ if ( !ref[1] && !alt[1] )
+ {
+ if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
+ if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
+ if ( *alt == '*' ) { var->n = 0; var->type = VCF_REF; return; }
+ var->n = 1; var->type = VCF_SNP; return;
+ }
+ if ( alt[0]=='<' )
+ {
+ if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
+ if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
+ var->type = VCF_OTHER;
+ return;
+ }
+
+ const char *r = ref, *a = alt;
+ while (*r && *a && toupper(*r)==toupper(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed
+
+ if ( *a && !*r )
+ {
+ while ( *a ) a++;
+ var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
+ }
+ else if ( *r && !*a )
+ {
+ while ( *r ) r++;
+ var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
+ }
+ else if ( !*r && !*a )
+ {
+ var->n = 0; var->type = VCF_REF; return;
+ }
+
+ const char *re = r, *ae = a;
+ while ( re[1] ) re++;
+ while ( ae[1] ) ae++;
+ while ( re>r && ae>a && toupper(*re)==toupper(*ae) ) { re--; ae--; }
+ if ( ae==a )
+ {
+ if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
+ var->n = -(re-r);
+ if ( toupper(*re)==toupper(*ae) ) { var->type = VCF_INDEL; return; }
+ var->type = VCF_OTHER; return;
+ }
+ else if ( re==r )
+ {
+ var->n = ae-a;
+ if ( toupper(*re)==toupper(*ae) ) { var->type = VCF_INDEL; return; }
+ var->type = VCF_OTHER; return;
+ }
+
+ var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
+ var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
+
+ // should do also complex events, SVs, etc...
+}
+
+static void bcf_set_variant_types(bcf1_t *b)
+{
+ if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
+ bcf_dec_t *d = &b->d;
+ if ( d->n_var < b->n_allele )
+ {
+ d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele);
+ d->n_var = b->n_allele;
+ }
+ int i;
+ b->d.var_type = 0;
+ for (i=1; i<b->n_allele; i++)
+ {
+ bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
+ b->d.var_type |= d->var[i].type;
+ //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
+ }
+}
+
+int bcf_get_variant_types(bcf1_t *rec)
+{
+ if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
+ return rec->d.var_type;
+}
+int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
+{
+ if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
+ return rec->d.var[ith_allele].type;
+}
+
+int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+{
+ // Is the field already present?
+ int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
+ if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
+
+ for (i=0; i<line->n_info; i++)
+ if ( inf_id==line->d.info[i].key ) break;
+ bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
+
+ if ( !n || (type==BCF_HT_STR && !values) )
+ {
+ if ( n==0 && !strcmp("END",key) )
+ line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
+ if ( inf )
+ {
+ // Mark the tag for removal, free existing memory if necessary
+ if ( inf->vptr_free )
+ {
+ free(inf->vptr - inf->vptr_off);
+ inf->vptr_free = 0;
+ }
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ inf->vptr = NULL;
+ }
+ return 0;
+ }
+
+ // Encode the values and determine the size required to accommodate the values
+ kstring_t str = {0,0,0};
+ bcf_enc_int1(&str, inf_id);
+ if ( type==BCF_HT_INT )
+ bcf_enc_vint(&str, n, (int32_t*)values, -1);
+ else if ( type==BCF_HT_REAL )
+ bcf_enc_vfloat(&str, n, (float*)values);
+ else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
+ {
+ if ( values==NULL )
+ bcf_enc_size(&str, 0, BCF_BT_NULL);
+ else
+ bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
+ }
+ else
+ {
+ fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
+ abort();
+ }
+
+ // Is the INFO tag already present
+ if ( inf )
+ {
+ // Is it big enough to accommodate new block?
+ if ( str.l <= inf->vptr_len + inf->vptr_off )
+ {
+ if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
+ uint8_t *ptr = inf->vptr - inf->vptr_off;
+ memcpy(ptr, str.s, str.l);
+ free(str.s);
+ int vptr_free = inf->vptr_free;
+ bcf_unpack_info_core1(ptr, inf);
+ inf->vptr_free = vptr_free;
+ }
+ else
+ {
+ assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before
+ bcf_unpack_info_core1((uint8_t*)str.s, inf);
+ inf->vptr_free = 1;
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ }
+ }
+ else
+ {
+ // The tag is not present, create new one
+ line->n_info++;
+ hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
+ inf = &line->d.info[line->n_info-1];
+ bcf_unpack_info_core1((uint8_t*)str.s, inf);
+ inf->vptr_free = 1;
+ line->d.shared_dirty |= BCF1_DIRTY_INF;
+ }
+ line->unpacked |= BCF_UN_INFO;
+
+ if ( n==1 && !strcmp("END",key) ) line->rlen = ((int32_t*)values)[0] - line->pos;
+ return 0;
+}
+
+int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
+{
+ if ( !n )
+ return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
+
+ int i, max_len = 0;
+ for (i=0; i<n; i++)
+ {
+ int len = strlen(values[i]);
+ if ( len > max_len ) max_len = len;
+ }
+ char *out = (char*) malloc(max_len*n);
+ if ( !out ) return -2;
+ for (i=0; i<n; i++)
+ {
+ char *dst = out+i*max_len;
+ const char *src = values[i];
+ int j = 0;
+ while ( src[j] ) { dst[j] = src[j]; j++; }
+ for (; j<max_len; j++) dst[j] = 0;
+ }
+ int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
+ free(out);
+ return ret;
+}
+
+int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+{
+ // Is the field already present?
+ int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
+ {
+ if ( !n ) return 0;
+ return -1; // the key not present in the header
+ }
+
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==fmt_id ) break;
+ bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
+
+ if ( !n )
+ {
+ if ( fmt )
+ {
+ // Mark the tag for removal, free existing memory if necessary
+ if ( fmt->p_free )
+ {
+ free(fmt->p - fmt->p_off);
+ fmt->p_free = 0;
+ }
+ line->d.indiv_dirty = 1;
+ fmt->p = NULL;
+ }
+ return 0;
+ }
+
+ line->n_sample = bcf_hdr_nsamples(hdr);
+ int nps = n / line->n_sample; // number of values per sample
+ assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
+
+ // Encode the values and determine the size required to accommodate the values
+ kstring_t str = {0,0,0};
+ bcf_enc_int1(&str, fmt_id);
+ if ( type==BCF_HT_INT )
+ bcf_enc_vint(&str, n, (int32_t*)values, nps);
+ else if ( type==BCF_HT_REAL )
+ {
+ bcf_enc_size(&str, nps, BCF_BT_FLOAT);
+ kputsn((char*)values, nps*line->n_sample*sizeof(float), &str);
+ }
+ else if ( type==BCF_HT_STR )
+ {
+ bcf_enc_size(&str, nps, BCF_BT_CHAR);
+ kputsn((char*)values, nps*line->n_sample, &str);
+ }
+ else
+ {
+ fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
+ abort();
+ }
+
+ if ( !fmt )
+ {
+ // Not present, new format field
+ line->n_fmt++;
+ hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
+
+ // Special case: VCF specification requires that GT is always first
+ if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
+ {
+ for (i=line->n_fmt-1; i>0; i--)
+ line->d.fmt[i] = line->d.fmt[i-1];
+ fmt = &line->d.fmt[0];
+ }
+ else
+ fmt = &line->d.fmt[line->n_fmt-1];
+ bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
+ line->d.indiv_dirty = 1;
+ fmt->p_free = 1;
+ }
+ else
+ {
+ // The tag is already present, check if it is big enough to accomodate the new block
+ if ( str.l <= fmt->p_len + fmt->p_off )
+ {
+ // good, the block is big enough
+ if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
+ uint8_t *ptr = fmt->p - fmt->p_off;
+ memcpy(ptr, str.s, str.l);
+ free(str.s);
+ int p_free = fmt->p_free;
+ bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
+ fmt->p_free = p_free;
+ }
+ else
+ {
+ assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before
+ bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
+ fmt->p_free = 1;
+ line->d.indiv_dirty = 1;
+ }
+ }
+ line->unpacked |= BCF_UN_FMT;
+ return 0;
+}
+
+
+int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
+{
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ line->d.shared_dirty |= BCF1_DIRTY_FLT;
+ line->d.n_flt = n;
+ if ( !n ) return 0;
+ hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
+ int i;
+ for (i=0; i<n; i++)
+ line->d.flt[i] = flt_ids[i];
+ return 0;
+}
+
+int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
+{
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ int i;
+ for (i=0; i<line->d.n_flt; i++)
+ if ( flt_id==line->d.flt[i] ) break;
+ if ( i<line->d.n_flt ) return 0; // this filter is already set
+ line->d.shared_dirty |= BCF1_DIRTY_FLT;
+ if ( flt_id==0 ) // set to PASS
+ line->d.n_flt = 1;
+ else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
+ line->d.n_flt = 1;
+ else
+ line->d.n_flt++;
+ hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
+ line->d.flt[line->d.n_flt-1] = flt_id;
+ return 1;
+}
+int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
+{
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ int i;
+ for (i=0; i<line->d.n_flt; i++)
+ if ( flt_id==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) return 0; // the filter is not present
+ line->d.shared_dirty |= BCF1_DIRTY_FLT;
+ if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
+ line->d.n_flt--;
+ if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
+ return 0;
+}
+
+int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
+{
+ if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
+ int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
+
+ if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
+ if ( id==0 && !line->d.n_flt) return 1; // PASS
+
+ int i;
+ for (i=0; i<line->d.n_flt; i++)
+ if ( line->d.flt[i]==id ) return 1;
+ return 0;
+}
+
+static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
+{
+ line->d.shared_dirty |= BCF1_DIRTY_ALS;
+
+ line->n_allele = nals;
+ hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
+
+ char *als = line->d.als;
+ int n = 0;
+ while (n<nals)
+ {
+ line->d.allele[n] = als;
+ while ( *als ) als++;
+ als++;
+ n++;
+ }
+
+ // Update REF length
+ bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
+ line->rlen = end_info ? end_info->v1.i : strlen(line->d.allele[0]);
+
+ return 0;
+}
+int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
+{
+ kstring_t tmp = {0,0,0};
+ char *free_old = NULL;
+
+ // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
+ int i;
+ for (i=0; i<nals; i++)
+ if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
+ if ( i==nals )
+ {
+ // all alleles point elsewhere, reuse the existing block
+ tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
+ }
+ else
+ free_old = line->d.als;
+
+ for (i=0; i<nals; i++)
+ {
+ kputs(alleles[i], &tmp);
+ kputc(0, &tmp);
+ }
+ line->d.als = tmp.s; line->d.m_als = tmp.m;
+ free(free_old);
+ return _bcf1_sync_alleles(hdr,line,nals);
+}
+
+int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
+{
+ kstring_t tmp;
+ tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
+ kputs(alleles_string, &tmp);
+ line->d.als = tmp.s; line->d.m_als = tmp.m;
+
+ int nals = 1;
+ char *t = line->d.als;
+ while (*t)
+ {
+ if ( *t==',' ) { *t = 0; nals++; }
+ t++;
+ }
+ return _bcf1_sync_alleles(hdr, line, nals);
+}
+
+int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+{
+ kstring_t tmp;
+ tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
+ if ( id )
+ kputs(id, &tmp);
+ else
+ kputs(".", &tmp);
+ line->d.id = tmp.s; line->d.m_id = tmp.m;
+ line->d.shared_dirty |= BCF1_DIRTY_ID;
+ return 0;
+}
+
+int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+{
+ if ( !id ) return 0;
+
+ kstring_t tmp;
+ tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
+
+ int len = strlen(id);
+ char *dst = line->d.id;
+ while ( *dst && (dst=strstr(dst,id)) )
+ {
+ if ( dst[len]!=0 && dst[len]!=';' ) dst++; // a prefix, not a match
+ else if ( dst==line->d.id || dst[-1]==';' ) return 0; // already present
+ dst++; // a suffix, not a match
+ }
+ if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
+ {
+ tmp.l = strlen(line->d.id);
+ kputc(';',&tmp);
+ }
+ kputs(id,&tmp);
+
+ line->d.id = tmp.s; line->d.m_id = tmp.m;
+ line->d.shared_dirty |= BCF1_DIRTY_ID;
+ return 0;
+
+}
+
+bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+{
+ int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
+ return bcf_get_fmt_id(line, id);
+}
+
+bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+{
+ int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
+ return bcf_get_info_id(line, id);
+}
+
+bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
+{
+ int i;
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ for (i=0; i<line->n_fmt; i++)
+ {
+ if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
+ }
+ return NULL;
+}
+
+bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
+{
+ int i;
+ if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
+ for (i=0; i<line->n_info; i++)
+ {
+ if ( line->d.info[i].key==id ) return &line->d.info[i];
+ }
+ return NULL;
+}
+
+
+int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+{
+ int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
+ if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type
+
+ if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
+
+ for (i=0; i<line->n_info; i++)
+ if ( line->d.info[i].key==tag_id ) break;
+ if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
+ if ( type==BCF_HT_FLAG ) return 1;
+
+ bcf_info_t *info = &line->d.info[i];
+ if ( !info->vptr ) return -3; // the tag was marked for removal
+ if ( type==BCF_HT_STR )
+ {
+ if ( *ndst < info->len+1 )
+ {
+ *ndst = info->len + 1;
+ *dst = realloc(*dst, *ndst);
+ }
+ memcpy(*dst,info->vptr,info->len);
+ ((uint8_t*)*dst)[info->len] = 0;
+ return info->len;
+ }
+
+ // Make sure the buffer is big enough
+ int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
+ if ( *ndst < info->len )
+ {
+ *ndst = info->len;
+ *dst = realloc(*dst, *ndst * size1);
+ }
+
+ if ( info->len == 1 )
+ {
+ if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f;
+ else
+ {
+ #define BRANCH(type_t, missing) { \
+ if ( info->v1.i==missing ) *((int32_t*)*dst) = bcf_int32_missing; \
+ else *((int32_t*)*dst) = info->v1.i; \
+ }
+ switch (info->type)
+ {
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing ); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing); break;
+ }
+ #undef BRANCH
+ }
+ return 1;
+ }
+
+ #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \
+ out_type_t *tmp = (out_type_t *) *dst; \
+ type_t *p = (type_t *) info->vptr; \
+ for (j=0; j<info->len; j++) \
+ { \
+ if ( is_vector_end ) return j; \
+ if ( is_missing ) set_missing; \
+ else *tmp = p[j]; \
+ tmp++; \
+ } \
+ return j; \
+ }
+ switch (info->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, int32_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, int32_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, int32_t); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break;
+ default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ }
+ #undef BRANCH
+ return -4; // this can never happen
+}
+
+int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
+{
+ int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
+ if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
+
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==tag_id ) break;
+ if ( i==line->n_fmt ) return -3; // the tag is not present in this record
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ if ( !fmt->p ) return -3; // the tag was marked for removal
+
+ int nsmpl = bcf_hdr_nsamples(hdr);
+ if ( !*dst )
+ {
+ *dst = (char**) malloc(sizeof(char*)*nsmpl);
+ if ( !*dst ) return -4; // could not alloc
+ (*dst)[0] = NULL;
+ }
+ int n = (fmt->n+1)*nsmpl;
+ if ( *ndst < n )
+ {
+ (*dst)[0] = realloc((*dst)[0], n);
+ if ( !(*dst)[0] ) return -4; // could not alloc
+ *ndst = n;
+ }
+ for (i=0; i<nsmpl; i++)
+ {
+ uint8_t *src = fmt->p + i*fmt->n;
+ uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
+ memcpy(tmp,src,fmt->n);
+ tmp[fmt->n] = 0;
+ (*dst)[i] = (char*) tmp;
+ }
+ return n;
+}
+
+int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+{
+ int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
+ if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
+ if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
+ {
+ // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
+ if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
+ }
+ else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
+
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==tag_id ) break;
+ if ( i==line->n_fmt ) return -3; // the tag is not present in this record
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ if ( !fmt->p ) return -3; // the tag was marked for removal
+
+ if ( type==BCF_HT_STR )
+ {
+ int n = fmt->n*bcf_hdr_nsamples(hdr);
+ if ( *ndst < n )
+ {
+ *dst = realloc(*dst, n);
+ if ( !*dst ) return -4; // could not alloc
+ *ndst = n;
+ }
+ memcpy(*dst,fmt->p,n);
+ return n;
+ }
+
+ // Make sure the buffer is big enough
+ int nsmpl = bcf_hdr_nsamples(hdr);
+ int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
+ if ( *ndst < fmt->n*nsmpl )
+ {
+ *ndst = fmt->n*nsmpl;
+ *dst = realloc(*dst, *ndst*size1);
+ if ( !dst ) return -4; // could not alloc
+ }
+
+ #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
+ out_type_t *tmp = (out_type_t *) *dst; \
+ type_t *p = (type_t*) fmt->p; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ for (j=0; j<fmt->n; j++) \
+ { \
+ if ( is_missing ) set_missing; \
+ else if ( is_vector_end ) { set_vector_end; break; } \
+ else *tmp = p[j]; \
+ tmp++; \
+ } \
+ for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
+ p = (type_t *)((char *)p + fmt->size); \
+ } \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
+ default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
+ }
+ #undef BRANCH
+ return nsmpl*fmt->n;
+}
+
diff --git a/htslib/vcf_sweep.c b/htslib/vcf_sweep.c
new file mode 100644
index 0000000..38e384f
--- /dev/null
+++ b/htslib/vcf_sweep.c
@@ -0,0 +1,183 @@
+/* vcf_sweep.c -- forward/reverse sweep API.
+
+ Copyright (C) 2013 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include "htslib/vcf_sweep.h"
+#include "htslib/bgzf.h"
+
+#define SW_FWD 0
+#define SW_BWD 1
+
+struct _bcf_sweep_t
+{
+ htsFile *file;
+ bcf_hdr_t *hdr;
+ BGZF *fp;
+
+ int direction; // to tell if the direction has changed
+ int block_size; // the size of uncompressed data to hold in memory
+ bcf1_t *rec; // bcf buffer
+ int nrec, mrec; // number of used records; total size of the buffer
+ int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record
+ char *lals;
+
+ uint64_t *idx; // uncompressed offsets of VCF/BCF records
+ int iidx, nidx, midx; // i: current offset; n: used; m: allocated
+ int idx_done; // the index is built during the first pass
+};
+
+BGZF *hts_get_bgzfp(htsFile *fp);
+int hts_useek(htsFile *file, long uoffset, int where);
+long hts_utell(htsFile *file);
+
+static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec)
+{
+ if ( sw->lrid!=rec->rid ) return 0;
+ if ( sw->lpos!=rec->pos ) return 0;
+ if ( sw->lnals!=rec->n_allele ) return 0;
+
+ char *t = rec->d.allele[sw->lnals-1];
+ int len = t - rec->d.allele[0] + 1;
+ while ( *t ) { t++; len++; }
+ if ( sw->lals_len!=len ) return 0;
+ if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0;
+ return 1;
+}
+
+static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec)
+{
+ sw->lrid = rec->rid;
+ sw->lpos = rec->pos;
+ sw->lnals = rec->n_allele;
+
+ char *t = rec->d.allele[sw->lnals-1];
+ int len = t - rec->d.allele[0] + 1;
+ while ( *t ) { t++; len++; }
+ sw->lals_len = len;
+ hts_expand(char, len, sw->mlals, sw->lals);
+ memcpy(sw->lals, rec->d.allele[0], len);
+}
+
+static void sw_fill_buffer(bcf_sweep_t *sw)
+{
+ if ( !sw->iidx ) return;
+ sw->iidx--;
+
+ int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
+ assert( ret==0 );
+
+ sw->nrec = 0;
+ bcf1_t *rec = &sw->rec[sw->nrec];
+ while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
+ {
+ bcf_unpack(rec, BCF_UN_STR);
+
+ // if not in the last block, stop at the saved record
+ if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;
+
+ sw->nrec++;
+ hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
+ rec = &sw->rec[sw->nrec];
+ }
+ sw_rec_save(sw, &sw->rec[0]);
+}
+
+bcf_sweep_t *bcf_sweep_init(const char *fname)
+{
+ bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
+ sw->file = hts_open(fname, "r");
+ sw->fp = hts_get_bgzfp(sw->file);
+ bgzf_index_build_init(sw->fp);
+ sw->hdr = bcf_hdr_read(sw->file);
+ sw->mrec = 1;
+ sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
+ sw->block_size = 1024*1024*3;
+ sw->direction = SW_FWD;
+ return sw;
+}
+
+void bcf_sweep_destroy(bcf_sweep_t *sw)
+{
+ int i;
+ for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]);
+ free(sw->idx);
+ free(sw->rec);
+ free(sw->lals);
+ bcf_hdr_destroy(sw->hdr);
+ hts_close(sw->file);
+ free(sw);
+}
+
+static void sw_seek(bcf_sweep_t *sw, int direction)
+{
+ sw->direction = direction;
+ if ( direction==SW_FWD )
+ hts_useek(sw->file, sw->idx[0], 0);
+ else
+ {
+ sw->iidx = sw->nidx;
+ sw->nrec = 0;
+ }
+}
+
+bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw)
+{
+ if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD);
+
+ long pos = hts_utell(sw->file);
+
+ bcf1_t *rec = &sw->rec[0];
+ int ret = bcf_read1(sw->file, sw->hdr, rec);
+
+ if ( ret!=0 ) // last record, get ready for sweeping backwards
+ {
+ sw->idx_done = 1;
+ sw->fp->idx_build_otf = 0;
+ sw_seek(sw, SW_BWD);
+ return NULL;
+ }
+
+ if ( !sw->idx_done )
+ {
+ if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size )
+ {
+ sw->nidx++;
+ hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx);
+ sw->idx[sw->nidx-1] = pos;
+ }
+ }
+ return rec;
+}
+
+bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw)
+{
+ if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD);
+ if ( !sw->nrec ) sw_fill_buffer(sw);
+ if ( !sw->nrec ) return NULL;
+ return &sw->rec[ --sw->nrec ];
+}
+
+bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; }
+
diff --git a/htslib/vcfutils.c b/htslib/vcfutils.c
new file mode 100644
index 0000000..141fe0e
--- /dev/null
+++ b/htslib/vcfutils.c
@@ -0,0 +1,691 @@
+/* vcfutils.c -- allele-related utility functions.
+
+ Copyright (C) 2012-2015 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include "htslib/vcfutils.h"
+#include "htslib/kbitset.h"
+
+int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
+{
+ int i;
+ for (i=0; i<line->n_allele; i++) ac[i]=0;
+
+ // Use INFO/AC,AN field only when asked
+ if ( which&BCF_UN_INFO )
+ {
+ bcf_unpack(line, BCF_UN_INFO);
+ int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
+ int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
+ int i, an=-1, ac_len=0, ac_type=0;
+ uint8_t *ac_ptr=NULL;
+ if ( an_id>=0 && ac_id>=0 )
+ {
+ for (i=0; i<line->n_info; i++)
+ {
+ bcf_info_t *z = &line->d.info[i];
+ if ( z->key == an_id ) an = z->v1.i;
+ else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
+ }
+ }
+ if ( an>=0 && ac_ptr )
+ {
+ int nac = 0;
+ #define BRANCH_INT(type_t) { \
+ type_t *p = (type_t *) ac_ptr; \
+ for (i=0; i<ac_len; i++) \
+ { \
+ ac[i+1] = p[i]; \
+ nac += p[i]; \
+ } \
+ }
+ switch (ac_type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t); break;
+ default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
+ }
+ #undef BRANCH_INT
+ if ( an<nac )
+ {
+ fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+ exit(1);
+ }
+ ac[0] = an - nac;
+ return 1;
+ }
+ }
+
+ // Split genotype fields only when asked
+ if ( which&BCF_UN_FMT )
+ {
+ int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
+ if ( gt_id<0 ) return 0;
+ bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<(int)line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt ) return 0;
+ #define BRANCH_INT(type_t,vector_end) { \
+ for (i=0; i<line->n_sample; i++) \
+ { \
+ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
+ int ial; \
+ for (ial=0; ial<fmt_gt->n; ial++) \
+ { \
+ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
+ if ( p[ial]>>1 > line->n_allele ) \
+ { \
+ fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
+ exit(1); \
+ } \
+ ac[(p[ial]>>1)-1]++; \
+ } \
+ } \
+ }
+ switch (fmt_gt->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
+ }
+ #undef BRANCH_INT
+ return 1;
+ }
+ return 0;
+}
+
+int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
+{
+ int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
+ #define BRANCH_INT(type_t,vector_end) { \
+ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
+ for (i=0; i<fmt_ptr->n; i++) \
+ { \
+ if ( p[i] == vector_end ) break; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(p[i]) ) return GT_UNKN; /* missing allele */ \
+ int tmp = p[i]>>1; \
+ if ( tmp>1 ) \
+ { \
+ if ( !ial ) { ial = tmp; has_alt = 1; } \
+ else if ( tmp!=ial ) \
+ { \
+ if ( tmp<ial ) \
+ { \
+ jal = ial; \
+ ial = tmp; \
+ } \
+ else \
+ { \
+ jal = tmp; \
+ } \
+ has_alt = 2; \
+ } \
+ } \
+ else has_ref = 1; \
+ nals++; \
+ } \
+ }
+ switch (fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+
+ if ( _ial ) *_ial = ial>0 ? ial-1 : ial;
+ if ( _jal ) *_jal = jal>0 ? jal-1 : jal;
+ if ( !nals ) return GT_UNKN;
+ if ( nals==1 )
+ return has_ref ? GT_HAPL_R : GT_HAPL_A;
+ if ( !has_ref )
+ return has_alt==1 ? GT_HOM_AA : GT_HET_AA;
+ if ( !has_alt )
+ return GT_HOM_RR;
+ return GT_HET_RA;
+}
+
+int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
+{
+ int i;
+ bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT");
+ if ( !gt ) return 0;
+
+ int *ac = (int*) calloc(line->n_allele,sizeof(int));
+
+ // check if all alleles are populated
+ #define BRANCH(type_t,vector_end) { \
+ for (i=0; i<line->n_sample; i++) \
+ { \
+ type_t *p = (type_t*) (gt->p + i*gt->size); \
+ int ial; \
+ for (ial=0; ial<gt->n; ial++) \
+ { \
+ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
+ if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \
+ ac[(p[ial]>>1)-1]++; \
+ } \
+ } \
+ }
+ switch (gt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
+ }
+ #undef BRANCH
+
+ int nrm = 0;
+ kbitset_t *rm_set = kbs_init(line->n_allele);
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( !ac[i] ) { kbs_insert(rm_set, i); nrm++; }
+ }
+ free(ac);
+
+ if ( nrm ) bcf_remove_allele_set(header, line, rm_set);
+ kbs_destroy(rm_set);
+ return nrm;
+}
+
+void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
+{
+ int i;
+ kbitset_t *rm_set = kbs_init(line->n_allele);
+ for (i=1; i<line->n_allele; i++)
+ if ( rm_mask & 1<<i ) kbs_insert(rm_set, i);
+
+ bcf_remove_allele_set(header, line, rm_set);
+ kbs_destroy(rm_set);
+}
+
+void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set)
+{
+ int *map = (int*) calloc(line->n_allele, sizeof(int));
+
+ // create map of indexes from old to new ALT numbering and modify ALT
+ kstring_t str = {0,0,0};
+ kputs(line->d.allele[0], &str);
+
+ int nrm = 0, i,j; // i: ori alleles, j: new alleles
+ for (i=1, j=1; i<line->n_allele; i++)
+ {
+ if ( kbs_exists(rm_set, i) )
+ {
+ // remove this allele
+ line->d.allele[i] = NULL;
+ nrm++;
+ continue;
+ }
+ kputc(',', &str);
+ kputs(line->d.allele[i], &str);
+ map[i] = j;
+ j++;
+ }
+ if ( !nrm ) { free(map); free(str.s); return; }
+
+ int nR_ori = line->n_allele;
+ int nR_new = line->n_allele-nrm;
+ assert(nR_new > 0); // should not be able to remove reference allele
+ int nA_ori = nR_ori-1;
+ int nA_new = nR_new-1;
+
+ int nG_ori = nR_ori*(nR_ori + 1)/2;
+ int nG_new = nR_new*(nR_new + 1)/2;
+
+ bcf_update_alleles_str(header, line, str.s);
+
+ // remove from Number=G, Number=R and Number=A INFO fields.
+ uint8_t *dat = NULL;
+ int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
+ for (i=0; i<line->n_info; i++)
+ {
+ bcf_info_t *info = &line->d.info[i];
+ int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
+
+ if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
+
+ int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
+ if ( type==BCF_HT_FLAG ) continue;
+ int size = 1;
+ if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
+
+ mdat = mdat_bytes / size;
+ nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
+ mdat_bytes = mdat * size;
+ if ( nret<0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
+ bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
+ exit(1);
+ }
+ if ( type==BCF_HT_STR )
+ {
+ str.l = 0;
+ char *ss = (char*) dat, *se = (char*) dat;
+ if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
+ {
+ int nexp, inc = 0;
+ if ( vlen==BCF_VL_A )
+ {
+ nexp = nA_ori;
+ inc = 1;
+ }
+ else
+ nexp = nR_ori;
+ for (j=0; j<nexp; j++)
+ {
+ if ( !*se ) break;
+ while ( *se && *se!=',' ) se++;
+ if ( kbs_exists(rm_set, j+inc) )
+ {
+ if ( *se ) se++;
+ ss = se;
+ continue;
+ }
+ if ( str.l ) kputc(',',&str);
+ kputsn(ss,se-ss,&str);
+ if ( *se ) se++;
+ ss = se;
+ }
+ assert( j==nexp );
+ }
+ else // Number=G, assuming diploid genotype
+ {
+ int k = 0, n = 0;
+ for (j=0; j<nR_ori; j++)
+ {
+ for (k=0; k<=j; k++)
+ {
+ if ( !*se ) break;
+ while ( *se && *se!=',' ) se++;
+ n++;
+ if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) )
+ {
+ if ( *se ) se++;
+ ss = se;
+ continue;
+ }
+ if ( str.l ) kputc(',',&str);
+ kputsn(ss,se-ss,&str);
+ if ( *se ) se++;
+ ss = se;
+ }
+ if ( !*se ) break;
+ }
+ assert( n==nG_ori );
+ }
+
+ nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
+ if ( nret<0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
+ bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
+ exit(1);
+ }
+ continue;
+ }
+
+ if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
+ {
+ int inc = 0, ntop;
+ if ( vlen==BCF_VL_A )
+ {
+ assert( nret==nA_ori );
+ ntop = nA_ori;
+ ndat = nA_new;
+ inc = 1;
+ }
+ else
+ {
+ assert( nret==nR_ori );
+ ntop = nR_ori;
+ ndat = nR_new;
+ }
+ int k = 0;
+
+ #define BRANCH(type_t,is_vector_end) \
+ { \
+ type_t *ptr = (type_t*) dat; \
+ int size = sizeof(type_t); \
+ for (j=0; j<ntop; j++) /* j:ori, k:new */ \
+ { \
+ if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
+ if ( kbs_exists(rm_set, j+inc) ) continue; \
+ if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
+ k++; \
+ } \
+ }
+ switch (type)
+ {
+ case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
+ }
+ #undef BRANCH
+ }
+ else // Number=G
+ {
+ assert( nret==nG_ori );
+ int k, l_ori = -1, l_new = 0;
+ ndat = nG_new;
+
+ #define BRANCH(type_t,is_vector_end) \
+ { \
+ type_t *ptr = (type_t*) dat; \
+ int size = sizeof(type_t); \
+ for (j=0; j<nR_ori; j++) \
+ { \
+ for (k=0; k<=j; k++) \
+ { \
+ l_ori++; \
+ if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
+ if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) ) continue; \
+ if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
+ l_new++; \
+ } \
+ } \
+ }
+ switch (type)
+ {
+ case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
+ }
+ #undef BRANCH
+ }
+
+ nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
+ if ( nret<0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
+ bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
+ exit(1);
+ }
+ }
+
+ // Update GT fields, the allele indexes might have changed
+ for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
+ if ( i<line->n_allele )
+ {
+ mdat = mdat_bytes / 4; // sizeof(int32_t)
+ nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
+ mdat_bytes = mdat * 4;
+ if ( nret>0 )
+ {
+ nret /= line->n_sample;
+ int32_t *ptr = (int32_t*) dat;
+ for (i=0; i<line->n_sample; i++)
+ {
+ for (j=0; j<nret; j++)
+ {
+ if ( bcf_gt_is_missing(ptr[j]) ) continue;
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ int al = bcf_gt_allele(ptr[j]);
+ assert( al<nR_ori && map[al]>=0 );
+ ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
+ }
+ ptr += nret;
+ }
+ bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
+ }
+ }
+
+ // Remove from Number=G, Number=R and Number=A FORMAT fields.
+ // Assuming haploid or diploid GTs
+ for (i=0; i<line->n_fmt; i++)
+ {
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);
+
+ if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
+
+ int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
+ if ( type==BCF_HT_FLAG ) continue;
+
+ int size = 1;
+ if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
+
+ mdat = mdat_bytes / size;
+ nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
+ mdat_bytes = mdat * size;
+ if ( nret<0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
+ bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
+ exit(1);
+ }
+
+ if ( type==BCF_HT_STR )
+ {
+ int size = nret/line->n_sample; // number of bytes per sample
+ str.l = 0;
+ if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
+ {
+ int nexp, inc = 0;
+ if ( vlen==BCF_VL_A )
+ {
+ nexp = nA_ori;
+ inc = 1;
+ }
+ else
+ nexp = nR_ori;
+ for (j=0; j<line->n_sample; j++)
+ {
+ char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
+ int k_src = 0, k_dst = 0, l = str.l;
+ for (k_src=0; k_src<nexp; k_src++)
+ {
+ if ( ptr>=se || !*ptr) break;
+ while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
+ if ( kbs_exists(rm_set, k_src+inc) )
+ {
+ ss = ++ptr;
+ continue;
+ }
+ if ( k_dst ) kputc(',',&str);
+ kputsn(ss,ptr-ss,&str);
+ ss = ++ptr;
+ k_dst++;
+ }
+ assert( k_src==nexp );
+ l = str.l - l;
+ for (; l<size; l++) kputc(0, &str);
+ }
+ }
+ else // Number=G, diploid or haploid
+ {
+ for (j=0; j<line->n_sample; j++)
+ {
+ char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
+ int k_src = 0, k_dst = 0, l = str.l;
+ int nexp = 0; // diploid or haploid?
+ while ( ptr<se )
+ {
+ if ( !*ptr ) break;
+ if ( *ptr==',' ) nexp++;
+ ptr++;
+ }
+ if ( ptr!=ss ) nexp++;
+ assert( nexp==nG_ori || nexp==nR_ori );
+ ptr = ss;
+ if ( nexp==nG_ori ) // diploid
+ {
+ int ia, ib;
+ for (ia=0; ia<nR_ori; ia++)
+ {
+ for (ib=0; ib<=ia; ib++)
+ {
+ if ( ptr>=se || !*ptr ) break;
+ while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
+ if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) )
+ {
+ ss = ++ptr;
+ continue;
+ }
+ if ( k_dst ) kputc(',',&str);
+ kputsn(ss,ptr-ss,&str);
+ ss = ++ptr;
+ k_dst++;
+ }
+ if ( ptr>=se || !*ptr ) break;
+ }
+ }
+ else // haploid
+ {
+ for (k_src=0; k_src<nR_ori; k_src++)
+ {
+ if ( ptr>=se || !*ptr ) break;
+ while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
+ if ( kbs_exists(rm_set, k_src) )
+ {
+ ss = ++ptr;
+ continue;
+ }
+ if ( k_dst ) kputc(',',&str);
+ kputsn(ss,ptr-ss,&str);
+ ss = ++ptr;
+ k_dst++;
+ }
+ assert( k_src==nR_ori );
+ l = str.l - l;
+ for (; l<size; l++) kputc(0, &str);
+ }
+ }
+ }
+ nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
+ if ( nret<0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
+ bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
+ exit(1);
+ }
+ continue;
+ }
+
+ int nori = nret / line->n_sample;
+ if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
+ {
+ int inc = 0, nnew;
+ if ( vlen==BCF_VL_A )
+ {
+ assert( nori==nA_ori ); // todo: will fail if all values are missing
+ ndat = nA_new*line->n_sample;
+ nnew = nA_new;
+ inc = 1;
+ }
+ else
+ {
+ assert( nori==nR_ori ); // todo: will fail if all values are missing
+ ndat = nR_new*line->n_sample;
+ nnew = nR_new;
+ }
+
+ #define BRANCH(type_t,is_vector_end) \
+ { \
+ for (j=0; j<line->n_sample; j++) \
+ { \
+ type_t *ptr_src = ((type_t*)dat) + j*nori; \
+ type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
+ int size = sizeof(type_t); \
+ int k_src, k_dst = 0; \
+ for (k_src=0; k_src<nori; k_src++) \
+ { \
+ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
+ if ( kbs_exists(rm_set, k_src+inc) ) continue; \
+ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
+ k_dst++; \
+ } \
+ } \
+ }
+ switch (type)
+ {
+ case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
+ }
+ #undef BRANCH
+ }
+ else // Number=G, diploid or mixture of haploid+diploid
+ {
+ assert( nori==nG_ori );
+ ndat = nG_new*line->n_sample;
+
+ #define BRANCH(type_t,is_vector_end) \
+ { \
+ for (j=0; j<line->n_sample; j++) \
+ { \
+ type_t *ptr_src = ((type_t*)dat) + j*nori; \
+ type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
+ int size = sizeof(type_t); \
+ int ia, ib, k_dst = 0, k_src; \
+ int nset = 0; /* haploid or diploid? */ \
+ for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
+ if ( nset==nR_ori ) /* haploid */ \
+ { \
+ for (k_src=0; k_src<nR_ori; k_src++) \
+ { \
+ if ( kbs_exists(rm_set, k_src) ) continue; \
+ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
+ k_dst++; \
+ } \
+ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
+ } \
+ else /* diploid */ \
+ { \
+ k_src = -1; \
+ for (ia=0; ia<nR_ori; ia++) \
+ { \
+ for (ib=0; ib<=ia; ib++) \
+ { \
+ k_src++; \
+ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
+ if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) ) continue; \
+ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
+ k_dst++; \
+ } \
+ } \
+ } \
+ } \
+ }
+ switch (type)
+ {
+ case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
+ }
+ #undef BRANCH
+ }
+ nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
+ if ( nret<0 )
+ {
+ fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
+ bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
+ exit(1);
+ }
+ }
+ free(dat);
+ free(str.s);
+ free(map);
+}
+
diff --git a/scripts/ref_map.pl b/scripts/ref_map.pl
index 18ec64b..83dd103 100755
--- a/scripts/ref_map.pl
+++ b/scripts/ref_map.pl
@@ -872,7 +872,7 @@ sub usage {
version();
print STDERR <<EOQ;
-ref_map.pl -p path -r path [-s path] -o path [-t] [-m min_cov] [-M mismatches] [-n mismatches] [-T num_threads] [-A type] [-O popmap] [-B db -b batch_id -D "desc"] [-S -i num] [-e path] [-d] [-h]
+ref_map.pl -p path -r path [-s path] -o path [-t] [-m min_cov] [-M mismatches] [-T num_threads] [-A type] [-O popmap] [-B db -b batch_id -D "desc"] [-S -i num] [-e path] [-d] [-h]
b: batch ID representing this dataset (an integer, e.g. 1, 2, 3).
o: path to write pipeline output files.
O: if analyzing one or more populations, specify a pOpulation map.
diff --git a/src/BamI.h b/src/BamI.h
index a769089..5e2a9cf 100644
--- a/src/BamI.h
+++ b/src/BamI.h
@@ -29,11 +29,12 @@
#ifdef HAVE_BAM
#include "input.h"
-#include "bam.h"
+#include "sam.h"
class Bam: public Input {
- bamFile bam_fh;
- bam1_t *aln;
+ htsFile *bam_fh;
+ bam_hdr_t *bamh;
+ bam1_t *aln;
map<uint, string> chrs;
@@ -47,13 +48,14 @@ class Bam: public Input {
public:
Bam(const char *path) : Input() {
this->path = string(path);
- this->bam_fh = bam_open(path, "r");
+ this->bam_fh = hts_open(path, "r");
this->aln = bam_init1();
this->parse_header();
};
~Bam() {
- bam_close(this->bam_fh);
+ hts_close(this->bam_fh);
+ bam_hdr_destroy(this->bamh);
bam_destroy1(this->aln);
};
Seq *next_seq();
@@ -63,18 +65,16 @@ class Bam: public Input {
int
Bam::parse_header()
{
- bam_header_t *bamh = bam_header_init();
- bamh = bam_header_read(this->bam_fh);
+ this->bamh = bam_hdr_init();
+ this->bamh = sam_hdr_read(this->bam_fh);
- for (uint j = 0; j < (uint) bamh->n_targets; j++) {
+ for (uint j = 0; j < (uint) this->bamh->n_targets; j++) {
//
// Record the mapping from integer ID to chromosome name that we will see in BAM records.
//
- this->chrs[j] = string(bamh->target_name[j]);
+ this->chrs[j] = string(this->bamh->target_name[j]);
}
- bam_header_destroy(bamh);
-
return 0;
}
@@ -88,7 +88,7 @@ Bam::next_seq()
// Read a record from the file, skipping unmapped reads, and place it in a Seq object.
//
do {
- bytes_read = bam_read1(this->bam_fh, this->aln);
+ bytes_read = sam_read1(this->bam_fh, this->bamh, this->aln);
if (bytes_read <= 0)
return NULL;
@@ -122,8 +122,11 @@ Bam::next_seq()
//
string seq;
uint8_t j;
+
+ seq.reserve(this->aln->core.l_qseq);
+
for (int i = 0; i < this->aln->core.l_qseq; i++) {
- j = bam1_seqi(bam1_seq(this->aln), i);
+ j = bam_seqi(bam_get_seq(this->aln), i);
switch(j) {
case 1:
seq += 'A';
@@ -147,15 +150,15 @@ Bam::next_seq()
// Fetch the quality score.
//
string qual;
- uint8_t *q = bam1_qual(this->aln);
+ uint8_t *q = bam_get_qual(this->aln);
for (int i = 0; i < this->aln->core.l_qseq; i++) {
qual += char(int(q[i]) + 33);
}
string chr = this->chrs[this->aln->core.tid];
- Seq *s = new Seq((const char *) bam1_qname(this->aln), seq.c_str(), qual.c_str(),
- chr.c_str(), bp, flag ? minus : plus);
+ Seq *s = new Seq((const char *) bam_get_qname(this->aln), seq.c_str(), qual.c_str(),
+ chr.c_str(), bp, flag ? strand_minus : strand_plus);
if (cigar.size() > 0)
this->edit_gaps(cigar, s->seq);
@@ -168,7 +171,7 @@ Bam::parse_bam_cigar(vector<pair<char, uint> > &cigar, bool orientation)
{
int op, len;
char c;
- uint32_t *cgr = bam1_cigar(this->aln);
+ uint32_t *cgr = bam_get_cigar(this->aln);
for (int k = 0; k < this->aln->core.n_cigar; k++) {
op = cgr[k] & BAM_CIGAR_MASK;
@@ -202,7 +205,7 @@ Bam::parse_bam_cigar(vector<pair<char, uint> > &cigar, bool orientation)
// If aligned to the negative strand, sequence has been reverse complemented and
// CIGAR string should be interpreted in reverse.
//
- if (orientation == plus)
+ if (orientation == strand_plus)
cigar.push_back(make_pair(c, len));
else
cigar.insert(cigar.begin(), make_pair(c, len));
@@ -235,7 +238,7 @@ Bam::parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar, bool o
// If aligned to the negative strand, sequence has been reverse complemented and
// CIGAR string should be interpreted in reverse.
//
- if (orientation == plus)
+ if (orientation == strand_plus)
cigar.push_back(make_pair(*q, dist));
else
cigar.insert(cigar.begin(), make_pair(*q, dist));
diff --git a/src/BamUnalignedI.h b/src/BamUnalignedI.h
index b6224b8..73f8496 100644
--- a/src/BamUnalignedI.h
+++ b/src/BamUnalignedI.h
@@ -29,11 +29,12 @@
#ifdef HAVE_BAM
#include "input.h"
-#include "bam.h"
+#include "sam.h"
class BamUnAln: public Input {
- bamFile bam_fh;
- bam1_t *aln;
+ htsFile *bam_fh;
+ bam_hdr_t *bamh;
+ bam1_t *aln;
map<uint, string> chrs;
@@ -42,20 +43,21 @@ class BamUnAln: public Input {
public:
BamUnAln(const char *path) : Input() {
this->path = string(path);
- this->bam_fh = bam_open(path, "r");
+ this->bam_fh = hts_open(path, "r");
this->aln = bam_init1();
this->parse_header();
};
BamUnAln(string path) : Input() {
this->path = path;
- this->bam_fh = bam_open(path.c_str(), "r");
+ this->bam_fh = hts_open(path.c_str(), "r");
this->aln = bam_init1();
this->parse_header();
};
~BamUnAln() {
- bam_close(this->bam_fh);
+ hts_close(this->bam_fh);
+ bam_hdr_destroy(this->bamh);
bam_destroy1(this->aln);
};
Seq *next_seq();
@@ -65,18 +67,16 @@ class BamUnAln: public Input {
int
BamUnAln::parse_header()
{
- bam_header_t *bamh = bam_header_init();
- bamh = bam_header_read(this->bam_fh);
+ this->bamh = bam_hdr_init();
+ this->bamh = sam_hdr_read(this->bam_fh);
- for (uint j = 0; j < (uint) bamh->n_targets; j++) {
+ for (uint j = 0; j < (uint) this->bamh->n_targets; j++) {
//
// Record the mapping from integer ID to chromosome name that we will see in BAM records.
//
- this->chrs[j] = string(bamh->target_name[j]);
+ this->chrs[j] = string(this->bamh->target_name[j]);
}
- bam_header_destroy(bamh);
-
return 0;
}
@@ -88,7 +88,7 @@ BamUnAln::next_seq()
//
// Read a record from the file and place it in a Seq object.
//
- bytes_read = bam_read1(this->bam_fh, this->aln);
+ bytes_read = sam_read1(this->bam_fh, this->bamh, this->aln);
if (bytes_read <= 0)
return NULL;
@@ -98,8 +98,11 @@ BamUnAln::next_seq()
//
string seq;
uint8_t j;
+
+ seq.reserve(this->aln->core.l_qseq);
+
for (int i = 0; i < this->aln->core.l_qseq; i++) {
- j = bam1_seqi(bam1_seq(this->aln), i);
+ j = bam_seqi(bam_get_seq(this->aln), i);
switch(j) {
case 1:
seq += 'A';
@@ -123,7 +126,7 @@ BamUnAln::next_seq()
// Fetch the quality score.
//
string qual;
- uint8_t *q = bam1_qual(this->aln);
+ uint8_t *q = bam_get_qual(this->aln);
for (int i = 0; i < this->aln->core.l_qseq; i++) {
qual += char(int(q[i]) + 33);
}
@@ -134,7 +137,7 @@ BamUnAln::next_seq()
// Attempt to parse the query name for this read.
//
- Seq *s = new Seq((const char *) bam1_qname(this->aln), seq.c_str(), qual.c_str());
+ Seq *s = new Seq((const char *) bam_get_qname(this->aln), seq.c_str(), qual.c_str());
return s;
}
diff --git a/src/BowtieI.h b/src/BowtieI.h
index 5501846..2d30c18 100644
--- a/src/BowtieI.h
+++ b/src/BowtieI.h
@@ -55,14 +55,14 @@ Seq *Bowtie::next_seq() {
parse_tsv(this->line, parts);
- strand_type strand = parts[1] == "+" ? plus : minus;
+ strand_type strand = parts[1] == "+" ? strand_plus : strand_minus;
//
// If the read was aligned on the reverse strand (and is therefore reverse complemented)
// alter the start point of the alignment to reflect the right-side of the read, at the
// end of the RAD cut site.
//
- int bp = strand == plus ? atoi(parts[3].c_str()) : atoi(parts[3].c_str()) + parts[4].length();
+ int bp = strand == strand_plus ? atoi(parts[3].c_str()) : atoi(parts[3].c_str()) + parts[4].length();
Seq *s = new Seq(parts[0].c_str(), parts[4].c_str(), parts[5].c_str(),
parts[2].c_str(), bp, strand);
diff --git a/src/GappedAln.h b/src/GappedAln.h
index fc05ab0..af42ba4 100644
--- a/src/GappedAln.h
+++ b/src/GappedAln.h
@@ -125,7 +125,7 @@ GappedAln::GappedAln(int len_1, int len_2)
GappedAln::~GappedAln()
{
- for (int i = 0; i < this->_m; i++) {
+ for (uint i = 0; i < this->_m; i++) {
delete [] this->matrix[i];
delete [] this->path[i];
}
@@ -139,8 +139,8 @@ GappedAln::init(int size_1, int size_2)
//
// Resize the underlying matrix and path arrays, if necessary.
//
- if ((size_1 + 1) > this->_m_size || (size_2 + 1) > this->_n_size) {
- for (int i = 0; i < this->_m_size; i++) {
+ if ((size_1 + 1) > (int)_m_size || (size_2 + 1) > (int)_n_size) {
+ for (uint i = 0; i < this->_m_size; i++) {
delete [] this->matrix[i];
delete [] this->path[i];
}
diff --git a/src/MetaPopInfo.cc b/src/MetaPopInfo.cc
new file mode 100644
index 0000000..0ec5799
--- /dev/null
+++ b/src/MetaPopInfo.cc
@@ -0,0 +1,347 @@
+#include <fstream>
+#include <dirent.h>
+#include <iostream>
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include "config.h"
+#include "constants.h"
+#include "input.h"
+
+#include "MetaPopInfo.h"
+
+using std::ifstream;
+using std::cerr;
+using std::exception;
+
+const string MetaPopInfo::Pop::default_name = "defaultpop";
+const string MetaPopInfo::Group::default_name = "defaultgrp";
+
+void MetaPopInfo::reset_sample_map() {
+ sample_indexes_.clear();
+ for (size_t i = 0; i < samples_.size(); ++i)
+ sample_indexes_.insert( {samples_[i].name, i} );
+}
+
+void MetaPopInfo::reset_pop_map() {
+ pop_indexes_.clear();
+ for (size_t i = 0; i < pops_.size(); ++i)
+ pop_indexes_.insert( {pops_[i].name, i} );
+}
+
+void MetaPopInfo::reset_group_map() {
+ group_indexes_.clear();
+ for (size_t i = 0; i < groups_.size(); ++i)
+ group_indexes_.insert( {groups_[i].name, i} );
+}
+
+bool MetaPopInfo::init_popmap(const string& pmap_path) {
+
+ ifstream fh(pmap_path.c_str(), ifstream::in);
+ if (fh.fail())
+ return false;
+
+ size_t p = 0; // pop index counter
+ size_t g = 0; // group index counter
+
+ char line[max_len];
+ memset(line, '\0', max_len);
+ vector<string> parts;
+ while (fh.getline(line, max_len)) {
+ size_t len = strlen(line);
+
+ // Skip empty lines and comments
+ if (len == 0 || line[0] == '#')
+ continue;
+
+ // Check for Windows line endings
+ if (line[len - 1] == '\r') {
+ line[len - 1] = '\0';
+ len -= 1;
+ }
+
+ //
+ // Parse the contents, we expect:
+ // <file name> tab <population string> [tab <group string>]
+ //
+
+ parse_tsv(line, parts);
+
+ if (parts.size() < 2
+ || parts.size() > 3
+ || parts[0].empty()
+ || parts[1].empty()) {
+ cerr << "Error: Malformed population map -- expected 'SAMPLE\\tPOP[\\tGROUP]'. In file '" << pmap_path << "', line :\n" << line << "\n";
+ throw exception();
+ }
+
+ //
+ // Process the sample name.
+ //
+
+ samples_.push_back(Sample(parts[0]));
+
+ //
+ // Process the population field.
+ //
+
+ pair<map<string,size_t>::iterator, bool> pop_ins = pop_indexes_.insert( {parts[1], p} );
+ size_t pop_index = pop_ins.first->second;
+
+ samples_.back().pop = pop_index; // Set the sample's population index.
+ if (pop_ins.second) {
+ // Unknown pop
+ pops_.push_back(Pop(parts[1]));
+ ++p;
+ }
+
+ //
+ // Process the group field, if any
+ //
+
+ if (parts.size() == 3 && ! parts[2].empty()) {
+ // Get the index of this group -- create it if necessary.
+ pair<map<string,size_t>::iterator, bool> grp_ins = group_indexes_.insert( {parts[2], g} );
+ if (grp_ins.second) {
+ groups_.push_back(Group(parts[2]));
+ ++g;
+ }
+ size_t grp_index = grp_ins.first->second;
+
+ if (pops_[pop_index].group != size_t(-1)) {
+ // The current pop already has a group, check that it is the same.
+ if (pops_[pop_index].group != grp_index) {
+ cerr << "Error: In population map file '"
+ << pmap_path << "': population '"
+ << pops_[pop_index].name << "' belongs to two groups, '"
+ << groups_[pops_[pop_index].group].name << "' and '"
+ << groups_[grp_index].name << "'\n.";
+ throw exception();
+ }
+ } else {
+ pops_[pop_index].group = grp_index;
+ groups_[grp_index].pops.push_back(pop_index);
+ }
+ }
+ }
+ if (samples_.empty())
+ // Empty population map.
+ return false;
+
+ //
+ // Check that all the populations are in a group. Put
+ // populations that do not have a group in a default
+ // one.
+ //
+
+ bool missing_group = false;
+ if (not groups_.empty()) {
+ for (vector<Pop>::iterator p = pops_.begin(); p != pops_.end(); ++p) {
+ if (p->group == size_t(-1)) {
+ cerr << "Warning: Population '" << p->name
+ << "' did not have a group, adding it to '"
+ << Group::default_name << "'.\n";
+ missing_group = true;
+ }
+ }
+ } else {
+ missing_group = true;
+ }
+ if (missing_group) {
+ groups_.push_back(Group(Group::default_name));
+ g = groups_.size()-1;
+ group_indexes_.insert( {Group::default_name, g} );
+ for (size_t p = 0; p < pops_.size(); ++p) {
+ if (pops_[p].group == size_t(-1)) {
+ pops_[p].group = g;
+ groups_[g].pops.push_back(p);
+ }
+ }
+ }
+
+ //
+ // Sort the samples. Determine the first/last indexes for each population.
+ //
+
+ sort(samples_.begin(), samples_.end());
+ reset_sample_map();
+
+ size_t curr_pop = 0;
+ pops_[curr_pop].first_sample = 0;
+ for (size_t s = 1; s < samples_.size(); ++s) {
+ if (samples_[s].pop != curr_pop) {
+ pops_[curr_pop].last_sample = s-1;
+ ++curr_pop;
+ pops_[curr_pop].first_sample = s;
+ }
+ }
+ pops_[curr_pop].last_sample = samples_.size()-1;
+
+ return true;
+}
+
+bool MetaPopInfo::init_names(const vector<string>& sample_names) {
+ if(sample_names.empty())
+ return false;
+
+ // Create the samples
+ for (vector<string>::const_iterator s=sample_names.begin(); s!= sample_names.end(); ++s) {
+ samples_.push_back(Sample(*s));
+ samples_.back().pop = 0;
+ }
+ sort(samples_.begin(), samples_.end());
+
+ // Create a default population
+ pops_.push_back(Pop(Pop::default_name));
+ pops_[0].first_sample = 0;
+ pops_[0].last_sample = samples_.size()-1;
+ pops_[0].group = 0;
+
+ // Create a default group
+ groups_.push_back(Group(Group::default_name));
+ groups_[0].pops.push_back(0);
+
+ // Set the support members.
+ reset_sample_map();
+ reset_pop_map();
+ reset_group_map();
+
+ return true;
+}
+
+bool MetaPopInfo::init_directory(const string& dir_path) {
+
+ //
+ // Find all sample names.
+ //
+
+ vector<string> sample_names;
+ DIR* dir = opendir(dir_path.c_str());
+ if (dir == NULL) {
+ cerr << "Unable to open directory '" << dir_path << "' for reading.\n";
+ throw exception();
+ }
+ dirent *direntry;
+ while ((direntry = readdir(dir)) != NULL) {
+ string filename = direntry->d_name;
+
+ if (filename == "." || filename == ".." || filename.substr(0, 6) == "batch_")
+ continue;
+
+ size_t pos = filename.rfind(".tags.tsv");
+ if (pos == string::npos)
+ pos = filename.rfind(".tags.tsv.gz");
+
+ if (pos != string::npos)
+ sample_names.push_back(filename.substr(0, pos));
+ }
+ closedir(dir);
+
+ //
+ // Initialize the MetaPopInfo
+ //
+
+ return init_names(sample_names);
+}
+
+void MetaPopInfo::delete_samples(const vector<size_t>& rm_samples) {
+
+ // Remove these samples from [samples_].
+ for (vector<size_t>::const_iterator s = rm_samples.begin(); s != rm_samples.end(); ++s) {
+ samples_.at(*s).name.clear(); // Mark the sample for removal.
+ }
+ samples_.erase(
+ remove_if(samples_.begin(), samples_.end(),
+ [] (Sample& s) {return s.name.empty();} ),
+ samples_.end());
+
+ // Update the indexes of the populations.
+ for (vector<Pop>::iterator p = pops_.begin(); p != pops_.end(); ++p) {
+ for (vector<size_t>::const_reverse_iterator rm_sample = rm_samples.rbegin(); rm_sample != rm_samples.rend(); ++rm_sample) {
+ if (p->first_sample > *rm_sample) // n.b. ">"
+ --p->first_sample;
+ if (p->last_sample >= *rm_sample) // n.b. ">=". Thus if the population becomes
+ // empty, [first_sample] will be past [last_sample].
+ // n.b. If removing the first pop, last_sample=size_t(-1).
+ --p->last_sample;
+ }
+ }
+
+ auto pop_is_empty = [] (Pop& p) {return (p.first_sample > p.last_sample || p.last_sample == size_t(-1));};
+
+ // Remove the empty populations from [groups_].
+ for(vector<Group>::iterator group = groups_.begin(); group != groups_.end(); ++group)
+ group->pops.erase(
+ remove_if(group->pops.begin(), group->pops.end(),
+ [this,&pop_is_empty] (size_t p) {return pop_is_empty(pops_[p]);}),
+ group->pops.end());
+
+ // Remove the empty populations from [pops_].
+ pops_.erase(
+ remove_if(pops_.begin(), pops_.end(), pop_is_empty),
+ pops_.end());
+
+ // Remove empty groups from [groups_].
+ groups_.erase(
+ remove_if(groups_.begin(), groups_.end(),
+ [] (Group& g) {return g.pops.empty();}),
+ groups_.end());
+
+ // Update the support members.
+ reset_sample_map();
+ reset_pop_map();
+ reset_group_map();
+ reset_sample_id_map();
+}
+
+void MetaPopInfo::reset_sample_id_map() {
+ sample_indexes_by_id_.clear();
+ for (size_t i = 0; i < samples_.size(); ++i)
+ sample_indexes_by_id_[samples_[i].id] = i;
+}
+
+void MetaPopInfo::fill_files(vector<pair<int, string> >& files) const {
+ files.clear();
+ for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
+ files.push_back( {sample->pop, sample->name} );
+}
+
+void MetaPopInfo::fill_sample_ids(vector<int>& sample_ids) const {
+ sample_ids.clear();
+ for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
+ sample_ids.push_back(sample->id);
+}
+
+void MetaPopInfo::fill_samples(map<int, string>& samples) const {
+ samples.clear();
+ for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
+ samples.insert({sample->id, sample->name});
+}
+
+void MetaPopInfo::fill_pop_key(map<int, string>& pop_key) const {
+ pop_key.clear();
+ for (size_t i = 0; i < pops_.size(); ++i)
+ pop_key.insert( {i, pops_[i].name} );
+}
+
+void MetaPopInfo::fill_pop_indexes(map<int, pair<int, int> >& pop_indexes) const {
+ pop_indexes.clear();
+ for (size_t i = 0; i < pops_.size(); ++i)
+ pop_indexes.insert( {i, {pops_[i].first_sample, pops_[i].last_sample}} );
+}
+
+void MetaPopInfo::fill_grp_key(map<int, string>& grp_key) const {
+ grp_key.clear();
+ for (size_t i = 0; i < groups_.size(); ++i)
+ grp_key.insert({i, groups_[i].name});
+}
+
+void MetaPopInfo::fill_grp_members(map<int, vector<int> >& grp_members) const {
+ grp_members.clear();
+ for (size_t i = 0; i < groups_.size(); ++i) {
+ vector<int>& pop_ids = grp_members.insert( {i, vector<int>()} ).first->second;
+ for(vector<size_t>::const_iterator p = groups_[i].pops.begin(); p != groups_[i].pops.end(); ++p)
+ pop_ids.push_back(*p);
+ }
+}
diff --git a/src/MetaPopInfo.h b/src/MetaPopInfo.h
new file mode 100644
index 0000000..57d6f6f
--- /dev/null
+++ b/src/MetaPopInfo.h
@@ -0,0 +1,111 @@
+#ifndef METAPOPINFO_H
+#define METAPOPINFO_H
+
+#include <string>
+#include <vector>
+#include <map>
+
+using std::size_t;
+using std::pair;
+using std::vector;
+using std::string;
+using std::map;
+
+/*
+ * MetaPopInfo
+ * Class for reprensenting a metapopulation : its samples, populations,
+ * groups of populations, and associated information.
+ */
+class MetaPopInfo {
+public:
+ struct Sample {
+ string name;
+ size_t pop;
+ size_t id; // optional, deprecated
+
+ Sample(const string& n) : name(n), pop(-1), id(-1) {}
+ inline bool operator<(const Sample& other) const;
+ };
+ struct Pop {
+ string name;
+ size_t first_sample;
+ size_t last_sample;
+ size_t group;
+
+ Pop(const string& n) : name(n), first_sample(-1), last_sample(-1), group(-1) {}
+ static const string default_name;
+ };
+ struct Group {
+ string name;
+ vector<size_t> pops;
+
+ Group(const string& n) : name(n), pops() {}
+ static const string default_name;
+ };
+
+private:
+ vector<Sample> samples_; //n.b. Samples are sorted primarily by population index, and secondarily by name.
+ vector<Pop> pops_;
+ vector<Group> groups_;
+
+ map<string,size_t> sample_indexes_; // Links a name with an index in [samples_].
+ map<string,size_t> pop_indexes_;
+ map<string,size_t> group_indexes_;
+ void reset_sample_map(); // Resets [sample_indexes_].
+ void reset_pop_map();
+ void reset_group_map();
+
+ map<size_t,size_t> sample_indexes_by_id_; // Links a sample ID with an index in [samples_].
+ void reset_sample_id_map();
+
+public:
+ // Create the representation :
+ // -- from a population map file.
+ // -- from just a vector of sample names.
+ // -- or by looking for "*.tags.tsv(.gz)" files in a directory.
+ bool init_popmap(const string& popmap_path);
+ bool init_names(const vector<string>& sample_names);
+ bool init_directory(const string& dir_path);
+
+ // Delete samples from the metapopulation.
+ // (As samples, populations or groups may be deleted, the indexes of
+ // the remaining ones change, but the order in which they appear
+ // is preserved.)
+ void delete_samples(const vector<size_t>& rm_samples);
+
+ // Retrieve information.
+ const vector<Sample>& samples() const {return samples_;}
+ const vector<Pop>& pops() const {return pops_;}
+ const vector<Group>& groups() const {return groups_;}
+
+ size_t get_sample_index(const string& name) const {return sample_indexes_.at(name);}
+ size_t get_pop_index(const string& name) const {return pop_indexes_.at(name);}
+ size_t get_group_index(const string& name) const {return group_indexes_.at(name);}
+
+ // Work with sample IDs. (IDs unicity is not enforced.)
+ void set_sample_id(size_t index, size_t id) {samples_.at(index).id = id; sample_indexes_by_id_[id] = index;}
+ size_t get_sample_index(const size_t& id) const {return sample_indexes_by_id_.at(id);}
+
+ /*
+ * Methods for backwards compatibility
+ */
+
+ // Fill former globals.
+ void fill_files(vector<pair<int, string> >&) const;
+ void fill_sample_ids(vector<int>&) const;
+ void fill_samples(map<int, string>&) const;
+ void fill_pop_key(map<int, string>&) const;
+ void fill_pop_indexes(map<int, pair<int, int> >&) const;
+ void fill_grp_key(map<int, string>&) const;
+ void fill_grp_members(map<int, vector<int> >&) const;
+};
+
+inline
+bool MetaPopInfo::Sample::operator<(const Sample& other) const {
+ if (pop == other.pop)
+ return name < other.name;
+ else
+ return pop < other.pop;
+}
+
+#endif // METAPOPINFO_H
diff --git a/src/PopMap.h b/src/PopMap.h
index 495d42a..5172f2c 100644
--- a/src/PopMap.h
+++ b/src/PopMap.h
@@ -21,7 +21,9 @@
#ifndef __POPMAP_H__
#define __POPMAP_H__
-#include <string.h>
+#include <exception>
+using std::exception;
+#include <cstring>
#include <string>
using std::string;
#include <vector>
@@ -30,6 +32,8 @@ using std::vector;
using std::map;
#include <set>
using std::set;
+#include <numeric>
+using std::accumulate;
#include <algorithm>
#include <utility>
using std::pair;
@@ -38,6 +42,8 @@ using std::make_pair;
#include "stacks.h"
#include "locus.h"
#include "aln_utils.h"
+#include "MetaPopInfo.h"
+#include "Vcf.h"
class Datum {
public:
@@ -53,8 +59,9 @@ public:
char *cigar; // CIGAR string describing how the datum aligns to the catalog locus.
double lnl; // Log likelihood of this locus.
vector<char *> obshap; // Observed Haplotypes
- vector<SNP *> snps;
+ vector<SNP *> snps; // All calls for this sample for this locus. size() is [len].
Datum() {
+ this->id = -1;
this->corrected = false;
this->gtype = NULL;
this->trans_gtype = NULL;
@@ -97,57 +104,69 @@ public:
template<class LocusT=Locus>
class PopMap {
- set<pair<int, int> > blacklist;
+ const MetaPopInfo& metapopinfo;
int num_loci;
- int num_samples;
+ set<pair<int, int> > blacklist;
Datum ***data;
map<int, int> locus_order; // LocusID => ArrayIndex; map catalog IDs to their first dimension
// position in the Datum array.
map<int, int> rev_locus_order;
- map<int, int> sample_order; // SampleID => ArrayIndex; map defining at what position in
- // the second dimension of the datum array each sample is stored.
- map<int, int> rev_sample_order;
public:
map<string, vector<LocusT *> > ordered_loci; // Loci ordered by genomic position
- PopMap(int, int);
+ PopMap(const MetaPopInfo& mpopi, int n_loci);
~PopMap();
- int populate(vector<int> &, map<int, LocusT*> &, vector<vector<CatMatch *> > &);
- int order_loci(map<int, LocusT*> &);
- int prune(set<int> &);
+ // Populates the PopMap based on sstack matches files.
+ // The catalog is modified (LocusT must be CSLocus, and
+ // members [cnt, hcnt, confounded_cnt] are modified).
+ int populate(map<int, LocusT*>& catalog, const vector<vector<CatMatch *> >& matches);
+
+ // Populates the PopMap based on VCF (SNP) records.
+ // The catalog is modified (LocusT must be CSLocus, and
+ // members [cnt, hcnt] are modified).
+ // N.B. The IDs of the loci in the catalog MUST be the same
+ // as the indexes in the records vector.
+ int populate(map<int, LocusT*>& catalog, const vector<VcfRecord>& records, const VcfHeader& header);
+
+ int order_loci(const map<int, LocusT*>& catalog);
+ int prune(set<int>& loc_ids);
- int loci_cnt() { return this->num_loci; }
- int rev_locus_index(int index) { if (this->rev_locus_order.count(index) == 0) return -1; return this->rev_locus_order[index]; }
- int sample_cnt() { return this->num_samples; }
- int sample_index(int index) { if (this->sample_order.count(index) == 0) return -1; return this->sample_order[index]; }
- int rev_sample_index(int index) { if (this->rev_sample_order.count(index) == 0) return -1; return this->rev_sample_order[index]; }
+ int loci_cnt() const { return this->num_loci; }
+ int locus_index(int id) const {return locus_order.at(id);}
+ int rev_locus_index(int index) const {try {return rev_locus_order.at(index);} catch(exception&) {return -1;}}
- Datum **locus(int);
- Datum *datum(int, int);
- bool blacklisted(int, int);
+ int sample_cnt() const { return metapopinfo.samples().size(); }
+ int sample_index(int id) const {try {return metapopinfo.get_sample_index(id);} catch (exception&) {return -1;}}
+ int rev_sample_index(int index) const {return metapopinfo.samples().at(index).id;}
+
+ Datum **locus(int id);
+ Datum *datum(int loc_id, int sample_id);
+
+ bool blacklisted(int loc_id, int sample_id);
};
template<class LocusT>
-PopMap<LocusT>::PopMap(int num_samples, int num_loci) {
+PopMap<LocusT>::PopMap(const MetaPopInfo& mpopi, int num_loci)
+: metapopinfo(mpopi)
+{
this->data = new Datum **[num_loci];
for (int i = 0; i < num_loci; i++) {
- this->data[i] = new Datum *[num_samples];
+ this->data[i] = new Datum *[metapopinfo.samples().size()];
- for (int j = 0; j < num_samples; j++)
+ for (size_t j = 0; j < metapopinfo.samples().size(); j++)
this->data[i][j] = NULL;
}
- this->num_samples = num_samples;
this->num_loci = num_loci;
}
template<class LocusT>
PopMap<LocusT>::~PopMap() {
for (int i = 0; i < this->num_loci; i++) {
- for (int j = 0; j < this->num_samples; j++)
+ for (int j = 0; j < metapopinfo.samples().size(); j++)
delete this->data[i][j];
delete [] this->data[i];
}
@@ -155,22 +174,13 @@ PopMap<LocusT>::~PopMap() {
}
template<class LocusT>
-int PopMap<LocusT>::populate(vector<int> &sample_ids,
- map<int, LocusT*> &catalog,
- vector<vector<CatMatch *> > &matches) {
- //
- // Record the array position of each sample that we will load.
- //
- for (uint i = 0; i < sample_ids.size(); i++) {
- this->sample_order[sample_ids[i]] = i;
- this->rev_sample_order[i] = sample_ids[i];
- }
-
+int PopMap<LocusT>::populate(map<int, LocusT*> &catalog,
+ const vector<vector<CatMatch *> > &matches) {
//
// Create an index showing what position each catalog locus is stored at in the datum
// array. Create a second index allowing ordering of Loci by genomic position.
//
- typename std::map<int, LocusT*>::iterator it;
+ typename std::map<int, LocusT*>::const_iterator it;
uint i = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
this->locus_order[it->first] = i;
@@ -191,7 +201,7 @@ int PopMap<LocusT>::populate(vector<int> &sample_ids,
for (i = 0; i < matches.size(); i++) {
for (uint j = 0; j < matches[i].size(); j++) {
- sample = this->sample_order[matches[i][j]->sample_id];
+ sample = metapopinfo.get_sample_index(matches[i][j]->sample_id);
if (this->locus_order.count(matches[i][j]->cat_id) == 0)
continue;
@@ -260,11 +270,154 @@ int PopMap<LocusT>::populate(vector<int> &sample_ids,
}
template<class LocusT>
-int PopMap<LocusT>::order_loci(map<int, LocusT*> &catalog)
+int PopMap<LocusT>::populate(map<int, LocusT*>& catalog,
+ const vector<VcfRecord>& records,
+ const VcfHeader& header) {
+
+ // Initalize [locus_order], [rev_locus_order].
+ size_t loc_index = 0;
+ for (typename map<int, LocusT*>::iterator
+ l = catalog.begin();
+ l != catalog.end();
+ ++l) {
+ locus_order[l->first] = loc_index;
+ rev_locus_order[loc_index] = l->first;
+ ++loc_index;
+ }
+
+ // Initialize [ordered_loci].
+ order_loci(catalog);
+
+ /*
+ * Fill the PopMap.
+ *
+ * We observe the following rules to create the Datums :
+ * [id] is the locus id.
+ * [len] is the locus length (expected to be one, for a SNP)
+ * [model] "E" or "O" according to the SAMPLE/GT field, or
+ * "U" if the GT field is absent.
+ * [obshap] the nucleotide(s) observed for this SNP for this individual.
+ * If one of a sample's VCF alleles is missing ('.') or has an index
+ * corresponding to the special '*' allele, the Datum* is left to NULL.
+ *
+ * When no depth information is available, [tot_depth] and the [depths]
+ * of all alleles are set to 0.
+ * (n.b. the parsing of depth information in VCF is not implemented as
+ * of Mar 21, 2016.)
+ *
+ * When no likelihood information is available, [lnl] is set to 0.
+ * (n.b. the parsing of depth information in VCF is not implemented as
+ * of Mar 21, 2016.)
+ *
+ * The following members are left unset, on the premise that
+ * "populations" does not use them :
+ * corrected, genotype, trans_genotype
+ *
+ * [merge_partner] is set later by [merge_datums()] (in populations.cc).
+ * [snps] is only used by [write_vcf()] and [write_vcf_strict()], which
+ * first call [populate_snp_calls()] then use the SNP data in the
+ * datum.
+ */
+
+ loc_index = 0;
+ for (typename map<int, LocusT*>::iterator
+ l = catalog.begin();
+ l != catalog.end();
+ ++l) {
+ LocusT* loc = l->second;
+
+ const VcfRecord& rec = records[loc->id]; // n.b. assumes locus ID == record index.
+ int ad_index;
+ try {
+ ad_index = rec.index_of_gt_subfield("AD");
+ } catch (exception& e) {
+ ad_index = -1;
+ }
+
+ for (size_t s = 0; s < metapopinfo.samples().size(); ++s) {
+ size_t vcf_index = header.sample_indexes().at(metapopinfo.samples()[s].name);
+ const string& sample = rec.samples.at(vcf_index);
+
+ pair<int, int> gt = rec.parse_genotype(sample);
+ if (gt.first < 0
+ || gt.second < 0
+ || rec.alleles[gt.first]=="*"
+ || rec.alleles[gt.second]=="*")
+ // Missing or incomplete genotype.
+ continue;
+
+ vector<int> ad;
+ if (ad_index != -1) {
+ string ad_str = rec.parse_gt_subfield(sample, ad_index);
+ size_t start = 0;
+ size_t coma;
+ try {
+ while ((coma = ad_str.find(',', start)) != string::npos) {
+ ad.push_back(std::stoi(ad_str.substr(start,coma)));
+ if (ad.back() < 0)
+ throw exception();
+ start=coma+1;
+ }
+ ad.push_back(std::stoi(ad_str.substr(start)));
+ if (ad.size() != rec.alleles.size())
+ throw exception();
+ } catch (exception& e) {
+ cerr << "Warning: Badly formatted AD string '" << ad_str
+ << "' at VCF record '" << rec.chrom << ":" << rec.pos << "'.\n";
+ ad = vector<int>(rec.alleles.size(), 0);
+ }
+ }
+
+ Datum* d = new Datum();
+ data[loc_index][s] = d;
+ ++loc->cnt;
+ ++loc->hcnt;
+
+ // id, len, lnl
+ d->id = loc->id;
+ d->len = loc->len;
+ d->lnl = 0;
+
+ // model, obshap, depth
+ d->model = new char[2];
+ if (gt.first == gt.second) {
+ strcpy(d->model, "O");
+ const string& allele = rec.alleles[gt.first];
+ d->obshap.push_back(new char[allele.size()+1]);
+ strcpy(d->obshap[0], allele.c_str());
+ if (ad_index != -1)
+ d->depth = { ad[gt.first] };
+ else
+ d->depth = {0};
+ } else {
+ strcpy(d->model, "E");
+ const string& allele1 = rec.alleles[gt.first];
+ const string& allele2 = rec.alleles[gt.second];
+ d->obshap.push_back(new char[allele1.size()+1]);
+ d->obshap.push_back(new char[allele2.size()+1]);
+ strcpy(d->obshap[0], allele1.c_str());
+ strcpy(d->obshap[1], allele2.c_str());
+ if (ad_index != -1)
+ d->depth = {ad[gt.first], ad[gt.second]};
+ else
+ d->depth = {0, 0};
+ }
+
+ // tot_depth
+ d->tot_depth = std::accumulate(d->depth.begin(), d->depth.end(), 0);
+ }
+ ++loc_index;
+ }
+
+ return 0;
+}
+
+template<class LocusT>
+int PopMap<LocusT>::order_loci(const map<int, LocusT*> &catalog)
{
this->ordered_loci.clear();
- typename std::map<int, LocusT*>::iterator it;
+ typename std::map<int, LocusT*>::const_iterator it;
for (it = catalog.begin(); it != catalog.end(); it++) {
if (strlen(it->second->loc.chr) > 0)
@@ -294,20 +447,16 @@ int PopMap<LocusT>::prune(set<int> &remove_ids) {
loc_id = this->rev_locus_order[i];
- //
- // Keep this locus.
- //
if (remove_ids.count(loc_id) == 0) {
+ // Keep this locus.
d[j] = this->data[i];
new_loc_order[loc_id] = j;
new_rev_loc_order[j] = loc_id;
j++;
} else {
- //
// Remove this locus.
- //
- for (int k = 0; k < this->num_samples; k++)
+ for (size_t k = 0; k < metapopinfo.samples().size(); k++)
delete this->data[i][k];
delete [] this->data[i];
}
@@ -345,13 +494,13 @@ int PopMap<LocusT>::prune(set<int> &remove_ids) {
}
template<class LocusT>
-Datum **PopMap<LocusT>::locus(int locus) {
- return this->data[this->locus_order[locus]];
+Datum **PopMap<LocusT>::locus(int id) {
+ return this->data[this->locus_order[id]];
}
template<class LocusT>
-Datum *PopMap<LocusT>::datum(int locus, int sample) {
- return this->data[this->locus_order[locus]][this->sample_order[sample]];
+Datum *PopMap<LocusT>::datum(int loc_id, int sample_id) {
+ return this->data[this->locus_order[loc_id]][metapopinfo.get_sample_index(sample_id)];
}
template<class LocusT>
diff --git a/src/PopSum.h b/src/PopSum.h
index 9b53c3e..d8d491f 100644
--- a/src/PopSum.h
+++ b/src/PopSum.h
@@ -38,10 +38,12 @@ using std::make_pair;
#include <math.h>
#include "stacks.h"
+#include "locus.h"
+#include "PopMap.h"
+#include "MetaPopInfo.h"
extern bool log_fst_comp;
extern double minor_allele_freq;
-extern map<int, string> pop_key;
const uint PopStatSize = 5;
class PopStat {
@@ -84,6 +86,7 @@ public:
HapStat(): PopStat() {
comp = NULL;
+ popcnt = uint(-1);
}
~HapStat() {
if (this->comp != NULL)
@@ -249,36 +252,29 @@ public:
//
template<class LocusT=Locus>
class PopSum {
- int num_loci;
- int num_pops;
+ const PopMap<LocusT>& popmap;
+ const MetaPopInfo& metapopinfo;
LocSum ***data;
LocTally **loc_tally;
- map<int, int> locus_order; // LocusID => ArrayIndex; map catalog IDs to their first dimension
- // position in the LocSum array.
- map<int, int> rev_locus_order;
- map<int, int> pop_order; // PopulationID => ArrayIndex; map defining at what position in
- // the second dimension of the LocSum array each population is stored.
- map<int, int> rev_pop_order;
- map<int, int> pop_sizes; // The maximum size of each separate population.
public:
- PopSum(int, int);
+ PopSum(const PopMap<LocusT>& pmap, const MetaPopInfo& mpopi);
~PopSum();
int initialize(PopMap<LocusT> *);
- int add_population(map<int, LocusT *> &, PopMap<LocusT> *, uint, uint, uint, bool, ofstream &);
+ int add_population(map<int, LocusT *> &, PopMap<LocusT> *, size_t, bool, ofstream &);
int tally(map<int, LocusT *> &);
- int loci_cnt() { return this->num_loci; }
- int rev_locus_index(int index) { return this->rev_locus_order[index]; }
- int pop_cnt() { return this->num_pops; }
- int pop_index(int index) { return this->pop_order[index]; }
- int rev_pop_index(int index) { return this->rev_pop_order[index]; }
- int pop_size(int pop_id) { return this->pop_sizes[pop_id]; }
+ int loci_cnt() const {return popmap.loci_cnt();}
+ int rev_locus_index(int index) const {return popmap.rev_locus_index(index);}
+ int pop_cnt() const {return metapopinfo.pops().size();}
+ int pop_size(size_t pop_index) const {
+ return metapopinfo.pops().at(pop_index).last_sample - metapopinfo.pops().at(pop_index).first_sample + 1;
+ }
- LocSum **locus(int);
- LocSum *pop(int, int);
- LocTally *locus_tally(int);
+ LocSum **locus(int locus_id);
+ LocSum *pop(int locus_id, size_t pop_index);
+ LocTally *locus_tally(int locus_id);
PopPair *Fst(int, int, int, int);
int fishers_exact_test(PopPair *, double, double, double, double);
@@ -292,25 +288,24 @@ private:
};
template<class LocusT>
-PopSum<LocusT>::PopSum(int num_loci, int num_populations) {
- this->loc_tally = new LocTally *[num_loci];
- this->data = new LocSum **[num_loci];
+PopSum<LocusT>::PopSum(const PopMap<LocusT>& pmap, const MetaPopInfo& mpopi)
+: popmap(pmap), metapopinfo(mpopi)
+{
+ this->loc_tally = new LocTally*[popmap.loci_cnt()];
+ this->data = new LocSum**[popmap.loci_cnt()];
- for (int i = 0; i < num_loci; i++) {
- this->data[i] = new LocSum *[num_populations];
+ for (int i = 0; i < popmap.loci_cnt(); i++) {
+ this->data[i] = new LocSum *[metapopinfo.pops().size()];
- for (int j = 0; j < num_populations; j++)
+ for (size_t j = 0; j < metapopinfo.pops().size(); j++)
this->data[i][j] = NULL;
}
-
- this->num_pops = num_populations;
- this->num_loci = num_loci;
}
template<class LocusT>
PopSum<LocusT>::~PopSum() {
- for (int i = 0; i < this->num_loci; i++) {
- for (int j = 0; j < this->num_pops; j++)
+ for (int i = 0; i < loci_cnt(); i++) {
+ for (int j = 0; j < pop_cnt(); j++)
delete this->data[i][j];
delete [] this->data[i];
delete this->loc_tally[i];
@@ -321,30 +316,21 @@ PopSum<LocusT>::~PopSum() {
template<class LocusT>
int PopSum<LocusT>::initialize(PopMap<LocusT> *pmap) {
- int locus_id;
-
- for (int i = 0; i < this->num_loci; i++) {
- locus_id = pmap->rev_locus_index(i);
- this->locus_order[locus_id] = i;
- this->rev_locus_order[i] = locus_id;
- }
-
return 0;
}
template<class LocusT>
int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
- PopMap<LocusT> *pmap,
- uint population_id,
- uint start_index, uint end_index,
- bool verbose, ofstream &log_fh) {
+ PopMap<LocusT> *pmap,
+ size_t pop_index,
+ bool verbose,
+ ofstream &log_fh) {
LocusT *loc;
Datum **d;
LocSum **s;
uint locus_id, len;
int res;
set<int> snp_cols;
-
int incompatible_loci = 0;
if (verbose)
@@ -352,20 +338,10 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
<< "#\n"
<< "# Level\tAction\tLocus ID\tChr\tBP\tColumn\tPopID\n#\n";
- //
- // Determine the index for this population
- //
- uint pop_index = this->pop_order.size() == 0 ? 0 : this->pop_order.size();
- this->pop_order[population_id] = pop_index;
- this->rev_pop_order[pop_index] = population_id;
-
- //
- // Record the maximal size of this population.
- //
- this->pop_sizes[population_id] = end_index - start_index + 1;
+ const MetaPopInfo::Pop& pop = metapopinfo.pops().at(pop_index);
- for (int i = 0; i < this->num_loci; i++) {
- locus_id = pmap->rev_locus_index(i);
+ for (int i = 0; i < loci_cnt(); i++) {
+ locus_id = rev_locus_index(i);
d = pmap->locus(locus_id);
s = this->locus(locus_id);
loc = catalog[locus_id];
@@ -379,8 +355,9 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
// Check if this locus has already been filtered and is NULL in all individuals.
//
bool filtered = true;
- for (uint k = start_index; k <= end_index; k++) {
- if (d[k] != NULL) filtered = false;
+ for (uint k = pop.first_sample; k <= pop.last_sample; k++) {
+ if (d[k] != NULL)
+ filtered = false;
}
if (filtered == true) {
for (uint k = 0; k < len; k++) {
@@ -394,8 +371,8 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
// calculate observed genotype frequencies, allele frequencies, and expected genotype frequencies.
//
for (uint k = 0; k < loc->snps.size(); k++) {
- res = this->tally_heterozygous_pos(loc, d, s[pop_index],
- loc->snps[k]->col, k, start_index, end_index);
+ res = this->tally_heterozygous_pos(loc, d, s[pop_index],
+ loc->snps[k]->col, k, pop.first_sample, pop.last_sample);
//
// If site is incompatible (too many alleles present), log it.
//
@@ -408,9 +385,9 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
<< "incompatible_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << loc->sort_bp(loc->snps[k]->col) << "\t"
+ << loc->sort_bp(loc->snps[k]->col) +1 << "\t"
<< loc->snps[k]->col << "\t"
- << pop_key[population_id] << "\n";
+ << pop.name << "\n";
}
snp_cols.insert(loc->snps[k]->col);
@@ -420,15 +397,15 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
//
for (uint k = 0; k < len; k++) {
if (snp_cols.count(k)) continue;
- this->tally_fixed_pos(loc, d, s[pop_index],
- k, start_index, end_index);
+ this->tally_fixed_pos(loc, d, s[pop_index],
+ k, pop.first_sample, pop.last_sample);
}
snp_cols.clear();
}
- cerr << "Population '" << pop_key[population_id] << "' contained " << incompatible_loci << " incompatible loci -- more than two alleles present.\n";
- log_fh << "Population " << population_id << " contained " << incompatible_loci << " incompatible loci -- more than two alleles present.\n";
+ cerr << "Population '" << pop.name << "' contained " << incompatible_loci << " incompatible loci -- more than two alleles present.\n";
+ log_fh << "Population " << pop.name << " contained " << incompatible_loci << " incompatible loci -- more than two alleles present.\n";
return 0;
}
@@ -442,7 +419,7 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
int locus_id, variable_pop;
uint16_t p_cnt, q_cnt, len, col;
- for (int n = 0; n < this->num_loci; n++) {
+ for (int n = 0; n < loci_cnt(); n++) {
locus_id = this->rev_locus_index(n);
loc = catalog[locus_id];
s = this->locus(locus_id);
@@ -471,7 +448,7 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
if (ltally->nucs[col].allele_cnt > 1)
ltally->nucs[col].fixed = false;
- for (int j = 0; j < this->num_pops; j++) {
+ for (int j = 0; j < pop_cnt(); j++) {
//
// Sum the number of individuals examined at this locus across populations.
//
@@ -479,7 +456,7 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
ltally->nucs[col].pop_cnt += s[j]->nucs[col].num_indv > 0 ? 1 : 0;
}
- for (int j = 0; j < this->num_pops; j++) {
+ for (int j = 0; j < pop_cnt(); j++) {
//
// Sum the most frequent allele across populations.
//
@@ -499,6 +476,7 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
//
// We want to report the most frequent allele as the P allele. Reorder the alleles
// if necessary.
+ // XXX Possibly unstable for p_freq ~ 0.5. @Nick (July 2016)
//
if (ltally->nucs[col].p_freq < 0.5) {
char a = ltally->nucs[col].p_allele;
@@ -518,12 +496,12 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
variable_pop = -1;
if (p_cnt == 1 && q_cnt > 1) {
- for (int j = 0; j < this->num_pops; j++)
+ for (int j = 0; j < pop_cnt(); j++)
if (s[j]->nucs[col].p_nuc == ltally->nucs[col].p_allele ||
s[j]->nucs[col].q_nuc == ltally->nucs[col].p_allele)
variable_pop = j;
} else if (p_cnt > 1 && q_cnt == 1) {
- for (int j = 0; j < this->num_pops; j++)
+ for (int j = 0; j < pop_cnt(); j++)
if (s[j]->nucs[col].p_nuc == ltally->nucs[col].q_allele ||
s[j]->nucs[col].q_nuc == ltally->nucs[col].q_allele)
variable_pop = j;
@@ -548,7 +526,7 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
q_allele = 0;
allele_cnt = 0;
- for (int j = 0; j < this->num_pops; j++) {
+ for (int j = 0; j < pop_cnt(); j++) {
nuc[0] = 0;
nuc[1] = 0;
nuc[0] = s[j]->nucs[snp_index].p_nuc;
@@ -636,7 +614,7 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
p_cnt = 0;
q_cnt = 0;
- for (int j = 0; j < this->num_pops; j++) {
+ for (int j = 0; j < pop_cnt(); j++) {
nuc[0] = 0;
nuc[1] = 0;
nuc[0] = s[j]->nucs[snp_index].p_nuc;
@@ -653,10 +631,10 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
}
template<class LocusT>
-PopPair *PopSum<LocusT>::Fst(int locus, int pop_1, int pop_2, int pos)
+PopPair *PopSum<LocusT>::Fst(int locus_id, int pop_1, int pop_2, int pos)
{
- LocSum *s_1 = this->pop(locus, pop_1); /////// SLOW!
- LocSum *s_2 = this->pop(locus, pop_2);
+ LocSum *s_1 = pop(locus_id, pop_1); /////// SLOW!
+ LocSum *s_2 = pop(locus_id, pop_2);
PopPair *pair = new PopPair();
//
@@ -1346,21 +1324,21 @@ double PopSum<LocusT>::binomial_coeff(double n, double k)
}
template<class LocusT>
-LocSum **PopSum<LocusT>::locus(int locus)
+LocSum **PopSum<LocusT>::locus(int locus_id)
{
- return this->data[this->locus_order[locus]];
+ return this->data[popmap.locus_index(locus_id)];
}
template<class LocusT>
-LocSum *PopSum<LocusT>::pop(int locus, int pop_id)
+LocSum *PopSum<LocusT>::pop(int locus_id, size_t pop_index)
{
- return this->data[this->locus_order[locus]][this->pop_order[pop_id]];
+ return this->data[popmap.locus_index(locus_id)][pop_index];
}
template<class LocusT>
-LocTally *PopSum<LocusT>::locus_tally(int locus)
+LocTally *PopSum<LocusT>::locus_tally(int locus_id)
{
- return this->loc_tally[this->locus_order[locus]];
+ return this->loc_tally[popmap.locus_index(locus_id)];
}
#endif // __POPSUM_H__
diff --git a/src/SamI.h b/src/SamI.h
index 8bab0d1..74ab51c 100644
--- a/src/SamI.h
+++ b/src/SamI.h
@@ -106,7 +106,7 @@ Sam::next_seq()
bp--;
Seq *s = new Seq(parts[0].c_str(), parts[9].c_str(), parts[10].c_str(), // Read ID, Sequence, Quality
- parts[2].c_str(), bp, flag ? minus : plus); // Chr, BasePair, Strand
+ parts[2].c_str(), bp, flag ? strand_minus : strand_plus); // Chr, BasePair, Strand
if (cigar.size() > 0)
this->edit_gaps(cigar, s->seq);
@@ -138,7 +138,7 @@ Sam::parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar, bool o
// If aligned to the negative strand, sequence has been reverse complemented and
// CIGAR string should be interpreted in reverse.
//
- if (orientation == plus)
+ if (orientation == strand_plus)
cigar.push_back(make_pair(*q, dist));
else
cigar.insert(cigar.begin(), make_pair(*q, dist));
diff --git a/src/Tsv.h b/src/Tsv.h
index 435d97b..f94d94d 100644
--- a/src/Tsv.h
+++ b/src/Tsv.h
@@ -56,7 +56,7 @@ Seq *Tsv::next_seq() {
string id = parts[0] + "_" + parts[1];
- Seq *s = new Seq(id.c_str(), parts[2].c_str(), parts[3].c_str(), parts[0].c_str(), atoi(parts[1].c_str()), plus);
+ Seq *s = new Seq(id.c_str(), parts[2].c_str(), parts[3].c_str(), parts[0].c_str(), atoi(parts[1].c_str()), strand_plus);
return s;
}
diff --git a/src/Vcf.cc b/src/Vcf.cc
new file mode 100644
index 0000000..32e63c3
--- /dev/null
+++ b/src/Vcf.cc
@@ -0,0 +1,506 @@
+#include <algorithm>
+#include <ctime>
+#include <iomanip>
+#include <sstream>
+
+#include "Vcf.h"
+
+using namespace std;
+
+const map<string, VcfMeta> VcfMeta::predefined = {
+ {"INFO/NS", VcfMeta("INFO","<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">")},
+ {"INFO/AF", VcfMeta("INFO","<ID=AF,Number=.,Type=Float,Description=\"Allele Frequency\">")},
+ {"FORMAT/GT", VcfMeta("FORMAT","<ID=GT,Number=1,Type=String,Description=\"Genotype\">")},
+ {"FORMAT/DP", VcfMeta("FORMAT","<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">")},
+ {"FORMAT/AD", VcfMeta("FORMAT","<ID=AD,Number=1,Type=Integer,Description=\"Allele Depth\">")},
+ {"FORMAT/GL", VcfMeta("FORMAT","<ID=GL,Number=.,Type=Float,Description=\"Genotype Likelihood\">")},
+ };
+
+void VcfHeader::init_meta(const string& fileformat) {
+ add_meta(VcfMeta("fileformat", fileformat));
+
+ time_t t;
+ time(&t);
+ char date[9];
+ strftime(date, 9, "%Y%m%d", localtime(&t));
+ add_meta(VcfMeta("fileDate", date));
+
+ add_meta(VcfMeta("source", string("\"Stacks v") + VERSION + "\""));
+}
+
+VcfAbstractParser::VcfAbstractParser(const string& path)
+: path_(path), header_(), line_number_(0), eol_(true), eof_(false), tabs_(), bounds_(), sample_index_(-1)
+{
+ memset(line_, '\0', Vcf::line_buf_size);
+}
+
+void
+VcfAbstractParser::read_header()
+{
+ auto malformed = [this] () {
+ cerr << "Error: Malformed header."
+ << " At line " << line_number_ << " in file '" << path_ << "'.\n";
+ throw exception();
+ };
+
+ while(true) {
+ getline(line_, Vcf::line_buf_size);
+ ++line_number_;
+ if(eof_)
+ malformed();
+
+ if (line_[0] != '#')
+ malformed();
+
+ if(line_[1] == '#') {
+ // Meta header line.
+
+ char* equal = strchr(line_, '=');
+ if(equal == NULL) {
+ // (Notice: Skipping missing '=')
+ tabs_.clear();
+ tabs_.push_back(line_+strlen(line_));
+ check_eol();
+ read_to_eol();
+ continue;
+ }
+ char* end = equal + strlen(equal);
+
+ // Check the length of the line (discard lines that don't fit in the buffer).
+ tabs_.clear();
+ tabs_.push_back(end);
+ check_eol();
+ if(!eol_) {
+ // (Notice: Skipping long line)
+ read_to_eol();
+ continue;
+ }
+
+ string key = string(line_+2, equal);
+ transform(key.begin(), key.end(), key.begin(), ::toupper);
+ header_.add_meta(VcfMeta(key, string(equal+1, end)));
+
+ continue;
+ }
+ break;
+ }
+
+ // Final header line.
+
+ if(strncmp(line_, Vcf::base_fields.c_str(), Vcf::base_fields.length()) != 0)
+ malformed();
+
+ // Get tabs.
+ tabs_.clear();
+ tabs_.push_back(strchr(line_, '\t')); //rem. one tab guaranteed
+ while (tabs_.back())
+ tabs_.push_back(strchr(tabs_.back()+1, '\t'));
+ tabs_.pop_back();
+ tabs_.push_back(tabs_.back() + strlen(tabs_.back())); // final '\0'
+ check_eol();
+
+ // Check the number of tabs.
+ if(!(tabs_.size() == Vcf::base_fields_no or tabs_.size() >= Vcf::base_fields_no+2))
+ malformed();
+
+ // Check windows line endings.
+ if(*(tabs_.back()-1) == '\r') {
+ tabs_.back() = tabs_.back()-1;
+ *tabs_.back() = '\0';
+ }
+
+ // Parse sample names.
+ if(tabs_.size() >= Vcf::base_fields_no+2) {
+ for(size_t i=Vcf::first_sample-1; i < tabs_.size()-2; ++i)
+ header_.add_sample(string(tabs_[i]+1, tabs_[i+1]));
+
+ while(!eol_){
+ // Buffer wasn't long enough : parse until eol is found.
+ // Copy the truncated name at the beginning of the buffer.
+ size_t last_field_len = tabs_.back() - *(tabs_.end()-2);
+ *line_ = '\t';
+ strcpy(line_+1, *(tabs_.end()-2)+1);
+
+ getline(line_+last_field_len, Vcf::line_buf_size-last_field_len);
+ if(eof_)
+ malformed();
+
+ // Get tabs.
+ tabs_.clear();
+ tabs_.push_back(line_);
+ while (tabs_.back())
+ tabs_.push_back(strchr(tabs_.back()+1, '\t'));
+ tabs_.pop_back();
+ tabs_.push_back(tabs_.back() + strlen(tabs_.back()));
+ check_eol();
+
+ // Check windows line endings.
+ if(*(tabs_.back()-1) == '\r') {
+ tabs_.back() = tabs_.back()-1;
+ *tabs_.back() = '\0';
+ }
+
+ for(size_t i = 0; i<tabs_.size()-2;++i)
+ header_.add_sample(string(tabs_[i]+1, tabs_[i+1]));
+ }
+ header_.add_sample(string(*(tabs_.end()-2)+1, tabs_.back()));
+ }
+
+ return;
+}
+
+#ifdef HAVE_LIBZ
+VcfGzParser::VcfGzParser(const string& path)
+: VcfAbstractParser(path), file_(NULL)
+{
+ file_ = gzopen(path_.c_str(), "rb");
+#if ZLIB_VERNUM >= 0x1240
+ gzbuffer(file_, libz_buffer_size);
+#endif
+}
+
+#endif // HAVE_LIBZ
+
+VcfAbstractParser*
+Vcf::adaptive_open(const string& path)
+{
+ VcfAbstractParser* parser = NULL;
+ if (path.length() >= 4 && path.substr(path.length()-4) == ".vcf") {
+ parser = new VcfParser(path);
+ if (parser->fail()) {
+ // Opening failed
+ delete parser;
+ parser = NULL;
+ }
+#ifdef HAVE_LIBZ
+ } else if (path.length() >= 7 && path.substr(path.length()-7) == ".vcf.gz") {
+ parser = new VcfGzParser(path);
+ if (parser->fail()) {
+ delete parser;
+ parser = NULL;
+ }
+#endif
+ } else {
+ cerr << "Error: File '" << path << "' : expected '.vcf(.gz)' suffix.";
+ throw exception();
+ }
+
+ return parser;
+}
+
+bool
+VcfAbstractParser::next_record(VcfRecord& record)
+{
+ getline(line_, Vcf::line_buf_size);
+ if(eof_)
+ return false;
+ ++line_number_;
+
+ record.clear();
+
+ // Get tabs.
+ tabs_.clear();
+ tabs_.push_back(strchr(line_, '\t'));
+ while(tabs_.back())
+ tabs_.push_back(strchr(tabs_.back()+1, '\t'));
+ tabs_.pop_back();
+ if(tabs_.size() > 0)
+ tabs_.push_back(tabs_.back() + strlen(tabs_.back()));
+ else
+ tabs_.push_back(line_ + strlen(line_));
+ check_eol();
+
+ /* Check the number of fields (should be ==8 or >=10 depending on the
+ * presence of samples) :
+ * If the line fits in the buffer but the number of fields is wrong,
+ * raise an error.
+ * If the fixed fields don't fit in the buffer, return a null record. */
+ if(eol_) {
+ if(tabs_.size() < Vcf::base_fields_no + (header_.samples().empty() ? 0 : 2)) {
+ cerr << "Error: malformed VCF record line (at least "
+ << Vcf::base_fields_no + (header_.samples().empty() ? 0 : 2)
+ << " fields required).\nLine " << line_number_ << " in file '" << path_ << "'.\n";
+ throw exception ();
+ }
+ } else {
+ if(header_.samples().empty() || tabs_.size() < Vcf::base_fields_no + 2) {
+ cerr << "Warning: In file '" << path_ << "': skipping the very long record at line "
+ << line_number_ << ".\n";
+ read_to_eol();
+ return true;
+ }
+ }
+
+ // Check windows line endings.
+ if(*(tabs_.back()-1) == '\r') {
+ tabs_.back() = tabs_.back()-1;
+ *tabs_.back() = '\0';
+ }
+
+ // Separate all substrings (replace \t's with \0's).
+ for(size_t i = 0; i<tabs_.size()-1;++i)
+ *tabs_[i] = '\0';
+
+ // CHROM
+ if(line_ == tabs_[Vcf::chrom]) {
+ cerr << "Warning: Skipping malformed VCF record (missing CHROM value)."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ // Skip this record.
+ // ([record.type] is still [null].)
+ return true;
+ }
+ record.chrom.assign(line_, tabs_[Vcf::chrom]);
+
+ // POS
+ if(tabs_[Vcf::pos-1]+1 == tabs_[Vcf::pos]
+ || *(tabs_[Vcf::pos-1]+1) == '.' ){
+ cerr << "Warning: Skipping malformed VCF record (missing POS value)."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ // Skip this record.
+ // ([record.type] is still [null].)
+ return true;
+ }
+ record.pos = stoi(tabs_[Vcf::pos-1]+1) -1 ; // VCF is 1-based
+
+ // ID
+ if(tabs_[Vcf::id-1]+1 == tabs_[Vcf::id])
+ cerr << "Notice: Empty ID field should be marked by a dot."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ if(*(tabs_[Vcf::id-1]+1) != '.')
+ record.id.assign(tabs_[Vcf::id-1]+1, tabs_[Vcf::id]);
+
+ // REF
+ if(tabs_[Vcf::ref-1]+1 == tabs_[Vcf::ref]
+ || *(tabs_[Vcf::ref-1]+1) == '.' ) {
+ cerr << "Warning: malformed VCF record (missing REF value)."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ // Skip this record.
+ // ([record.type] is still [null].)
+ return true;
+ }
+ record.alleles.push_back(string(tabs_[Vcf::ref-1]+1, tabs_[Vcf::ref]));
+
+ // ALT & determine the type of the record
+ if(tabs_[Vcf::alt-1]+1 == tabs_[Vcf::alt]) {
+ cerr << "Warning: Skipping malformed VCF record (expected ALT field to be marked by a dot)."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ // Skip this record.
+ // ([record.type] is still [null].)
+ return true;
+ } else if(*(tabs_[Vcf::alt-1]+1) == '.') {
+ record.type = Vcf::RType::invariant;
+ } else {
+ get_bounds(bounds_, tabs_[Vcf::alt-1], tabs_[Vcf::alt], ',');
+ for (size_t i = 0; i < bounds_.size()-1; ++i ) {
+ record.alleles.push_back(string(bounds_.at(i)+1, bounds_.at(i+1)));
+ if (record.alleles.back().empty()) {
+ record.alleles.pop_back();
+ cerr << "Warning: Skipping malformed VCF record (malformed ALT field)."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ // Skip this record.
+ // ([record.type] is still [null].)
+ return true;
+ }
+ }
+
+ if (strchr(tabs_[Vcf::alt-1]+1, '[') || strchr(tabs_[Vcf::alt-1]+1, ']'))
+ record.type = Vcf::RType::breakend;
+ else if (strchr(tabs_[Vcf::alt-1]+1, '<'))
+ record.type = Vcf::RType::symbolic;
+ else
+ record.type = Vcf::RType::expl;
+ }
+
+ // Do not parse symbolic & breakend records.
+ if (record.type == Vcf::RType::symbolic || record.type == Vcf::RType::breakend) {
+ read_to_eol();
+ return true;
+ }
+
+ // QUAL
+ if(tabs_[Vcf::qual-1]+1 == tabs_[Vcf::qual])
+ cerr << "Notice: Empty QUAL field should be marked by a dot."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ if(*(tabs_[Vcf::qual-1]+1) != '.')
+ record.qual.assign(tabs_[Vcf::qual-1]+1, tabs_[Vcf::qual]);
+
+ // FILTER
+ if(tabs_[Vcf::filter-1]+1 == tabs_[Vcf::filter])
+ cerr << "Notice: Empty FILTER field should be marked by a dot."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ if(*(tabs_[Vcf::filter-1]+1) != '.') {
+ get_bounds(bounds_, tabs_[Vcf::filter-1], tabs_[Vcf::filter],';');
+ for(size_t i = 0; i < bounds_.size()-1; ++i )
+ record.filter.push_back(string(bounds_.at(i)+1, bounds_.at(i+1)));
+ }
+
+ // INFO
+ if(tabs_[Vcf::info-1]+1 == tabs_[Vcf::info])
+ cerr << "Notice: Empty INFO field should be marked by a dot."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+ if(*(tabs_[Vcf::info-1]+1) != '.') {
+ get_bounds(bounds_, tabs_[Vcf::info-1], tabs_[Vcf::info], ';');
+ for(size_t i = 0; i < bounds_.size()-1; ++i ) {
+ char* equal = strchr(bounds_[i]+1, '=');
+ if(equal == NULL or bounds_[i+1] - equal < 0)
+ record.info.push_back(pair<string,string>(string(bounds_[i]+1, bounds_[i+1]),string()));
+ else
+ record.info.push_back(pair<string,string>(string(bounds_[i]+1, equal),string(equal+1,bounds_[i+1])));
+ }
+ }
+
+ // FORMAT
+ if (!header_.samples().empty() && *(tabs_[Vcf::format-1]+1) != '.') {
+ if(tabs_[Vcf::format-1]+1 == tabs_[Vcf::format])
+ cerr << "Notice: Empty FORMAT field should be marked by a dot."
+ << " Line " << line_number_ << " in file '" << path_ << "'.\n";
+
+ get_bounds(bounds_, tabs_[Vcf::format-1], tabs_[Vcf::format],':');
+ for(size_t i = 0; i < bounds_.size()-1; ++i )
+ record.format.push_back(string(bounds_.at(i)+1, bounds_.at(i+1)));
+
+ // SAMPLES
+ sample_index_ = 0;
+ for (size_t s = Vcf::base_fields_no; s < tabs_.size()-2; ++s) //n.b. -2
+ add_sample(record, tabs_[s], tabs_[s+1]);
+
+ // If the record line was not read entirely, copy the trucated sample field
+ // to the beggining of the buffer and read some more.
+ while (!eol_) {
+ size_t lastfieldlen = tabs_.back() - *(tabs_.end()-2);
+ *line_ = '\t';
+ strcpy(line_+1, *(tabs_.end()-2)+1);
+
+ getline(line_+lastfieldlen, Vcf::line_buf_size-lastfieldlen);
+ if(eof_) {
+ cerr << "Warning: Encountered end of file while reading a record.\n";
+ record.type = Vcf::RType::null;
+ return false;
+ }
+
+ // Get tabs.
+ tabs_.clear();
+ tabs_.push_back(line_);
+ while (tabs_.back())
+ tabs_.push_back(strchr(tabs_.back()+1, '\t'));
+ tabs_.pop_back();
+ tabs_.push_back(tabs_.back() + strlen(tabs_.back()));
+ check_eol();
+
+ // Windows line endings.
+ if(*(tabs_.back()-1) == '\r') {
+ tabs_.back() = tabs_.back()-1;
+ *tabs_.back() = '\0';
+ }
+ for(size_t i = 0; i<tabs_.size()-2;++i) {
+ *tabs_[i+1] = '\0';
+ add_sample(record, tabs_[i], tabs_[i+1]);
+ }
+ }
+ //the actual last sample
+ add_sample(record, *(tabs_.end()-2), tabs_.back());
+
+ if(sample_index_ != header_.samples().size()) {
+ cerr << "Error: malformed VCF record ("
+ << header_.samples().size() << " SAMPLE fields expected, "
+ << sample_index_ << " found). File '" << path_ << "', line "
+ << line_number_ << ".\n";
+ throw exception();
+ }
+ }
+
+ return true;
+}
+
+void VcfWriter::write_header(const VcfHeader& header) {
+ for(const VcfMeta& m : header.meta())
+ file_ << "##" << m.key() << "=" << m.value() << "\n";
+
+ file_ << Vcf::base_header;
+ if(not header.samples().empty())
+ file_ << "\tFORMAT";
+ for(const string& s : header.samples())
+ file_ << "\t" << s;
+ file_ << "\n";
+}
+
+void VcfWriter::write_record(const VcfRecord& r, const VcfHeader& h) {
+
+ if (r.type != Vcf::RType::expl)
+ // This is not implemented.
+ throw exception();
+
+ file_ << r.chrom
+ << "\t" << r.pos
+ << "\t" << (r.id.empty() ? "." : r.id)
+ << "\t" << r.alleles.at(0);
+
+ //ALT
+ file_ << "\t";
+ if (r.alleles.size() == 1) {
+ file_ << ".";
+ } else {
+ auto allele = r.alleles.begin()+1;
+ file_ << *allele;
+ ++allele;
+ while(allele != r.alleles.end()) {
+ file_ << "," << *allele;
+ ++allele;
+ }
+ }
+
+ file_ << "\t" << (r.qual.empty() ? "." : r.qual);
+
+ //FILTER
+ file_ << "\t";
+ if (r.filter.empty()) {
+ file_ << "\t.";
+ } else {
+ auto filter = r.filter.begin();
+ file_ << *filter;
+ ++filter;
+ while(filter != r.filter.end()) {
+ file_ << ";" << *filter;
+ ++filter;
+ }
+ }
+
+ //INFO
+ file_ << "\t";
+ if (r.info.empty()) {
+ file_ << ".";
+ } else {
+ auto i = r.info.begin();
+ file_ << i->first << "=" << i->second;
+ ++i;
+ while(i != r.info.end()) {
+ file_ << ";" << i->first << "=" << i->second;
+ ++i;
+ }
+ }
+
+ if (not h.samples().empty()) {
+ //FORMAT
+ file_ << "\t";
+ if (r.format.empty()) {
+ file_ << ".";
+ } else {
+ auto f = r.format.begin();
+ file_ << *f;
+ ++f;
+ while(f != r.format.end()) {
+ file_ << ":" << *f;
+ ++f;
+ }
+ }
+
+ //SAMPLES
+ if (r.samples.size() != h.samples().size())
+ throw exception();
+
+ for (const string& s : r.samples)
+ file_ << "\t" << s;
+ }
+
+ file_ << "\n";
+}
diff --git a/src/Vcf.h b/src/Vcf.h
new file mode 100644
index 0000000..5c2eb35
--- /dev/null
+++ b/src/Vcf.h
@@ -0,0 +1,467 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2010-2015, Julian Catchen <jcatchen at uoregon.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef __VCF_H__
+#define __VCF_H__
+
+#include "config.h"
+
+#include <fstream>
+#include <iostream>
+#include <exception>
+#include <stdexcept>
+#include <utility>
+#include <cstring>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+
+#ifdef HAVE_LIBZ
+#include <zlib.h>
+#endif
+
+#include "constants.h"
+
+using std::size_t;
+using std::pair;
+using std::vector;
+using std::string;
+using std::set;
+using std::map;
+using std::ifstream;
+using std::ofstream;
+using std::cerr;
+
+using std::exception;
+using std::out_of_range;
+
+class VcfAbstractParser;
+class VcfHeader;
+
+namespace Vcf {
+
+const size_t base_fields_no = 8; // CHROM POS ID REF ALT QUAL FILTER INFO [FORMAT SAMPLE ...]]
+const string base_fields = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
+const size_t chrom = 0;
+const size_t pos = 1;
+const size_t id = 2;
+const size_t ref = 3;
+const size_t alt = 4;
+const size_t qual = 5;
+const size_t filter = 6;
+const size_t info = 7;
+const size_t format = 8;
+const size_t first_sample = 9;
+
+const string base_header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
+
+// enum for record types
+enum class RType {
+ null,
+ expl, // Record with explicitly written alleles, e.g. SNPs, small indels or entirely defined haplotypes
+ invariant, // ALT is empty
+ symbolic, // ALT is e.g. "<DUP:TANDEM>"
+ breakend // ALT is e.g. "G]17:198982]"
+};
+
+// Constants for the parser.
+const size_t line_buf_size = 4096;
+
+// Open the given VCF file using VcfParser or VcfGzParser, depending on the
+// suffix of the file. Return NULL if the opening failed.
+// (The pointee is dynamically allocated and should be deleted.)
+VcfAbstractParser* adaptive_open(const string& path);
+
+} //namespace Vcf
+
+/*
+ * VcfRecord
+ * ==========
+ * Datastructure to store VCF records
+ */
+struct VcfRecord {
+ string chrom; // required
+ size_t pos; // required
+ string id;
+ vector<string> alleles; // allele 0 is REF and is required ; case insensitive
+ string qual;
+ vector<string> filter;
+ vector<pair<string, string> > info;
+ vector<string> format;
+ vector<string> samples;
+
+ Vcf::RType type;
+
+ //map<string, size_t> allele_indexes;
+ //void refresh_allele_indexes();
+
+ VcfRecord()
+ : chrom(), pos(-1), id(), alleles(), qual(), filter(), info(), format(),
+ samples(), type(Vcf::RType::null)
+ {}
+
+ // Clears all the members.
+ inline void clear();
+
+ inline size_t index_of_gt_subfield(const string& key) const;
+ inline string parse_gt_subfield(const string& sample, size_t index) const;
+
+ // Returns (first allele, second allele), '-1' meaning no data.
+ inline pair<int, int> parse_genotype(const string& sample) const;
+
+ inline bool is_snp() const;
+};
+
+/*
+ * VcfMeta
+ * ==========
+ * Represents one line of VCF metainformation.
+ */
+class VcfMeta {
+ string key_;
+ string value_;
+public:
+ VcfMeta(const string& k, const string& p) : key_(k), value_(p) {}
+
+ const string& key() const {return key_;}
+ const string& value() const {return value_;}
+
+ static const map<string, VcfMeta> predefined;
+};
+//class Info : public Meta {...};
+
+/*
+ * VcfHeader
+ * ==========
+ * Stores the contents of a VCF header.
+ */
+class VcfHeader {
+ vector<string> samples_;
+ vector<VcfMeta> meta_;
+
+ map<string, size_t> sample_indexes_;
+ // map<string, vector<size_t> > meta_indexes;
+public:
+ VcfHeader() : samples_(), meta_() {}
+
+ const vector<VcfMeta>& meta() const {return meta_;}
+ const vector<string>& samples() const {return samples_;}
+ const map<string, size_t>& sample_indexes() const {return sample_indexes_;}
+
+ // Adds the meta lines VERSION, FILEDATE and SOURCE
+ void init_meta(const string& fileformat = "VCFv4.2");
+ void add_meta(const VcfMeta& m) {meta_.push_back(m);}
+ void add_sample(const string& s) {samples_.push_back(s); sample_indexes_.insert({s, samples_.size()-1});}
+};
+
+/*
+ * VcfAbstractParser
+ * ==========
+ *
+ * Main class for parsing VCF. The derived non-abstract classes
+ * VcfParser and VcfGzParser only add the file_ attribute and
+ * implement the getline(), eof() and check_eol() methods.
+ *
+ * At present, the parser does not handle :
+ * -- records for which the fixed fields (CHROM to FORMAT) do not
+ * fit in the buffer. Such records usually correspond to large
+ * indels with several alleles. In these cases, the record
+ * passed to next_record() will be empty and have the default
+ * type 'null'.
+ * -- records describing symbolic or breakends alleles. In these
+ * cases the record passed to next_record() will be of type
+ * 'symbolic' or 'breakend', with only the fields CHROM, POS,
+ * ID and REF filled in.
+ */
+class VcfAbstractParser {
+protected:
+ const string path_;
+ VcfHeader header_;
+
+ size_t line_number_;
+ char line_[Vcf::line_buf_size];
+ bool eol_; // Set by check_eol(). True if the buffer reaches the end of the currently parsed line.
+ bool eof_; // Set by getline(). True if EOF was reached.
+ vector<char*> tabs_; // Vector of pointers to {first_tab, second_tab, ..., \0 } in [line_].
+ vector<char*> bounds_; // Vector of pointers to {leading_tab, first_sep, second_sep, ..., (trailing \t or \0) } in [line_].
+ size_t sample_index_; // Index of the current (next) sample.
+
+ virtual void getline(char* ptr, size_t n) =0;
+ virtual void check_eol() =0; // rem. The implementation in gzparser relies on [tabs_] to access the end of the string in [line_].
+ inline void read_to_eol(); // Reads while [eol_] is false.
+
+ // Add a sample to [record.samples_] if [samples_to_keep_.at(sample_index_)] is true.
+ inline void add_sample(VcfRecord& record, char* tab1, char* tab2);
+
+public:
+ VcfAbstractParser(const string& path);
+ virtual ~VcfAbstractParser() {}
+
+ // Assess the state of the underlying file.
+ virtual bool fail() =0;
+
+ // Getters.
+ const string& path() const {return path_;};
+ const VcfHeader& header() const {return header_;};
+ size_t line_number() const {return line_number_;}
+
+ // Parse the header.
+ void read_header();
+
+ // Read a record. Returns false on EOF, true otherwise.
+ bool next_record(VcfRecord& record);
+};
+
+/*
+ * VcfParser
+ * =========
+ * Implements VcfAbstractParser for plain text files.
+ */
+class VcfParser : public VcfAbstractParser {
+ ifstream file_;
+ inline void getline(char* ptr, size_t n);
+ void check_eol() {eol_ = ! file_.fail(); file_.clear();}
+public:
+ VcfParser(const string& path) : VcfAbstractParser(path), file_(path) {}
+ bool fail() {return file_.fail();}
+};
+
+#ifdef HAVE_LIBZ
+/*
+ * VcfGzParser
+ * ==========
+ * Implements VcfAbstractParser for gzipped files.
+ */
+class VcfGzParser : public VcfAbstractParser {
+ gzFile file_;
+ inline void getline(char* ptr, size_t n);
+ inline void check_eol();
+
+ VcfGzParser(VcfGzParser& p) = delete; // No copy constructor.
+ VcfGzParser& operator=(VcfGzParser& p) = delete;
+public:
+ VcfGzParser(const string& path);
+ ~VcfGzParser() {if(file_) gzclose(file_);}
+ bool fail() {return file_ == NULL;}
+};
+#endif // HAVE_LIBZ
+
+/*
+ * VcfWriter
+ * ==========
+ */
+class VcfWriter {
+private:
+ const string path_;
+ ofstream file_;
+
+public:
+ VcfWriter(const string& path) : path_(path), file_(path) {}
+ bool fail() {return file_.fail();}
+
+ void write_header(const VcfHeader& h);
+ void write_record(const VcfRecord& r, const VcfHeader& h);
+};
+
+/*
+ * Inline methods.
+ * ==========
+ */
+
+inline void
+get_bounds(vector<char*>& bounds, char* tab1, char* tab2, char sep)
+{
+ bounds.clear();
+ bounds.push_back(tab1);
+ do {
+ bounds.push_back(strchr(bounds.back()+1, sep)); // this is last+1 if tab1==tab2
+ } while (bounds.back());
+ bounds.pop_back();
+ bounds.push_back(tab2);
+}
+
+inline
+void VcfRecord::clear() {
+ pos = -1;
+ chrom.clear();
+ id.clear();
+ alleles.clear();
+ qual.clear();
+ filter.clear();
+ info.clear();
+ format.clear();
+ samples.clear();
+ type = Vcf::RType::null;
+}
+
+inline
+size_t VcfRecord::index_of_gt_subfield(const string& key) const {
+ size_t i = 0;
+ for (const string& f : format) {
+ if (f == key)
+ return i;
+ ++i;
+ }
+
+ throw out_of_range(key);
+}
+
+inline
+string VcfRecord::parse_gt_subfield(const string& sample, size_t index) const {
+ string subf;
+
+ // Skip the first [index] colons.
+ const char* first = sample.c_str();
+ for(size_t i=0; i<index; ++i) {
+ first = strchr(first, ':');
+ if (first == NULL)
+ // The requested field is not explicitly written, return the empty string.
+ return subf;
+ else
+ first += 1;
+ }
+
+ const char* last = strchr(first, ':');
+ subf = last == NULL ? string(first) : string(first, last);
+ return subf;
+}
+
+inline
+pair<int, int> VcfRecord::parse_genotype(const string& sample) const {
+
+ pair<int, int> genotype = {-1,-1};
+
+ if (format.empty()
+ || format[0] != "GT"
+ || sample.empty()
+ || sample[0] == '.') {
+ return genotype;
+ }
+ const char* first = sample.c_str();
+ const char* slash = strchr(first, '/');
+ if (slash == NULL) {
+ slash = strchr(first, '|');
+ if (slash == NULL) {
+ cerr << "Error: Malformed VCF genotype field '" << sample
+ << "', at marker '" << chrom << ":" << pos
+ << "'.\n";
+ throw exception();
+ }
+ }
+ if (*(slash+1) == '.') {
+ static bool printed = false;
+ if (not printed) {
+ // Print the warning once.
+ cerr << "Notice: Treating incomplete genotypes (e.g. '1/.') as missing.\n";
+ printed = true;
+ }
+ return genotype;
+ }
+
+ const char* colon = strchr(slash, ':');
+ try {
+ genotype.first = std::stoi(string(first, slash));
+ genotype.second = std::stoi(colon==NULL ? string(slash+1) : string(slash+1, colon));
+ if (genotype.first < 0
+ || genotype.first >= int(alleles.size())
+ || genotype.second < 0
+ || genotype.second >= int(alleles.size()))
+ throw exception();
+ } catch (exception& e) {
+ cerr << "Error: Malformed VCF genotype '" << sample
+ << "', at marker '" << chrom << ":" << pos
+ << "'.\n";
+ throw e;
+ }
+
+ return genotype;
+}
+
+inline
+bool VcfRecord::is_snp() const {
+ if (type != Vcf::RType::expl)
+ return false;
+
+ for (const string& a : alleles)
+ if (a.length() > 1)
+ return false;
+
+ return true;
+}
+
+inline
+void VcfAbstractParser::read_to_eol()
+{
+ while(!eol_) {
+ getline(line_, Vcf::line_buf_size);
+ if(eof_) {
+ eol_=true;
+ } else {
+ tabs_.clear();
+ tabs_.push_back(line_+strlen(line_));
+ check_eol();
+ }
+ }
+}
+
+inline
+void VcfAbstractParser::add_sample(VcfRecord& record, char* tab1, char* tab2)
+{
+ if(tab2 == tab1+1)
+ cerr << "Warning: malformed VCF record line (empty SAMPLE field should be marked by a dot)."
+ << " Line " << line_number_ << " in file " << path_ << "'.\n";
+
+ record.samples.push_back(string(tab1+1, tab2));
+
+ ++sample_index_;
+}
+
+inline
+void VcfParser::getline(char* ptr, size_t n) {
+ file_.getline(ptr, n);
+ if(file_.eof()) {
+ eof_=file_.fail();
+ if(!eof_)
+ cerr << "Notice: File '" << path_ << "' does not end with a newline.\n";
+ }
+}
+
+#ifdef HAVE_LIBZ
+inline
+void VcfGzParser::getline(char* buf, size_t n) {
+ if (gzgets(file_, buf, n) == NULL)
+ eof_ = true;
+}
+
+inline
+void VcfGzParser::check_eol() {
+ if(*(tabs_.back()-1) == '\n') { // rem. safe, gzgets returns NULL on EOF
+ eol_ = true;
+ tabs_.back() = tabs_.back()-1;
+ *tabs_.back() = '\0';
+ } else {
+ eol_= false;
+ }
+}
+#endif // HAVE_LIBZ
+
+#endif // __VCF_H__
diff --git a/src/aln_utils.cc b/src/aln_utils.cc
index baf00a4..24107a5 100644
--- a/src/aln_utils.cc
+++ b/src/aln_utils.cc
@@ -182,9 +182,8 @@ apply_cigar_to_seq(char *seq, uint seq_len, const char *old_seq, vector<pair<cha
{
uint size = cigar.size();
char op;
- uint dist, bp, seq_bp, oldseq_len, stop;
+ uint dist, bp, seq_bp, stop;
- oldseq_len = strlen(old_seq);
bp = 0;
seq_bp = 0;
@@ -235,9 +234,8 @@ apply_cigar_to_model_seq(char *seq, uint seq_len, const char *model, vector<pair
{
uint size = cigar.size();
char op;
- uint dist, model_bp, seq_bp, model_len, stop;
+ uint dist, model_bp, seq_bp, stop;
- model_len = strlen(model);
model_bp = 0;
seq_bp = 0;
@@ -429,7 +427,6 @@ remove_snps_from_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
uint size = cigar.size();
char op;
uint dist, bp, new_bp, stop, snp_cnt;
- SNP *s;
bp = 0;
new_bp = 0;
diff --git a/src/catalog_utils.cc b/src/catalog_utils.cc
index 61d666d..09374cc 100644
--- a/src/catalog_utils.cc
+++ b/src/catalog_utils.cc
@@ -28,7 +28,7 @@
#include "catalog_utils.h"
int
-reduce_catalog(map<int, CSLocus *> &catalog, set<int> &whitelist, set<int> &blacklist)
+reduce_catalog(map<int, CSLocus *> &catalog, set<int> &whitelist, set<int> &blacklist)
{
map<int, CSLocus *> list;
map<int, CSLocus *>::iterator it;
@@ -204,7 +204,7 @@ check_whitelist_integrity(map<int, CSLocus *> &catalog, map<int, set<int> > &whi
}
int
-reduce_catalog(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist, set<int> &blacklist)
+reduce_catalog(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist, set<int> &blacklist)
{
map<int, CSLocus *> list;
map<int, CSLocus *>::iterator it;
@@ -339,3 +339,57 @@ reduce_catalog_snps(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist
return 0;
}
+
+map<int, CSLocus*> create_catalog(const vector<VcfRecord>& records) {
+ map<int, CSLocus*> catalog;
+
+ for (size_t i = 0; i < records.size(); ++i) {
+ const VcfRecord& rec = records[i];
+
+ CSLocus* loc = catalog.insert(make_pair(i, new CSLocus())).first->second;
+ loc->sample_id = 0;
+ loc->id = i;
+ loc->len = 1;
+ loc->con = new char[2];
+ strcpy(loc->con, rec.alleles[0].c_str());
+ loc->loc.set(rec.chrom.c_str(), (uint)rec.pos, strand_plus);
+ for (const string& a : rec.alleles) {
+ if (a=="*")
+ continue;
+ loc->alleles.insert({a, 0});
+ }
+ loc->depth = 0;
+ loc->lnl = 0;
+
+ loc->snps.push_back(new SNP());
+ SNP& snp = *loc->snps.back();
+ snp.col = 0;
+ vector<char*> snp_alleles = {&snp.rank_1, &snp.rank_2, &snp.rank_3, &snp.rank_4};
+ try {
+ size_t s=0;
+ for (const string& a : rec.alleles) {
+ if (a=="*")
+ continue;
+ *snp_alleles.at(s) = a.at(0);
+ ++s;
+ }
+ } catch (out_of_range& e) {
+ cerr << "Warning: Skipping malformed VCF SNP record '"
+ << rec.chrom << ":" << rec.pos << "'."
+ << " Alleles were:";
+ for (const string& a : rec.alleles)
+ cerr << " '" << a << "';";
+ cerr << ".\n";
+ delete loc->snps[0];
+ delete loc->con;
+ delete loc;
+ catalog.erase(i);
+ continue;
+ }
+ snp.type = *snp_alleles[2] == 0 ? snp_type_hom : snp_type_het;
+
+ loc->populate_alleles();
+ }
+
+ return catalog;
+}
diff --git a/src/catalog_utils.h b/src/catalog_utils.h
index e0520f1..a68f0ee 100644
--- a/src/catalog_utils.h
+++ b/src/catalog_utils.h
@@ -31,6 +31,7 @@ using std::set;
#include "locus.h"
#include "PopMap.h"
#include "PopSum.h"
+#include "Vcf.h"
int check_whitelist_integrity(map<int, CSLocus *> &, map<int, set<int> > &);
int reduce_catalog(map<int, CSLocus *> &, set<int> &, set<int> &);
@@ -39,4 +40,47 @@ int reduce_catalog_snps(map<int, CSLocus *> &, map<int, set<int> > &, PopMap<CSL
int implement_single_snp_whitelist(map<int, CSLocus *> &, PopSum<CSLocus> *, map<int, set<int> > &);
int implement_random_snp_whitelist(map<int, CSLocus *> &, PopSum<CSLocus> *, map<int, set<int> > &);
+/*
+ * create_catalog(vector<VcfRecord>&):
+ * Creates a catalog based on VCF SNP records.
+ *
+ * We observe the following rules to create the catalog loci :
+ * [sample_id] (batch number) Always set to 0.
+ * [id] VCF records do not intrinsically have locus ids; we use the SNP records indexes.
+ * [len] Always set to 1.
+ * [con] We use the reference nucleotide as the consensus.
+ * [loc] Use the chromosome and position given by each record (n.b. the
+ * VCF format requires these field), and strand "strand_plus".
+ * [snps] Use the ref+alt alleles.
+ * [col] Always set to 0 (first nucleotide in the consensus).
+ * [type] "snp_type_het" if the alt field is not empty, otherwise "snp_type_hom".
+ * [lratio] Always set to 0.
+ * [rank_1] The ref allele.
+ * [rank_2], [rank_3], [rank_4] The alt allele(s).
+ * [alleles] Use the ref+alt alleles in the order they appear, skipping
+ * the special '*' allele ('variant is irrelevant in certain context
+ * because of an neighboring structural polymorphism') if present.
+ * [strings] We fill this by calling Locus::populate_alleles().
+ * [cnt] Set to the approriate value when filling the PopMap.
+ * [hcnt] (Same as above.)
+ * [confounded_cnt] (Same as above.)
+ * [gmap] This is filled by tabulate_haplotypes() (in "populations.cc").
+ * [gcnt] (Same as above.)
+ * [marker] (Same as above.)
+ *
+ * When no depth information is available, the [depth] and the depths
+ * of the [alleles] are set to 0. (n.b. the parsing of depth information in
+ * VCF is not implemented as of Mar 21, 2016.)
+ *
+ * When no likelihood information is available, [lnl] is set to 0. (n.b.
+ * the parsing of likelihood information in VCF is not implemented as of
+ * Mar 21, 2016.)
+ *
+ * The following members are left unset, on the premise that
+ * "populations" does not use them :
+ * model, blacklisted, deleveraged, lumberjack, components, reads, comp_cnt,
+ * comp_type, annotation, uncor_marker, hap_cnts, f, trans_gcnt, chisq.
+ */
+map<int, CSLocus*> create_catalog(const vector<VcfRecord>& vcf_records);
+
#endif // __CATALOG_UTILS_H__
diff --git a/src/clone_filter.cc b/src/clone_filter.cc
index 9a3b2fc..fd3de72 100644
--- a/src/clone_filter.cc
+++ b/src/clone_filter.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -73,47 +73,47 @@ int main (int argc, char* argv[]) {
// If input files are gzipped, output gziped files, unless the user chooses an output type.
//
if (out_file_type == FileT::unknown) {
- if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
- out_file_type = FileT::gzfastq;
- else
- out_file_type = FileT::fastq;
+ if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
+ out_file_type = FileT::gzfastq;
+ else
+ out_file_type = FileT::fastq;
}
if (paired)
- cerr << "Processing paired-end data.\n";
+ cerr << "Processing paired-end data.\n";
else
- cerr << "Processing single-end data.\n";
+ cerr << "Processing single-end data.\n";
switch(barcode_type) {
case null_null:
- cerr << "No oligo sequence specified, will use single and paired-end reads to determine clones.\n";
- break;
+ cerr << "No oligo sequence specified, will use single and paired-end reads to determine clones.\n";
+ break;
case null_index:
- cerr << "Searching for index oligo (i7 Illumina read).\n";
- break;
+ cerr << "Searching for index oligo (i7 Illumina read).\n";
+ break;
case index_null:
- cerr << "Searching for index oligo (i5 Illumina read).\n";
- break;
+ cerr << "Searching for index oligo (i5 Illumina read).\n";
+ break;
case inline_null:
- cerr << "Searching for inline oligo on single-end read.\n";
- break;
+ cerr << "Searching for inline oligo on single-end read.\n";
+ break;
case index_index:
- cerr << "Searching for index oligos (i5 and i7 Illumina reads).\n";
- break;
+ cerr << "Searching for index oligos (i5 and i7 Illumina reads).\n";
+ break;
case inline_inline:
- cerr << "Searching for inline oligos on single and paired-end read.\n";
- break;
+ cerr << "Searching for inline oligos on single and paired-end read.\n";
+ break;
case inline_index:
- cerr << "Searching for inline oligo on single-end read and index oligo (i5 or i7 Illumina read).\n";
- break;
+ cerr << "Searching for inline oligo on single-end read and index oligo (i5 or i7 Illumina read).\n";
+ break;
case index_inline:
- if (paired)
- cerr << "Searching for inline oligo on paired-end read and index oligo (i5 or i7 Illumina read).\n";
- else
- cerr << "Searching for inline oligo on single-end read and index oligo (i5 or i7 Illumina read).\n";
- break;
+ if (paired)
+ cerr << "Searching for inline oligo on paired-end read and index oligo (i5 or i7 Illumina read).\n";
+ else
+ cerr << "Searching for inline oligo on single-end read and index oligo (i5 or i7 Illumina read).\n";
+ break;
}
-
+
map<string, long> counters;
counters["total"] = 0;
counters["red_reads"] = 0;
@@ -130,38 +130,38 @@ int main (int argc, char* argv[]) {
int result = 1;
for (uint i = 0; i < files.size(); i++) {
- cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
-
- result = 1;
- if (paired) {
- if (barcode_type == null_null)
- result = process_paired_reads_by_sequence(files[i].first, files[i].second, counters, clone_map, clone_map_keys);
- else
- result = process_paired_reads(files[i].first, files[i].second, counters, oligo_map);
-
- } else {
- result = process_reads(files[i].first, counters, oligo_map);
- }
-
- if (!result) {
- cerr << "Error processing reads.\n";
- break;
- }
+ cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
+
+ result = 1;
+ if (paired) {
+ if (barcode_type == null_null)
+ result = process_paired_reads_by_sequence(files[i].first, files[i].second, counters, clone_map, clone_map_keys);
+ else
+ result = process_paired_reads(files[i].first, files[i].second, counters, oligo_map);
+
+ } else {
+ result = process_reads(files[i].first, counters, oligo_map);
+ }
+
+ if (!result) {
+ cerr << "Error processing reads.\n";
+ break;
+ }
}
if (barcode_type == null_null && result) {
- write_clonereduced_sequence(files[0].first, files[0].second, clone_map, clone_dist, counters);
+ write_clonereduced_sequence(files[0].first, files[0].second, clone_map, clone_dist, counters);
} else {
- for (OligoHash::iterator i = oligo_map.begin(); i != oligo_map.end(); i++)
- for (map<string, uint16_t>::iterator j = i->second.begin(); j != i->second.end(); j++)
- clone_dist[j->second]++;
+ for (OligoHash::iterator i = oligo_map.begin(); i != oligo_map.end(); i++)
+ for (map<string, uint16_t>::iterator j = i->second.begin(); j != i->second.end(); j++)
+ clone_dist[j->second]++;
}
if (clone_map_keys.size() > 0) {
- cerr << "Freeing hash key memory...";
- free_hash(clone_map_keys);
- cerr << "done.\n";
+ cerr << "Freeing hash key memory...";
+ free_hash(clone_map_keys);
+ cerr << "done.\n";
}
//
@@ -172,24 +172,24 @@ int main (int argc, char* argv[]) {
vector<int> bins;
map<int, int>::iterator it;
for (it = clone_dist.begin(); it != clone_dist.end(); it++)
- bins.push_back(it->first);
+ bins.push_back(it->first);
sort(bins.begin(), bins.end());
cout << "Num Clones\tCount\n";
for (uint i = 0; i < bins.size(); i++)
- cout << bins[i] << "\t" << clone_dist[bins[i]] << "\n";
+ cout << bins[i] << "\t" << clone_dist[bins[i]] << "\n";
char buf[32];
sprintf(buf, "%0.2f%%", ((double) (counters["total"] - counters["red_reads"]) / (double) counters["total"]) * 100);
cerr << counters["total"] << " pairs of reads input. "
- << counters["red_reads"] << " pairs of reads output, discarded "
- << counters["dis_reads"] << " pairs of reads, " << buf << " clone reads.\n";
+ << counters["red_reads"] << " pairs of reads output, discarded "
+ << counters["dis_reads"] << " pairs of reads, " << buf << " clone reads.\n";
return 0;
}
int
process_paired_reads_by_sequence(string prefix_1, string prefix_2, map<string, long> &counters,
- CloneHash &clone_map, vector<char *> &clone_map_keys)
+ CloneHash &clone_map, vector<char *> &clone_map_keys)
{
Input *fh_1, *fh_2;
@@ -199,37 +199,37 @@ process_paired_reads_by_sequence(string prefix_1, string prefix_2, map<string, l
string path_2 = in_path_2 + prefix_2;
cerr << "Reading data from:\n "
- << path_1 << " and\n "
- << path_2 << "\n";
+ << path_1 << " and\n "
+ << path_2 << "\n";
switch (in_file_type) {
case FileT::fastq:
fh_1 = new Fastq(path_1);
- fh_2 = interleaved ? fh_1 : new Fastq(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new Fastq(path_2);
+ break;
case FileT::gzfastq:
fh_1 = new GzFastq(path_1);
- fh_2 = interleaved ? fh_1 : new GzFastq(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new GzFastq(path_2);
+ break;
case FileT::fasta:
fh_1 = new Fasta(path_1);
- fh_2 = interleaved ? fh_1 : new Fasta(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new Fasta(path_2);
+ break;
case FileT::gzfasta:
fh_1 = new GzFasta(path_1);
- fh_2 = interleaved ? fh_1 : new GzFasta(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new GzFasta(path_2);
+ break;
case FileT::bam:
fh_1 = new BamUnAln(path_1);
- fh_2 = fh_1;
- break;
+ fh_2 = fh_1;
+ break;
case FileT::bustard:
fh_1 = new Bustard(path_1);
fh_2 = interleaved ? fh_1 : new Bustard(path_2);
default:
fh_1 = NULL;
fh_2 = NULL;
- break;
+ break;
}
//
@@ -238,8 +238,8 @@ process_paired_reads_by_sequence(string prefix_1, string prefix_2, map<string, l
Seq *s_1 = fh_1->next_seq();
Seq *s_2 = fh_2->next_seq();
if (s_1 == NULL || s_2 == NULL) {
- cerr << "Unable to allocate Seq object.\n";
- return 0;
+ cerr << "Unable to allocate Seq object.\n";
+ return 0;
}
long i = 1;
@@ -250,31 +250,31 @@ process_paired_reads_by_sequence(string prefix_1, string prefix_2, map<string, l
do {
if (i % 10000 == 0) cerr << "Processing short read " << i << " \r";
- counters["total"]++;
+ counters["total"]++;
- exists = clone_map.count(s_1->seq) == 0 ? false : true;
+ exists = clone_map.count(s_1->seq) == 0 ? false : true;
- if (exists) {
- hash_key = s_1->seq;
- } else {
- hash_key = new char [seq_len + 1];
- strcpy(hash_key, s_1->seq);
- clone_map_keys.push_back(hash_key);
- }
+ if (exists) {
+ hash_key = s_1->seq;
+ } else {
+ hash_key = new char [seq_len + 1];
+ strcpy(hash_key, s_1->seq);
+ clone_map_keys.push_back(hash_key);
+ }
- if (out_file_type == FileT::fastq ||
+ if (out_file_type == FileT::fastq ||
out_file_type == FileT::gzfastq)
- clone_map[hash_key][s_2->seq].push_back(Pair(s_1->id, s_2->id, s_1->qual, s_2->qual));
- else if (out_file_type == FileT::fasta ||
+ clone_map[hash_key][s_2->seq].push_back(Pair(s_1->id, s_2->id, s_1->qual, s_2->qual));
+ else if (out_file_type == FileT::fasta ||
out_file_type == FileT::gzfasta)
- clone_map[hash_key][s_2->seq].push_back(Pair(s_1->id, s_2->id));
+ clone_map[hash_key][s_2->seq].push_back(Pair(s_1->id, s_2->id));
- delete s_1;
- delete s_2;
+ delete s_1;
+ delete s_2;
- i++;
+ i++;
} while ((s_1 = fh_1->next_seq()) != NULL &&
- (s_2 = fh_2->next_seq()) != NULL);
+ (s_2 = fh_2->next_seq()) != NULL);
cerr << "\n";
@@ -286,8 +286,8 @@ process_paired_reads_by_sequence(string prefix_1, string prefix_2, map<string, l
int
write_clonereduced_sequence(string prefix_1, string prefix_2,
- CloneHash &clone_map, map<int, int> &clone_dist,
- map<string, long> &counters)
+ CloneHash &clone_map, map<int, int> &clone_dist,
+ map<string, long> &counters)
{
ofstream out_fh_1, out_fh_2, discard_fh_1, discard_fh_2;
gzFile out_gzfh_1, out_gzfh_2, discard_gzfh_1, discard_gzfh_2;
@@ -306,56 +306,56 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
string suffix_1, suffix_2;
if (out_file_type == FileT::gzfastq) {
- suffix_1 = ".1.fq.gz";
- suffix_2 = ".2.fq.gz";
+ suffix_1 = ".1.fq.gz";
+ suffix_2 = ".2.fq.gz";
} else if (out_file_type == FileT::fastq) {
- suffix_1 = ".1.fq";
- suffix_2 = ".2.fq";
+ suffix_1 = ".1.fq";
+ suffix_2 = ".2.fq";
} else if (out_file_type == FileT::gzfasta) {
- suffix_1 = ".1.fa.gz";
- suffix_2 = ".2.fa.gz";
+ suffix_1 = ".1.fa.gz";
+ suffix_2 = ".2.fa.gz";
} else if (out_file_type == FileT::fasta) {
- suffix_1 = ".1.fa";
- suffix_2 = ".2.fa";
+ suffix_1 = ".1.fa";
+ suffix_2 = ".2.fa";
}
string file_1 = remove_suffix(in_file_type, prefix_1);
path_1 = out_path + file_1 + suffix_1;
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
- out_gzfh_1 = gzopen(path_1.c_str(), "wb");
- if (!(out_gzfh_1)) {
- cerr << "Error opening output file '" << path_1 << "'\n";
- return -1;
- }
- } else {
- out_fh_1.open(path_1.c_str(), ifstream::out);
- if (out_fh_1.fail()) {
- cerr << "Error opening output file '" << path_1 << "'\n";
- return -1;
- }
+ out_gzfh_1 = gzopen(path_1.c_str(), "wb");
+ if (!(out_gzfh_1)) {
+ cerr << "Error opening output file '" << path_1 << "'\n";
+ return -1;
+ }
+ } else {
+ out_fh_1.open(path_1.c_str(), ifstream::out);
+ if (out_fh_1.fail()) {
+ cerr << "Error opening output file '" << path_1 << "'\n";
+ return -1;
+ }
}
string file_2 = remove_suffix(in_file_type, prefix_2);
path_2 = out_path + file_2 + suffix_2;
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
- out_gzfh_2 = gzopen(path_2.c_str(), "wb");
- if (!(out_gzfh_2)) {
- cerr << "Error opening output file '" << path_2 << "'\n";
- return -1;
- }
- } else {
- out_fh_2.open(path_2.c_str(), ifstream::out);
- if (out_fh_2.fail()) {
- cerr << "Error opening output file '" << path_2 << "'\n";
- return -1;
- }
+ out_gzfh_2 = gzopen(path_2.c_str(), "wb");
+ if (!(out_gzfh_2)) {
+ cerr << "Error opening output file '" << path_2 << "'\n";
+ return -1;
+ }
+ } else {
+ out_fh_2.open(path_2.c_str(), ifstream::out);
+ if (out_fh_2.fail()) {
+ cerr << "Error opening output file '" << path_2 << "'\n";
+ return -1;
+ }
}
//
// Open files for recording discarded reads.
//
if (discards) {
- path_1 = out_path + file_1 + ".discards" + suffix_1;
+ path_1 = out_path + file_1 + ".discards" + suffix_1;
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
discard_gzfh_1 = gzopen(path_1.c_str(), "wb");
if (!(discard_gzfh_1)) {
@@ -370,7 +370,7 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
}
}
- path_2 = out_path + file_2 + ".discards" + suffix_2;
+ path_2 = out_path + file_2 + ".discards" + suffix_2;
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
discard_gzfh_2 = gzopen(path_2.c_str(), "wb");
if (!(discard_gzfh_2)) {
@@ -394,7 +394,7 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
for (hash_it = clone_map.begin(); hash_it != clone_map.end(); hash_it++) {
- for (map_it = hash_it->second.begin(); map_it != hash_it->second.end(); map_it++) {
+ for (map_it = hash_it->second.begin(); map_it != hash_it->second.end(); map_it++) {
sstr_1.str("");
sstr_2.str("");
@@ -408,33 +408,33 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
<< "+\n"
<< map_it->second[0].p2_qual << "\n";
} else {
- sstr_1 << ">" << map_it->second[0].p1_id << "\n"
+ sstr_1 << ">" << map_it->second[0].p1_id << "\n"
<< hash_it->first << "\n";
- sstr_2 << ">" << map_it->second[0].p2_id << "\n"
+ sstr_2 << ">" << map_it->second[0].p2_id << "\n"
<< map_it->first << "\n";
}
switch(out_file_type) {
case FileT::gzfastq:
case FileT::gzfasta:
- gzputs(out_gzfh_1, sstr_1.str().c_str());
- gzputs(out_gzfh_2, sstr_2.str().c_str());
+ gzputs(out_gzfh_1, sstr_1.str().c_str());
+ gzputs(out_gzfh_2, sstr_2.str().c_str());
break;
case FileT::fastq:
case FileT::fasta:
default:
- out_fh_1 << sstr_2.str();
- out_fh_2 << sstr_2.str();
+ out_fh_1 << sstr_1.str();
+ out_fh_2 << sstr_2.str();
}
- counters["dis_reads"] += map_it->second.size() - 1;
- clone_dist[map_it->second.size()]++;
-
- //
- // Write cloned read pairs that we are discarding
- //
- if (discards) {
- for (uint i = 1; i < map_it->second.size(); i++) {
+ counters["dis_reads"] += map_it->second.size() - 1;
+ clone_dist[map_it->second.size()]++;
+
+ //
+ // Write cloned read pairs that we are discarding
+ //
+ if (discards) {
+ for (uint i = 1; i < map_it->second.size(); i++) {
sstr_1.str("");
sstr_2.str("");
@@ -463,27 +463,27 @@ write_clonereduced_sequence(string prefix_1, string prefix_2,
case FileT::fastq:
case FileT::fasta:
default:
- discard_fh_1 << sstr_2.str();
+ discard_fh_1 << sstr_1.str();
discard_fh_2 << sstr_2.str();
}
- }
+ }
}
- counters["red_reads"]++;
- }
+ counters["red_reads"]++;
+ }
}
cerr << "done.\n";
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
- gzclose(out_gzfh_1);
- gzclose(out_gzfh_2);
- if (discards) {
- gzclose(discard_gzfh_1);
- gzclose(discard_gzfh_2);
- }
+ gzclose(out_gzfh_1);
+ gzclose(out_gzfh_2);
+ if (discards) {
+ gzclose(discard_gzfh_1);
+ gzclose(discard_gzfh_2);
+ }
} else {
- out_fh_1.close();
- out_fh_2.close();
+ out_fh_1.close();
+ out_fh_2.close();
if (discards) {
discard_fh_1.close();
discard_fh_2.close();
@@ -509,38 +509,38 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
string path_2 = in_path_2 + prefix_2;
if (interleaved)
- cerr << " Reading data from:\n " << path_1 << "\n";
+ cerr << " Reading data from:\n " << path_1 << "\n";
else
- cerr << " Reading data from:\n " << path_1 << " and\n " << path_2 << "\n";
+ cerr << " Reading data from:\n " << path_1 << " and\n " << path_2 << "\n";
switch (in_file_type) {
case FileT::fastq:
fh_1 = new Fastq(path_1);
- fh_2 = interleaved ? fh_1 : new Fastq(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new Fastq(path_2);
+ break;
case FileT::gzfastq:
fh_1 = new GzFastq(path_1);
- fh_2 = interleaved ? fh_1 : new GzFastq(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new GzFastq(path_2);
+ break;
case FileT::fasta:
fh_1 = new Fasta(path_1);
- fh_2 = interleaved ? fh_1 : new Fasta(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new Fasta(path_2);
+ break;
case FileT::gzfasta:
fh_1 = new GzFasta(path_1);
- fh_2 = interleaved ? fh_1 : new GzFasta(path_2);
- break;
+ fh_2 = interleaved ? fh_1 : new GzFasta(path_2);
+ break;
case FileT::bam:
fh_1 = new BamUnAln(path_1);
- fh_2 = fh_1;
- break;
+ fh_2 = fh_1;
+ break;
case FileT::bustard:
fh_1 = new Bustard(path_1);
fh_2 = interleaved ? fh_1 : new Bustard(path_2);
default:
fh_1 = NULL;
fh_2 = NULL;
- break;
+ break;
}
//
@@ -549,86 +549,86 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
string suffix_1, suffix_2;
if (out_file_type == FileT::gzfastq) {
- suffix_1 = ".1.fq.gz";
- suffix_2 = ".2.fq.gz";
+ suffix_1 = ".1.fq.gz";
+ suffix_2 = ".2.fq.gz";
} else if (out_file_type == FileT::fastq) {
- suffix_1 = ".1.fq";
- suffix_2 = ".2.fq";
+ suffix_1 = ".1.fq";
+ suffix_2 = ".2.fq";
} else if (out_file_type == FileT::gzfasta) {
- suffix_1 = ".1.fa.gz";
- suffix_2 = ".2.fa.gz";
+ suffix_1 = ".1.fa.gz";
+ suffix_2 = ".2.fa.gz";
} else if (out_file_type == FileT::fasta) {
- suffix_1 = ".1.fa";
- suffix_2 = ".2.fa";
+ suffix_1 = ".1.fa";
+ suffix_2 = ".2.fa";
}
string file_1 = remove_suffix(in_file_type, prefix_1);
path_1 = out_path + file_1 + suffix_1;
if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
- out_gzfh_1 = gzopen(path_1.c_str(), "wb");
- if (!(out_gzfh_1)) {
- cerr << "Error opening output file '" << path_1 << "'\n";
- return -1;
- }
- } else {
- out_fh_1.open(path_1.c_str(), ifstream::out);
- if (out_fh_1.fail()) {
- cerr << "Error opening output file '" << path_1 << "'\n";
- return -1;
- }
+ out_gzfh_1 = gzopen(path_1.c_str(), "wb");
+ if (!(out_gzfh_1)) {
+ cerr << "Error opening output file '" << path_1 << "'\n";
+ return -1;
+ }
+ } else {
+ out_fh_1.open(path_1.c_str(), ifstream::out);
+ if (out_fh_1.fail()) {
+ cerr << "Error opening output file '" << path_1 << "'\n";
+ return -1;
+ }
}
string file_2 = remove_suffix(in_file_type, prefix_2);
path_2 = out_path + file_2 + suffix_2;
if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
- out_gzfh_2 = gzopen(path_2.c_str(), "wb");
- if (!(out_gzfh_2)) {
- cerr << "Error opening output file '" << path_2 << "'\n";
- return -1;
- }
- } else {
- out_fh_2.open(path_2.c_str(), ifstream::out);
- if (out_fh_2.fail()) {
- cerr << "Error opening output file '" << path_2 << "'\n";
- return -1;
- }
+ out_gzfh_2 = gzopen(path_2.c_str(), "wb");
+ if (!(out_gzfh_2)) {
+ cerr << "Error opening output file '" << path_2 << "'\n";
+ return -1;
+ }
+ } else {
+ out_fh_2.open(path_2.c_str(), ifstream::out);
+ if (out_fh_2.fail()) {
+ cerr << "Error opening output file '" << path_2 << "'\n";
+ return -1;
+ }
}
//
// Open files for recording discarded reads.
//
if (discards) {
- path_1 = out_path + file_1 + ".discards" + suffix_1;
-
- if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
- discard_gzfh_1 = gzopen(path_1.c_str(), "wb");
- if (!(discard_gzfh_1)) {
- cerr << "Error opening discard file '" << path_1 << "'\n";
- return -1;
- }
- } else {
- discard_fh_1.open(path_1.c_str(), ifstream::out);
- if (discard_fh_1.fail()) {
- cerr << "Error opening discard file '" << path_1 << "'\n";
- return -1;
- }
- }
-
- path_2 = out_path + file_2 + ".discards" + suffix_2;
-
- if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
- discard_gzfh_2 = gzopen(path_2.c_str(), "wb");
- if (!(discard_gzfh_2)) {
- cerr << "Error opening discard file '" << path_2 << "'\n";
- return -1;
- }
- } else {
- discard_fh_2.open(path_2.c_str(), ifstream::out);
- if (discard_fh_2.fail()) {
- cerr << "Error opening discard file '" << path_2 << "'\n";
- return -1;
- }
- }
+ path_1 = out_path + file_1 + ".discards" + suffix_1;
+
+ if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
+ discard_gzfh_1 = gzopen(path_1.c_str(), "wb");
+ if (!(discard_gzfh_1)) {
+ cerr << "Error opening discard file '" << path_1 << "'\n";
+ return -1;
+ }
+ } else {
+ discard_fh_1.open(path_1.c_str(), ifstream::out);
+ if (discard_fh_1.fail()) {
+ cerr << "Error opening discard file '" << path_1 << "'\n";
+ return -1;
+ }
+ }
+
+ path_2 = out_path + file_2 + ".discards" + suffix_2;
+
+ if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
+ discard_gzfh_2 = gzopen(path_2.c_str(), "wb");
+ if (!(discard_gzfh_2)) {
+ cerr << "Error opening discard file '" << path_2 << "'\n";
+ return -1;
+ }
+ } else {
+ discard_fh_2.open(path_2.c_str(), ifstream::out);
+ if (discard_fh_2.fail()) {
+ cerr << "Error opening discard file '" << path_2 << "'\n";
+ return -1;
+ }
+ }
}
//
@@ -638,27 +638,27 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
switch (barcode_type) {
case inline_null:
case inline_index:
- offset_1 = oligo_len_1;
- offset_2 = 0;
- break;
+ offset_1 = oligo_len_1;
+ offset_2 = 0;
+ break;
case index_null:
case null_index:
case index_index:
- offset_1 = 0;
- offset_2 = 0;
- break;
+ offset_1 = 0;
+ offset_2 = 0;
+ break;
case inline_inline:
- offset_1 = oligo_len_1;
- offset_2 = oligo_len_2;
- break;
+ offset_1 = oligo_len_1;
+ offset_2 = oligo_len_2;
+ break;
case index_inline:
- offset_1 = 0;
- offset_2 = oligo_len_2;
+ offset_1 = 0;
+ offset_2 = oligo_len_2;
default:
- break;
+ break;
}
-
+
//
// Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
@@ -666,9 +666,9 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
Seq *s_1 = fh_1->next_seq();
Seq *s_2 = fh_2->next_seq();
if (s_1 == NULL || s_2 == NULL) {
- cerr << "Attempting to read first pair of input records, unable to allocate "
- << "Seq object (Was the correct input type specified?).\n";
- exit(1);
+ cerr << "Attempting to read first pair of input records, unable to allocate "
+ << "Seq object (Was the correct input type specified?).\n";
+ exit(1);
}
r_1 = new Read(strlen(s_1->seq), 1, min_bc_size_1, win_size);
@@ -683,146 +683,146 @@ process_paired_reads(string prefix_1, string prefix_2, map<string, long> &counte
do {
if (i % 10000 == 0) cerr << " Processing RAD-Tag " << i << " \r";
- parse_input_record(s_1, r_1);
- parse_input_record(s_2, r_2);
- counters["total"]++;
-
- result_1 = 1;
- result_2 = 1;
- clone = false;
-
- //
- // Fetch the randomized oligo sequence from the proper position in the reads.
- //
- switch (barcode_type) {
- case inline_null:
- oligo_1 = r_1->inline_bc;
- break;
- case index_null:
- oligo_1 = r_1->index_bc;
- break;
- case null_index:
- oligo_1 = r_2->index_bc;
- break;
+ parse_input_record(s_1, r_1);
+ parse_input_record(s_2, r_2);
+ counters["total"]++;
+
+ result_1 = 1;
+ result_2 = 1;
+ clone = false;
+
+ //
+ // Fetch the randomized oligo sequence from the proper position in the reads.
+ //
+ switch (barcode_type) {
+ case inline_null:
+ oligo_1 = r_1->inline_bc;
+ break;
+ case index_null:
+ oligo_1 = r_1->index_bc;
+ break;
+ case null_index:
+ oligo_1 = r_2->index_bc;
+ break;
case inline_inline:
- oligo_1 = r_1->inline_bc;
- oligo_2 = r_2->inline_bc;
- break;
- case index_index:
- oligo_1 = r_1->index_bc;
- oligo_2 = r_2->index_bc;
- break;
- case inline_index:
- oligo_1 = r_1->inline_bc;
- oligo_2 = r_2->index_bc;
- break;
- case index_inline:
- oligo_1 = r_1->index_bc;
- oligo_2 = r_2->inline_bc;
- default:
- break;
- }
-
- //
- // Have we seen this combination of oligos before for this read?
- //
- oligo = oligo_1 + oligo_2;
- key = string(s_1->seq + offset_1) + string(s_2->seq + offset_2);
-
- // cerr << "Oligo: '" << oligo << "'\n"
- // << "Seq: '" << s_1->seq << "'\n"
- // << "Key: '" << key << "'\n";
-
- if (oligo_map.count(key) == 0)
- oligo_map[key] = map<string, uint16_t>();
-
- if (oligo_map[key].count(oligo) == 0) {
- oligo_map[key][oligo] = 1;
- clone = false;
- } else {
- oligo_map[key][oligo]++;
- clone = true;
- }
-
- if (clone == false) {
- counters["red_reads"]++;
-
- switch (out_file_type) {
- case FileT::fastq:
- result_1 = write_fastq(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
- result_2 = write_fastq(&out_fh_2, s_2, retain_oligo ? 0 : offset_2);
- break;
- case FileT::gzfastq:
- result_1 = write_fastq(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
- result_2 = write_fastq(&out_gzfh_2, s_2, retain_oligo ? 0 : offset_2);
- break;
- case FileT::fasta:
- result_1 = write_fasta(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
- result_2 = write_fasta(&out_fh_2, s_2, retain_oligo ? 0 : offset_2);
- break;
- case FileT::gzfasta:
- result_1 = write_fasta(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
- result_2 = write_fasta(&out_gzfh_2, s_2, retain_oligo ? 0 : offset_2);
- default:
- break;
- }
-
- if (!result_1 || !result_2) {
- cerr << "Error writing to output file for '" << file_1 << " / " << file_2 << "'\n";
- return_val = -1;
- break;
- }
- } else if (clone == true && discards) {
- counters["dis_reads"]++;
-
- switch (out_file_type) {
- case FileT::fastq:
- result_1 = write_fastq(&discard_fh_1, s_1);
- result_2 = write_fastq(&discard_fh_2, s_2);
- break;
- case FileT::gzfastq:
- result_1 = write_fastq(&discard_gzfh_1, s_1);
- result_2 = write_fastq(&discard_gzfh_2, s_2);
- break;
- case FileT::fasta:
- result_1 = write_fasta(&discard_fh_1, s_1);
- result_2 = write_fasta(&discard_fh_2, s_2);
- break;
- case FileT::gzfasta:
- result_1 = write_fasta(&discard_gzfh_1, s_1);
- result_2 = write_fasta(&discard_gzfh_2, s_2);
- default:
- break;
- }
-
- if (!result_1 || !result_2) {
- cerr << "Error writing to discard file for '" << file_1 << " / " << file_2 << "'\n";
- return_val = -1;
- break;
- }
- }
-
- delete s_1;
- delete s_2;
-
- i++;
+ oligo_1 = r_1->inline_bc;
+ oligo_2 = r_2->inline_bc;
+ break;
+ case index_index:
+ oligo_1 = r_1->index_bc;
+ oligo_2 = r_2->index_bc;
+ break;
+ case inline_index:
+ oligo_1 = r_1->inline_bc;
+ oligo_2 = r_2->index_bc;
+ break;
+ case index_inline:
+ oligo_1 = r_1->index_bc;
+ oligo_2 = r_2->inline_bc;
+ default:
+ break;
+ }
+
+ //
+ // Have we seen this combination of oligos before for this read?
+ //
+ oligo = oligo_1 + oligo_2;
+ key = string(s_1->seq + offset_1) + string(s_2->seq + offset_2);
+
+ // cerr << "Oligo: '" << oligo << "'\n"
+ // << "Seq: '" << s_1->seq << "'\n"
+ // << "Key: '" << key << "'\n";
+
+ if (oligo_map.count(key) == 0)
+ oligo_map[key] = map<string, uint16_t>();
+
+ if (oligo_map[key].count(oligo) == 0) {
+ oligo_map[key][oligo] = 1;
+ clone = false;
+ } else {
+ oligo_map[key][oligo]++;
+ clone = true;
+ }
+
+ if (clone == false) {
+ counters["red_reads"]++;
+
+ switch (out_file_type) {
+ case FileT::fastq:
+ result_1 = write_fastq(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
+ result_2 = write_fastq(&out_fh_2, s_2, retain_oligo ? 0 : offset_2);
+ break;
+ case FileT::gzfastq:
+ result_1 = write_fastq(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
+ result_2 = write_fastq(&out_gzfh_2, s_2, retain_oligo ? 0 : offset_2);
+ break;
+ case FileT::fasta:
+ result_1 = write_fasta(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
+ result_2 = write_fasta(&out_fh_2, s_2, retain_oligo ? 0 : offset_2);
+ break;
+ case FileT::gzfasta:
+ result_1 = write_fasta(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
+ result_2 = write_fasta(&out_gzfh_2, s_2, retain_oligo ? 0 : offset_2);
+ default:
+ break;
+ }
+
+ if (!result_1 || !result_2) {
+ cerr << "Error writing to output file for '" << file_1 << " / " << file_2 << "'\n";
+ return_val = -1;
+ break;
+ }
+ } else if (clone == true && discards) {
+ counters["dis_reads"]++;
+
+ switch (out_file_type) {
+ case FileT::fastq:
+ result_1 = write_fastq(&discard_fh_1, s_1);
+ result_2 = write_fastq(&discard_fh_2, s_2);
+ break;
+ case FileT::gzfastq:
+ result_1 = write_fastq(&discard_gzfh_1, s_1);
+ result_2 = write_fastq(&discard_gzfh_2, s_2);
+ break;
+ case FileT::fasta:
+ result_1 = write_fasta(&discard_fh_1, s_1);
+ result_2 = write_fasta(&discard_fh_2, s_2);
+ break;
+ case FileT::gzfasta:
+ result_1 = write_fasta(&discard_gzfh_1, s_1);
+ result_2 = write_fasta(&discard_gzfh_2, s_2);
+ default:
+ break;
+ }
+
+ if (!result_1 || !result_2) {
+ cerr << "Error writing to discard file for '" << file_1 << " / " << file_2 << "'\n";
+ return_val = -1;
+ break;
+ }
+ }
+
+ delete s_1;
+ delete s_2;
+
+ i++;
} while ((s_1 = fh_1->next_seq()) != NULL &&
- (s_2 = fh_2->next_seq()) != NULL);
+ (s_2 = fh_2->next_seq()) != NULL);
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
- gzclose(out_gzfh_1);
- gzclose(out_gzfh_2);
- if (discards) {
- gzclose(discard_gzfh_1);
- gzclose(discard_gzfh_2);
- }
+ gzclose(out_gzfh_1);
+ gzclose(out_gzfh_2);
+ if (discards) {
+ gzclose(discard_gzfh_1);
+ gzclose(discard_gzfh_2);
+ }
} else {
- out_fh_1.close();
- out_fh_2.close();
- if (discards) {
- discard_fh_1.close();
- discard_fh_2.close();
- }
+ out_fh_1.close();
+ out_fh_2.close();
+ if (discards) {
+ discard_fh_1.close();
+ discard_fh_2.close();
+ }
}
delete fh_1;
@@ -853,24 +853,24 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
switch(in_file_type) {
case FileT::fastq:
- fh_1 = new Fastq(path_1);
- break;
+ fh_1 = new Fastq(path_1);
+ break;
case FileT::gzfastq:
fh_1 = new GzFastq(path_1.c_str());
- break;
+ break;
case FileT::fasta:
- fh_1 = new Fasta(path_1);
- break;
+ fh_1 = new Fasta(path_1);
+ break;
case FileT::gzfasta:
fh_1 = new GzFasta(path_1);
- break;
+ break;
case FileT::bam:
fh_1 = new BamUnAln(path_1);
- break;
+ break;
case FileT::bustard:
fh_1 = new Bustard(path_1);
default:
- break;
+ break;
}
//
@@ -879,55 +879,55 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
string suffix_1;
if (out_file_type == FileT::gzfastq)
- suffix_1 = ".fq.gz";
+ suffix_1 = ".fq.gz";
else if (out_file_type == FileT::fastq)
- suffix_1 = ".fq";
+ suffix_1 = ".fq";
else if (out_file_type == FileT::gzfasta)
- suffix_1 = ".fa.gz";
+ suffix_1 = ".fa.gz";
else if (out_file_type == FileT::fasta)
- suffix_1 = ".fa";
+ suffix_1 = ".fa";
string file_1 = prefix_1;
int pos = file_1.find_last_of(".");
if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
- file_1.substr(pos) == ".gz") {
- file_1 = file_1.substr(0, pos);
- pos = file_1.find_last_of(".");
+ file_1.substr(pos) == ".gz") {
+ file_1 = file_1.substr(0, pos);
+ pos = file_1.find_last_of(".");
}
path_1 = out_path + file_1.substr(0, pos) + suffix_1;
if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
- out_gzfh_1 = gzopen(path_1.c_str(), "wb");
- if (!(out_gzfh_1)) {
- cerr << "Error opening output file '" << path_1 << "'\n";
- return -1;
- }
- } else {
- out_fh_1.open(path_1.c_str(), ifstream::out);
- if (out_fh_1.fail()) {
- cerr << "Error opening output file '" << path_1 << "'\n";
- return -1;
- }
+ out_gzfh_1 = gzopen(path_1.c_str(), "wb");
+ if (!(out_gzfh_1)) {
+ cerr << "Error opening output file '" << path_1 << "'\n";
+ return -1;
+ }
+ } else {
+ out_fh_1.open(path_1.c_str(), ifstream::out);
+ if (out_fh_1.fail()) {
+ cerr << "Error opening output file '" << path_1 << "'\n";
+ return -1;
+ }
}
//
// Open files for recording discarded reads.
//
if (discards) {
- path_1 = out_path + file_1 + ".discards" + suffix_1;
-
- if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
- discard_gzfh_1 = gzopen(path_1.c_str(), "wb");
- if (!(discard_gzfh_1)) {
- cerr << "Error opening discard file '" << path_1 << "'\n";
- return -1;
- }
- } else {
- discard_fh_1.open(path_1.c_str(), ifstream::out);
- if (discard_fh_1.fail()) {
- cerr << "Error opening discard file '" << path_1 << "'\n";
- return -1;
- }
- }
+ path_1 = out_path + file_1 + ".discards" + suffix_1;
+
+ if (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) {
+ discard_gzfh_1 = gzopen(path_1.c_str(), "wb");
+ if (!(discard_gzfh_1)) {
+ cerr << "Error opening discard file '" << path_1 << "'\n";
+ return -1;
+ }
+ } else {
+ discard_fh_1.open(path_1.c_str(), ifstream::out);
+ if (discard_fh_1.fail()) {
+ cerr << "Error opening discard file '" << path_1 << "'\n";
+ return -1;
+ }
+ }
}
//
@@ -938,22 +938,22 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
case inline_null:
case inline_index:
case index_inline:
- offset_1 = oligo_len_1;
- break;
+ offset_1 = oligo_len_1;
+ break;
default:
- offset_1 = 0;
- break;
+ offset_1 = 0;
+ break;
}
-
+
//
// Read in the first record, initializing the Seq object s. Then
// initialize the Read object r, then loop, using the same objects.
//
Seq *s_1 = fh_1->next_seq();
if (s_1 == NULL) {
- cerr << "Attempting to read first pair of input records, unable to allocate "
- << "Seq object (Was the correct input type specified?).\n";
- exit(1);
+ cerr << "Attempting to read first pair of input records, unable to allocate "
+ << "Seq object (Was the correct input type specified?).\n";
+ exit(1);
}
r_1 = new Read(strlen(s_1->seq), 1, min_bc_size_1, win_size);
@@ -966,97 +966,97 @@ process_reads(string prefix_1, map<string, long> &counters, OligoHash &oligo_map
do {
if (i % 10000 == 0) cerr << " Processing RAD-Tag " << i << " \r";
- parse_input_record(s_1, r_1);
- counters["total"]++;
-
- result_1 = 1;
- clone = false;
-
- //
- // Fetch the randomized oligo sequence from the proper position in the reads.
- //
- if (barcode_type == inline_null)
- oligo_1 = r_1->inline_bc;
- else if (barcode_type == index_null)
- oligo_1 = r_1->index_bc;
-
- //
- // Have we seen this combination of oligos before for this read?
- //
- key = string(s_1->seq + offset_1);
-
- if (oligo_map.count(key) == 0)
- oligo_map[key] = map<string, uint16_t>();
-
- if (oligo_map[key].count(oligo_1) == 0) {
- oligo_map[key][oligo_1] = 1;
- clone = false;
- } else {
- oligo_map[key][oligo_1]++;
- clone = true;
- }
-
- if (clone == false) {
- counters["red_reads"]++;
-
- switch (out_file_type) {
- case FileT::fastq:
- result_1 = write_fastq(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
- break;
- case FileT::gzfastq:
- result_1 = write_fastq(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
- break;
- case FileT::fasta:
- result_1 = write_fasta(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
- break;
- case FileT::gzfasta:
- result_1 = write_fasta(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
- default:
- break;
- }
-
- if (!result_1) {
- cerr << "Error writing to output file for '" << file_1 << "'\n";
- return_val = -1;
- break;
- }
- } else if (clone == true && discards) {
- counters["dis_reads"]++;
-
- switch (out_file_type) {
- case FileT::fastq:
- result_1 = write_fastq(&discard_fh_1, s_1);
- break;
- case FileT::gzfastq:
- result_1 = write_fastq(&discard_gzfh_1, s_1);
- break;
- case FileT::fasta:
- result_1 = write_fasta(&discard_fh_1, s_1);
- break;
- case FileT::gzfasta:
- result_1 = write_fasta(&discard_gzfh_1, s_1);
- default:
- break;
- }
-
- if (!result_1) {
- cerr << "Error writing to discard file for '" << file_1 << "'\n";
- return_val = -1;
- break;
- }
- }
-
- delete s_1;
-
- i++;
+ parse_input_record(s_1, r_1);
+ counters["total"]++;
+
+ result_1 = 1;
+ clone = false;
+
+ //
+ // Fetch the randomized oligo sequence from the proper position in the reads.
+ //
+ if (barcode_type == inline_null)
+ oligo_1 = r_1->inline_bc;
+ else if (barcode_type == index_null)
+ oligo_1 = r_1->index_bc;
+
+ //
+ // Have we seen this combination of oligos before for this read?
+ //
+ key = string(s_1->seq + offset_1);
+
+ if (oligo_map.count(key) == 0)
+ oligo_map[key] = map<string, uint16_t>();
+
+ if (oligo_map[key].count(oligo_1) == 0) {
+ oligo_map[key][oligo_1] = 1;
+ clone = false;
+ } else {
+ oligo_map[key][oligo_1]++;
+ clone = true;
+ }
+
+ if (clone == false) {
+ counters["red_reads"]++;
+
+ switch (out_file_type) {
+ case FileT::fastq:
+ result_1 = write_fastq(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
+ break;
+ case FileT::gzfastq:
+ result_1 = write_fastq(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
+ break;
+ case FileT::fasta:
+ result_1 = write_fasta(&out_fh_1, s_1, retain_oligo ? 0 : offset_1);
+ break;
+ case FileT::gzfasta:
+ result_1 = write_fasta(&out_gzfh_1, s_1, retain_oligo ? 0 : offset_1);
+ default:
+ break;
+ }
+
+ if (!result_1) {
+ cerr << "Error writing to output file for '" << file_1 << "'\n";
+ return_val = -1;
+ break;
+ }
+ } else if (clone == true && discards) {
+ counters["dis_reads"]++;
+
+ switch (out_file_type) {
+ case FileT::fastq:
+ result_1 = write_fastq(&discard_fh_1, s_1);
+ break;
+ case FileT::gzfastq:
+ result_1 = write_fastq(&discard_gzfh_1, s_1);
+ break;
+ case FileT::fasta:
+ result_1 = write_fasta(&discard_fh_1, s_1);
+ break;
+ case FileT::gzfasta:
+ result_1 = write_fasta(&discard_gzfh_1, s_1);
+ default:
+ break;
+ }
+
+ if (!result_1) {
+ cerr << "Error writing to discard file for '" << file_1 << "'\n";
+ return_val = -1;
+ break;
+ }
+ }
+
+ delete s_1;
+
+ i++;
} while ((s_1 = fh_1->next_seq()) != NULL);
if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
- gzclose(out_gzfh_1);
- if (discards) gzclose(discard_gzfh_1);
+ gzclose(out_gzfh_1);
+ if (discards) gzclose(discard_gzfh_1);
} else {
- out_fh_1.close();
- if (discards) discard_fh_1.close();
+ out_fh_1.close();
+ if (discards) discard_fh_1.close();
}
delete fh_1;
@@ -1070,7 +1070,7 @@ int
free_hash(vector<char *> &keys)
{
for (uint i = 0; i < keys.size(); i++) {
- delete [] keys[i];
+ delete [] keys[i];
}
keys.clear();
@@ -1082,182 +1082,182 @@ int parse_command_line(int argc, char* argv[]) {
int c;
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"discards", no_argument, NULL, 'D'},
- {"paired", no_argument, NULL, 'P'},
- {"null_index", no_argument, NULL, 'U'},
- {"index_null", no_argument, NULL, 'u'},
- {"inline_null", no_argument, NULL, 'V'},
- {"index_index", no_argument, NULL, 'W'},
- {"inline_inline", no_argument, NULL, 'x'},
- {"index_inline", no_argument, NULL, 'Y'},
- {"inline_index", no_argument, NULL, 'Z'},
- {"infile_type", required_argument, NULL, 'i'},
- {"outfile_type", required_argument, NULL, 'y'},
- {"file", required_argument, NULL, 'f'},
- {"path", required_argument, NULL, 'p'},
- {"file_p1", required_argument, NULL, '1'},
- {"file_p2", required_argument, NULL, '2'},
- {"outpath", required_argument, NULL, 'o'},
- {"oligo_len_1", required_argument, NULL, 'O'},
- {"oligo_len_2", required_argument, NULL, 'L'},
- {"retain_oligo", required_argument, NULL, 'R'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvDPuUVWxYZi:y:f:p:1:2:o:O:L:R:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 'i':
+ {"discards", no_argument, NULL, 'D'},
+ {"paired", no_argument, NULL, 'P'},
+ {"null_index", no_argument, NULL, 'U'},
+ {"index_null", no_argument, NULL, 'u'},
+ {"inline_null", no_argument, NULL, 'V'},
+ {"index_index", no_argument, NULL, 'W'},
+ {"inline_inline", no_argument, NULL, 'x'},
+ {"index_inline", no_argument, NULL, 'Y'},
+ {"inline_index", no_argument, NULL, 'Z'},
+ {"infile_type", required_argument, NULL, 'i'},
+ {"outfile_type", required_argument, NULL, 'y'},
+ {"file", required_argument, NULL, 'f'},
+ {"path", required_argument, NULL, 'p'},
+ {"file_p1", required_argument, NULL, '1'},
+ {"file_p2", required_argument, NULL, '2'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"oligo_len_1", required_argument, NULL, 'O'},
+ {"oligo_len_2", required_argument, NULL, 'L'},
+ {"retain_oligo", required_argument, NULL, 'R'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvDPuUVWxYZi:y:f:p:1:2:o:O:L:R:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'i':
if (strcasecmp(optarg, "bustard") == 0)
in_file_type = FileT::bustard;
- else if (strcasecmp(optarg, "fasta") == 0)
+ else if (strcasecmp(optarg, "fasta") == 0)
in_file_type = FileT::fasta;
- else if (strcasecmp(optarg, "gzfasta") == 0)
- in_file_type = FileT::gzfasta;
- else if (strcasecmp(optarg, "gzfastq") == 0)
- in_file_type = FileT::gzfastq;
- else
- in_file_type = FileT::fastq;
- break;
- case 'y':
- if (strcasecmp(optarg, "fastq") == 0)
- out_file_type = FileT::fastq;
+ else if (strcasecmp(optarg, "gzfasta") == 0)
+ in_file_type = FileT::gzfasta;
+ else if (strcasecmp(optarg, "gzfastq") == 0)
+ in_file_type = FileT::gzfastq;
+ else
+ in_file_type = FileT::fastq;
+ break;
+ case 'y':
+ if (strcasecmp(optarg, "fastq") == 0)
+ out_file_type = FileT::fastq;
else if (strcasecmp(optarg, "gzfastq") == 0)
- out_file_type = FileT::gzfastq;
+ out_file_type = FileT::gzfastq;
else if (strcasecmp(optarg, "fasta") == 0)
out_file_type = FileT::fasta;
else if (strcasecmp(optarg, "gzfasta") == 0)
- out_file_type = FileT::gzfasta;
- break;
- case 'D':
- discards = true;
- break;
- case 'f':
- in_file = optarg;
- ftype = FileT::fastq;
- break;
- case 'p':
- in_path_1 = optarg;
- in_path_2 = in_path_1;
- ftype = FileT::fastq;
- break;
- case '1':
- paired = true;
- in_file_p1 = optarg;
- ftype = FileT::fastq;
- break;
- case '2':
- paired = true;
- in_file_p2 = optarg;
- ftype = FileT::fastq;
- break;
- case 'P':
- paired = true;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'U':
- barcode_type = null_index;
- break;
- case 'u':
- barcode_type = index_null;
- break;
- case 'V':
- barcode_type = inline_null;
- break;
- case 'W':
- barcode_type = index_index;
- break;
- case 'x':
- barcode_type = inline_inline;
- break;
- case 'Y':
- barcode_type = index_inline;
- break;
- case 'Z':
- barcode_type = inline_index;
- break;
- case 'O':
- oligo_len_1 = is_integer(optarg);
- break;
- case 'L':
- oligo_len_2 = is_integer(optarg);
- break;
- case 'R':
- retain_oligo = true;
- break;
+ out_file_type = FileT::gzfasta;
+ break;
+ case 'D':
+ discards = true;
+ break;
+ case 'f':
+ in_file = optarg;
+ ftype = FileT::fastq;
+ break;
+ case 'p':
+ in_path_1 = optarg;
+ in_path_2 = in_path_1;
+ ftype = FileT::fastq;
+ break;
+ case '1':
+ paired = true;
+ in_file_p1 = optarg;
+ ftype = FileT::fastq;
+ break;
+ case '2':
+ paired = true;
+ in_file_p2 = optarg;
+ ftype = FileT::fastq;
+ break;
+ case 'P':
+ paired = true;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'U':
+ barcode_type = null_index;
+ break;
+ case 'u':
+ barcode_type = index_null;
+ break;
+ case 'V':
+ barcode_type = inline_null;
+ break;
+ case 'W':
+ barcode_type = index_index;
+ break;
+ case 'x':
+ barcode_type = inline_inline;
+ break;
+ case 'Y':
+ barcode_type = index_inline;
+ break;
+ case 'Z':
+ barcode_type = inline_index;
+ break;
+ case 'O':
+ oligo_len_1 = is_integer(optarg);
+ break;
+ case 'L':
+ oligo_len_2 = is_integer(optarg);
+ break;
+ case 'R':
+ retain_oligo = true;
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (in_file.length() == 0 && in_path_1.length() == 0 && in_file_p1.length() == 0) {
- cerr << "You must specify an input file of a directory path to a set of input files.\n";
- help();
+ cerr << "You must specify an input file of a directory path to a set of input files.\n";
+ help();
}
if (in_file.length() > 0 && in_path_1.length() > 0) {
- cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
- help();
+ cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
+ help();
}
if (in_file.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
- cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
- help();
+ cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
+ help();
}
if (in_path_1.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
- cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
- help();
+ cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
+ help();
}
if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
- in_path_1 += "/";
+ in_path_1 += "/";
if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
- in_path_2 += "/";
+ in_path_2 += "/";
if (out_path.length() == 0)
- out_path = ".";
+ out_path = ".";
if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ out_path += "/";
if (in_file_type == FileT::unknown)
- in_file_type = ftype;
+ in_file_type = ftype;
if (paired == false && barcode_type == null_null) {
- cerr << "You must specify paired-end data if you do not have oligo sequences to differentiate cloned reads.\n";
- help();
+ cerr << "You must specify paired-end data if you do not have oligo sequences to differentiate cloned reads.\n";
+ help();
}
if (barcode_type != null_null && oligo_len_1 == 0 && oligo_len_2 == 0) {
- cerr << "You must specify the length of the oligo sequences (--oligo_len_1 / --oligo_len_2).\n";
- help();
+ cerr << "You must specify the length of the oligo sequences (--oligo_len_1 / --oligo_len_2).\n";
+ help();
}
return 0;
@@ -1272,27 +1272,27 @@ void version() {
void help() {
std::cerr << "clone_filter " << VERSION << "\n"
<< "clone_filter [-f in_file | -p in_dir [-P] [-I] | -1 pair_1 -2 pair_2] -o out_dir [-i type] [-y type] [-D] [-h]\n"
- << " f: path to the input file if processing single-end sequences.\n"
- << " p: path to a directory of files.\n"
- << " P: files contained within directory specified by '-p' are paired.\n"
- << " 1: first input file in a set of paired-end sequences.\n"
- << " 2: second input file in a set of paired-end sequences.\n"
- << " i: input file type, either 'bustard', 'fastq', 'fasta', 'gzfasta', or 'gzfastq' (default 'fastq').\n"
- << " o: path to output the processed files.\n"
- << " y: output type, either 'fastq', 'fasta', 'gzfasta', or 'gzfastq' (default same as input type).\n"
- << " D: capture discarded reads to a file.\n"
- << " h: display this help messsage.\n"
- << " --oligo_len_1 len: length of the single-end oligo sequence in data set.\n"
- << " --oligo_len_2 len: length of the paired-end oligo sequence in data set.\n"
- << " --retain_oligo: do not trim off the random oligo sequence (if oligo is inline).\n\n"
- << " Oligo sequence options:\n"
- << " --inline_null: random oligo is inline with sequence, occurs only on single-end read (default).\n"
- << " --null_index: random oligo is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
- << " --index_null: random oligo is provded in FASTQ header (Illumina i5 or i7 read).\n"
- << " --inline_inline: random oligo is inline with sequence, occurs on single and paired-end read.\n"
- << " --index_index: random oligo is provded in FASTQ header (Illumina i5 and i7 read).\n"
- << " --inline_index: random oligo is inline with sequence on single-end read and second oligo occurs in FASTQ header.\n"
- << " --index_inline: random oligo occurs in FASTQ header (Illumina i5 or i7 read) and is inline with sequence on single-end read (if single read data) or paired-end read (if paired data).\n\n";
+ << " f: path to the input file if processing single-end sequences.\n"
+ << " p: path to a directory of files.\n"
+ << " P: files contained within directory specified by '-p' are paired.\n"
+ << " 1: first input file in a set of paired-end sequences.\n"
+ << " 2: second input file in a set of paired-end sequences.\n"
+ << " i: input file type, either 'bustard', 'fastq', 'fasta', 'gzfasta', or 'gzfastq' (default 'fastq').\n"
+ << " o: path to output the processed files.\n"
+ << " y: output type, either 'fastq', 'fasta', 'gzfasta', or 'gzfastq' (default same as input type).\n"
+ << " D: capture discarded reads to a file.\n"
+ << " h: display this help messsage.\n"
+ << " --oligo_len_1 len: length of the single-end oligo sequence in data set.\n"
+ << " --oligo_len_2 len: length of the paired-end oligo sequence in data set.\n"
+ << " --retain_oligo: do not trim off the random oligo sequence (if oligo is inline).\n\n"
+ << " Oligo sequence options:\n"
+ << " --inline_null: random oligo is inline with sequence, occurs only on single-end read (default).\n"
+ << " --null_index: random oligo is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
+ << " --index_null: random oligo is provded in FASTQ header (Illumina i5 or i7 read).\n"
+ << " --inline_inline: random oligo is inline with sequence, occurs on single and paired-end read.\n"
+ << " --index_index: random oligo is provded in FASTQ header (Illumina i5 and i7 read).\n"
+ << " --inline_index: random oligo is inline with sequence on single-end read and second oligo occurs in FASTQ header.\n"
+ << " --index_inline: random oligo occurs in FASTQ header (Illumina i5 or i7 read) and is inline with sequence on single-end read (if single read data) or paired-end read (if paired data).\n\n";
exit(0);
}
diff --git a/src/cstacks.cc b/src/cstacks.cc
index 7bf608c..010b7be 100644
--- a/src/cstacks.cc
+++ b/src/cstacks.cc
@@ -48,8 +48,11 @@ int main (int argc, char* argv[]) {
uint sample_cnt = samples.size();
- cerr << "Number of mismatches allowed between stacks: " << ctag_dist << "\n"
- << "Loci matched based on " << (search_type == sequence ? "sequence identity" : "genomic location") << ".\n"
+ cerr << "cstacks paramters selected:\n"
+ << " Loci matched based on " << (search_type == sequence ? "sequence identity" : "genomic location") << ".\n";
+ if (search_type == sequence)
+ cerr << " Number of mismatches allowed between stacks: " << ctag_dist << "\n";
+ cerr << " Gapped alignments: " << (gapped_alignments ? "enabled" : "disabled") << "\n"
<< "Constructing catalog from " << sample_cnt << " samples.\n";
//
@@ -173,7 +176,7 @@ int update_catalog_index(map<int, CLocus *> &catalog, map<string, int> &cat_inde
snprintf(id, id_len - 1, "%s|%d|%c",
j->second->loc.chr,
j->second->loc.bp,
- j->second->loc.strand == plus ? '+' : '-');
+ j->second->loc.strand == strand_plus ? '+' : '-');
if (cat_index.count(id) == 0) {
cat_index[id] = j->first;
@@ -249,7 +252,7 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
CLocus *ctag;
QLocus *qtag;
string cseq, qseq, cigar_str;
- int cseq_len, qseq_len, match_index;
+ int cseq_len, match_index;
vector<pair<char, uint> > cigar;
GappedAln *aln = new GappedAln();
@@ -324,7 +327,7 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
cigar_str = "";
for (uint k = 0; k < qtag->matches.size(); k++)
- if (qtag->matches[k]->cat_id == min_cat_id) {
+ if ((int)qtag->matches[k]->cat_id == min_cat_id) {
cigar_str = qtag->matches[k]->cigar;
match_index = k;
break;
@@ -335,7 +338,7 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
// sample, we must re-align the sequences in case changes have been made to the
// sequence by the previous matching sequence.
//
- if (ctag->match_cnt > 0) {
+ if (gapped_alignments && ctag->match_cnt > 0) {
string query_allele, query_seq, cat_allele, cat_seq;
// cerr << " Warning: Catalog locus " << ctag->id
// << ", Sample " << qtag->sample_id << ", locus " << qtag->id
@@ -367,7 +370,6 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
// Adjust the postition of any SNPs that were shifted down sequence due to a gap.
//
if (gapped_aln) {
- qseq_len = parse_cigar(cigar_str.c_str(), cigar);
qseq = apply_cigar_to_seq(qtag->con, cigar);
adjust_snps_for_gaps(cigar, qtag);
@@ -380,7 +382,7 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
// If the alignment modified the catalog locus, record it so we can re-align
// any other matching sequences from this sample.
//
- if (cseq_len > ctag->len)
+ if ((uint)cseq_len > ctag->len)
ctag->match_cnt++;
//
@@ -547,7 +549,7 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
// generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
//
uniq_kmers.clear();
- for (uint j = 0; j < num_kmers; j++)
+ for (int j = 0; j < num_kmers; j++)
uniq_kmers.insert(kmers[j]);
hits.clear();
@@ -586,7 +588,7 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
hit_cnt = 0;
allele_id = prev_id;
- while (hits[index] == prev_id) {
+ while ((uint)hits[index] == prev_id) {
hit_cnt++;
index++;
}
@@ -594,7 +596,7 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
if (index < hits_size)
prev_id = hits[index];
- if (hit_cnt >= min_hits)
+ if (hit_cnt >= (uint)min_hits)
ordered_hits.push_back(make_pair(allele_id, hit_cnt));
} while (index < hits_size);
@@ -725,7 +727,7 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
// generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
//
uniq_kmers.clear();
- for (uint j = 0; j < num_kmers; j++)
+ for (int j = 0; j < num_kmers; j++)
uniq_kmers.insert(kmers[j]);
hits.clear();
@@ -764,7 +766,7 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
hit_cnt = 0;
allele_id = prev_id;
- while (hits[index] == prev_id) {
+ while ((uint)hits[index] == prev_id) {
hit_cnt++;
index++;
}
@@ -772,7 +774,7 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
if (index < hits_size)
prev_id = hits[index];
- if (hit_cnt >= min_hits)
+ if (hit_cnt >= (uint)min_hits)
ordered_hits.push_back(make_pair(allele_id, hit_cnt));
} while (index < hits_size);
@@ -791,7 +793,7 @@ search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double
top_hit = ordered_hits[0].second;
stop = 1;
for (uint j = 1; j < ordered_hits.size(); j++)
- if (ordered_hits[j].second < top_hit) {
+ if ((uint)ordered_hits[j].second < top_hit) {
stop = j;
break;
}
@@ -932,7 +934,7 @@ int find_matches_by_genomic_loc(map<string, int> &cat_index, map<int, QLocus *>
snprintf(id, id_len - 1, "%s|%d|%c",
i->second->loc.chr,
i->second->loc.bp,
- i->second->loc.strand == plus ? '+' : '-');
+ i->second->loc.strand == strand_plus ? '+' : '-');
if (cat_index.count(id) > 0)
i->second->add_match(cat_index[id], "", "", 0);
@@ -1536,7 +1538,7 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
tag->id << "\t" <<
tag->loc.chr << "\t" <<
tag->loc.bp << "\t" <<
- (tag->loc.strand == plus ? "+" : "-") << "\t" <<
+ (tag->loc.strand == strand_plus ? "+" : "-") << "\t" <<
"consensus" << "\t" <<
"0" << "\t" <<
sources << "\t" <<
@@ -1613,7 +1615,7 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
tag->id << "\t" <<
tag->loc.chr << "\t" <<
tag->loc.bp << "\t" <<
- (tag->loc.strand == plus ? "+" : "-") << "\t" <<
+ (tag->loc.strand == strand_plus ? "+" : "-") << "\t" <<
"consensus" << "\t" <<
"0" << "\t" <<
sources << "\t" <<
diff --git a/src/export_formats.cc b/src/export_formats.cc
new file mode 100644
index 0000000..6397f57
--- /dev/null
+++ b/src/export_formats.cc
@@ -0,0 +1,3333 @@
+#include <algorithm>
+#include <vector>
+
+#include "ordered.h"
+#include "sql_utilities.h"
+#include "MetaPopInfo.h"
+
+#include "export_formats.h"
+
+using namespace std;
+
+extern InputMode input_mode;
+extern int batch_id;
+extern string in_path;
+extern string out_path;
+extern string out_prefix;
+extern bool phylip_var;
+extern bool loci_ordered;
+extern bool merge_sites;
+extern string enz;
+extern set<string> debug_flags;
+
+extern MetaPopInfo mpopi;
+extern map<string, int> renz_olap;
+
+int
+write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
+{
+ string file = out_path + out_prefix + ".markers.tsv";
+
+ cerr << "Writing SQL markers file to '" << file << "'\n";
+
+ ofstream fh(file.c_str(), ofstream::out);
+ if (fh.fail()) {
+ cerr << "Error opening markers SQL file '" << file << "'\n";
+ exit(1);
+ }
+ fh.precision(fieldw);
+ fh.setf(std::ios::fixed);
+
+ fh << "# SQL ID" << "\t"
+ << "Batch ID" << "\t"
+ << "Catalog Locus ID" << "\t"
+ << "\t"
+ << "Total Genotypes" << "\t"
+ << "Max" << "\t"
+ << "Genotype Freqs" << "\t"
+ << "F" << "\t"
+ << "Mean Log Likelihood" << "\t"
+ << "Genotype Map" << "\t"
+ << "\n";
+
+ map<int, CSLocus *>::iterator it;
+ CSLocus *loc;
+ stringstream gtype_map;
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+
+ string freq = "";
+ double max = 0.0;
+ int total = 0;
+ gtype_map.str("");
+
+ if (loc->marker.length() > 0) {
+ tally_haplotype_freq(loc, pmap, total, max, freq);
+
+ //
+ // Record the haplotype to genotype map.
+ //
+ map<string, string>::iterator j;
+ for (j = loc->gmap.begin(); j != loc->gmap.end(); j++)
+ gtype_map << j->first << ":" << j->second << ";";
+ }
+
+ fh << 0 << "\t"
+ << batch_id << "\t"
+ << loc->id << "\t"
+ << "\t" // Marker
+ << total << "\t"
+ << max << "\t"
+ << freq << "\t"
+ << loc->f << "\t"
+ << loc->lnl << "\t"
+ << gtype_map.str() << "\t"
+ << "\n";
+ }
+
+ fh.close();
+
+ return 0;
+}
+
+int
+write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
+{
+ //
+ // Write a FASTA file containing each allele from each locus from
+ // each sample in the population.
+ //
+ string file = out_path + out_prefix + ".fa";
+
+ cerr << "Writing population alleles to FASTA file '" << file << "'\n";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening FASTA file '" << file << "'\n";
+ exit(1);
+ }
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ char *seq;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ d = pmap->locus(loc->id);
+ seq = new char[loc->len + 1];
+ strcpy(seq, loc->con);
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] == NULL)
+ continue;
+
+ for (uint k = 0; k < d[j]->obshap.size(); k++) {
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
+ }
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << pmap->rev_sample_index(j)
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << k
+ << " [" << mpopi.samples()[j].name;
+
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == strand_plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+ }
+ }
+ delete [] seq;
+ }
+ }
+
+ fh.close();
+
+ return 0;
+}
+
+int
+write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
+{
+ //
+ // Write a FASTA file containing each allele from each locus from
+ // each sample in the population.
+ //
+ string file = out_path + out_prefix + ".strict.fa";
+
+ cerr << "Writing strict population alleles to FASTA file '" << file << "'\n";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening strict FASTA file '" << file << "'\n";
+ exit(1);
+ }
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ char *seq;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ d = pmap->locus(loc->id);
+ seq = new char[loc->len + 1];
+ strcpy(seq, loc->con);
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] == NULL)
+ continue;
+ if (d[j]->obshap.size() > 2)
+ continue;
+
+ if (d[j]->obshap.size() == 1) {
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ seq[col] = col < loc->len ? d[j]->obshap[0][i] : loc->con[col];
+ }
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << pmap->rev_sample_index(j)
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << 0
+ << " [" << mpopi.samples()[j].name;
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == strand_plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << pmap->rev_sample_index(j)
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << 1
+ << " [" << mpopi.samples()[j].name;
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == strand_plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+
+ } else {
+ for (uint k = 0; k < d[j]->obshap.size(); k++) {
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
+ }
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << mpopi.samples()[j].id
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << k
+ << " [" << mpopi.samples()[j].name;
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == strand_plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+ }
+ }
+ }
+
+ delete [] seq;
+ }
+ }
+
+ fh.close();
+
+ return 0;
+}
+
+int
+write_vcf_ordered(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, pair<merget, int> > &merge_map, ofstream &log_fh)
+{
+ //
+ // Write a VCF file as defined here: http://www.1000genomes.org/node/101
+ // Order SNPs by genomic position (and handle overlapping loci).
+ //
+
+ string file = out_path + out_prefix + ".vcf";
+ cerr << "Writing ordered population data to VCF file '" << file << "'\n";
+ log_fh << "\n#\n# Generating SNP-based VCF export.\n#\n";
+ VcfWriter writer (file);
+ if (writer.fail()) {
+ cerr << "Error opening VCF file '" << file << "'\n";
+ exit(-1);
+ }
+
+ bool gl = false; // Whether to include the GL genotype subfield.
+ if (input_mode == InputMode::stacks
+ && !debug_flags.count("VCFCOMP")) {
+ gl=true;
+ // Load SNP data so that model likelihoods can be output to VCF file.
+ cerr << "In preparation for VCF export, loading SNP data for " << mpopi.samples().size() << " samples.\n";
+ populate_snp_calls(catalog, pmap, merge_map);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%Y%m%d", timeinfo);
+
+ VcfHeader header;
+ header.init_meta();
+ header.add_meta(VcfMeta::predefined.at("INFO/NS"));
+ header.add_meta(VcfMeta::predefined.at("INFO/AF"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/GT"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/DP"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/AD"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/GL"));
+ for(auto& s : mpopi.samples()) {
+ header.add_sample(s.name);
+ }
+ writer.write_header(header);
+
+ // We need to order the SNPs taking into account overlapping loci.
+ OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
+
+ for (auto it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> sites;
+ ord->order(sites, it->second);
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ if (catalog.count(sites[pos]->loc_id) == 0) {
+ cerr << "Unable to find locus id " << sites[pos]->loc_id << "\n";
+ continue;
+ }
+ CSLocus* loc = catalog[sites[pos]->loc_id];
+ uint16_t col = sites[pos]->col;
+ int snp_index = loc->snp_index(col);
+ if (snp_index < 0) {
+ cerr << "Warning: unable to locate SNP call in column " << col << " for catalog locus " << loc->id << "\n";
+ continue;
+ }
+ Datum** d = pmap->locus(loc->id);
+
+
+ const char ref = sites[pos]->p_allele;
+ const char alt = sites[pos]->q_allele;
+ char freq_alt[32];
+ sprintf(freq_alt, "%0.3f", 1 - sites[pos]->p_freq);
+
+ VcfRecord rec;
+ rec.type = Vcf::RType::expl;
+ rec.chrom = loc->loc.chr;
+ rec.pos = loc->sort_bp(col) + 1;
+ rec.id = to_string(loc->id) + "_" + to_string(col);
+ rec.alleles.push_back(string(1, loc->loc.strand == strand_plus ? ref : reverse(ref)));
+ rec.alleles.push_back(string(1, loc->loc.strand == strand_plus ? alt : reverse(alt)));
+ rec.qual = ".";
+ rec.filter.push_back("PASS");
+ rec.info.push_back({"NS",to_string(sites[pos]->num_indv)});
+ rec.info.push_back({"AF",freq_alt});
+ rec.format.push_back("GT");
+ rec.format.push_back("DP");
+ rec.format.push_back("AD");
+ if (gl)
+ rec.format.push_back("GL");
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ stringstream sample;
+
+ if (d[j] == NULL || col >= d[j]->len) {
+ // Data does not exist.
+ sample << "./.:0:.,.";
+ if (gl)
+ sample << ":.,.,.";
+ } else if (d[j]->model[col] == 'U') {
+ // Data exists, but the model call was uncertain.
+ sample << "./.:" << d[j]->tot_depth << ":.,.";
+ if (gl)
+ sample << ":.,.,.";
+ } else {
+ char allele1, allele2;
+ tally_observed_haplotypes(d[j]->obshap, snp_index, allele1, allele2);
+
+ if (allele1 == 0) {
+ // More than two alleles in this sample
+ sample << "./.:" << d[j]->tot_depth << ":.,.";
+ if (gl)
+ sample << ":.,.,.";
+ } else {
+ // Write the genotype.
+
+ int dp1, dp2;
+ find_datum_allele_depths(d[j], snp_index, allele1, allele2, dp1, dp2);
+
+ if(gl && col >= d[j]->snps.size())
+ cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << ", tag ID " << d[j]->id << "\n";
+
+ if (allele2 == 0) {
+ // homozygote
+ if (allele1 == ref) {
+ sample << "0/0:" << d[j]->tot_depth << ":" << dp1 << "," << dp2;
+ if (gl)
+ sample << ":" << - d[j]->snps[col]->lratio << ",.,.";
+ } else {
+ sample << "1/1:" << d[j]->tot_depth << ":" << dp2 << "," << dp1;
+ if (gl)
+ sample << ":.,.," << - d[j]->snps[col]->lratio;
+ }
+ } else {
+ // heterozygote
+ sample << "0/1:" << d[j]->tot_depth
+ << ":" << (allele1 == ref ? dp1 : dp2) << "," << (allele1 == ref ? dp2 : dp1);
+ if (gl)
+ sample << ":.," << - d[j]->snps[col]->lratio << ",.";
+ }
+ }
+ }
+ rec.samples.push_back(sample.str());
+ }
+ writer.write_record(rec, header);
+ }
+ }
+
+ return 0;
+}
+
+int
+write_vcf(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, pair<merget, int> > &merge_map)
+{
+ //
+ // Write a VCF file as defined here: http://www.1000genomes.org/node/101
+ //
+
+ string file = out_path + out_prefix + ".vcf";
+ cerr << "Writing population data to VCF file '" << file << "'\n";
+ VcfWriter writer (file);
+ if (writer.fail()) {
+ cerr << "Error opening VCF file '" << file << "'\n";
+ exit(-1);
+ }
+
+ bool gl = false; // Whether to include the GL genotype subfield.
+ if (input_mode == InputMode::stacks
+ && !debug_flags.count("VCFCOMP")) {
+ gl=true;
+ // Load SNP data so that model likelihoods can be output to VCF file.
+ cerr << "In preparation for VCF export, loading SNP data for " << mpopi.samples().size() << " samples.\n";
+ populate_snp_calls(catalog, pmap, merge_map);
+ }
+
+ //
+ // Output the header.
+ //
+
+ // Obtain the current date.
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%Y%m%d", timeinfo);
+
+ VcfHeader header;
+ header.init_meta();
+ header.add_meta(VcfMeta::predefined.at("INFO/NS"));
+ header.add_meta(VcfMeta::predefined.at("INFO/AF"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/GT"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/DP"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/AD"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/GL"));
+ for(auto& s : mpopi.samples()) {
+ header.add_sample(s.name);
+ }
+ writer.write_header(header);
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocTally *t;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+
+ // We need to order the SNPs so negative and positive strand SNPs are properly ordered.
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint16_t col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end());
+
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ uint16_t col = loc->snps[ordered_loci[pos].snp_index]->col;
+ int snp_index = loc->snp_index(col);
+ if (snp_index < 0) {
+ cerr << "Warning: unable to locate SNP call in column " << col << " for locus #" << loc->id << "\n";
+ continue;
+ }
+ t = psum->locus_tally(loc->id);
+ d = pmap->locus(loc->id);
+
+ const char ref = t->nucs[col].p_allele;
+ const char alt = t->nucs[col].q_allele;
+ char freq_alt[32];
+ sprintf(freq_alt, "%0.3f", 1 - t->nucs[col].p_freq);
+
+ VcfRecord rec;
+ rec.type = Vcf::RType::expl;
+ rec.chrom = loc->loc.chr;
+ rec.pos = loc->sort_bp(col) + 1;
+ rec.id = to_string(loc->id) + "_" + to_string(col);
+ rec.alleles.push_back(string(1, loc->loc.strand == strand_plus ? ref : reverse(ref)));
+ rec.alleles.push_back(string(1, loc->loc.strand == strand_plus ? alt : reverse(alt)));
+ rec.qual = ".";
+ rec.filter.push_back("PASS");
+ rec.info.push_back({"NS",to_string(t->nucs[col].num_indv)});
+ rec.info.push_back({"AF",freq_alt});
+ rec.format.push_back("GT");
+ rec.format.push_back("DP");
+ rec.format.push_back("AD");
+ if (gl)
+ rec.format.push_back("GL");
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ stringstream sample;
+
+ if (d[j] == NULL || col >= uint(d[j]->len)) {
+ // Data does not exist.
+ sample << "./.:0:.,.";
+ if (gl)
+ sample << ":.,.,.";
+ } else if (d[j]->model[col] == 'U') {
+ // Data exists, but the model call was uncertain.
+ sample << "./.:" << d[j]->tot_depth << ":.,.";
+ if (gl)
+ sample << ":.,.,.";
+ } else {
+ char allele1, allele2;
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, allele1, allele2);
+
+ if (allele1 == 0) {
+ // More than two alleles in this sample
+ sample << "./.:" << d[j]->tot_depth << ":.,.";
+ if (gl)
+ sample << ":.,.,.";
+ } else {
+ // Write the genotype.
+
+ int dp1, dp2;
+ find_datum_allele_depths(d[j], snp_index, allele1, allele2, dp1, dp2);
+
+ if (allele2 == 0) {
+ // homozygote
+ if (allele1 == ref) {
+ sample << "0/0:" << d[j]->tot_depth << ":" << dp1 << "," << dp2;
+ if (gl)
+ sample << ":" << - d[j]->snps[col]->lratio << ",.,.";
+ } else {
+ sample << "1/1:" << d[j]->tot_depth << ":" << dp2 << "," << dp1;
+ if (gl)
+ sample << ":.,.," << - d[j]->snps[col]->lratio;
+ }
+ } else {
+ // heterozygote
+ sample << "0/1:" << d[j]->tot_depth
+ << ":" << (allele1 == ref ? dp1 : dp2) << "," << (allele1 == ref ? dp2 : dp1);
+ if (gl)
+ sample << ":.," << - d[j]->snps[col]->lratio << ",.";
+ }
+ }
+ }
+ rec.samples.push_back(sample.str());
+ }
+ writer.write_record(rec, header);
+ }
+ }
+
+ return 0;
+}
+
+int
+write_vcf_haplotypes(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a VCF file as defined here: http://samtools.github.io/hts-specs/
+ //XXX Datum::obshap *is not* ordered, I think. See Bitbucket. @Nick (June 2016)
+ //
+
+ string file = out_path + out_prefix + ".haplotypes.vcf";
+ cerr << "Writing population data haplotypes to VCF file '" << file << "'\n";
+ VcfWriter writer (file);
+ if (writer.fail()) {
+ cerr << "Error opening VCF file '" << file << "'\n";
+ exit(-1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%Y%m%d", timeinfo);
+
+ VcfHeader header;
+ header.init_meta();
+ header.add_meta(VcfMeta::predefined.at("INFO/NS"));
+ header.add_meta(VcfMeta::predefined.at("INFO/AF"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/GT"));
+ header.add_meta(VcfMeta::predefined.at("FORMAT/DP"));
+ for(auto& s : mpopi.samples()) {
+ header.add_sample(s.name);
+ }
+ writer.write_header(header);
+
+ CSLocus *loc;
+ Datum **d;
+ char allele[id_len];
+
+ for (auto chr = pmap->ordered_loci.begin(); chr != pmap->ordered_loci.end(); chr++) {
+ for (uint pos = 0; pos < chr->second.size(); pos++) {
+ loc = chr->second[pos];
+ d = pmap->locus(loc->id);
+
+ map<string, double> hap_freq;
+ const double n_alleles = count_haplotypes_at_locus(0, pmap->sample_cnt() - 1, d, hap_freq);
+ if (hap_freq.size() <= 1)
+ // Monomorphic locus.
+ // XXX What does [hap_freq.size()==1] mean ? @Nick (July 2016)
+ continue;
+ for(auto& h : hap_freq)
+ // Convert counts to freqs.
+ h.second /= n_alleles;
+
+ //
+ // Order the haplotypes according to most frequent. Record the ordered position or each
+ // haplotype and convert them from counts to frequencies.
+ //
+
+ VcfRecord rec;
+ rec.type = Vcf::RType::expl;
+ rec.chrom = loc->loc.chr;
+ rec.pos = loc->sort_bp() + 1;
+ rec.id = to_string(loc->id);
+
+ //alleles
+ vector<pair<string, double> > ordered_hap (hap_freq.begin(), hap_freq.end());
+ sort(ordered_hap.begin(), ordered_hap.end(), compare_pair_haplotype);
+ map<string, int> hap_index;
+ for (size_t i = 0; i < ordered_hap.size(); i++) {
+ string h = ordered_hap[i].first;
+ rec.alleles.push_back(loc->loc.strand == strand_plus ? h : string(rev_comp(h.c_str())));
+ hap_index[h] = i;
+ }
+
+ rec.qual = ".";
+ rec.filter.push_back("PASS");
+
+ //info
+ stringstream ss;
+ ss << n_alleles/2;
+ rec.info.push_back({"NS",ss.str()});
+ rec.info.push_back({"AF",string()});
+ string& af=rec.info.back().second;
+ sprintf(allele, "%0.3f", ordered_hap[1].second); //NB. hap_freq.size() >= 2
+ af += allele;
+ for (auto h=ordered_hap.begin()+2; h!=ordered_hap.end(); ++h) {
+ sprintf(allele, "%0.3f", h->second);
+ af += string(",") + allele;
+ }
+
+ //format
+ rec.format.push_back("GT");
+ rec.format.push_back("DP");
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ stringstream sample;
+
+ if (d[j] == NULL) {
+ // Data does not exist.
+ sample << "./.:0";
+ } else if (d[j]->obshap.size() > 2) {
+ // More than two alleles in this sample
+ sample << "./.:" << d[j]->tot_depth;
+ } else if (d[j]->obshap.size() == 1) {
+ // Homozygote.
+ char* h = d[j]->obshap[0];
+ int i = uncalled_haplotype(h) ? -1 : hap_index.at(h);
+ if(i >= 0)
+ sample << i << "/" << i << ":" << d[j]->tot_depth;
+ else
+ sample << "./.:" << d[j]->tot_depth;
+ } else {
+ // Heterozygote.
+ char* h1 = d[j]->obshap[0];
+ char* h2 = d[j]->obshap[1];
+ int i1 = uncalled_haplotype(h1) ? -1 : hap_index.at(h1);
+ int i2 = uncalled_haplotype(h2) ? -1 : hap_index.at(h2);
+ if(i1 >= 0 && i2 >= 0)
+ sample << (i1 < i2 ? i1 : i2) << "/" << (i1 < i2 ? i2 : i1) << ":" << d[j]->tot_depth;
+ else if (i1 >= 0)
+ sample << i1 << "/.:" << d[j]->tot_depth;
+ else if (i2 >= 0)
+ sample << i2 << "/.:" << d[j]->tot_depth;
+ }
+ rec.samples.push_back(sample.str());
+ }
+ writer.write_record(rec, header);
+ }
+ }
+ return 0;
+}
+
+int
+write_genepop(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a GenePop file as defined here: http://kimura.univ-montp2.fr/~rousset/Genepop.htm
+ //
+ string file = out_path + out_prefix + ".genepop";
+
+ cerr << "Writing population data to GenePop file '" << file << "'\n";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening GenePop file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ //
+ // Output the header line.
+ //
+ fh << "Stacks version " << VERSION << "; Genepop version 4.1.3; " << date << "\n";
+
+ map<int, CSLocus *>::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+ LocTally *t;
+ int col;
+ char p_allele, q_allele;
+
+ //
+ // Determine how many loci will be output, then output all the loci on the second line, comma-separated.
+ //
+ uint cnt = 0;
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ for (uint j = 0; j < loc->snps.size(); j++) {
+ col = loc->snps[j]->col;
+ t = psum->locus_tally(loc->id);
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+ cnt++;
+ }
+ }
+
+ uint i = 0;
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ for (uint j = 0; j < loc->snps.size(); j++) {
+ col = loc->snps[j]->col;
+ t = psum->locus_tally(loc->id);
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+ i++;
+ fh << loc->id << "_" << col;
+ if (i < cnt) fh << ",";
+ }
+ }
+ fh << "\n";
+
+ map<char, string> nuc_map;
+ nuc_map['A'] = "01";
+ nuc_map['C'] = "02";
+ nuc_map['G'] = "03";
+ nuc_map['T'] = "04";
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ fh << "pop\n";
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+
+ fh << mpopi.samples()[j].name << ",";
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ d = pmap->locus(loc->id);
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t0000";
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t0000";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t0000";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0) {
+ // More than two potential alleles.
+ fh << "\t0000";
+ } else if (p_allele == 0) {
+ fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
+
+ } else if (q_allele == 0) {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
+
+ } else {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
+ }
+
+ fh.close();
+
+ return 0;
+}
+
+int
+write_genepop_ordered(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ ofstream &log_fh)
+{
+ //
+ // Write a GenePop file as defined here: http://kimura.univ-montp2.fr/~rousset/Genepop.htm
+ //
+ string file = out_path + out_prefix + ".genepop";
+
+ cerr << "Writing population data to GenePop file '" << file << "'\n";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening GenePop file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ //
+ // Output the header line.
+ //
+ fh << "Stacks version " << VERSION << "; Genepop version 4.1.3; " << date << "\n";
+
+ map<string, vector<NucTally *> > genome_sites;
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+ uint col, snp_index;
+ char p_allele, q_allele;
+
+ //
+ // We need to order the SNPs to take into account overlapping loci.
+ //
+ OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
+
+ //
+ // Output all the loci on the second line, comma-separated.
+ //
+ int chrs = pmap->ordered_loci.size();
+ int cnt = 0;
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+ ord->order(sites, it->second);
+ cnt++;
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ fh << sites[pos]->loc_id << "_" << sites[pos]->col;
+ if (cnt < chrs || pos < sites.size() - 1) fh << ",";
+ }
+ }
+ fh << "\n";
+
+ map<char, string> nuc_map;
+ nuc_map['A'] = "01";
+ nuc_map['C'] = "02";
+ nuc_map['G'] = "03";
+ nuc_map['T'] = "04";
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ fh << "pop\n";
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+
+ fh << mpopi.samples()[j].name << ",";
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ loc = catalog[sites[pos]->loc_id];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ col = sites[pos]->col;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t0000";
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t0000";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t0000";
+ } else {
+ snp_index = loc->snp_index(col);
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0) {
+ // More than two potential alleles.
+ fh << "\t0000";
+ } else if (p_allele == 0) {
+ fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
+
+ } else if (q_allele == 0) {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
+
+ } else {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
+ }
+
+ fh.close();
+
+ return 0;
+}
+
+int
+write_structure(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a Structure file as defined here: http://pritch.bsd.uchicago.edu/structure.html
+ //
+ // To avoid linked SNPs (which Structure can't handle), we will only output the first
+ // SNP from each variable locus.
+ //
+ string file = out_path + out_prefix + ".structure.tsv";
+
+ cerr << "Writing population data to Structure file '" << file << "'...";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening Structure file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ //
+ // Output the header.
+ //
+ fh << "# Stacks v" << VERSION << "; " << " Structure v2.3; " << date << "\n";
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+ LocTally *t;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ fh << "\t" << loc->id << "_" << col;
+ }
+ }
+ }
+ fh << "\n";
+
+ map<char, string> nuc_map;
+ nuc_map['A'] = "1";
+ nuc_map['C'] = "2";
+ nuc_map['G'] = "3";
+ nuc_map['T'] = "4";
+
+ char p_allele, q_allele;
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << mpopi.samples()[j].name << "\t" << pop.name;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "0";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (p_allele == 0)
+ fh << "\t" << nuc_map[q_allele];
+ else
+ fh << "\t" << nuc_map[p_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ fh << mpopi.samples()[j].name << "\t" << pop.name;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ fh << "\t" << "0";
+ } else {
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (q_allele == 0)
+ fh << "\t" << nuc_map[p_allele];
+ else
+ fh << "\t" << nuc_map[q_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
+ }
+
+ fh.close();
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_structure_ordered(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ ofstream &log_fh)
+{
+ //
+ // Write a Structure file as defined here: http://pritch.bsd.uchicago.edu/structure.html
+ //
+ // To avoid linked SNPs (which Structure can't handle), we will only output the first
+ // SNP from each variable locus.
+ //
+ string file = out_path + out_prefix + ".structure.tsv";
+
+ cerr << "Writing population data to Structure file '" << file << "'...";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening Structure file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ //
+ // Output the header.
+ //
+ fh << "# Stacks v" << VERSION << "; " << " Structure v2.3; " << date << "\n";
+
+ map<string, vector<NucTally *> > genome_sites;
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+
+ //
+ // We need to order the SNPs to take into account overlapping loci.
+ //
+ OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+ ord->order(sites, it->second);
+
+ for (uint pos = 0; pos < sites.size(); pos++)
+ fh << "\t" << sites[pos]->loc_id << "_" << sites[pos]->col;
+ }
+ fh << "\n";
+
+ map<char, string> nuc_map;
+ nuc_map['A'] = "1";
+ nuc_map['C'] = "2";
+ nuc_map['G'] = "3";
+ nuc_map['T'] = "4";
+
+ char p_allele, q_allele;
+ uint col, snp_index;
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << mpopi.samples()[j].name << "\t" << pop.name;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ loc = catalog[sites[pos]->loc_id];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ col = sites[pos]->col;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "0";
+ } else {
+ snp_index = loc->snp_index(col);
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (p_allele == 0)
+ fh << "\t" << nuc_map[q_allele];
+ else
+ fh << "\t" << nuc_map[p_allele];
+ }
+ }
+ }
+ fh << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ fh << mpopi.samples()[j].name << "\t" << pop.name;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ loc = catalog[sites[pos]->loc_id];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ col = sites[pos]->col;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ fh << "\t" << "0";
+ } else {
+ snp_index = loc->snp_index(col);
+ tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (q_allele == 0)
+ fh << "\t" << nuc_map[p_allele];
+ else
+ fh << "\t" << nuc_map[q_allele];
+ }
+ }
+ }
+ fh << "\n";
+ }
+ }
+
+ fh.close();
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_hzar(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a Hybrid Zone Analysis using R (HZAR) file as defined here:
+ // http://cran.r-project.org/web/packages/hzar/hzar.pdf
+ //
+ string file = out_path + out_prefix + ".hzar.csv";
+
+ cerr << "Writing population data to HZAR file '" << file << "'...";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening HZAR file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ //
+ // Output the header.
+ //
+ fh << "# Stacks v" << VERSION << "; " << " HZAR v0.2-5; " << date << "\n"
+ << "Population" << "," << "Distance";
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ LocSum **s;
+ LocTally *t;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2) {
+ fh << "," << loc->id << "_" << col << ".A"
+ << "," << loc->id << "_" << col << ".B"
+ << "," << loc->id << "_" << col << ".N";
+ }
+ }
+ }
+ }
+ fh << "\n";
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ fh << pop.name << ",";
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].num_indv == 0 ||
+ s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ fh << ",0,0,0";
+ continue;
+ }
+
+ if (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc)
+ fh << "," << s[p]->nucs[col].p << "," << 1 - s[p]->nucs[col].p << ",";
+ else
+ fh << "," << 1 - s[p]->nucs[col].p << "," << s[p]->nucs[col].p << ",";
+
+ fh << s[p]->nucs[col].num_indv * 2;
+ }
+ }
+ }
+ fh << "\n";
+ }
+
+ fh.close();
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_treemix(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a TreeMix file (Pickrell and Pritchard, 2012 PLoS Genetics)
+ // https://bitbucket.org/nygcresearch/treemix/wiki/Home
+ //
+ string file = out_path + out_prefix + ".treemix";
+
+ cerr << "Writing population data to TreeMix file '" << file << "'; ";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening TreeMix file '" << file << "'\n";
+ exit(1);
+ }
+
+ file += ".log";
+
+ cerr << "logging nucleotide positions to '" << file << "'...";
+
+ ofstream log_fh(file.c_str(), ofstream::out);
+
+ if (log_fh.fail()) {
+ cerr << "Error opening Phylip Log file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ log_fh << "# Stacks v" << VERSION << "; " << " TreeMix v1.1; " << date << "\n"
+ << "# Line\tLocus ID\tColumn\tChr\tBasepair\n";
+
+ //
+ // Output the header.
+ //
+ fh << "# Stacks v" << VERSION << "; " << " TreeMix v1.1; " << date << "\n";
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ LocSum **s;
+ LocTally *t;
+
+ //
+ // Output a space-separated list of the populations on the first line.
+ //
+ stringstream sstr;
+ for (auto& pop : mpopi.pops())
+ sstr << pop.name << " ";
+
+ fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
+
+ double p_freq, p_cnt, q_cnt, allele_cnt;
+ long int line = 1;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ sstr.str("");
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+
+ if (s[p]->nucs[col].num_indv == 0 ||
+ s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ sstr << "0,0 ";
+ continue;
+ }
+
+ p_freq = (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc) ?
+ s[p]->nucs[col].p :
+ 1 - s[p]->nucs[col].p;
+
+ allele_cnt = s[p]->nucs[col].num_indv * 2;
+ p_cnt = round(allele_cnt * p_freq);
+ q_cnt = allele_cnt - p_cnt;
+ sstr << (int) p_cnt << "," << (int) q_cnt << " ";
+ }
+
+ if (sstr.str().length() == 0)
+ continue;
+
+ fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
+ log_fh << line << "\t" << loc->id << "\t" << col << "\t" << loc->loc.chr << "\t" << loc->sort_bp(col) + 1 << "\n";
+ line++;
+ }
+ }
+ }
+
+ fh.close();
+ log_fh.close();
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_fastphase(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a fastPHASE file as defined here: http://stephenslab.uchicago.edu/software.html
+ //
+ // Data will be written as independent, bi-allelic SNPs. We will write one file per chromosome.
+ //
+ cerr << "Writing population data to fastPHASE files...";
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+ LocTally *t;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+
+ string file = out_path + out_prefix + "." + it->first + ".fastphase.inp";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening fastPHASE file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Tally up the number of sites
+ //
+ int total_sites = 0;
+ uint col;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ total_sites++;
+ }
+ }
+
+ //
+ // Output the total number of SNP sites and the number of individuals.
+ //
+ fh << mpopi.samples().size() << "\n"
+ << total_sites << "\n";
+
+ //
+ // We need to determine an ordering that can take into account overlapping RAD sites.
+ //
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end());
+
+ //
+ // Output the position of each site according to its basepair.
+ //
+ fh << "P";
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+ fh << " " << ordered_loci[pos].bp +1;
+ }
+ fh << "\n";
+
+ //
+ // Output a line of 'S' characters, one per site, indicating that these are SNP markers.
+ //
+ string snp_markers, gtypes_str;
+ snp_markers.assign(total_sites, 'S');
+ fh << snp_markers << '\n';
+
+ //
+ // Now output each sample name followed by a new line, then all of the genotypes for that sample
+ // on two lines.
+ //
+
+ char p_allele, q_allele;
+ stringstream gtypes;
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << mpopi.samples()[j].name << "\n";
+
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "? ";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ gtypes << "? ";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (p_allele == 0)
+ gtypes << q_allele << " ";
+ else
+ gtypes << p_allele << " ";
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ gtypes << "? ";
+
+ } else if (d[j]->model[col] == 'U') {
+ gtypes << "? ";
+
+ } else {
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (q_allele == 0)
+ gtypes << p_allele << " ";
+ else
+ gtypes << q_allele << " ";
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+ }
+ }
+
+ fh.close();
+ }
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_phase(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a PHASE file as defined here: http://stephenslab.uchicago.edu/software.html
+ //
+ // Data will be written as mixture of multiple allele, linked RAD sites
+ // (SNPs within a single RAD locus are already phased), and bi-allelic SNPs. We
+ // will write one file per chromosome.
+ //
+ cerr << "Writing population data to PHASE files...";
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+ LocTally *t;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+
+ string file = out_path + out_prefix + "." + it->first + ".phase.inp";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening PHASE file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // We need to determine an ordering for all legitimate loci/SNPs.
+ //
+ uint col;
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ if (loc->snps.size() == 0) continue;
+
+ //
+ // Will we output this locus as a haplotype or as a SNP?
+ //
+ if (loc->snps.size() > 1) {
+ //
+ // Check that there aren't too many haplotypes (PHASE has a max of 50).
+ //
+ if (loc->alleles.size() > 40) continue;
+
+ //
+ // Iterate over the population to determine that this subset of the population
+ // has data at this locus.
+ //
+ d = pmap->locus(loc->id);
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] != NULL &&
+ d[j]->obshap.size() > 0 &&
+ d[j]->obshap.size() <= 2) {
+ //
+ // Data exists, and there are the correct number of haplotypes.
+ //
+ ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
+ break;
+ }
+ }
+ } else {
+ col = loc->snps[0]->col;
+
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(col), snp));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end());
+
+ //
+ // Output the total number of SNP sites and the number of individuals.
+ //
+ fh << mpopi.samples().size() << "\n"
+ << ordered_loci.size() << "\n";
+
+ //
+ // Output the position of each site according to its basepair.
+ //
+ fh << "P";
+ for (uint pos = 0; pos < ordered_loci.size(); pos++)
+ fh << " " << ordered_loci[pos].bp +1;
+ fh << "\n";
+
+ //
+ // Output a line of 'S' characters for SNP markers, 'M' characters for multiallelic haplotypes.
+ //
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ if (pos > 0) fh << " ";
+ fh << (ordered_loci[pos].type == snp ? "S" : "M");
+ }
+ fh << "\n";
+
+ //
+ // Now output each sample name followed by a new line, then all of the genotypes for that sample
+ // on two lines.
+ //
+
+ string gtypes_str;
+ bool found;
+ char p_allele, q_allele;
+ stringstream gtypes;
+
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << mpopi.samples()[j].name << "\n";
+
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ //
+ // Will we output this locus as a haplotype or as a SNP?
+ //
+ if (ordered_loci[pos].type == haplotype) {
+ if (d[j] == NULL) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "-1 ";
+ } else {
+ //
+ // Data exists, output the first haplotype. We will assume the haplotypes are
+ // numbered by their position in the loc->strings vector.
+ //
+ if (d[j]->obshap.size() > 2) {
+ // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
+ gtypes << "-1 ";
+ } else {
+ found = false;
+ for (uint k = 0; k < loc->strings.size(); k++)
+ if (d[j]->obshap[0] == loc->strings[k].first) {
+ found = true;
+ gtypes << k + 1 << " ";
+ }
+ if (found == false)
+ cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
+ << mpopi.samples()[j].name << "; catalog locus: " << loc->id << "\n";
+ }
+ }
+ } else {
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "? ";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ gtypes << "? ";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (p_allele == 0)
+ gtypes << q_allele << " ";
+ else
+ gtypes << p_allele << " ";
+ }
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ //
+ // Will we output this locus as a haplotype or as a SNP?
+ //
+ if (ordered_loci[pos].type == haplotype) {
+ if (d[j] == NULL) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "-1 ";
+ } else {
+ //
+ // Data exists, output the second haplotype. We will assume the haplotypes are
+ // numbered by their position in the loc->strings vector.
+ //
+ if (d[j]->obshap.size() > 2) {
+ // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
+ gtypes << "-1 ";
+ } else if (d[j]->obshap.size() > 1) {
+ found = false;
+ for (uint k = 0; k < loc->strings.size(); k++)
+ if (d[j]->obshap[1] == loc->strings[k].first) {
+ found = true;
+ gtypes << k + 1 << " ";
+ }
+ if (found == false)
+ cerr << "Unable to find haplotype " << d[j]->obshap[1] << " from individual "
+ << mpopi.samples()[j].name << "; catalog locus: " << loc->id << "\n";
+ } else {
+ found = false;
+ for (uint k = 0; k < loc->strings.size(); k++)
+ if (d[j]->obshap[0] == loc->strings[k].first) {
+ found = true;
+ gtypes << k + 1 << " ";
+ }
+ if (found == false)
+ cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
+ << mpopi.samples()[j].name << "; catalog locus: " << loc->id << "\n";
+ }
+ }
+ } else {
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ gtypes << "? ";
+
+ } else if (d[j]->model[col] == 'U') {
+ gtypes << "? ";
+
+ } else {
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (q_allele == 0)
+ gtypes << p_allele << " ";
+ else
+ gtypes << q_allele << " ";
+ }
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+ }
+ }
+
+ fh.close();
+ }
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_plink(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a PLINK file as defined here: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
+ //
+ // We will write one file per chromosome.
+ //
+ cerr << "Writing population data to PLINK files...";
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+ LocTally *t;
+ string chr;
+
+ //
+ // First, write a markers file containing each marker, the chromosome it falls on,
+ // an empty centiMorgan field, and finally its genomic position in basepairs.
+ //
+ string file = out_path + out_prefix + ".plink.map";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening PLINK markers file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Output the header.
+ //
+ fh << "# Stacks v" << VERSION << "; " << " PLINK v1.07; " << date << "\n";
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ chr = it->first;
+
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ fh << chr << "\t"
+ << loc->id << "_" << col << "\t"
+ << "0\t"
+ << loc->sort_bp(col) +1 << "\n";
+ }
+ }
+ }
+ fh.close();
+
+ //
+ // Now output the genotypes in a separate file.
+ //
+ file = out_path + out_prefix + ".plink.ped";
+
+ fh.open(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening PLINK markers file '" << file << "'\n";
+ exit(1);
+ }
+
+ fh << "# Stacks v" << VERSION << "; " << " PLINK v1.07; " << date << "\n";
+
+ char p_allele, q_allele;
+
+ //
+ // marker, output the genotypes for each sample in two successive columns.
+ //
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+
+ fh << pop.name << "\t"
+ << mpopi.samples()[j].name << "\t"
+ << "0\t" // Paternal ID
+ << "0\t" // Maternal ID
+ << "0\t" // Sex
+ << "0"; // Phenotype
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+ //
+ // Output the p and q alleles
+ //
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "0" << "\t" << "0";
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "0" << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "0" << "\t" << "0";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0" << "\t" << "0";
+ else if (p_allele == 0)
+ fh << "\t" << q_allele << "\t" << q_allele;
+ else if (q_allele == 0)
+ fh << "\t" << p_allele << "\t" << p_allele;
+ else
+ fh << "\t" << p_allele << "\t" << q_allele;
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
+ }
+
+ fh.close();
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_beagle(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a Beagle file as defined here: http://faculty.washington.edu/browning/beagle/beagle.html
+ //
+ // We will write one file per chromosome, per population.
+ //
+ cerr << "Writing population data to unphased Beagle files...";
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+ LocSum **s;
+ LocTally *t;
+ uint col;
+
+ stringstream pop_name;
+ string file;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+
+ //
+ // We need to determine an ordering that can take into account overlapping RAD sites.
+ //
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end());
+
+ //
+ // Now output the genotypes in a separate file for each population.
+ //
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[p];
+
+ //
+ // Open a markers file containing each marker, its genomic position in basepairs
+ // and the two alternative alleles at this position.
+ //
+ file = out_path + out_prefix + "." + pop.name + "-" + it->first + ".unphased.bgl.markers";
+
+ ofstream mfh(file.c_str(), ofstream::out);
+ if (mfh.fail()) {
+ cerr << "Error opening Beagle markers file '" << file << "'\n";
+ exit(1);
+ }
+ mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ //
+ // Open the genotypes file.
+ //
+ file = out_path + out_prefix + "." + pop.name + "-" + it->first + ".unphased.bgl";
+
+ ofstream fh(file.c_str(), ofstream::out);
+ if (fh.fail()) {
+ cerr << "Error opening Beagle genotypes file '" << file << "'\n";
+ exit(1);
+ }
+ fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ char p_allele, q_allele;
+ //
+ // Output a list of all the samples in this population.
+ //
+ fh << "I\tid";
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++)
+ fh << "\t" << mpopi.samples()[j].name << "\t" << mpopi.samples()[j].name;
+ fh << "\n";
+
+ //
+ // Output population IDs for each sample.
+ //
+ fh << "S\tid";
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++)
+ fh << "\t" << pop.name << "\t" << pop.name;
+ fh << "\n";
+
+ //
+ // For each marker, output the genotypes for each sample in two successive columns.
+ //
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ //
+ // If this site is monomorphic in this population don't output it.
+ //
+ if (s[p]->nucs[col].pi == 0.0)
+ continue;
+
+ //
+ // Output this locus to the markers file.
+ //
+ mfh << loc->id << "_" << col << "\t"
+ << loc->sort_bp(col) +1 << "\t"
+ << t->nucs[col].p_allele << "\t"
+ << t->nucs[col].q_allele << "\n";
+
+ fh << "M" << "\t" << loc->id << "_" << col;
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+ //
+ // Output the p allele
+ //
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "?";
+
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "?";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "?";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "?";
+ else if (p_allele == 0)
+ fh << "\t" << q_allele;
+ else
+ fh << "\t" << p_allele;
+ }
+
+ //
+ // Now output the q allele
+ //
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ fh << "\t" << "?";
+
+ } else if (d[j] == NULL || col >= uint(d[j]->len)) {
+ fh << "\t" << "?";
+
+ } else if (d[j]->model[col] == 'U') {
+ fh << "\t" << "?";
+
+ } else {
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "?";
+ else if (q_allele == 0)
+ fh << "\t" << p_allele;
+ else
+ fh << "\t" << q_allele;
+ }
+ }
+ fh << "\n";
+ }
+
+ fh.close();
+ mfh.close();
+ }
+ }
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_beagle_phased(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // Write a Beagle file as a set of haplotpyes as defined here:
+ // http://faculty.washington.edu/browning/beagle/beagle.html
+ //
+ // We will write one file per chromosome.
+ //
+ cerr << "Writing population data to phased Beagle files...";
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ Datum **d;
+
+ stringstream pop_name;
+ string file;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+
+ //
+ // We need to determine an ordering for all legitimate loci/SNPs.
+ //
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ if (loc->snps.size() == 0) continue;
+
+ //
+ // Check that there aren't too many haplotypes (PHASE has a max of 50).
+ //
+ if (loc->alleles.size() > 40) continue;
+
+ //
+ // Iterate over the population to determine that this subset of the population
+ // has data at this locus.
+ //
+ d = pmap->locus(loc->id);
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] != NULL &&
+ d[j]->obshap.size() > 0 &&
+ d[j]->obshap.size() <= 2) {
+ //
+ // Data exists, and their are the corrent number of haplotypes.
+ //
+ ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
+ break;
+ }
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end());
+
+ //
+ // Now output the genotypes in a separate file for each population.
+ //
+
+ for (size_t i_pop=0; i_pop<mpopi.pops().size(); ++i_pop) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[i_pop];
+
+ //
+ // Open a file for writing the markers: their genomic position in basepairs
+ // and the two alternative alleles at this position.
+ //
+ file = out_path + out_prefix + "." + pop.name + "-" + it->first + ".phased.bgl.markers";
+
+ ofstream mfh(file.c_str(), ofstream::out);
+ if (mfh.fail()) {
+ cerr << "Error opening Beagle markers file '" << file << "'\n";
+ exit(1);
+ }
+ mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ //
+ // Now output the haplotypes in a separate file.
+ //
+ file = out_path + out_prefix + "." + pop.name + "-" + it->first + ".phased.bgl";
+
+ ofstream fh(file.c_str(), ofstream::out);
+ if (fh.fail()) {
+ cerr << "Error opening Beagle markers file '" << file << "'\n";
+ exit(1);
+ }
+ fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ //
+ // Output a list of all the samples in the data set.
+ //
+ fh << "I\tid";
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++)
+ fh << "\t" << mpopi.samples()[j].name << "\t" << mpopi.samples()[j].name;
+ fh << "\n";
+
+ //
+ // Output population IDs for each sample.
+ //
+ fh << "S\tid";
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++)
+ fh << "\t" << pop.name << "\t" << pop.name;
+ fh << "\n";
+
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ d = pmap->locus(loc->id);
+
+ //
+ // If this locus is monomorphic in this population don't output it.
+ //
+ set<string> haplotypes;
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+ if (d[j] == NULL) continue;
+
+ if (d[j]->obshap.size() == 2) {
+ haplotypes.insert(d[j]->obshap[0]);
+ haplotypes.insert(d[j]->obshap[1]);
+ } else {
+ haplotypes.insert(d[j]->obshap[0]);
+ }
+ }
+ if (haplotypes.size() == 1) continue;
+
+ //
+ // Output this locus to the markers file.
+ //
+ mfh << loc->id << "\t"
+ << loc->sort_bp() +1;
+ for (uint j = 0; j < loc->strings.size(); j++)
+ mfh << "\t" << loc->strings[j].first;
+ mfh << "\n";
+
+ //
+ // For each marker, output the genotypes for each sample in two successive columns.
+ //
+ fh << "M" << "\t" << loc->id;
+
+ for (size_t j = pop.first_sample; j <= pop.last_sample; j++) {
+ //
+ // Output the p and the q haplotype
+ //
+ if (d[j] == NULL) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "?" << "\t" << "?";
+ } else {
+ //
+ // Data exists, output the first haplotype. We will assume the haplotypes are
+ // numbered by their position in the loc->strings vector.
+ //
+ if (d[j]->obshap.size() > 2)
+ fh << "\t" << "?" << "\t" << "?";
+ else if (d[j]->obshap.size() == 2)
+ fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[1];
+ else
+ fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[0];
+ }
+ }
+ fh << "\n";
+ }
+ fh.close();
+ mfh.close();
+ }
+ }
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_phylip(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // We want to find loci where each locus is fixed within a population but variable between populations.
+ //
+ // We will write those loci to a Phylip file as defined here:
+ // http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
+ //
+ string file = out_path + out_prefix + ".phylip";
+
+ cerr << "Writing population data to Phylip file '" << file << "'; ";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening Phylip file '" << file << "'\n";
+ exit(1);
+ }
+
+ file += ".log";
+
+ cerr << "logging nucleotide positions to '" << file << "'...";
+
+ ofstream log_fh(file.c_str(), ofstream::out);
+
+ if (log_fh.fail()) {
+ cerr << "Error opening Phylip Log file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ log_fh << "# Stacks v" << VERSION << "; " << " Phylip sequential; " << date << "\n"
+ << "# Seq Pos\tLocus ID\tColumn\tPopulation\n";
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ LocSum **s;
+ LocTally *t;
+
+ int pop_cnt = psum->pop_cnt();
+ char nuc;
+
+ //
+ // A map storing, for each population, the concatenated list of interspecific nucleotides.
+ //
+ map<int, string> interspecific_nucs;
+
+ int index = 0;
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (phylip_var == false) {
+ //
+ // We are looking for loci that are fixed within each population, but are
+ // variable between one or more populations.
+ //
+ if (t->nucs[col].fixed == true || t->nucs[col].allele_cnt != 2 || t->nucs[col].pop_cnt < 2)
+ continue;
+
+ bool fixed_within = true;
+ for (int j = 0; j < pop_cnt; j++) {
+ if (s[j]->nucs[col].num_indv == 0)
+ continue;
+ if (s[j]->nucs[col].fixed == false) {
+ fixed_within = false;
+ break;
+ }
+ }
+ if (fixed_within == false) continue;
+
+ log_fh << index << "\t" << loc->id << "\t" << col << "\t";
+
+ for (int p=0; p<pop_cnt; p++) {
+ if (s[p]->nucs[col].num_indv > 0) {
+ interspecific_nucs[p] += s[p]->nucs[col].p_nuc;
+ log_fh << mpopi.pops()[p].name << ":" << s[p]->nucs[col].p_nuc << ",";
+ } else {
+ interspecific_nucs[p] += 'N';
+ log_fh << mpopi.pops()[p].name << ":N" << ",";
+ }
+ }
+ log_fh << "\n";
+ index++;
+
+ } else {
+ //
+ // Encode SNPs that are variable within a population as well, using IUPAC notation:
+ // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ log_fh << index << "\t" << loc->id << "\t" << col << "\t";
+
+ for (int j=0; j<pop_cnt; j++) {
+
+ switch(s[j]->nucs[col].p_nuc) {
+ case 0:
+ nuc = 'N';
+ break;
+ case 'A':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'C':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'R';
+ break;
+ case 'T':
+ nuc = 'W';
+ break;
+ case 0:
+ nuc = 'A';
+ break;
+ }
+ break;
+ case 'C':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'Y';
+ break;
+ case 0:
+ nuc = 'C';
+ break;
+ }
+ break;
+ case 'G':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'R';
+ break;
+ case 'C':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'G';
+ break;
+ }
+ break;
+ case 'T':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'W';
+ break;
+ case 'C':
+ nuc = 'Y';
+ break;
+ case 'G':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'T';
+ break;
+ }
+ break;
+ }
+ interspecific_nucs[j] += nuc;
+ log_fh << mpopi.pops()[j].name << ":" << nuc << ",";
+
+ }
+ log_fh << "\n";
+ index++;
+ }
+ }
+ }
+ }
+
+ if (interspecific_nucs.size() == 0) {
+ cerr << " No data is available to write to the Phylip file.\n";
+ return 0;
+ }
+
+ fh << mpopi.pops().size() << " " << interspecific_nucs.begin()->second.length() << "\n";
+ for (size_t i_pop=0; i_pop<mpopi.pops().size(); ++i_pop) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[i_pop];
+
+ char id_str[id_len];
+ sprintf(id_str, "%s", pop.name.c_str());
+ uint len = strlen(id_str);
+ for (uint j = len; j < 10; j++)
+ id_str[j] = ' ';
+ id_str[9] = '\0';
+
+ fh << id_str << " " << interspecific_nucs[i_pop] << "\n";
+ }
+
+ //
+ // Output the header.
+ //
+ fh << "# Stacks v" << VERSION << "; " << " Phylip sequential; " << date << "\n";
+
+ fh.close();
+ log_fh.close();
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+write_fullseq_phylip(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum)
+{
+ //
+ // We want to write all variable loci in Phylip interleaved format. Polymorphic positions
+ // will be encoded using IUPAC notation.
+ //
+ // We will write those loci to a Phylip file as defined here:
+ // http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
+ //
+ string file = out_path + out_prefix + ".fullseq.phylip";
+
+ cerr << "Writing full sequence population data to Phylip file '" << file << "'; ";
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening Phylip file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // We will also write a file that allows us to specify each RAD locus as a separate partition
+ // for use in phylogenetics programs.
+ //
+ file = out_path + out_prefix + ".fullseq.partitions.phylip";
+
+ ofstream par_fh(file.c_str(), ofstream::out);
+
+ if (par_fh.fail()) {
+ cerr << "Error opening Phylip partitions file '" << file << "'\n";
+ exit(1);
+ }
+
+ file = out_path + out_prefix + ".fullseq.phylip.log";
+ cerr << "logging nucleotide positions to '" << file << "'...";
+
+ ofstream log_fh(file.c_str(), ofstream::out);
+
+ if (log_fh.fail()) {
+ cerr << "Error opening Phylip Log file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Obtain the current date.
+ //
+ time_t rawtime;
+ struct tm *timeinfo;
+ char date[32];
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+ strftime(date, 32, "%B %d, %Y", timeinfo);
+
+ log_fh << "# Stacks v" << VERSION << "; " << " Phylip interleaved; " << date << "\n"
+ << "# Locus ID\tLine Number";
+ if (loci_ordered) log_fh << "\tChr\tBasepair";
+ log_fh << "\n";
+
+ map<string, vector<CSLocus *> >::iterator it;
+ CSLocus *loc;
+ LocSum **s;
+ LocTally *t;
+
+ int pop_cnt = psum->pop_cnt();
+ char nuc = '\0';
+
+ bool include;
+ uint len = 0;
+
+ //
+ // Determine the length of sequence we will output.
+ //
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ t = psum->locus_tally(loc->id);
+
+ include = true;
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (t->nucs[col].allele_cnt != 2)
+ include = false;
+ }
+
+ if (include)
+ len += strlen(loc->con);
+ }
+ }
+
+ map<int, string> outstrs;
+ fh << mpopi.pops().size() << " " << len << "\n";
+ for (size_t i_pop=0; i_pop<mpopi.pops().size(); ++i_pop) {
+ const MetaPopInfo::Pop& pop = mpopi.pops()[i_pop];
+
+ char id_str[id_len];
+ sprintf(id_str, "%s", pop.name.c_str());
+ len = strlen(id_str);
+ for (uint j = len; j < 10; j++)
+ id_str[j] = ' ';
+ id_str[9] = '\0';
+
+ outstrs[i_pop] = string(id_str) + " ";
+ }
+
+ char *seq;
+ int line = 1;
+ int index = 1;
+ int cnt = 1;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ include = true;
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (t->nucs[col].allele_cnt != 2)
+ include = false;
+ }
+
+ if (!include)
+ continue;
+
+ seq = new char[loc->len + 1];
+ strcpy(seq, loc->con);
+
+ for (int j = 0; j < pop_cnt; j++) {
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // Encode SNPs that are variable within a population using IUPAC notation:
+ // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
+ //
+ switch(s[j]->nucs[col].p_nuc) {
+ case 0:
+ nuc = 'N';
+ break;
+ case 'A':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'C':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'R';
+ break;
+ case 'T':
+ nuc = 'W';
+ break;
+ case 0:
+ nuc = 'A';
+ break;
+ }
+ break;
+ case 'C':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'Y';
+ break;
+ case 0:
+ nuc = 'C';
+ break;
+ }
+ break;
+ case 'G':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'R';
+ break;
+ case 'C':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'G';
+ break;
+ }
+ break;
+ case 'T':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'W';
+ break;
+ case 'C':
+ nuc = 'Y';
+ break;
+ case 'G':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'T';
+ break;
+ }
+ break;
+ }
+
+ seq[col] = nuc;
+ }
+
+ outstrs[j] += string(seq);
+ }
+ delete [] seq;
+
+ log_fh << line << "\t" << loc->id;
+ if (loci_ordered) log_fh << "\t" << loc->loc.chr << "\t" << loc->sort_bp() + 1;
+ log_fh << "\n";
+
+ for (size_t i_pop=0; i_pop<mpopi.pops().size(); ++i_pop) {
+ fh << outstrs[i_pop] << "\n";
+ outstrs[i_pop] = "";
+ line++;
+ }
+ fh << "\n";
+ line++;
+
+ par_fh << "DNA, p" << cnt << "=" << index << "-" << index + loc->len - 1 << "\n";
+ index += loc->len;
+ cnt++;
+ }
+ }
+
+ //
+ // Output the header.
+ //
+ fh << "# Stacks v" << VERSION << "; " << " Phylip interleaved; " << date << "\n";
+
+ fh.close();
+ par_fh.close();
+ log_fh.close();
+
+ cerr << "done.\n";
+
+ return 0;
+}
+
+int
+populate_snp_calls(map<int, CSLocus *> &catalog,
+ PopMap<CSLocus> *pmap,
+ map<int, pair<merget, int> > &merge_map)
+{
+ map<int, CSLocus *>::iterator cit;
+ map<int, SNPRes *>::iterator sit;
+ CSLocus *loc;
+ Datum *datum;
+ SNPRes *snpr;
+ SNP *snp;
+
+ for (uint i = 0; i < mpopi.samples().size(); i++) {
+ map<int, SNPRes *> snpres;
+ load_snp_calls(in_path + mpopi.samples()[i].name, snpres);
+
+ for (cit = catalog.begin(); cit != catalog.end(); cit++) {
+ loc = cit->second;
+ datum = pmap->datum(loc->id, mpopi.samples()[i].id);
+
+ if (datum != NULL && snpres.count(datum->id)) {
+
+ if (merge_sites && merge_map.count(loc->id)) {
+ datum_adjust_snp_positions(merge_map, loc, datum, snpres);
+ } else {
+ //
+ // Deep copy the SNP objects.
+ //
+ snpr = snpres[datum->id];
+ for (uint j = 0; j < snpr->snps.size(); j++) {
+ snp = new SNP;
+ snp->col = snpr->snps[j]->col;
+ snp->lratio = snpr->snps[j]->lratio;
+ snp->rank_1 = snpr->snps[j]->rank_1;
+ snp->rank_2 = snpr->snps[j]->rank_2;
+ snp->rank_3 = snpr->snps[j]->rank_3;
+ snp->rank_4 = snpr->snps[j]->rank_4;
+
+ datum->snps.push_back(snp);
+ }
+ }
+ }
+ }
+
+ for (sit = snpres.begin(); sit != snpres.end(); sit++)
+ delete sit->second;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculate the SNP-wise allelic depths by adding up the haplotype depths.
+ */
+int
+find_datum_allele_depths(Datum *d, int snp_index, char allele1, char allele2, int &dp1, int &dp2)
+{
+ dp1 = 0;
+ dp2 = 0;
+
+ for(uint i = 0; i < d->obshap.size(); i++) {
+ char nt = d->obshap[i][snp_index];
+ if(nt == allele1)
+ dp1 += d->depth[i];
+ else if(nt == allele2)
+ dp2 += d->depth[i];
+ else
+ throw std::exception();
+ }
+
+ return 0;
+}
+
+int
+tally_observed_haplotypes(vector<char *> &obshap, int snp_index, char &p_allele, char &q_allele)
+{
+ int nucs[4] = {0};
+ char nuc;
+
+ //
+ // Pull each allele for this SNP from the observed haplotype.
+ //
+ for (uint j = 0; j < obshap.size(); j++) {
+ nuc = obshap[j][snp_index];
+
+ switch(nuc) {
+ case 'A':
+ case 'a':
+ nucs[0]++;
+ break;
+ case 'C':
+ case 'c':
+ nucs[1]++;
+ break;
+ case 'G':
+ case 'g':
+ nucs[2]++;
+ break;
+ case 'T':
+ case 't':
+ nucs[3]++;
+ break;
+ }
+ }
+
+ //
+ // Determine how many alleles are present at this position in this population.
+ // We cannot deal with more than two alternative alleles, if there are more than two
+ // in a single population, print a warning and exclude this nucleotide position.
+ //
+ int i;
+ int allele_cnt = 0;
+ for (i = 0; i < 4; i++)
+ if (nucs[i] > 0) allele_cnt++;
+
+ if (allele_cnt > 2) {
+ p_allele = 0;
+ q_allele = 0;
+ return -1;
+ }
+
+ //
+ // Record which nucleotide is the P allele and which is the Q allele.
+ // (The P allele is the first one alphabetically, and the Q allele the second
+ // one, if any.)
+ //
+ p_allele = 0;
+ q_allele = 0;
+
+ i = 0;
+ while (p_allele == 0 && i < 4) {
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 0:
+ p_allele = 'A';
+ break;
+ case 1:
+ p_allele = 'C';
+ break;
+ case 2:
+ p_allele = 'G';
+ break;
+ case 3:
+ p_allele = 'T';
+ break;
+ }
+ }
+ i++;
+ }
+ while (q_allele == 0 && i < 4) {
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 1:
+ q_allele = 'C';
+ break;
+ case 2:
+ q_allele = 'G';
+ break;
+ case 3:
+ q_allele = 'T';
+ break;
+ }
+ }
+ i++;
+ }
+
+ return 0;
+}
+
+int tally_haplotype_freq(CSLocus *locus, PopMap<CSLocus> *pmap,
+ int &total, double &max, string &freq_str) {
+
+ map<string, double> freq;
+ Datum **d = pmap->locus(locus->id);
+
+ total = 0;
+ max = 0;
+
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL)
+ continue;
+
+ if (d[i]->gtype[0] != '-') {
+ freq[d[i]->gtype]++;
+ total++;
+ }
+ }
+
+ if (total == 0)
+ return 0;
+
+ double frac;
+ stringstream s;
+ char f[id_len];
+ map<string, double>::iterator it;
+ for (it = freq.begin(); it != freq.end(); it++) {
+ frac = (double) it->second / (double) total * 100;
+ if (frac > max) max = frac;
+ sprintf(f, "(%0.1f%%);", frac);
+ s << it->first << ":" << it->second << f;
+ }
+
+ freq_str = s.str();
+
+ return 0;
+}
+
+int
+datum_adjust_snp_positions(map<int, pair<merget, int> > &merge_map,
+ CSLocus *loc, Datum *datum,
+ map<int, SNPRes *> &snpres)
+{
+ //
+ // We will start with the 'sink' locus, which was originally on the negative strand:
+ // 1. If the locus was shorter than the catalog locus, pad the difference.
+ // 2. Convert to positive strand: Reverse the order, complement the alleles,
+ // alter the internal column position.
+ //
+ SNP *snp;
+ SNPRes *snpr = snpres[datum->id];
+ int index = 0;
+ int stop_pos = renz_olap[enz] - 1;
+
+ //
+ // We know the catalog was padded since we already padded hte model call string
+ // if it was necessary when originally merging.
+ //
+ while (datum->model[index] == 'N') {
+ snp = new SNP;
+ snp->col = index;
+ snp->lratio = 0.0;
+ snp->rank_1 = 'N';
+ snp->type = snp_type_unk;
+ datum->snps.push_back(snp);
+ index++;
+ }
+
+ for (int j = snpr->snps.size() - 1; j > stop_pos; j--) {
+ snp = new SNP;
+ snp->col = index;
+ snp->lratio = snpr->snps[j]->lratio;
+ snp->rank_1 = reverse(snpr->snps[j]->rank_1);
+ snp->rank_2 = reverse(snpr->snps[j]->rank_2);
+ snp->rank_3 = reverse(snpr->snps[j]->rank_3);
+ snp->rank_4 = reverse(snpr->snps[j]->rank_4);
+ datum->snps.push_back(snp);
+ index++;
+ }
+
+ //
+ // Now we fetch the former locus, the 'src', which was originally on the positive strand.
+ // All we have to do is adjust the column position of each SNP.
+ //
+ snpr = snpres[datum->merge_partner];
+
+ for (uint j = 0; j < snpres[datum->id]->snps.size(); j++) {
+ snp = new SNP;
+ snp->col = index;
+ snp->lratio = snpr->snps[j]->lratio;
+ snp->rank_1 = snpr->snps[j]->rank_1;
+ snp->rank_2 = snpr->snps[j]->rank_2;
+ snp->rank_3 = snpr->snps[j]->rank_3;
+ snp->rank_4 = snpr->snps[j]->rank_4;
+ datum->snps.push_back(snp);
+ index++;
+ }
+
+ return 0;
+}
diff --git a/src/export_formats.h b/src/export_formats.h
new file mode 100644
index 0000000..c867c68
--- /dev/null
+++ b/src/export_formats.h
@@ -0,0 +1,70 @@
+#ifndef EXPORT_FORMATS_H
+#define EXPORT_FORMATS_H
+
+#include <iostream>
+#include <utility>
+#include <map>
+
+#include "locus.h"
+#include "PopMap.h"
+#include "PopSum.h"
+#include "ordered.h" // for "snp"
+#include "populations.h" // for "merget", "InputMode", "uncalled_haplotype()", "count_haplotypes_at_locus()"
+
+using std::ofstream;
+using std::pair;
+using std::map;
+
+class GenPos {
+public:
+ uint id;
+ uint bp;
+ uint snp_index;
+ loc_type type;
+
+ GenPos(int id, int snp_index, int bp) {
+ this->id = id;
+ this->snp_index = snp_index;
+ this->bp = bp;
+ this->type = snp;
+ }
+ GenPos(int id, int snp_index, int bp, loc_type type) {
+ this->id = id;
+ this->snp_index = snp_index;
+ this->bp = bp;
+ this->type = type;
+ }
+
+ bool operator<(const GenPos& other) const {return bp < other.bp;}
+};
+
+int write_sql(map<int, CSLocus *> &, PopMap<CSLocus> *);
+int write_fst_stats(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, ofstream &);
+int write_generic(map<int, CSLocus *> &, PopMap<CSLocus> *, bool);
+int write_genomic(map<int, CSLocus *> &, PopMap<CSLocus> *);
+int write_fasta(map<int, CSLocus *> &, PopMap<CSLocus> *);
+int write_strict_fasta(map<int, CSLocus *> &, PopMap<CSLocus> *);
+int write_vcf(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<merget, int> > &);
+int write_vcf_ordered(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<merget, int> > &, ofstream &);
+int write_vcf_haplotypes(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_genepop(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_genepop_ordered(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, ofstream &);
+int write_structure(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_structure_ordered(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, ofstream &);
+int write_phase(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_fastphase(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_beagle(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_beagle_phased(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_plink(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_hzar(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_treemix(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_phylip(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int write_fullseq_phylip(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+
+int populate_snp_calls(map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, pair<merget, int> > &);
+int find_datum_allele_depths(Datum *, int, char, char, int &, int &);
+int tally_observed_haplotypes(vector<char *> &, int, char &, char &);
+int tally_haplotype_freq(CSLocus *, PopMap<CSLocus> *, int &, double &, string &);
+int datum_adjust_snp_positions(map<int, pair<merget, int> > &, CSLocus *, Datum *, map<int, SNPRes *> &);
+
+#endif // EXPORT_FORMATS_H
diff --git a/src/file_io.cc b/src/file_io.cc
index d297a92..3631769 100644
--- a/src/file_io.cc
+++ b/src/file_io.cc
@@ -772,13 +772,14 @@ load_barcodes(string barcode_file, vector<BarcodePair> &barcodes,
switch (*q) {
case '-':
case '_':
+ case '.':
break;
case '\r':
case '\t':
*q = '\0';
break;
default:
- cerr << "Invalid filename on line " << line_num << ": '" << s << "' (filenames can consist of letters, numbers, '-' and '_').\n";
+ cerr << "Invalid filename on line " << line_num << ": '" << s << "' (filenames can consist of letters, numbers, '.', '-' and '_').\n";
exit(1);
}
}
diff --git a/src/genotype_dictionaries.h b/src/genotype_dictionaries.cc
similarity index 75%
copy from src/genotype_dictionaries.h
copy to src/genotype_dictionaries.cc
index d90ece3..6c6b4ca 100644
--- a/src/genotype_dictionaries.h
+++ b/src/genotype_dictionaries.cc
@@ -1,31 +1,17 @@
-// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
-//
-// Copyright 2011-2014, Julian Catchen <jcatchen at uoregon.edu>
-//
-// This file is part of Stacks.
-//
-// Stacks is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// Stacks is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
-//
-
-#ifndef __GENOTYPE_DICTIONARIES_H__
-#define __GENOTYPE_DICTIONARIES_H__
-
-enum map_types {unk, none, gen, dh, cp, bc1, f2};
-enum out_types {rqtl, joinmap, onemap, genomic};
-
-void
-initialize_dictionaries(map<string, map<string, string> > &global_dictionary)
+#include "genotype_dictionaries.h"
+
+int
+encoded_gtypes[4][4] =
+{
+ // A C G T
+ {1, 2, 3, 4}, // A
+ {2, 5, 6, 7}, // C
+ {3, 6, 8, 9}, // G
+ {4, 7, 9, 10} // T
+};
+
+void
+initialize_dictionaries(map<string, map<string, string> > &global_dictionary)
{
global_dictionary["ab/--"]["a"] = "aa";
global_dictionary["ab/--"]["b"] = "bb";
@@ -82,9 +68,9 @@ initialize_dictionaries(map<string, map<string, string> > &global_dictionary)
global_dictionary["ab/ab"]["ab"] = "ab";
}
-void
-load_cp_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_cp_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["ab/--"] = "ab/--";
types["--/ab"] = "--/ab";
@@ -144,9 +130,9 @@ load_cp_dictionary(map<string, string> &types,
return;
}
-void
-load_joinmap_cp_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_joinmap_cp_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["ab/--"] = "lmx--";
types["--/ab"] = "--xnp";
@@ -198,8 +184,8 @@ load_joinmap_cp_dictionary(map<string, string> &types,
return;
}
-void
-load_onemap_cp_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary)
+void
+load_onemap_cp_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary)
{
types["ab/--"] = "abxoo";
types["--/ab"] = "ooxab";
@@ -252,9 +238,9 @@ load_onemap_cp_dictionary(map<string, string> &types, map<string, map<string, st
return;
}
-void
-load_bc_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_bc_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["aa/bb"] = "aa/bb";
types["bb/aa"] = "bb/aa";
@@ -286,9 +272,9 @@ load_bc_dictionary(map<string, string> &types,
dictionary["cc/ab"]["bb"] = "bb";
}
-void
-load_f2_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_f2_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["aa/bb"] = "aa/bb";
types["ab/cd"] = "ab/cd";
@@ -341,9 +327,9 @@ load_f2_dictionary(map<string, string> &types,
dictionary["cc/ab"]["--"] = "--";
}
-void
-load_mm_bc_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_mm_bc_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["aa/bb"] = "aaxbb";
types["bb/aa"] = "bbxaa";
@@ -375,9 +361,9 @@ load_mm_bc_dictionary(map<string, string> &types,
dictionary["ccxab"]["bb"] = "a";
}
-void
-load_mm_f2_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_mm_f2_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["aa/bb"] = "aaxbb";
types["ab/cd"] = "abxcd";
@@ -430,9 +416,9 @@ load_mm_f2_dictionary(map<string, string> &types,
dictionary["ccxab"]["--"] = "-";
}
-void
-load_dh_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_dh_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["ab/--"] = "ab/--";
types["--/ab"] = "--/ab";
@@ -446,9 +432,9 @@ load_dh_dictionary(map<string, string> &types,
dictionary["--/ab"]["--"] = "--";
}
-void
-load_mm_dh_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
+void
+load_mm_dh_dictionary(map<string, string> &types,
+ map<string, map<string, string> > &dictionary)
{
types["ab/--"] = "abx--";
types["--/ab"] = "--xab";
@@ -462,111 +448,81 @@ load_mm_dh_dictionary(map<string, string> &types,
dictionary["--xab"]["--"] = "-";
}
-void
-load_segregation_ratios(map_types type, map<string, map<string, double> > &segregation_ratios)
+void
+load_segregation_ratios(map_types type, map<string, map<string, double> > &segregation_ratios)
{
switch(type) {
case cp:
- segregation_ratios["ab/--"]["aa"] = 0.50;
- segregation_ratios["ab/--"]["ab"] = 0.50;
+ segregation_ratios["ab/--"]["aa"] = 0.50;
+ segregation_ratios["ab/--"]["ab"] = 0.50;
- segregation_ratios["--/ab"]["aa"] = 0.50;
- segregation_ratios["--/ab"]["ab"] = 0.50;
+ segregation_ratios["--/ab"]["aa"] = 0.50;
+ segregation_ratios["--/ab"]["ab"] = 0.50;
- segregation_ratios["ab/aa"]["aa"] = 0.50;
- segregation_ratios["ab/aa"]["ab"] = 0.50;
+ segregation_ratios["ab/aa"]["aa"] = 0.50;
+ segregation_ratios["ab/aa"]["ab"] = 0.50;
- segregation_ratios["aa/ab"]["aa"] = 0.50;
- segregation_ratios["aa/ab"]["ab"] = 0.50;
+ segregation_ratios["aa/ab"]["aa"] = 0.50;
+ segregation_ratios["aa/ab"]["ab"] = 0.50;
- segregation_ratios["ab/ab"]["ab"] = 0.50;
- segregation_ratios["ab/ab"]["aa"] = 0.25;
- segregation_ratios["ab/ab"]["bb"] = 0.25;
+ segregation_ratios["ab/ab"]["ab"] = 0.50;
+ segregation_ratios["ab/ab"]["aa"] = 0.25;
+ segregation_ratios["ab/ab"]["bb"] = 0.25;
- segregation_ratios["ab/ac"]["ab"] = 0.25;
- segregation_ratios["ab/ac"]["ac"] = 0.25;
- segregation_ratios["ab/ac"]["bc"] = 0.25;
- segregation_ratios["ab/ac"]["aa"] = 0.25;
+ segregation_ratios["ab/ac"]["ab"] = 0.25;
+ segregation_ratios["ab/ac"]["ac"] = 0.25;
+ segregation_ratios["ab/ac"]["bc"] = 0.25;
+ segregation_ratios["ab/ac"]["aa"] = 0.25;
- segregation_ratios["ab/cd"]["ac"] = 0.25;
- segregation_ratios["ab/cd"]["ad"] = 0.25;
- segregation_ratios["ab/cd"]["bc"] = 0.25;
- segregation_ratios["ab/cd"]["bd"] = 0.25;
- break;
+ segregation_ratios["ab/cd"]["ac"] = 0.25;
+ segregation_ratios["ab/cd"]["ad"] = 0.25;
+ segregation_ratios["ab/cd"]["bc"] = 0.25;
+ segregation_ratios["ab/cd"]["bd"] = 0.25;
+ break;
case f2:
- segregation_ratios["aaxbb"]["a"] = 0.25;
- segregation_ratios["aaxbb"]["b"] = 0.25;
- segregation_ratios["aaxbb"]["h"] = 0.50;
+ segregation_ratios["aaxbb"]["a"] = 0.25;
+ segregation_ratios["aaxbb"]["b"] = 0.25;
+ segregation_ratios["aaxbb"]["h"] = 0.50;
- segregation_ratios["abxcd"]["a"] = 0.25;
- segregation_ratios["abxcd"]["b"] = 0.25;
- segregation_ratios["abxcd"]["h"] = 0.50;
+ segregation_ratios["abxcd"]["a"] = 0.25;
+ segregation_ratios["abxcd"]["b"] = 0.25;
+ segregation_ratios["abxcd"]["h"] = 0.50;
- segregation_ratios["abxaa"]["a"] = 1.00;
+ segregation_ratios["abxaa"]["a"] = 1.00;
- segregation_ratios["aaxab"]["b"] = 1.00;
+ segregation_ratios["aaxab"]["b"] = 1.00;
- segregation_ratios["abxcc"]["a"] = 0.50;
- segregation_ratios["abxcc"]["b"] = 0.50;
+ segregation_ratios["abxcc"]["a"] = 0.50;
+ segregation_ratios["abxcc"]["b"] = 0.50;
- segregation_ratios["ccxab"]["b"] = 0.50;
- segregation_ratios["ccxab"]["a"] = 0.50;
- break;
+ segregation_ratios["ccxab"]["b"] = 0.50;
+ segregation_ratios["ccxab"]["a"] = 0.50;
+ break;
case bc1:
- segregation_ratios["aaxbb"]["h"] = 0.50;
- segregation_ratios["aaxbb"]["b"] = 0.50;
+ segregation_ratios["aaxbb"]["h"] = 0.50;
+ segregation_ratios["aaxbb"]["b"] = 0.50;
- segregation_ratios["bbxaa"]["h"] = 0.50;
- segregation_ratios["bbxaa"]["a"] = 0.50;
+ segregation_ratios["bbxaa"]["h"] = 0.50;
+ segregation_ratios["bbxaa"]["a"] = 0.50;
- segregation_ratios["abxcc"]["h"] = 0.50;
- segregation_ratios["abxcc"]["b"] = 0.50;
+ segregation_ratios["abxcc"]["h"] = 0.50;
+ segregation_ratios["abxcc"]["b"] = 0.50;
- segregation_ratios["ccxab"]["h"] = 0.50;
- segregation_ratios["ccxab"]["a"] = 0.50;
- break;
+ segregation_ratios["ccxab"]["h"] = 0.50;
+ segregation_ratios["ccxab"]["a"] = 0.50;
+ break;
case dh:
- segregation_ratios["ab/--"]["a"] = 0.50;
- segregation_ratios["ab/--"]["b"] = 0.50;
+ segregation_ratios["ab/--"]["a"] = 0.50;
+ segregation_ratios["ab/--"]["b"] = 0.50;
- segregation_ratios["--/ab"]["a"] = 0.50;
- segregation_ratios["--/ab"]["b"] = 0.50;
- break;
+ segregation_ratios["--/ab"]["a"] = 0.50;
+ segregation_ratios["--/ab"]["b"] = 0.50;
+ break;
case gen:
case none:
case unk:
- break;
+ break;
}
return;
}
-
-inline
-int
-encode_gtype(char a)
-{
- switch (a) {
- case 'A':
- return 0;
- case 'C':
- return 1;
- case 'G':
- return 2;
- case 'T':
- return 3;
- }
-
- return -1;
-}
-
-int
-encoded_gtypes[4][4] =
-{
- // A C G T
- {1, 2, 3, 4}, // A
- {2, 5, 6, 7}, // C
- {3, 6, 8, 9}, // G
- {4, 7, 9, 10} // T
-};
-
-#endif // __GENOTYPE_DICTIONARIES_H__
diff --git a/src/genotype_dictionaries.h b/src/genotype_dictionaries.h
index d90ece3..a174bb8 100644
--- a/src/genotype_dictionaries.h
+++ b/src/genotype_dictionaries.h
@@ -21,552 +21,42 @@
#ifndef __GENOTYPE_DICTIONARIES_H__
#define __GENOTYPE_DICTIONARIES_H__
-enum map_types {unk, none, gen, dh, cp, bc1, f2};
-enum out_types {rqtl, joinmap, onemap, genomic};
-
-void
-initialize_dictionaries(map<string, map<string, string> > &global_dictionary)
-{
- global_dictionary["ab/--"]["a"] = "aa";
- global_dictionary["ab/--"]["b"] = "bb";
-
- global_dictionary["--/ab"]["a"] = "aa";
- global_dictionary["--/ab"]["b"] = "bb";
-
- global_dictionary["aa/bb"]["a"] = "aa";
- global_dictionary["aa/bb"]["ab"] = "ab";
- global_dictionary["aa/bb"]["b"] = "bb";
-
- global_dictionary["ab/ac"]["a"] = "aa";
- global_dictionary["ab/ac"]["ab"] = "ab";
- global_dictionary["ab/ac"]["b"] = "bb";
- global_dictionary["ab/ac"]["ac"] = "ac";
- global_dictionary["ab/ac"]["c"] = "cc";
- global_dictionary["ab/ac"]["bc"] = "bc";
-
- global_dictionary["ab/cd"]["a"] = "aa";
- global_dictionary["ab/cd"]["ab"] = "ab";
- global_dictionary["ab/cd"]["b"] = "bb";
- global_dictionary["ab/cd"]["c"] = "cc";
- global_dictionary["ab/cd"]["cd"] = "cd";
- global_dictionary["ab/cd"]["d"] = "dd";
- global_dictionary["ab/cd"]["ac"] = "ac";
- global_dictionary["ab/cd"]["ad"] = "ad";
- global_dictionary["ab/cd"]["bc"] = "bc";
- global_dictionary["ab/cd"]["bd"] = "bd";
-
- global_dictionary["ab/aa"]["a"] = "aa";
- global_dictionary["ab/aa"]["ab"] = "ab";
- global_dictionary["ab/aa"]["b"] = "bb";
-
- global_dictionary["aa/ab"]["a"] = "aa";
- global_dictionary["aa/ab"]["ab"] = "ab";
- global_dictionary["aa/ab"]["b"] = "bb";
-
- global_dictionary["ab/cc"]["a"] = "aa";
- global_dictionary["ab/cc"]["ab"] = "ab";
- global_dictionary["ab/cc"]["b"] = "bb";
- global_dictionary["ab/cc"]["c"] = "cc";
- global_dictionary["ab/cc"]["ac"] = "ac";
- global_dictionary["ab/cc"]["bc"] = "bc";
-
- global_dictionary["cc/ab"]["a"] = "aa";
- global_dictionary["cc/ab"]["ab"] = "ab";
- global_dictionary["cc/ab"]["b"] = "bb";
- global_dictionary["cc/ab"]["c"] = "cc";
- global_dictionary["cc/ab"]["ac"] = "ac";
- global_dictionary["cc/ab"]["bc"] = "bc";
-
- global_dictionary["ab/ab"]["a"] = "aa";
- global_dictionary["ab/ab"]["b"] = "bb";
- global_dictionary["ab/ab"]["ab"] = "ab";
-}
-
-void
-load_cp_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["ab/--"] = "ab/--";
- types["--/ab"] = "--/ab";
- types["ab/aa"] = "ab/aa";
- types["aa/ab"] = "aa/ab";
- types["ab/a-"] = "ab/--";
- types["-a/ab"] = "--/ab";
- types["ab/c-"] = "ab/cd";
- types["-c/ab"] = "ab/cd";
- types["ab/cc"] = "ab/--";
- types["cc/ab"] = "--/ab";
- types["ab/ab"] = "ab/ab";
- types["ab/ac"] = "ab/ac";
- types["ab/cd"] = "ab/cd";
- types["-a/bb"] = "ab/--";
- types["aa/b-"] = "--/ab";
-
- dictionary["ab/--"]["--"] = "--";
- dictionary["ab/--"]["aa"] = "aa";
- dictionary["ab/--"]["ab"] = "ab";
- dictionary["ab/--"]["bb"] = "ab";
- dictionary["ab/--"]["ac"] = "aa";
- dictionary["ab/--"]["bc"] = "ab";
-
- dictionary["--/ab"]["--"] = "--";
- dictionary["--/ab"]["aa"] = "aa";
- dictionary["--/ab"]["ab"] = "ab";
- dictionary["--/ab"]["bb"] = "ab";
- dictionary["--/ab"]["ac"] = "aa";
- dictionary["--/ab"]["bc"] = "ab";
-
- dictionary["ab/aa"]["--"] = "--";
- dictionary["ab/aa"]["aa"] = "aa";
- dictionary["ab/aa"]["ab"] = "ab";
-
- dictionary["aa/ab"]["--"] = "--";
- dictionary["aa/ab"]["aa"] = "aa";
- dictionary["aa/ab"]["ab"] = "ab";
-
- dictionary["ab/ab"]["--"] = "--";
- dictionary["ab/ab"]["ab"] = "ab";
- dictionary["ab/ab"]["aa"] = "aa";
- dictionary["ab/ab"]["bb"] = "bb";
-
- dictionary["ab/ac"]["--"] = "--";
- dictionary["ab/ac"]["ab"] = "ab";
- dictionary["ab/ac"]["ac"] = "ac";
- dictionary["ab/ac"]["bc"] = "bc";
- dictionary["ab/ac"]["aa"] = "aa";
-
- dictionary["ab/cd"]["--"] = "--";
- dictionary["ab/cd"]["ac"] = "ac";
- dictionary["ab/cd"]["ad"] = "ad";
- dictionary["ab/cd"]["bc"] = "bc";
- dictionary["ab/cd"]["bd"] = "bd";
-
- return;
-}
-
-void
-load_joinmap_cp_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["ab/--"] = "lmx--";
- types["--/ab"] = "--xnp";
- types["ab/aa"] = "lmxll";
- types["aa/ab"] = "nnxnp";
- types["ab/ab"] = "hkxhk";
- types["ab/ac"] = "efxeg";
- types["ab/cd"] = "abxcd";
-
- dictionary["lmx--"]["--"] = "--";
- dictionary["lmx--"]["aa"] = "ll";
- dictionary["lmx--"]["ab"] = "lm";
- dictionary["lmx--"]["bb"] = "lm";
- dictionary["lmx--"]["ac"] = "ll";
- dictionary["lmx--"]["bc"] = "lm";
-
- dictionary["--xnp"]["--"] = "--";
- dictionary["--xnp"]["aa"] = "nn";
- dictionary["--xnp"]["ab"] = "np";
- dictionary["--xnp"]["bb"] = "np";
- dictionary["--xnp"]["ac"] = "nn";
- dictionary["--xnp"]["bc"] = "np";
-
- dictionary["lmxll"]["--"] = "--";
- dictionary["lmxll"]["aa"] = "ll";
- dictionary["lmxll"]["ab"] = "lm";
-
- dictionary["nnxnp"]["--"] = "--";
- dictionary["nnxnp"]["aa"] = "nn";
- dictionary["nnxnp"]["ab"] = "np";
-
- dictionary["hkxhk"]["--"] = "--";
- dictionary["hkxhk"]["ab"] = "hk";
- dictionary["hkxhk"]["aa"] = "hh";
- dictionary["hkxhk"]["bb"] = "kk";
-
- dictionary["efxeg"]["--"] = "--";
- dictionary["efxeg"]["ab"] = "ef";
- dictionary["efxeg"]["ac"] = "eg";
- dictionary["efxeg"]["bc"] = "fg";
- dictionary["efxeg"]["aa"] = "ee";
-
- dictionary["abxcd"]["--"] = "--";
- dictionary["abxcd"]["ac"] = "ac";
- dictionary["abxcd"]["ad"] = "ad";
- dictionary["abxcd"]["bc"] = "bc";
- dictionary["abxcd"]["bd"] = "bd";
-
- return;
-}
-
-void
-load_onemap_cp_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary)
-{
- types["ab/--"] = "abxoo";
- types["--/ab"] = "ooxab";
- types["ab/aa"] = "abxaa";
- types["aa/ab"] = "aaxab";
- types["ab/ab"] = "abxab";
- types["ab/ac"] = "abxac";
- types["ab/cd"] = "abxcd";
-
- // D1.11
- dictionary["abxoo"]["--"] = "-";
- dictionary["abxoo"]["aa"] = "a";
- dictionary["abxoo"]["bb"] = "b";
-
- // D2.16
- dictionary["ooxab"]["--"] = "-";
- dictionary["ooxab"]["aa"] = "a";
- dictionary["ooxab"]["bb"] = "b";
-
- // D1.10
- dictionary["abxaa"]["--"] = "-";
- dictionary["abxaa"]["aa"] = "a";
- dictionary["abxaa"]["ab"] = "ab";
-
- // D2.15
- dictionary["aaxab"]["--"] = "-";
- dictionary["aaxab"]["aa"] = "a";
- dictionary["aaxab"]["ab"] = "ab";
-
- // B3.7
- dictionary["abxab"]["--"] = "-";
- dictionary["abxab"]["ab"] = "ab";
- dictionary["abxab"]["aa"] = "a";
- dictionary["abxab"]["bb"] = "b";
-
- // A.2
- dictionary["abxac"]["--"] = "-";
- dictionary["abxac"]["ab"] = "ba";
- dictionary["abxac"]["ac"] = "ac";
- dictionary["abxac"]["bc"] = "bc";
- dictionary["abxac"]["aa"] = "a";
-
- // A.1
- dictionary["abxcd"]["--"] = "-";
- dictionary["abxcd"]["ac"] = "ac";
- dictionary["abxcd"]["ad"] = "ad";
- dictionary["abxcd"]["bc"] = "bc";
- dictionary["abxcd"]["bd"] = "bd";
-
- return;
-}
-
-void
-load_bc_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["aa/bb"] = "aa/bb";
- types["bb/aa"] = "bb/aa";
- types["ab/cc"] = "ab/cc";
- types["cc/ab"] = "cc/ab";
-
- dictionary["aa/bb"]["--"] = "--";
- dictionary["aa/bb"]["aa"] = "aa";
- dictionary["aa/bb"]["ab"] = "ab";
- dictionary["aa/bb"]["bb"] = "bb";
-
- dictionary["bb/aa"]["--"] = "--";
- dictionary["bb/aa"]["aa"] = "aa";
- dictionary["bb/aa"]["ab"] = "ab";
- dictionary["bb/aa"]["bb"] = "bb";
-
- dictionary["ab/cc"]["--"] = "--";
- dictionary["ab/cc"]["ac"] = "ac";
- dictionary["ab/cc"]["bc"] = "bc";
- dictionary["ab/cc"]["ab"] = "ab";
- dictionary["ab/cc"]["aa"] = "aa";
- dictionary["ab/cc"]["bb"] = "bb";
-
- dictionary["cc/ab"]["--"] = "--";
- dictionary["cc/ab"]["ac"] = "ac";
- dictionary["cc/ab"]["bc"] = "bc";
- dictionary["cc/ab"]["ab"] = "ab";
- dictionary["cc/ab"]["aa"] = "aa";
- dictionary["cc/ab"]["bb"] = "bb";
-}
-
-void
-load_f2_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["aa/bb"] = "aa/bb";
- types["ab/cd"] = "ab/cd";
- types["ab/aa"] = "ab/aa";
- types["aa/ab"] = "aa/ab";
- types["ab/cc"] = "ab/cc";
- types["cc/ab"] = "cc/ab";
+#include <string>
+#include <map>
- dictionary["aa/bb"]["aa"] = "aa";
- dictionary["aa/bb"]["ab"] = "ab";
- dictionary["aa/bb"]["bb"] = "bb";
- dictionary["aa/bb"]["--"] = "--";
+using std::string;
+using std::map;
- dictionary["ab/cd"]["aa"] = "aa";
- dictionary["ab/cd"]["ab"] = "ab";
- dictionary["ab/cd"]["bb"] = "bb";
- dictionary["ab/cd"]["cc"] = "cc";
- dictionary["ab/cd"]["cd"] = "cd";
- dictionary["ab/cd"]["dd"] = "dd";
- dictionary["ab/cd"]["ac"] = "ac";
- dictionary["ab/cd"]["ad"] = "ad";
- dictionary["ab/cd"]["bc"] = "bc";
- dictionary["ab/cd"]["bd"] = "bd";
- dictionary["ab/cd"]["--"] = "--";
-
- dictionary["ab/aa"]["aa"] = "--";
- dictionary["ab/aa"]["ab"] = "--";
- dictionary["ab/aa"]["bb"] = "bb";
- dictionary["ab/aa"]["--"] = "--";
-
- dictionary["aa/ab"]["aa"] = "--";
- dictionary["aa/ab"]["ab"] = "--";
- dictionary["aa/ab"]["bb"] = "bb";
- dictionary["aa/ab"]["--"] = "--";
-
- dictionary["ab/cc"]["aa"] = "aa";
- dictionary["ab/cc"]["ab"] = "ab";
- dictionary["ab/cc"]["bb"] = "bb";
- dictionary["ab/cc"]["cc"] = "cc";
- dictionary["ab/cc"]["ac"] = "--";
- dictionary["ab/cc"]["bc"] = "--";
- dictionary["ab/cc"]["--"] = "--";
-
- dictionary["cc/ab"]["aa"] = "aa";
- dictionary["cc/ab"]["ab"] = "ab";
- dictionary["cc/ab"]["bb"] = "bb";
- dictionary["cc/ab"]["cc"] = "cc";
- dictionary["cc/ab"]["ac"] = "--";
- dictionary["cc/ab"]["bc"] = "--";
- dictionary["cc/ab"]["--"] = "--";
-}
-
-void
-load_mm_bc_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["aa/bb"] = "aaxbb";
- types["bb/aa"] = "bbxaa";
- types["ab/cc"] = "abxcc";
- types["cc/ab"] = "ccxab";
-
- dictionary["aaxbb"]["--"] = "-";
- dictionary["aaxbb"]["aa"] = "b";
- dictionary["aaxbb"]["ab"] = "h";
- dictionary["aaxbb"]["bb"] = "h";
-
- dictionary["bbxaa"]["--"] = "-";
- dictionary["bbxaa"]["aa"] = "h";
- dictionary["bbxaa"]["ab"] = "h";
- dictionary["bbxaa"]["bb"] = "a";
-
- dictionary["abxcc"]["--"] = "-";
- dictionary["abxcc"]["ac"] = "h";
- dictionary["abxcc"]["bc"] = "h";
- dictionary["abxcc"]["ab"] = "b";
- dictionary["abxcc"]["aa"] = "b";
- dictionary["abxcc"]["bb"] = "b";
-
- dictionary["ccxab"]["--"] = "-";
- dictionary["ccxab"]["ac"] = "h";
- dictionary["ccxab"]["bc"] = "h";
- dictionary["ccxab"]["ab"] = "a";
- dictionary["ccxab"]["aa"] = "a";
- dictionary["ccxab"]["bb"] = "a";
-}
-
-void
-load_mm_f2_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["aa/bb"] = "aaxbb";
- types["ab/cd"] = "abxcd";
- types["ab/aa"] = "abxaa";
- types["aa/ab"] = "aaxab";
- types["ab/cc"] = "abxcc";
- types["cc/ab"] = "ccxab";
-
- dictionary["aaxbb"]["aa"] = "a";
- dictionary["aaxbb"]["ab"] = "h";
- dictionary["aaxbb"]["bb"] = "b";
- dictionary["aaxbb"]["--"] = "-";
-
- dictionary["abxcd"]["aa"] = "a";
- dictionary["abxcd"]["ab"] = "a";
- dictionary["abxcd"]["bb"] = "a";
- dictionary["abxcd"]["cc"] = "b";
- dictionary["abxcd"]["cd"] = "b";
- dictionary["abxcd"]["dd"] = "b";
- dictionary["abxcd"]["ac"] = "h";
- dictionary["abxcd"]["ad"] = "h";
- dictionary["abxcd"]["bc"] = "h";
- dictionary["abxcd"]["bd"] = "h";
- dictionary["abxcd"]["--"] = "-";
-
- dictionary["abxaa"]["aa"] = "-";
- dictionary["abxaa"]["ab"] = "-";
- dictionary["abxaa"]["bb"] = "a";
- dictionary["abxaa"]["--"] = "-";
-
- dictionary["aaxab"]["aa"] = "-";
- dictionary["aaxab"]["ab"] = "-";
- dictionary["aaxab"]["bb"] = "b";
- dictionary["aaxab"]["--"] = "-";
-
- dictionary["abxcc"]["aa"] = "a";
- dictionary["abxcc"]["ab"] = "a";
- dictionary["abxcc"]["bb"] = "a";
- dictionary["abxcc"]["cc"] = "b";
- dictionary["abxcc"]["ac"] = "-";
- dictionary["abxcc"]["bc"] = "-";
- dictionary["abxcc"]["--"] = "-";
-
- dictionary["ccxab"]["aa"] = "b";
- dictionary["ccxab"]["ab"] = "b";
- dictionary["ccxab"]["bb"] = "b";
- dictionary["ccxab"]["cc"] = "a";
- dictionary["ccxab"]["ac"] = "-";
- dictionary["ccxab"]["bc"] = "-";
- dictionary["ccxab"]["--"] = "-";
-}
-
-void
-load_dh_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["ab/--"] = "ab/--";
- types["--/ab"] = "--/ab";
-
- dictionary["ab/--"]["aa"] = "aa";
- dictionary["ab/--"]["bb"] = "bb";
- dictionary["ab/--"]["--"] = "--";
-
- dictionary["--/ab"]["aa"] = "aa";
- dictionary["--/ab"]["bb"] = "bb";
- dictionary["--/ab"]["--"] = "--";
-}
-
-void
-load_mm_dh_dictionary(map<string, string> &types,
- map<string, map<string, string> > &dictionary)
-{
- types["ab/--"] = "abx--";
- types["--/ab"] = "--xab";
-
- dictionary["abx--"]["aa"] = "a";
- dictionary["abx--"]["bb"] = "b";
- dictionary["abx--"]["--"] = "-";
-
- dictionary["--xab"]["aa"] = "a";
- dictionary["--xab"]["bb"] = "b";
- dictionary["--xab"]["--"] = "-";
-}
-
-void
-load_segregation_ratios(map_types type, map<string, map<string, double> > &segregation_ratios)
-{
- switch(type) {
- case cp:
- segregation_ratios["ab/--"]["aa"] = 0.50;
- segregation_ratios["ab/--"]["ab"] = 0.50;
-
- segregation_ratios["--/ab"]["aa"] = 0.50;
- segregation_ratios["--/ab"]["ab"] = 0.50;
-
- segregation_ratios["ab/aa"]["aa"] = 0.50;
- segregation_ratios["ab/aa"]["ab"] = 0.50;
-
- segregation_ratios["aa/ab"]["aa"] = 0.50;
- segregation_ratios["aa/ab"]["ab"] = 0.50;
-
- segregation_ratios["ab/ab"]["ab"] = 0.50;
- segregation_ratios["ab/ab"]["aa"] = 0.25;
- segregation_ratios["ab/ab"]["bb"] = 0.25;
-
- segregation_ratios["ab/ac"]["ab"] = 0.25;
- segregation_ratios["ab/ac"]["ac"] = 0.25;
- segregation_ratios["ab/ac"]["bc"] = 0.25;
- segregation_ratios["ab/ac"]["aa"] = 0.25;
-
- segregation_ratios["ab/cd"]["ac"] = 0.25;
- segregation_ratios["ab/cd"]["ad"] = 0.25;
- segregation_ratios["ab/cd"]["bc"] = 0.25;
- segregation_ratios["ab/cd"]["bd"] = 0.25;
- break;
- case f2:
- segregation_ratios["aaxbb"]["a"] = 0.25;
- segregation_ratios["aaxbb"]["b"] = 0.25;
- segregation_ratios["aaxbb"]["h"] = 0.50;
-
- segregation_ratios["abxcd"]["a"] = 0.25;
- segregation_ratios["abxcd"]["b"] = 0.25;
- segregation_ratios["abxcd"]["h"] = 0.50;
-
- segregation_ratios["abxaa"]["a"] = 1.00;
-
- segregation_ratios["aaxab"]["b"] = 1.00;
-
- segregation_ratios["abxcc"]["a"] = 0.50;
- segregation_ratios["abxcc"]["b"] = 0.50;
-
- segregation_ratios["ccxab"]["b"] = 0.50;
- segregation_ratios["ccxab"]["a"] = 0.50;
- break;
- case bc1:
- segregation_ratios["aaxbb"]["h"] = 0.50;
- segregation_ratios["aaxbb"]["b"] = 0.50;
-
- segregation_ratios["bbxaa"]["h"] = 0.50;
- segregation_ratios["bbxaa"]["a"] = 0.50;
-
- segregation_ratios["abxcc"]["h"] = 0.50;
- segregation_ratios["abxcc"]["b"] = 0.50;
-
- segregation_ratios["ccxab"]["h"] = 0.50;
- segregation_ratios["ccxab"]["a"] = 0.50;
- break;
- case dh:
- segregation_ratios["ab/--"]["a"] = 0.50;
- segregation_ratios["ab/--"]["b"] = 0.50;
-
- segregation_ratios["--/ab"]["a"] = 0.50;
- segregation_ratios["--/ab"]["b"] = 0.50;
- break;
- case gen:
- case none:
- case unk:
- break;
- }
+enum map_types {unk, none, gen, dh, cp, bc1, f2};
+enum out_types {rqtl, joinmap, onemap, genomic};
- return;
-}
+void initialize_dictionaries(map<string, map<string, string> > &global_dictionary);
+void load_cp_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_joinmap_cp_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_onemap_cp_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_bc_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_f2_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_mm_bc_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_mm_f2_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_dh_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_mm_dh_dictionary(map<string, string> &types, map<string, map<string, string> > &dictionary);
+void load_segregation_ratios(map_types type, map<string, map<string, double> > &segregation_ratios);
inline
-int
-encode_gtype(char a)
-{
+int
+encode_gtype(char a)
+{
switch (a) {
case 'A':
- return 0;
+ return 0;
case 'C':
- return 1;
+ return 1;
case 'G':
- return 2;
+ return 2;
case 'T':
- return 3;
+ return 3;
}
-
return -1;
}
-int
-encoded_gtypes[4][4] =
-{
- // A C G T
- {1, 2, 3, 4}, // A
- {2, 5, 6, 7}, // C
- {3, 6, 8, 9}, // G
- {4, 7, 9, 10} // T
-};
-
#endif // __GENOTYPE_DICTIONARIES_H__
diff --git a/src/genotypes.cc b/src/genotypes.cc
index 9d0180e..d160159 100644
--- a/src/genotypes.cc
+++ b/src/genotypes.cc
@@ -26,8 +26,12 @@
// University of Oregon
//
+#include "MetaPopInfo.h"
+
#include "genotypes.h"
+extern int **encoded_gtypes;
+
// Global variables to hold command-line options.
int num_threads = 1;
int batch_id = -1;
@@ -82,8 +86,12 @@ int main (int argc, char* argv[]) {
initialize_dictionaries(global_dictionary);
- vector<string> files;
- build_file_list(files);
+ MetaPopInfo mpopi;
+ mpopi.init_directory(in_path);
+ if (mpopi.samples().empty()) {
+ cerr << "Error: Failed to find sample files in directory '" << in_path << "'.\n";
+ return -1;
+ }
if (wl_file.length() > 0) {
load_marker_list(wl_file, whitelist);
@@ -116,33 +124,58 @@ int main (int argc, char* argv[]) {
// Load matches to the catalog
//
vector<vector<CatMatch *> > catalog_matches;
- map<int, string> samples;
- vector<int> sample_ids;
- for (uint i = 0; i < files.size(); i++) {
- vector<CatMatch *> m;
- load_catalog_matches(in_path + files[i], m);
-
- if (m.size() == 0) {
- cerr << "Warning: unable to find any matches in file '" << files[i] << "', excluding this sample from genotypes analysis.\n";
- continue;
- }
+ vector<size_t> samples_to_remove;
+ set<size_t> seen_samples;
+ for (size_t i=0; i<mpopi.samples().size(); ++i) {
+ catalog_matches.push_back(vector<CatMatch*>());
+ vector<CatMatch *>& m = catalog_matches.back();
+
+ const MetaPopInfo::Sample& sample = mpopi.samples()[i];
+ load_catalog_matches(in_path + sample.name, m);
+
+ if (m.size() == 0) {
+ cerr << "Warning: Absent or malformed matches file '"
+ << sample.name << ".matches.tsv(.gz)"
+ <<"', excluding this sample from population analysis.\n";
+ samples_to_remove.push_back(i);
+ catalog_matches.pop_back(); // n.b. Index shift will be resolved by the call to MetaPopInfo::delete_samples().
+ continue;
+ }
- catalog_matches.push_back(m);
- samples[m[0]->sample_id] = files[i];
- sample_ids.push_back(m[0]->sample_id);
+ size_t sample_id = m[0]->sample_id;
+ if (seen_samples.count(sample_id) > 0) {
+ cerr << "Error: sample ID " << sample_id << " occurs twice in this data set, likely the pipeline was run incorrectly.\n";
+ return -1;
+ }
+ seen_samples.insert(sample_id);
+ mpopi.set_sample_id(i, sample_id);
+ }
+ mpopi.delete_samples(samples_to_remove);
+ if (mpopi.samples().size() == 0) {
+ cerr << "Error: Couln't find any matches files.\n";
+ return -1;
}
+ // [mpopi] is definitive.
+ cerr << "Working on " << mpopi.samples().size() << " samples.\n";
- sort(sample_ids.begin(), sample_ids.end());
+ // Set the deprecated [samples] variable to the value it should have
+ map<int, string> samples; // map of {sample_id, sample_name}
+ mpopi.fill_samples(samples);
+ // Retrieve the IDs of the parents
+ vector<int> sample_ids;
+ for (auto& sample : mpopi.samples())
+ sample_ids.push_back(sample.id);
+ sort(sample_ids.begin(), sample_ids.end());
set<int> parent_ids;
identify_parental_ids(catalog, sample_ids, parent_ids);
//
// Create the population map
//
- cerr << "Populating observed haplotypes for " << sample_ids.size() << " samples, " << catalog.size() << " loci.\n";
- PopMap<CSLocus> *pmap = new PopMap<CSLocus>(sample_ids.size(), catalog.size());
- pmap->populate(sample_ids, catalog, catalog_matches);
+ cerr << "Populating observed haplotypes for " << mpopi.samples().size() << " samples, " << catalog.size() << " loci.\n";
+ PopMap<CSLocus> *pmap = new PopMap<CSLocus>(mpopi, catalog.size());
+ pmap->populate(catalog, catalog_matches);
apply_locus_constraints(catalog, pmap);
@@ -2677,45 +2710,6 @@ int load_marker_list(string path, set<int> &list) {
return 0;
}
-int build_file_list(vector<string> &files) {
- uint pos;
- string file;
- struct dirent *direntry;
-
- DIR *dir = opendir(in_path.c_str());
-
- if (dir == NULL) {
- cerr << "Unable to open directory '" << in_path << "' for reading.\n";
- exit(1);
- }
-
- while ((direntry = readdir(dir)) != NULL) {
- file = direntry->d_name;
-
- if (file == "." || file == "..")
- continue;
-
- if (file.substr(0, 6) == "batch_")
- continue;
-
- pos = file.rfind(".tags.tsv");
- if (pos < file.length())
- files.push_back(file.substr(0, pos));
- }
-
- closedir(dir);
-
- sort(files.begin(), files.end());
-
- if (files.size() == 0) {
- cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
- }
-
- cerr << "Found " << files.size() << " input file(s).\n";
-
- return 0;
-}
-
bool hap_compare(pair<string, int> a, pair<string, int> b) {
return (a.second > b.second);
}
diff --git a/src/input.cc b/src/input.cc
index 3834e06..d58b6a6 100644
--- a/src/input.cc
+++ b/src/input.cc
@@ -67,12 +67,12 @@ Seq::Seq(const char *id, const char *seq, const char *qual, const char *chr, uin
strcpy(this->qual, qual);
this->loc.set(chr, bp, strand);
- sprintf(this->loc_str, "%s|%d|%c", chr, bp, strand == plus ? '+' : '-');
+ sprintf(this->loc_str, "%s|%d|%c", chr, bp, strand == strand_plus ? '+' : '-');
//
// Reverse complement sequences from the negative strand
//
- if (strand == plus) {
+ if (strand == strand_plus) {
this->seq = new char[strlen(seq) + 1];
strcpy(this->seq, seq);
} else {
diff --git a/src/kmers.cc b/src/kmers.cc
index 60bf0c5..d1bff17 100644
--- a/src/kmers.cc
+++ b/src/kmers.cc
@@ -98,7 +98,7 @@ generate_kmers_lazily(const char *seq, uint kmer_len, uint num_kmers, vector<cha
if (num_kmers > kmers.size()) {
int new_kmers = num_kmers - kmers.size();
- for (uint i = 0; i < new_kmers; i++) {
+ for (int i = 0; i < new_kmers; i++) {
kmer = new char[kmer_len + 1];
kmers.push_back(kmer);
}
diff --git a/src/locus.cc b/src/locus.cc
index 67ffea9..7daee85 100644
--- a/src/locus.cc
+++ b/src/locus.cc
@@ -26,7 +26,7 @@
uint
Locus::sort_bp(uint k)
{
- if (this->loc.strand == plus)
+ if (this->loc.strand == strand_plus)
return this->loc.bp + k;
else
return (k == 0 ? this->loc.bp - this->len + 1 : this->loc.bp - k);
diff --git a/src/mstack.cc b/src/mstack.cc
index 32127bc..8d7054f 100644
--- a/src/mstack.cc
+++ b/src/mstack.cc
@@ -194,10 +194,10 @@ MergedStack::calc_likelihood()
//
// Don't invoke the model within gaps.
//
- if (cur_gap < this->gaps.size() && col == this->gaps[cur_gap].start) {
+ if (cur_gap < gaps.size() && col == (int)gaps[cur_gap].start) {
do {
col++;
- } while (col < this->gaps[cur_gap].end && col < length);
+ } while (col < (int)gaps[cur_gap].end && col < length);
col--;
continue;
}
diff --git a/src/ordered.h b/src/ordered.h
index 6d74a86..e3455f8 100644
--- a/src/ordered.h
+++ b/src/ordered.h
@@ -21,6 +21,7 @@
#ifndef __ORDERED_H__
#define __ORDERED_H__
+#include <iostream>
#include <fstream>
using std::ifstream;
using std::ofstream;
@@ -34,8 +35,11 @@ using std::map;
#include <set>
using std::set;
+#include "MetaPopInfo.h"
#include "PopSum.h"
+extern MetaPopInfo mpopi;
+
enum loc_type {haplotype, snp};
template<class StatT>
@@ -263,10 +267,10 @@ OPopPair<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vecto
<< "incompatible_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << loc->sort_bp(k) << "\t"
+ << loc->sort_bp(k) +1 << "\t"
<< k << "\t"
- << pop_1 << "\t"
- << pop_2 << "\n";
+ << mpopi.pops()[pop_1].name << "\t"
+ << mpopi.pops()[pop_2].name << "\n";
delete pair;
continue;
}
@@ -292,10 +296,10 @@ OPopPair<StatT>::order(vector<StatT *> &sites, map<uint, uint> &sites_key, vecto
<< "multiple_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << pair->bp << "\t"
+ << pair->bp +1 << "\t"
<< k << "\t"
- << pop_key[pop_1] << "\t"
- << pop_key[pop_2] << "\n";
+ << mpopi.pops()[pop_1].name << "\t"
+ << mpopi.pops()[pop_2].name << "\n";
delete pair;
continue;
}
@@ -346,7 +350,7 @@ OSumStat<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci, i
if (lsum->nucs[k].num_indv == 0) continue;
if (sites_key.count(lsum->nucs[k].bp) == 0) {
- cerr << "Error: locus " << lsum->nucs[k].loc_id << " at " << lsum->nucs[k].bp << "bp is not defined in the sites map.\n";
+ cerr << "Error: locus " << lsum->nucs[k].loc_id << " at " << lsum->nucs[k].bp +1 << "bp is not defined in the sites map.\n";
} else if (sites[sites_key[lsum->nucs[k].bp]] == NULL) {
sites[sites_key[lsum->nucs[k].bp]] = &(lsum->nucs[k]);
@@ -357,9 +361,9 @@ OSumStat<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci, i
<< "multiple_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << lsum->nucs[k].bp << "\t"
+ << lsum->nucs[k].bp +1 << "\t"
<< k << "\t"
- << pop_id << "\t"
+ << mpopi.pops()[pop_id].name << "\t"
<< "conflicts with locus " << sites[sites_key[lsum->nucs[k].bp]]->loc_id << "\n";
}
}
@@ -407,7 +411,7 @@ OLocTally<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci)
if (ltally->nucs[k].allele_cnt != 2) continue;
if (sites_key.count(ltally->nucs[k].bp) == 0) {
- cerr << "Error: locus " << ltally->nucs[k].loc_id << " at " << ltally->nucs[k].bp << "bp is not defined in the sites map.\n";
+ cerr << "Error: locus " << ltally->nucs[k].loc_id << " at " << ltally->nucs[k].bp +1 << "bp is not defined in the sites map.\n";
} else if (sites[sites_key[ltally->nucs[k].bp]] == NULL) {
sites[sites_key[ltally->nucs[k].bp]] = &(ltally->nucs[k]);
@@ -418,7 +422,7 @@ OLocTally<StatT>::order(vector<StatT *> &sites, vector<CSLocus *> &sorted_loci)
<< "multiple_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << ltally->nucs[k].bp << "\t"
+ << ltally->nucs[k].bp +1 << "\t"
<< k << "\t"
<< "conflicts with locus " << sites[sites_key[ltally->nucs[k].bp]]->loc_id << "\n";
}
diff --git a/src/populations.cc b/src/populations.cc
index 97bbfa5..072c494 100644
--- a/src/populations.cc
+++ b/src/populations.cc
@@ -23,14 +23,30 @@
// haplotypes in a population context.
//
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "MetaPopInfo.h"
+#include "export_formats.h"
+
#include "populations.h"
+using namespace std;
+
+typedef MetaPopInfo::Sample Sample;
+typedef MetaPopInfo::Pop Pop;
+typedef MetaPopInfo::Group Group;
+
+extern int **encoded_gtypes;
+
// Global variables to hold command-line options.
+InputMode input_mode = InputMode::stacks;
int num_threads = 1;
int batch_id = -1;
string in_path;
+string in_vcf_path;
string out_path;
-string out_file;
string pmap_path;
string bl_file;
string wl_file;
@@ -83,10 +99,11 @@ double minor_allele_freq = 0.0;
double max_obs_het = 1.0;
double p_value_cutoff = 0.05;
corr_type fst_correction = no_correction;
+set<string> debug_flags;
+
+string out_prefix;
-map<int, string> pop_key, grp_key;
-map<int, pair<int, int> > pop_indexes;
-map<int, vector<int> > grp_members;
+MetaPopInfo mpopi;
set<int> blacklist, bootstraplist;
map<int, set<int> > whitelist;
@@ -101,9 +118,20 @@ map<string, int> renz_olap;
int main (int argc, char* argv[]) {
+#ifndef HAVE_LIBZ
+ cerr << "Stacks was compiled without zlib, and will refuse to parse compressed files.\n";
+#endif
+
+ //
+ // Initialize the globals that need it.
+ //
initialize_renz(renz, renz_cnt, renz_len);
initialize_renz_olap(renz_olap);
+ srandom(time(NULL));
+ //
+ // Parse the command line.
+ //
parse_command_line(argc, argv);
cerr
@@ -137,128 +165,321 @@ int main (int argc, char* argv[]) {
}
//
+ // Open and initialize the log file.
+ //
+ struct stat path_stat;
+ if (stat(out_path.c_str(), &path_stat) == 0) {
+ // Path exists, check that it is a directory
+ DIR* d = opendir(out_path.c_str());
+ if (d == NULL) {
+ cerr << "Error: Failed to open '" << out_path << "' as a directory.\n";
+ return -1;
+ }
+ closedir(d);
+ } else if (mkdir(out_path.c_str(), ACCESSPERMS) != 0) {
+ // Failed to create the directory.
+ cerr << "Error: Failed to create directory '" << out_path << "'.\n";
+ return -1;
+ }
+ string log_path = out_path + out_prefix + ".populations.log";
+ ofstream log_fh(log_path.c_str(), ofstream::out);
+ if (log_fh.fail()) {
+ cerr << "Error opening log file '" << log_path << "'\n";
+ return -1;
+ }
+ init_log(log_fh, argc, argv);
+ log_fh << flush;
+
+ //
// Set the number of OpenMP parallel threads to execute.
//
+
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
//
- // Seed the random number generator
+ // Initialize the catalog and the MetaPopInfo
//
- srandom(time(NULL));
- vector<pair<int, string> > files;
- if (!build_file_list(files, pop_indexes, grp_members))
- exit(1);
+ map<int, CSLocus *> catalog;
- if (wl_file.length() > 0) {
- load_marker_column_list(wl_file, whitelist);
- cerr << "Loaded " << whitelist.size() << " whitelisted markers.\n";
+ // We need some objects in the main scope for each mode.
+ vector<vector<CatMatch *> > catalog_matches;
+ VcfHeader *vcf_header = NULL;
+ vector<VcfRecord> *vcf_records = NULL;
+
+ // Read the population map file, if any.
+ if (not pmap_path.empty()) {
+ cerr << "Parsing population map...\n";
+ mpopi.init_popmap(pmap_path);
+ if (mpopi.samples().empty()) {
+ cerr << "Error: Failed to open or parse population map file '" << pmap_path << "'.\n";
+ return -1;
+ }
+ cerr << "The population map contained " << mpopi.samples().size() << " samples, "
+ << mpopi.pops().size() << " population(s), " << mpopi.groups().size() << " group(s).\n";
+ }
+
+ if (input_mode == InputMode::stacks) {
+ //
+ // Stacks mode
+ //
+ if (pmap_path.empty()) {
+ cerr << "No population map specified, building file list...\n";
+ mpopi.init_directory(in_path);
+ if (mpopi.samples().empty()) {
+ cerr << "Error: Failed to find sample files in directory '" << in_path << "'.\n";
+ return -1;
+ }
+ }
+
+ // Check that at least one sample file exists in the directory.
+ bool dir_good = false;
+ for (vector<Sample>::const_iterator s=mpopi.samples().begin(); s!=mpopi.samples().end(); ++s) {
+ ifstream f;
+ string path = in_path + s->name + ".matches.tsv";
+ f.open(path);
+ if (f.is_open()) {
+ dir_good = true;
+ break;
+ }
+#if HAVE_LIBZ
+ path += ".gz";
+ gzFile g = gzopen(path.c_str(), "rb");
+ if (g != NULL) {
+ dir_good = true;
+ gzclose(g);
+ break;
+ }
+#endif
+ }
+ if (!dir_good) {
+ cerr << "Error: Unable to locate any file in input directory '" << in_path << "'.\n";
+ return -1;
+ }
+
+ //
+ // Load the catalog
+ //
+ cerr << "Reading the catalog...\n";
+ string catalog_prefix = in_path + "batch_" + to_string(batch_id) + ".catalog";
+ bool compressed = false;
+ int res = load_loci(catalog_prefix, catalog, false, false, compressed);
+ if (res == 0) {
+ cerr << "Unable to load the catalog '" << catalog_prefix << "'\n";
+ return -1;
+ }
+
+ //
+ // Load the matches
+ //
+ cerr << "Reading matches to the catalog...\n";
+ vector<size_t> samples_to_remove;
+ set<size_t> seen_samples;
+ for (size_t i = 0; i < mpopi.samples().size(); ++i) {
+ catalog_matches.push_back(vector<CatMatch*>());
+ vector<CatMatch *>& m = catalog_matches.back();
+ load_catalog_matches(in_path + mpopi.samples().at(i).name, m);
+
+ if (m.size() == 0) {
+ cerr << "Warning: Absent or malformed matches file '"
+ << mpopi.samples()[i].name << ".matches.tsv(.gz)"
+ <<"', excluding this sample from population analysis.\n";
+ samples_to_remove.push_back(i);
+ catalog_matches.pop_back(); // This introduces an index shift between catalog_matches and [i]/[mpopi],
+ // which will be resolved by a call to MetaPopInfo::delete_samples().
+ continue;
+ }
+
+ size_t sample_id = m[0]->sample_id;
+ if (seen_samples.count(sample_id) > 0) {
+ cerr << "Error: sample ID " << sample_id << " occurs twice in this data set, likely the pipeline was run incorrectly.\n";
+ exit(0);
+ }
+ seen_samples.insert(sample_id);
+ mpopi.set_sample_id(i, sample_id);
+ }
+
+ mpopi.delete_samples(samples_to_remove);
+ if (mpopi.samples().size() == 0) {
+ cerr << "Error: Couln't find any matches files.\n";
+ return -1;
+ }
+ // [mpopi] is definitive.
+
+ } else if (input_mode == InputMode::vcf) {
+
+ //
+ // VCF mode
+ //
+
+ // Open the VCF file
+ cerr << "Opening the VCF file...\n";
+ VcfAbstractParser* parser = Vcf::adaptive_open(in_vcf_path);
+ if (parser == NULL) {
+ cerr << "Error: Unable to open VCF file '" << in_vcf_path << "'.\n";
+ return -1;
+ }
+
+ parser->read_header();
+ if (parser->header().samples().empty()) {
+ cerr << "Error: No samples in VCF file '" << in_vcf_path << "'.\n";
+ return -1;
+ }
+
+ // Reconsider the MetaPopInfo in light of the VCF header.
+ if (pmap_path.empty()) {
+ cerr << "No population map specified, creating one from the VCF header...\n";
+ mpopi.init_names(parser->header().samples());
+ } else {
+ // Intersect the samples present in the population map and the VCF.
+ vector<size_t> samples_to_discard;
+ for (size_t i=0; i<mpopi.samples().size(); ++i)
+ if (not parser->header().sample_indexes().count(mpopi.samples()[i].name))
+ samples_to_discard.push_back(i);
+ if (not samples_to_discard.empty()) {
+ if (samples_to_discard.size() == mpopi.samples().size()) {
+ cerr << "Error: No common samples between the population map and VCF header.\n";
+ return -1;
+ }
+ cerr << "Warning: of the samples listed in the population map, "
+ << samples_to_discard.size() << " could not be found in the VCF :";
+ for (const size_t& s : samples_to_discard)
+ cerr << " " << mpopi.samples()[s].name;
+ cerr << "\n";
+ mpopi.delete_samples(samples_to_discard);
+ }
+
+ // Create arbitrary sample IDs.
+ for (size_t i = 0; i < mpopi.samples().size(); ++i)
+ mpopi.set_sample_id(i, i+1); //id=i+1
+
+ // [mpopi] is definitive.
+ }
+
+ // Read the SNP records
+ cerr << "Reading the VCF records...\n";
+ vcf_records = new vector<VcfRecord>();
+ vector<size_t> skipped_notsnp;
+ vector<size_t> skipped_filter;
+
+ vcf_records->push_back(VcfRecord());
+ VcfRecord* rec = & vcf_records->back();
+ while (parser->next_record(*rec)) {
+ // Check for a SNP.
+ if (not rec->is_snp()) {
+ skipped_notsnp.push_back(parser->line_number());
+ continue;
+ }
+
+ // Check for a filtered-out SNP
+ if (not rec->filter.empty() && rec->filter[0] != "PASS") {
+ skipped_filter.push_back(parser->line_number());
+ continue;
+ }
+
+ // Save the SNP.
+ vcf_records->push_back(VcfRecord());
+ rec = & vcf_records->back();
+ }
+ vcf_records->pop_back();
+
+ cerr << "Found " << vcf_records->size() << " SNP records in file '" << in_vcf_path
+ << "'. (Skipped " << skipped_filter.size() << " already filtered-out SNPs and "
+ << skipped_notsnp.size() << " non-SNP records ; more with --verbose.)\n";
+ if (verbose && not skipped_notsnp.empty()) {
+ log_fh << "The following VCF record lines were determined not to be SNPs and skipped :";
+ for (vector<size_t>::const_iterator l=skipped_notsnp.begin(); l!=skipped_notsnp.end(); ++l)
+ log_fh << " " << *l;
+ log_fh << "\n";
+ }
+ if (vcf_records->size() == 0) {
+ cerr << "Error: No records.\n";
+ return -1;
+ }
+
+ catalog = create_catalog(*vcf_records);
+ vcf_header = new VcfHeader(parser->header());
+ delete parser;
}
+
+ //
+ // Read the blacklist, the whitelist, and the bootstrap-whitelist.
+ //
if (bl_file.length() > 0) {
load_marker_list(bl_file, blacklist);
cerr << "Loaded " << blacklist.size() << " blacklisted markers.\n";
}
+ if (wl_file.length() > 0) {
+ load_marker_column_list(wl_file, whitelist);
+ cerr << "Loaded " << whitelist.size() << " whitelisted markers.\n";
+ check_whitelist_integrity(catalog, whitelist);
+ }
if (bs_wl_file.length() > 0) {
load_marker_list(bs_wl_file, bootstraplist);
cerr << "Loaded " << bootstraplist.size() << " markers to include when bootstrapping.\n";
}
//
- // Open the log file.
+ // Reduce the catalog accordingly, and retrieve the genomic order of loci.
//
- stringstream log;
- log << "batch_" << batch_id << ".populations.log";
- string log_path = in_path + log.str();
- ofstream log_fh(log_path.c_str(), ofstream::out);
- if (log_fh.fail()) {
- cerr << "Error opening log file '" << log_path << "'\n";
- exit(1);
- }
- init_log(log_fh, argc, argv);
+ reduce_catalog(catalog, whitelist, blacklist);
+ loci_ordered = order_unordered_loci(catalog);
- //
- // Load the catalog
- //
- stringstream catalog_file;
- map<int, CSLocus *> catalog;
- bool compressed = false;
- int res;
- catalog_file << in_path << "batch_" << batch_id << ".catalog";
- if ((res = load_loci(catalog_file.str(), catalog, false, false, compressed)) == 0) {
- cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
- return 0;
+ // Report information on the MetaPopInfo.
+ cerr << "Working on " << mpopi.samples().size() << " samples.\n";
+ cerr << "Working on " << mpopi.pops().size() << " population(s):\n";
+ for (vector<Pop>::const_iterator p = mpopi.pops().begin(); p != mpopi.pops().end(); p++) {
+ cerr << " " << p->name << ": ";
+ for (size_t s = p->first_sample; s < p->last_sample; ++s) {
+ cerr << mpopi.samples()[s].name << ", ";
+ }
+ cerr << mpopi.samples()[p->last_sample].name << "\n";
+ }
+ cerr << "Working on " << mpopi.groups().size() << " group(s) of populations:\n";
+ for (vector<Group>::const_iterator g = mpopi.groups().begin(); g != mpopi.groups().end(); g++) {
+ cerr << " " << g->name << ": ";
+ for (vector<size_t>::const_iterator p = g->pops.begin(); p != g->pops.end() -1; ++p) {
+ //rem. end()-1 and back() are safe, there's always at least one pop
+ cerr << mpopi.pops()[*p].name << ", ";
+ }
+ cerr << mpopi.pops()[g->pops.back()].name << "\n";
}
- //
- // Check the whitelist.
- //
- check_whitelist_integrity(catalog, whitelist);
-
- //
- // Implement the black/white list
- //
- reduce_catalog(catalog, whitelist, blacklist);
+ if (size_t(population_limit) > mpopi.pops().size()) {
+ cerr << "Notice: Population limit (" << population_limit << ")"
+ << " larger than number of popualtions present, adjusting parameter to "
+ << mpopi.pops().size() << "\n";
+ population_limit = mpopi.pops().size();
+ }
//
- // If the catalog is not reference aligned, assign an arbitrary ordering to catalog loci.
+ // Initialize the PopMap
//
- loci_ordered = order_unordered_loci(catalog);
- //
- // Load matches to the catalog
- //
- vector<vector<CatMatch *> > catalog_matches;
- map<int, string> samples;
- vector<int> sample_ids;
- uint removed_cnt = 0;
+ cerr << "Populating observed haplotypes for " << mpopi.samples().size() << " samples, " << catalog.size() << " loci.\n";
+ PopMap<CSLocus> *pmap = new PopMap<CSLocus>(mpopi, catalog.size());
- for (int i = 0; i < (int) files.size(); i++) {
- vector<CatMatch *> m;
- load_catalog_matches(in_path + files[i].second, m);
+ if (input_mode == InputMode::stacks) {
+ // Using SStacks matches files...
+ pmap->populate(catalog, catalog_matches);
- if (m.size() == 0) {
- cerr << " Warning: unable to find any matches in file '" << files[i].second << "', excluding this sample from population analysis.\n";
- //
- // This case is generated by an existing, but empty file.
- // Remove this sample from the population index which was built from
- // existing files, but we couldn't yet check for empty files.
- //
- uint index = i - removed_cnt;
- for (map<int, pair<int, int> >::iterator pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- if ((index >= pit->second.first && index <= pit->second.second)) {
- pit->second.second--;
- removed_cnt++;
- pit++;
- while (pit != pop_indexes.end()) {
- pit->second.first--;
- pit->second.second--;
- pit++;
- }
- break;
- }
- continue;
- }
+ for(vector<vector<CatMatch *> >::iterator sample = catalog_matches.begin(); sample != catalog_matches.end(); ++sample)
+ for(vector<CatMatch*>::iterator match = sample->begin(); match != sample->end(); ++match)
+ delete *match;
+ catalog_matches.clear();
- catalog_matches.push_back(m);
- if (samples.count(m[0]->sample_id) == 0) {
- samples[m[0]->sample_id] = files[i].second;
- sample_ids.push_back(m[0]->sample_id);
- } else {
- cerr << "Fatal error: sample ID " << m[0]->sample_id << " occurs twice in this data set, likely the pipeline was run incorrectly.\n";
- exit(0);
- }
+ } else if (input_mode == InputMode::vcf) {
+ // ...or using VCF records.
+ pmap->populate(catalog, *vcf_records, *vcf_header);
+ delete vcf_records;
+ delete vcf_header;
+ vcf_records = NULL;
+ vcf_header = NULL;
}
//
- // Create the population map
- //
- cerr << "Populating observed haplotypes for " << sample_ids.size() << " samples, " << catalog.size() << " loci.\n";
- PopMap<CSLocus> *pmap = new PopMap<CSLocus>(sample_ids.size(), catalog.size());
- pmap->populate(sample_ids, catalog, catalog_matches);
-
- //
// Tabulate haplotypes present and in what combinations.
//
tabulate_haplotypes(catalog, pmap);
@@ -272,64 +493,61 @@ int main (int argc, char* argv[]) {
log_fh << "# Distribution of population loci.\n";
log_haplotype_cnts(catalog, log_fh);
- apply_locus_constraints(catalog, pmap, pop_indexes, log_fh);
+ apply_locus_constraints(catalog, pmap, log_fh);
log_fh << "# Distribution of population loci after applying locus constraints.\n";
log_haplotype_cnts(catalog, log_fh);
- cerr << "Loading model outputs for " << sample_ids.size() << " samples, " << catalog.size() << " loci.\n";
- map<int, CSLocus *>::iterator it;
- map<int, ModRes *>::iterator mit;
- Datum *d;
- CSLocus *loc;
+ if (input_mode == InputMode::stacks) {
+ //
+ // Load the output from the SNP calling model (hOm/hEt/Unk) for
+ // each individual at each locus.
+ //
- //
- // Load the output from the SNP calling model for each individual at each locus. This
- // model output string looks like this:
- // OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOEOOOOOOEOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUOOOOUOOOOOO
- // and records model calls for each nucleotide: O (hOmozygous), E (hEterozygous), U (Unknown)
- //
- for (uint i = 0; i < sample_ids.size(); i++) {
- map<int, ModRes *> modres;
- load_model_results(in_path + samples[sample_ids[i]], modres);
+ cerr << "Loading model outputs for " << mpopi.samples().size() << " samples, " << catalog.size() << " loci.\n";
- if (modres.size() == 0) {
- cerr << " Warning: unable to find any model results in file '" << samples[sample_ids[i]] << "', excluding this sample from population analysis.\n";
- continue;
- }
+ map<int, CSLocus *>::iterator it;
+ map<int, ModRes *>::iterator mit;
+ Datum *d;
+ CSLocus *loc;
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->datum(loc->id, sample_ids[i]);
-
- if (d != NULL) {
- if (modres.count(d->id) == 0) {
- cerr << "Fatal error: Unable to find model data for catalog locus " << loc->id
- << ", sample ID " << sample_ids[i] << ", sample locus " << d->id
- << "; likely IDs were mismatched when running pipeline.\n";
- exit(0);
+ for (uint i = 0; i < mpopi.samples().size(); i++) {
+ map<int, ModRes *> modres;
+ load_model_results(in_path + mpopi.samples()[i].name, modres);
+
+ if (modres.size() == 0) {
+ cerr << " Warning: unable to find any model results in file '" << mpopi.samples()[i].name << ".models.tsv(.gz)', excluding this sample from population analysis.\n";
+ continue;
+ }
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ d = pmap->datum(loc->id, mpopi.samples()[i].id);
+
+ if (d != NULL) {
+ if (modres.count(d->id) == 0) {
+ cerr << "Fatal error: Unable to find model data for catalog locus " << loc->id
+ << ", sample ID " << mpopi.samples()[i].id << ", sample locus " << d->id
+ << "; likely IDs were mismatched when running pipeline.\n";
+ exit(0);
+ }
+ d->add_model(modres[d->id]->model);
}
- d->add_model(modres[d->id]->model);
}
+ for (mit = modres.begin(); mit != modres.end(); mit++)
+ delete mit->second;
+ modres.clear();
}
-
- for (mit = modres.begin(); mit != modres.end(); mit++)
- delete mit->second;
- modres.clear();
}
- uint pop_id, start_index, end_index;
- map<int, pair<int, int> >::iterator pit;
-
- PopSum<CSLocus> *psum = new PopSum<CSLocus>(pmap->loci_cnt(), pop_indexes.size());
- psum->initialize(pmap);
+ //
+ // Create the PopSum object and compute the summary statistics.
+ //
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start_index = pit->second.first;
- end_index = pit->second.second;
- pop_id = pit->first;
- cerr << "Generating nucleotide-level summary statistics for population '" << pop_key[pop_id] << "'\n";
- psum->add_population(catalog, pmap, pop_id, start_index, end_index, verbose, log_fh);
+ PopSum<CSLocus> *psum = new PopSum<CSLocus>(*pmap, mpopi);
+ for (size_t i=0; i<mpopi.pops().size(); ++i) {
+ cerr << "Generating nucleotide-level summary statistics for population '" << mpopi.pops()[i].name << "'\n";
+ psum->add_population(catalog, pmap, i, verbose, log_fh);
}
cerr << "Tallying loci across populations...";
@@ -342,12 +560,9 @@ int main (int argc, char* argv[]) {
// frequency threshold (-a). In these cases we will remove the SNP, but keep the locus.
//
blacklist.clear();
- int pruned_snps = prune_polymorphic_sites(catalog, pmap, psum, pop_indexes, whitelist, blacklist, log_fh);
- cerr << "Pruned " << pruned_snps << " variant sites due to filter constraints.\n";
+ int pruned_snps = prune_polymorphic_sites(catalog, pmap, psum, whitelist, blacklist, log_fh);
+ cerr << "Pruned " << pruned_snps << " variant sites due to filter constraints (more with --verbose).\n";
- if (!verbose)
- cerr << " (enable the --verbose flag to record the reason why each site was filtered in the batch_X.populations.log file.)\n";
-
//
// Create an artificial whitelist if the user requested only the first or a random SNP per locus.
//
@@ -376,105 +591,107 @@ int main (int argc, char* argv[]) {
//
// Regenerate summary statistics after pruning SNPs and merging loci.
//
+
+ if (debug_flags.count("VCFCOMP"))
+ vcfcomp_simplify_pmap(catalog, pmap);
+
delete psum;
- psum = new PopSum<CSLocus>(pmap->loci_cnt(), pop_indexes.size());
- psum->initialize(pmap);
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start_index = pit->second.first;
- end_index = pit->second.second;
- pop_id = pit->first;
- cerr << "Regenerating nucleotide-level summary statistics for population '" << pop_key[pop_id] << "'\n";
- psum->add_population(catalog, pmap, pop_id, start_index, end_index, verbose, log_fh);
+ psum = new PopSum<CSLocus>(*pmap, mpopi);
+ for (size_t i=0; i<mpopi.pops().size(); ++i) {
+ cerr << "Regenerating nucleotide-level summary statistics for population '" << mpopi.pops()[i].name << "'\n";
+ psum->add_population(catalog, pmap, i, verbose, log_fh);
}
cerr << "Re-tallying loci across populations...";
psum->tally(catalog);
cerr << "done.\n";
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
- if (kernel_smoothed && loci_ordered) {
- cerr << " Generating kernel-smoothed population statistics...\n";
- kernel_smoothed_popstats(catalog, pmap, psum, pop_id, log_fh);
+ if (kernel_smoothed) {
+ if (loci_ordered) {
+ for (size_t i=0; i<mpopi.pops().size(); ++i) {
+ cerr << " Generating kernel-smoothed population statistics for population '" << mpopi.pops()[i].name << "'...\n";
+ kernel_smoothed_popstats(catalog, pmap, psum, i, log_fh);
+ }
+ } else {
+ cerr << "Notice: Smoothing was requested (-k), but will not be performed as the loci are not ordered.\n";
}
}
- calculate_haplotype_stats(files, pop_indexes, catalog, pmap, psum);
+ calculate_haplotype_stats(catalog, pmap, psum);
if (calc_fstats) {
- calculate_haplotype_divergence(files, pop_indexes, grp_members, catalog, pmap, psum);
-
- calculate_haplotype_divergence_pairwise(files, pop_indexes, grp_members, catalog, pmap, psum);
+ calculate_haplotype_divergence(catalog, pmap, psum);
+ calculate_haplotype_divergence_pairwise(catalog, pmap, psum);
}
//
// Calculate and output the locus-level summary statistics.
//
- calculate_summary_stats(files, pop_indexes, catalog, pmap, psum);
+ calculate_summary_stats(catalog, pmap, psum);
//
// Output the observed haplotypes.
//
- write_generic(catalog, pmap, samples, false);
+ write_generic(catalog, pmap, false);
//
// Output data in requested formats
//
if (fasta_out)
- write_fasta(catalog, pmap, samples, sample_ids);
+ write_fasta(catalog, pmap);
if (fasta_strict_out)
- write_strict_fasta(catalog, pmap, samples, sample_ids);
+ write_strict_fasta(catalog, pmap);
if (genepop_out && ordered_export)
- write_genepop_ordered(catalog, pmap, psum, pop_indexes, samples, log_fh);
+ write_genepop_ordered(catalog, pmap, psum, log_fh);
else if (genepop_out)
- write_genepop(catalog, pmap, psum, pop_indexes, samples);
+ write_genepop(catalog, pmap, psum);
if (structure_out && ordered_export)
- write_structure_ordered(catalog, pmap, psum, pop_indexes, samples, log_fh);
+ write_structure_ordered(catalog, pmap, psum, log_fh);
else if (structure_out)
- write_structure(catalog, pmap, psum, pop_indexes, samples);
+ write_structure(catalog, pmap, psum);
if (fastphase_out)
- write_fastphase(catalog, pmap, psum, pop_indexes, samples);
+ write_fastphase(catalog, pmap, psum);
if (phase_out)
- write_phase(catalog, pmap, psum, pop_indexes, samples);
+ write_phase(catalog, pmap, psum);
if (beagle_out)
- write_beagle(catalog, pmap, psum, pop_indexes, samples);
+ write_beagle(catalog, pmap, psum);
if (beagle_phased_out)
- write_beagle_phased(catalog, pmap, psum, pop_indexes, samples);
+ write_beagle_phased(catalog, pmap, psum);
if (plink_out)
- write_plink(catalog, pmap, psum, pop_indexes, samples);
+ write_plink(catalog, pmap, psum);
if (hzar_out)
- write_hzar(catalog, pmap, psum, pop_indexes, samples);
+ write_hzar(catalog, pmap, psum);
if (treemix_out)
- write_treemix(catalog, pmap, psum, pop_indexes, samples);
+ write_treemix(catalog, pmap, psum);
if (phylip_out || phylip_var)
- write_phylip(catalog, pmap, psum, pop_indexes, samples);
+ write_phylip(catalog, pmap, psum);
if (phylip_var_all)
- write_fullseq_phylip(catalog, pmap, psum, pop_indexes, samples);
+ write_fullseq_phylip(catalog, pmap, psum);
if (vcf_haplo_out)
- write_vcf_haplotypes(catalog, pmap, psum, samples, sample_ids);
+ write_vcf_haplotypes(catalog, pmap, psum);
if (vcf_out && ordered_export)
- write_vcf_ordered(catalog, pmap, psum, samples, sample_ids, merge_map, log_fh);
+ write_vcf_ordered(catalog, pmap, psum, merge_map, log_fh);
else if (vcf_out)
- write_vcf(catalog, pmap, psum, samples, sample_ids, merge_map);
+ write_vcf(catalog, pmap, psum, merge_map);
//
// Calculate and write Fst.
//
if (calc_fstats)
- write_fst_stats(files, pop_indexes, catalog, pmap, psum, log_fh);
+ write_fst_stats(catalog, pmap, psum, log_fh);
//
// Output nucleotide-level genotype calls for each individual.
@@ -484,16 +701,105 @@ int main (int argc, char* argv[]) {
log_fh.close();
+ cerr << "Populations is done.\n";
return 0;
}
+void vcfcomp_simplify_pmap (map<int, CSLocus*>& catalog, PopMap<CSLocus>* pmap) {
+
+ cerr << "DEBUG Deleting information from the pmap & catalog so that they resemble what can be retrieved from a VCF.\n";
+ // n.b. In this configuration we only have one SNP per locus so we don't have
+ // to worry about what U's imply regarding haplotypes.
+ size_t n_deleted = 0;
+ size_t n_loci = pmap->loci_cnt();
+ size_t n_samples = pmap->sample_cnt();
+ for (size_t l=0; l<n_loci; ++l) {
+ CSLocus* loc = catalog.at(pmap->rev_locus_index(l));
+ if (loc->snps.empty())
+ continue;
+ if (loc->snps.size() > 1) {
+ cerr << "Error: This requires --write_single_snp.\n";
+ throw exception();
+ }
+ size_t col = loc->snps.at(0)->col;
+ Datum** datums = pmap->locus(loc->id);
+ for (size_t s=0; s<n_samples; ++s) {
+ if (datums[s] == NULL)
+ continue;
+ if (size_t(datums[s]->len) <= col
+ || datums[s]->model[col] == 'U'
+ || datums[s]->obshap.empty()
+ || strcmp(datums[s]->obshap[0], "N") == 0
+ ) {
+ delete datums[s];
+ datums[s] = NULL;
+ --loc->cnt;
+ --loc->hcnt;
+ ++n_deleted;
+ }
+ }
+ }
+ cerr << "? Deleted " << n_deleted << " 'Datums'.\n";
+
+ set<int> myblacklist;
+ // All NULL.
+ for (auto& l : catalog) {
+ Datum** data = pmap->locus(l.second->id);
+ size_t non_null = 0;
+ for (size_t i=0; i < size_t(pmap->sample_cnt()); ++i)
+ if (data[i]!=NULL)
+ ++non_null;
+ if (non_null == 0)
+ myblacklist.insert(l.second->id);
+ }
+ // Not two alleles.
+ for (auto& l : catalog) {
+ if (l.second->snps.empty()
+ || l.second->snps[0]->rank_2 == 0
+ || l.second->snps[0]->rank_3 != 0
+ || l.second->alleles.size() != 2) {
+ myblacklist.insert(l.second->id);
+ } else {
+ // Check the actual number of alleles
+ Datum** data = pmap->locus(l.second->id);
+ set<char> seen_alleles;
+ for (size_t i=0; i<size_t(pmap->sample_cnt()); ++i) {
+ Datum* d = data[i];
+ if (d != NULL)
+ for(char* hapl : d->obshap)
+ seen_alleles.insert(hapl[0]);
+ }
+ if ((int((seen_alleles.count('A') || seen_alleles.count('a')))
+ + (seen_alleles.count('C') || seen_alleles.count('c'))
+ + (seen_alleles.count('T') || seen_alleles.count('t'))
+ + (seen_alleles.count('G') || seen_alleles.count('g')))
+ != 2)
+ myblacklist.insert(l.second->id);
+ }
+ }
+ // Same SNP in different loci.
+ for (auto& chr : pmap->ordered_loci) {
+ map<size_t,vector<size_t> > seen_bp0; // (bp, [loc_id's])
+ for (CSLocus* loc : chr.second)
+ if (not loc->snps.empty())
+ seen_bp0[loc->sort_bp(loc->snps[0]->col)].push_back(loc->id);
+ for (auto& bp : seen_bp0)
+ if (bp.second.size() > 1)
+ for (size_t loc_id : bp.second)
+ myblacklist.insert(loc_id);
+ }
+ set<int> empty;
+ reduce_catalog(catalog, empty, myblacklist);
+ pmap->prune(myblacklist);
+ cerr << "? Now working on " << catalog.size() << " loci (deleted " << myblacklist.size() << " loci).\n";
+}
+
int
apply_locus_constraints(map<int, CSLocus *> &catalog,
PopMap<CSLocus> *pmap,
- map<int, pair<int, int> > &pop_indexes,
ofstream &log_fh)
{
- uint pop_id, start_index, end_index;
+ uint pop_sthg;
CSLocus *loc;
Datum **d;
@@ -504,9 +810,8 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
<< "# Action\tLocus ID\tChr\tBP\tColumn\tReason\n";
map<int, CSLocus *>::iterator it;
- map<int, pair<int, int> >::iterator pit;
- uint pop_cnt = pop_indexes.size();
+ uint pop_cnt = mpopi.pops().size();
int *pop_order = new int [pop_cnt];
// Which population each sample belongs to.
@@ -518,18 +823,17 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
// The total number of samples in each population.
int *pop_tot = new int [pop_cnt];
- pop_id = 0;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start_index = pit->second.first;
- end_index = pit->second.second;
- pop_tot[pop_id] = 0;
+ pop_sthg = 0;
+ for (size_t i_pop=0; i_pop<mpopi.pops().size(); ++i_pop) {
+ const Pop& pop = mpopi.pops()[i_pop];
+ pop_tot[pop_sthg] = 0;
- for (uint i = start_index; i <= end_index; i++) {
- samples[i] = pop_id;
- pop_tot[pop_id]++;
+ for (uint i = pop.first_sample; i <= pop.last_sample; i++) {
+ samples[i] = pop_sthg;
+ pop_tot[pop_sthg]++;
}
- pop_order[pop_id] = pit->first;
- pop_id++;
+ pop_order[pop_sthg] = i_pop;
+ pop_sthg++;
}
for (uint i = 0; i < pop_cnt; i++)
@@ -585,14 +889,13 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
// the members of that population.
//
for (uint i = 0; i < pop_cnt; i++) {
- pct = (double) pop_cnts[i] / (double) pop_tot[i];
+ const Pop& pop = mpopi.pops()[pop_order[i]];
+ pct = (double) pop_cnts[i] / (double) pop_tot[i];
+
if (pop_cnts[i] > 0 && pct < sample_limit) {
//cerr << "Removing population " << pop_order[i] << " at locus: " << loc->id << "; below sample limit: " << pct << "\n";
- start_index = pop_indexes[pop_order[i]].first;
- end_index = pop_indexes[pop_order[i]].second;
-
- for (uint j = start_index; j <= end_index; j++) {
+ for (uint j = pop.first_sample; j <= pop.last_sample; j++) {
if (d[j] != NULL) {
delete d[j];
d[j] = NULL;
@@ -620,7 +923,7 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
log_fh << "removed_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << loc->sort_bp() << "\t"
+ << loc->sort_bp() +1 << "\t"
<< 0 << "\tfailed_population_limit\n";
}
@@ -658,7 +961,6 @@ int
prune_polymorphic_sites(map<int, CSLocus *> &catalog,
PopMap<CSLocus> *pmap,
PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
map<int, set<int> > &whitelist, set<int> &blacklist,
ofstream &log_fh)
{
@@ -670,8 +972,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
Datum **d;
bool sample_prune, maf_prune, het_prune, inc_prune;
int size, pruned = 0;
- uint pop_id, start_index, end_index;
-
+
if (verbose)
log_fh << "\n#\n# List of pruned nucleotide sites\n#\n"
<< "# Action\tLocus ID\tChr\tBP\tColumn\tReason\n";
@@ -721,32 +1022,29 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
inc_prune = false;
pop_prune_list.clear();
- for (int j = 0; j < psum->pop_cnt(); j++) {
- pop_id = psum->rev_pop_index(j);
-
- if (s[j]->nucs[loc->snps[i]->col].incompatible_site)
+ for (size_t p=0; p<mpopi.pops().size(); ++p) {
+ if (s[p]->nucs[loc->snps[i]->col].incompatible_site)
inc_prune = true;
- else if (s[j]->nucs[loc->snps[i]->col].num_indv == 0 ||
- (double) s[j]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(pop_id) < sample_limit)
- pop_prune_list.push_back(pop_id);
+ else if (s[p]->nucs[loc->snps[i]->col].num_indv == 0 ||
+ (double) s[p]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(p) < sample_limit)
+ pop_prune_list.push_back(p);
}
//
// Check how many populations have to be pruned out due to sample limit. If less than
// population limit, prune them; if more than population limit, mark locus for deletion.
//
- if ((psum->pop_cnt() - pop_prune_list.size()) < (uint) population_limit) {
+ if ((mpopi.pops().size() - pop_prune_list.size()) < (uint) population_limit) {
sample_prune = true;
} else {
- for (uint j = 0; j < pop_prune_list.size(); j++) {
- if (s[psum->pop_index(pop_prune_list[j])]->nucs[loc->snps[i]->col].num_indv == 0) continue;
-
- start_index = pop_indexes[pop_prune_list[j]].first;
- end_index = pop_indexes[pop_prune_list[j]].second;
- d = pmap->locus(loc->id);
+ for (size_t p : pop_prune_list) {
+ if (s[p]->nucs[loc->snps[i]->col].num_indv == 0)
+ continue;
- for (uint k = start_index; k <= end_index; k++) {
+ d = pmap->locus(loc->id);
+ const Pop& pop = mpopi.pops()[p];
+ for (uint k = pop.first_sample; k <= pop.last_sample; k++) {
if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
continue;
if (d[k]->model != NULL) {
@@ -777,7 +1075,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
log_fh << "pruned_polymorphic_site\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << loc->sort_bp(loc->snps[i]->col) << "\t"
+ << loc->sort_bp(loc->snps[i]->col) +1 << "\t"
<< loc->snps[i]->col << "\t";
if (inc_prune)
log_fh << "incompatible_site\n";
@@ -801,7 +1099,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
log_fh << "removed_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << loc->sort_bp() << "\t"
+ << loc->sort_bp() +1 << "\t"
<< 0 << "\tno_snps_remaining\n";
blacklist.insert(loc->id);
}
@@ -840,14 +1138,12 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
inc_prune = false;
pop_prune_list.clear();
- for (int j = 0; j < psum->pop_cnt(); j++) {
- pop_id = psum->rev_pop_index(j);
-
- if (s[j]->nucs[loc->snps[i]->col].incompatible_site)
+ for (size_t p = 0; p < mpopi.pops().size(); p++) {
+ if (s[p]->nucs[loc->snps[i]->col].incompatible_site)
inc_prune = true;
- else if (s[j]->nucs[loc->snps[i]->col].num_indv == 0 ||
- (double) s[j]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(pop_id) < sample_limit)
- pop_prune_list.push_back(pop_id);
+ else if (s[p]->nucs[loc->snps[i]->col].num_indv == 0 ||
+ (double) s[p]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(p) < sample_limit)
+ pop_prune_list.push_back(p);
}
//
@@ -857,14 +1153,13 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
if ((psum->pop_cnt() - pop_prune_list.size()) < (uint) population_limit) {
sample_prune = true;
} else {
- for (uint j = 0; j < pop_prune_list.size(); j++) {
- if (s[psum->pop_index(pop_prune_list[j])]->nucs[loc->snps[i]->col].num_indv == 0) continue;
-
- start_index = pop_indexes[pop_prune_list[j]].first;
- end_index = pop_indexes[pop_prune_list[j]].second;
- d = pmap->locus(loc->id);
-
- for (uint k = start_index; k <= end_index; k++) {
+ for (size_t p : pop_prune_list) {
+ if (s[p]->nucs[loc->snps[i]->col].num_indv == 0)
+ continue;
+
+ d = pmap->locus(loc->id);
+ const Pop& pop = mpopi.pops()[p];
+ for (uint k = pop.first_sample; k <= pop.last_sample; k++) {
if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
continue;
if (d[k]->model != NULL) {
@@ -895,7 +1190,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
log_fh << "pruned_polymorphic_site\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << loc->sort_bp(loc->snps[i]->col) << "\t"
+ << loc->sort_bp(loc->snps[i]->col) +1 << "\t"
<< loc->snps[i]->col << "\t";
if (inc_prune)
log_fh << "incompatible_site\n";
@@ -919,7 +1214,7 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
log_fh << "removed_locus\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
- << loc->sort_bp() << "\t"
+ << loc->sort_bp() +1 << "\t"
<< 0 << "\tno_snps_remaining\n";
blacklist.insert(loc->id);
}
@@ -1104,9 +1399,9 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
// +Must occur on opposite strands
// +Must overlap according to the length of the cutsite.
//
- if (((cur->loc.strand == minus && next->loc.strand == plus) &&
+ if (((cur->loc.strand == strand_minus && next->loc.strand == strand_plus) &&
((int) (cur->loc.bp - next->loc.bp + 1) == renz_olap[enz])) ||
- ((cur->loc.strand == plus && next->loc.strand == minus) &&
+ ((cur->loc.strand == strand_plus && next->loc.strand == strand_minus) &&
((int) (next->loc.bp - cur->loc.bp + 1) == renz_olap[enz]))) {
overlap++;
@@ -1476,7 +1771,7 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
//
uint bp = sink->sort_bp();
sink->loc.bp = bp;
- sink->loc.strand = plus;
+ sink->loc.strand = strand_plus;
//
// 5. Adjust the length of the sequence.
@@ -1505,7 +1800,7 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
sink->alleles[*it] = 0;
// cerr << "CSLocus " << sink->id << ":\n"
- // << "Length: " << sink->len << "; Chr: " << sink->loc.chr << "; BP: " << sink->sort_bp() << "; strand: " << (sink->loc.strand == plus ? "+" : "-") << "\n"
+ // << "Length: " << sink->len << "; Chr: " << sink->loc.chr << "; BP: " << sink->sort_bp() << "; strand: " << (sink->loc.strand == strand_plus ? "+" : "-") << "\n"
// << " SNPs:\n";
// for (uint j = 0; j < sink->snps.size(); j++)
// cerr << " Col: " << sink->snps[j]->col
@@ -1727,69 +2022,6 @@ merge_datums(int sample_cnt,
return 1;
}
-int
-datum_adjust_snp_positions(map<int, pair<merget, int> > &merge_map,
- CSLocus *loc, Datum *datum,
- map<int, SNPRes *> &snpres)
-{
- //
- // We will start with the 'sink' locus, which was originally on the negative strand:
- // 1. If the locus was shorter than the catalog locus, pad the difference.
- // 2. Convert to positive strand: Reverse the order, complement the alleles,
- // alter the internal column position.
- //
- SNP *snp;
- SNPRes *snpr = snpres[datum->id];
- int index = 0;
- int stop_pos = renz_olap[enz] - 1;
-
- //
- // We know the catalog was padded since we already padded hte model call string
- // if it was necessary when originally merging.
- //
- while (datum->model[index] == 'N') {
- snp = new SNP;
- snp->col = index;
- snp->lratio = 0.0;
- snp->rank_1 = 'N';
- snp->type = snp_type_unk;
- datum->snps.push_back(snp);
- index++;
- }
-
- for (int j = snpr->snps.size() - 1; j > stop_pos; j--) {
- snp = new SNP;
- snp->col = index;
- snp->lratio = snpr->snps[j]->lratio;
- snp->rank_1 = reverse(snpr->snps[j]->rank_1);
- snp->rank_2 = reverse(snpr->snps[j]->rank_2);
- snp->rank_3 = reverse(snpr->snps[j]->rank_3);
- snp->rank_4 = reverse(snpr->snps[j]->rank_4);
- datum->snps.push_back(snp);
- index++;
- }
-
- //
- // Now we fetch the former locus, the 'src', which was originally on the positive strand.
- // All we have to do is adjust the column position of each SNP.
- //
- snpr = snpres[datum->merge_partner];
-
- for (uint j = 0; j < snpres[datum->id]->snps.size(); j++) {
- snp = new SNP;
- snp->col = index;
- snp->lratio = snpr->snps[j]->lratio;
- snp->rank_1 = snpr->snps[j]->rank_1;
- snp->rank_2 = snpr->snps[j]->rank_2;
- snp->rank_3 = snpr->snps[j]->rank_3;
- snp->rank_4 = snpr->snps[j]->rank_4;
- datum->snps.push_back(snp);
- index++;
- }
-
- return 0;
-}
-
int
create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
{
@@ -1892,51 +2124,8 @@ int call_population_genotypes(CSLocus *locus,
return 0;
}
-int tally_haplotype_freq(CSLocus *locus, PopMap<CSLocus> *pmap,
- int &total, double &max, string &freq_str) {
-
- map<string, double> freq;
- Datum **d = pmap->locus(locus->id);
-
- total = 0;
- max = 0;
-
- //cerr << "Examining marker: " << locus->id << "\n";
-
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
-
- //cerr << " Sample: " << i << "; Haplotype: " << d[i]->obshap[0] << "; Genotype: " << d[i]->gtype << "\n";
- if (d[i]->gtype[0] != '-') {
- freq[d[i]->gtype]++;
- total++;
- }
- }
-
- if (total == 0)
- return 0;
-
- double frac;
- stringstream s;
- char f[id_len];
- map<string, double>::iterator it;
- for (it = freq.begin(); it != freq.end(); it++) {
- frac = (double) it->second / (double) total * 100;
- if (frac > max) max = frac;
- sprintf(f, "(%0.1f%%);", frac);
- s << it->first << ":" << it->second << f;
- }
-
- freq_str = s.str();
-
- return 0;
-}
-
int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".genomic.tsv";
-
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + ".genomic.tsv";
ofstream fh(file.c_str(), ofstream::out);
@@ -2003,7 +2192,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
uint k = 0;
for (uint n = start; n < end; n++) {
- fh << loc->id << "\t" << loc->loc.chr << "\t" << loc->sort_bp(n);
+ fh << loc->id << "\t" << loc->loc.chr << "\t" << loc->sort_bp(n) +1;
if (snp_locs.count(n) == 0) {
for (int j = 0; j < pmap->sample_cnt(); j++) {
@@ -2045,8 +2234,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
}
int
-calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &pop_indexes,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+calculate_haplotype_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
CSLocus *loc;
@@ -2067,9 +2255,7 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
//
// Open output file and print header.
//
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".hapstats" << ".tsv";
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + ".hapstats.tsv";
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
@@ -2079,19 +2265,15 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
fh.precision(fieldw);
fh.setf(std::ios::fixed);
- map<int, pair<int, int> >::iterator pit;
- int start, end, pop_id;
-
//
// Write the population members.
//
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- fh << "# " << pop_key[pit->first] << "\t";
- for (int i = start; i <= end; i++) {
- fh << files[i].second;
- if (i < end) fh << ",";
+ for (auto& pop : mpopi.pops()) {
+ fh << "# " << pop.name << "\t";
+ for (size_t i = pop.first_sample; i <= pop.last_sample; i++) {
+ fh << mpopi.samples()[i].name;
+ if (i < pop.last_sample)
+ fh << ",";
}
fh << "\n";
}
@@ -2114,12 +2296,9 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
//
// Iterate over the members of each population.
//
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- pop_id = pit->first;
+ for (auto& pop : mpopi.pops()) {
- cerr << "Generating haplotype-level summary statistics for population '" << pop_key[pop_id] << "'\n";
+ cerr << "Generating haplotype-level summary statistics for population '" << pop.name << "'\n";
map<string, vector<LocStat *> > genome_locstats;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
@@ -2140,7 +2319,7 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
// cerr << "Looking at locus " << loc->id << "\n";
- l = haplotype_diversity(start, end, d);
+ l = haplotype_diversity(pop.first_sample, pop.last_sample, d);
if (l != NULL) {
l->loc_id = loc->id;
@@ -2175,7 +2354,7 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
<< l->loc_id << "\t"
<< it->first << "\t"
<< l->bp + 1 << "\t"
- << pop_key[pop_id] << "\t"
+ << pop.name << "\t"
<< (int) l->alleles << "\t"
<< l->hap_cnt << "\t"
<< l->stat[0] << "\t"
@@ -2303,10 +2482,7 @@ nuc_substitution_identity_max(map<string, int> &hap_index, double **hdists)
}
int
-calculate_haplotype_divergence(vector<pair<int, string> > &files,
- map<int, pair<int, int> > &pop_indexes,
- map<int, vector<int> > &master_grp_members,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+calculate_haplotype_divergence(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
@@ -2316,22 +2492,11 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
cerr << "Calculating haplotype F statistics across all populations/groups...\n";
//
- // Create a list of all the groups we have.
- //
- map<int, vector<int> >::iterator git;
- map<int, int> pop_grp_key;
- for (git = master_grp_members.begin(); git != master_grp_members.end(); git++)
- for (uint i = 0; i < git->second.size(); i++)
- pop_grp_key[git->second[i]] = git->first;
-
- //
// Create a list of all the populations we have.
//
vector<int> pop_ids;
-
- map<int, pair<int, int> >::iterator pit;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- pop_ids.push_back(pit->first);
+ for (size_t i=0; i<mpopi.pops().size(); ++i)
+ pop_ids.push_back(i);
//
// Instantiate the kernel smoothing object and associated ordering object if requested.
@@ -2379,16 +2544,16 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
// If this locus only appears in one population or there is only a single haplotype,
// do not calculate haplotype F stats.
//
- if (fixed_locus(pop_indexes, d, pop_ids))
+ if (fixed_locus(d, pop_ids))
continue;
cnt++;
// cerr << "Processing locus " << loc->id << "\n";
- h = haplotype_amova(pop_grp_key, pop_indexes, d, s, pop_ids);
+ h = haplotype_amova(d, s, pop_ids);
if (h != NULL) {
- h->stat[4] = haplotype_d_est(pop_indexes, d, s, pop_ids);
+ h->stat[4] = haplotype_d_est(d, s, pop_ids);
h->loc_id = loc->id;
h->bp = loc->sort_bp();
@@ -2429,10 +2594,7 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
cerr << "Writing haplotype F statistics... ";
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".phistats" << ".tsv";
-
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + ".phistats.tsv";
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
@@ -2445,14 +2607,12 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
//
// Write the population members.
//
- int start, end;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- fh << "# Population " << pop_key[pit->first] << "\t";
- for (int k = start; k <= end; k++) {
- fh << files[k].second;
- if (k < end) fh << ",";
+ for (auto& pop : mpopi.pops()) {
+ fh << "# Population " << pop.name << "\t";
+ for (size_t k = pop.first_sample; k <= pop.last_sample; k++) {
+ fh << mpopi.samples()[k].name;
+ if (k < pop.last_sample)
+ fh << ",";
}
fh << "\n";
}
@@ -2460,12 +2620,12 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
//
// Write the group members.
//
- for (git = grp_members.begin(); git != grp_members.end(); git++) {
- end = git->second.size();
- fh << "# Group " << grp_key[git->first] << "\t";
- for (int k = 0; k < end; k++) {
- fh << pop_key[git->second[k]];
- if (k < end - 1) fh << ",";
+ for (auto& group : mpopi.groups()) {
+ fh << "# Group " << group.name << "\t";
+ for (size_t i_pop : group.pops) {
+ fh << mpopi.pops()[i_pop].name;
+ if (i_pop != group.pops.back())
+ fh << ",";
}
fh << "\n";
}
@@ -2518,7 +2678,7 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
fh << batch_id << "\t"
<< hapstats[k]->loc_id << "\t"
<< chr << "\t"
- << hapstats[k]->bp << "\t"
+ << hapstats[k]->bp +1 << "\t"
<< hapstats[k]->popcnt << "\t";
if (log_fst_comp)
fh << hapstats[k]->comp[0] << "\t"
@@ -2564,10 +2724,7 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
}
int
-calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
- map<int, pair<int, int> > &pop_indexes,
- map<int, vector<int> > &master_grp_members,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+calculate_haplotype_divergence_pairwise(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
@@ -2579,16 +2736,9 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
//
// Assign all individuals to one group for the pairwise calculations.
//
- map<int, vector<int> >::iterator git;
- map<int, int> pop_grp_key;
- for (git = master_grp_members.begin(); git != master_grp_members.end(); git++)
- for (uint i = 0; i < git->second.size(); i++)
- pop_grp_key[git->second[i]] = 1;
-
- map<int, pair<int, int> >::iterator pit;
vector<int> pop_ids;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- pop_ids.push_back(pit->first);
+ for (size_t i=0; i<mpopi.pops().size(); ++i)
+ pop_ids.push_back(i);
//
// Instantiate the kernel smoothing object if requested.
@@ -2601,19 +2751,20 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
ord = new OHaplotypes<HapStat>();
}
- for (uint i = 0; i < pop_ids.size(); i++) {
- for (uint j = i + 1; j < pop_ids.size(); j++) {
+ for (uint i = 0; i < mpopi.pops().size(); ++i) {
+ const Pop& pop_i = mpopi.pops()[i];
+ for (uint j = i + 1; j < mpopi.pops().size(); ++j) {
+ const Pop& pop_j = mpopi.pops()[j];
+ vector<int> subpop_ids;
+ subpop_ids.push_back(i);
+ subpop_ids.push_back(j);
if (bootstrap_phist)
bs = new Bootstrap<HapStat>(5);
map<string, vector<HapStat *> > genome_hapstats;
- vector<int> subpop_ids;
-
- subpop_ids.push_back(pop_ids[i]);
- subpop_ids.push_back(pop_ids[j]);
- cerr << " Processing populations '" << pop_key[pop_ids[i]] << "' and '" << pop_key[pop_ids[j]] << "'\n";
+ cerr << " Processing populations '" << pop_i.name << "' and '" << pop_j.name << "'\n";
uint cnt = 0;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
@@ -2645,16 +2796,16 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
// If this locus only appears in one population or there is only a single haplotype,
// do not calculate haplotype F stats.
//
- if (fixed_locus(pop_indexes, d, subpop_ids))
+ if (fixed_locus(d, subpop_ids))
continue;
cnt++;
// cerr << "Processing locus " << loc->id << "\n";
- h = haplotype_amova(pop_grp_key, pop_indexes, d, s, subpop_ids);
+ h = haplotype_amova(d, s, subpop_ids);
if (h != NULL) {
- h->stat[4] = haplotype_d_est(pop_indexes, d, s, subpop_ids);
+ h->stat[4] = haplotype_d_est(d, s, subpop_ids);
h->loc_id = loc->id;
h->bp = loc->sort_bp();
@@ -2690,10 +2841,7 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
cerr << "Writing haplotype F statistics... ";
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".phistats_" << pop_key[pop_ids[i]] << "-" << pop_key[pop_ids[j]] << ".tsv";
-
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + ".phistats_" + pop_i.name + "-" + pop_j.name + ".tsv";
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
@@ -2706,14 +2854,13 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
//
// Write the population members.
//
- int start, end;
- for (uint k = 0; k < subpop_ids.size(); k++) {
- start = pop_indexes[subpop_ids[k]].first;
- end = pop_indexes[subpop_ids[k]].second;
- fh << "# Population " << pop_key[subpop_ids[k]] << "\t";
- for (int n = start; n <= end; n++) {
- fh << files[n].second;
- if (n < end) fh << ",";
+ for (int k : subpop_ids) {
+ const Pop& pop_k = mpopi.pops()[k]; // This is [pop_i], then [pop_j].
+ fh << "# Population " << pop_k.name << "\t";
+ for (size_t n = pop_k.first_sample; n <= pop_k.last_sample; n++) {
+ fh << mpopi.samples()[n].name;
+ if (n < pop_k.last_sample)
+ fh << ",";
}
fh << "\n";
}
@@ -2760,10 +2907,10 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
fh << batch_id << "\t"
<< hapstats[k]->loc_id << "\t"
- << pop_key[pop_ids[i]] << "\t"
- << pop_key[pop_ids[j]] << "\t"
+ << pop_i.name << "\t"
+ << pop_j.name << "\t"
<< chr << "\t"
- << hapstats[k]->bp << "\t";
+ << hapstats[k]->bp +1 << "\t";
if (log_fst_comp)
fh << hapstats[k]->comp[0] << "\t"
<< hapstats[k]->comp[1] << "\t"
@@ -2809,19 +2956,14 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
}
bool
-fixed_locus(map<int, pair<int, int> > &pop_indexes, Datum **d, vector<int> &pop_ids)
+fixed_locus(Datum **d, vector<int> &pop_ids)
{
set<string> loc_haplotypes;
map<int, vector<string> > pop_haplotypes;
- int start, end, pop_id;
- int pop_cnt = pop_ids.size();
-
- for (int p = 0; p < pop_cnt; p++) {
- start = pop_indexes[pop_ids[p]].first;
- end = pop_indexes[pop_ids[p]].second;
- pop_id = pop_ids[p];
- for (int i = start; i <= end; i++) {
+ for (int pop_id : pop_ids) {
+ const Pop& pop = mpopi.pops()[pop_id];
+ for (size_t i = pop.first_sample; i <= pop.last_sample; i++) {
if (d[i] == NULL) continue;
if (d[i]->obshap.size() > 2) {
@@ -2846,10 +2988,8 @@ fixed_locus(map<int, pair<int, int> > &pop_indexes, Datum **d, vector<int> &pop_
uint valid_pops = 0;
- for (int p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
-
- if (pop_haplotypes[pop_id].size() > 0)
+ for (int pop_id : pop_ids) {
+ if (pop_haplotypes[pop_id].size() > 0)
valid_pops++;
}
@@ -2868,55 +3008,17 @@ fixed_locus(map<int, pair<int, int> > &pop_indexes, Datum **d, vector<int> &pop_
return false;
}
-inline bool
-uncalled_haplotype(const char *haplotype)
-{
- for (const char *p = haplotype; *p != '\0'; p++)
- if (*p == 'N' || *p == 'n')
- return true;
- return false;
-}
-
-inline double
-count_haplotypes_at_locus(int start, int end, Datum **d, map<string, double> &hap_cnts)
+LocStat *
+haplotype_diversity(int start, int end, Datum **d)
{
- double n = 0.0;
-
- for (int i = start; i <= end; i++) {
- if (d[i] == NULL) continue;
-
- if (d[i]->obshap.size() > 2) {
- continue;
-
- } else if (d[i]->obshap.size() == 1) {
- if(!uncalled_haplotype(d[i]->obshap[0])) {
- n += 2;
- hap_cnts[d[i]->obshap[0]] += 2;
- }
- } else {
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- if(!uncalled_haplotype(d[i]->obshap[0])) {
- n++;
- hap_cnts[d[i]->obshap[j]]++;
- }
- }
- }
- }
-
- return n;
-}
-
-LocStat *
-haplotype_diversity(int start, int end, Datum **d)
-{
- map<string, double>::iterator hit;
- vector<string> haplotypes;
- map<string, double> hap_freq;
- map<string, int> hap_index;
- double n = 0.0;
- double gene_diversity = 0.0;
- double hapl_diversity = 0.0;
- LocStat *lstat;
+ map<string, double>::iterator hit;
+ vector<string> haplotypes;
+ map<string, double> hap_freq;
+ map<string, int> hap_index;
+ double n = 0.0;
+ double gene_diversity = 0.0;
+ double hapl_diversity = 0.0;
+ LocStat *lstat;
//
// Tabulate the haplotypes in this population.
@@ -3006,8 +3108,7 @@ haplotype_diversity(int start, int end, Datum **d)
}
HapStat *
-haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_indexes,
- Datum **d, LocSum **s, vector<int> &pop_ids)
+haplotype_amova(Datum **d, LocSum **s, vector<int> &pop_ids)
{
map<string, int> loc_hap_index;
vector<string> loc_haplotypes;
@@ -3016,21 +3117,15 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
vector<int> grps;
map<string, int>::iterator hit, hit_2;
- map<int, pair<int, int> >::iterator pit;
- int start, end, pop_id, pop_id_1;
HapStat *h;
- int pop_cnt = pop_ids.size();
//
// Tabulate the occurences of haplotypes at this locus.
//
- for (int p = 0; p < pop_cnt; p++) {
- start = pop_indexes[pop_ids[p]].first;
- end = pop_indexes[pop_ids[p]].second;
- pop_id = pop_ids[p];
-
- for (int i = start; i <= end; i++) {
+ for (int pop_id : pop_ids) {
+ const Pop& pop = mpopi.pops()[pop_id];
+ for (size_t i = pop.first_sample; i <= pop.last_sample; i++) {
if (d[i] == NULL) continue;
if (d[i]->obshap.size() > 2) {
@@ -3060,8 +3155,7 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
// What is the total number of populations that had valid haplotypes.
//
double valid_pop_cnt = 0.0;
- for (int p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
+ for (int pop_id : pop_ids) {
if (pop_haplotypes[pop_id].size() > 0)
valid_pop_cnt++;
}
@@ -3071,12 +3165,11 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
// representative present in each group.
//
set<int> uniq_grps;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
-
+ for (size_t pop_id=0; pop_id<mpopi.pops().size(); ++pop_id) {
+ const Pop& pop = mpopi.pops()[pop_id];
if (pop_haplotypes.count(pop_id) > 0) {
- uniq_grps.insert(pop_grp_key[pop_id]);
- grp_members[pop_grp_key[pop_id]].push_back(pop_id);
+ uniq_grps.insert(pop.group);
+ grp_members[pop.group].push_back(pop_id);
}
}
set<int>::iterator uit;
@@ -3147,20 +3240,20 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
for (uint g = 0; g < num_grps; g++) {
for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
+ int pop_id_1 = grp_members[grps[g]][r];
tot_cnt += (double) pop_haplotypes[pop_id_1].size();
}
}
for (uint g = 0; g < num_grps; g++) {
grp_cnt = 0.0;
for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
+ int pop_id_1 = grp_members[grps[g]][r];
grp_cnt += (double) pop_haplotypes[pop_id_1].size();
}
a = 0.0;
for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
+ int pop_id_1 = grp_members[grps[g]][r];
a += (double) (pop_haplotypes[pop_id_1].size() * pop_haplotypes[pop_id_1].size()) / grp_cnt;
}
s_g += a;
@@ -3176,7 +3269,7 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
a = 0.0;
for (uint g = 0; g < num_grps; g++) {
for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
+ int pop_id_1 = grp_members[grps[g]][r];
a += ((double) (pop_haplotypes[pop_id_1].size() * pop_haplotypes[pop_id_1].size()) / tot_cnt);
}
}
@@ -3190,7 +3283,7 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
for (uint g = 0; g < num_grps; g++) {
a = 0.0;
for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
+ int pop_id_1 = grp_members[grps[g]][r];
a += pop_haplotypes[pop_id_1].size();
}
b += ((a * a) / tot_cnt);
@@ -3492,7 +3585,7 @@ amova_ssd_ag(vector<int> &grps, map<int, vector<int> > &grp_members,
}
double
-haplotype_d_est(map<int, pair<int, int> > &pop_indexes, Datum **d, LocSum **s, vector<int> &pop_ids)
+haplotype_d_est(Datum **d, LocSum **s, vector<int> &pop_ids)
{
//
// Calculate D_est, fixation index, as described by
@@ -3507,24 +3600,19 @@ haplotype_d_est(map<int, pair<int, int> > &pop_indexes, Datum **d, LocSum **s, v
map<int, double> pop_totals;
map<string, double>::iterator it;
- int start, end, pop_id;
uint pop_cnt = pop_ids.size();
//
// Tabulate the occurences of haplotypes at this locus.
//
- for (uint p = 0; p < pop_cnt; p++) {
- start = pop_indexes[pop_ids[p]].first;
- end = pop_indexes[pop_ids[p]].second;
- pop_id = pop_ids[p];
-
- for (int i = start; i <= end; i++) {
- if (d[i] == NULL) continue;
-
- if (d[i]->obshap.size() > 2) {
+ for (int pop_id : pop_ids) {
+ const Pop& pop = mpopi.pops()[pop_id];
+ for (size_t i = pop.first_sample; i <= pop.last_sample; i++) {
+ if (d[i] == NULL) {
+ continue;
+ } else if (d[i]->obshap.size() > 2) {
continue;
-
} else if (d[i]->obshap.size() == 1) {
loc_haplotypes[d[i]->obshap[0]] += 2;
pop_haplotypes[pop_id][d[i]->obshap[0]] += 2;
@@ -3547,8 +3635,7 @@ haplotype_d_est(map<int, pair<int, int> > &pop_indexes, Datum **d, LocSum **s, v
double freq_sum_sq = 0.0;
double freq_sq_sum = 0.0;
- for (uint p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
+ for (int pop_id : pop_ids) {
freq_sum_sq += (pop_haplotypes[pop_id][it->first] / pop_totals[pop_id]);
freq_sq_sum += pow((pop_haplotypes[pop_id][it->first] / pop_totals[pop_id]), 2);
}
@@ -3560,9 +3647,7 @@ haplotype_d_est(map<int, pair<int, int> > &pop_indexes, Datum **d, LocSum **s, v
double y = 0.0;
for (it = loc_haplotypes.begin(); it != loc_haplotypes.end(); it++) {
- for (uint p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
-
+ for (int pop_id : pop_ids) {
y += (pop_haplotypes[pop_id][it->first] * (pop_haplotypes[pop_id][it->first] - 1)) /
(pop_totals[pop_id] * (pop_totals[pop_id] - 1));
}
@@ -3574,8 +3659,7 @@ haplotype_d_est(map<int, pair<int, int> > &pop_indexes, Datum **d, LocSum **s, v
}
int
-calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &pop_indexes,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+calculate_summary_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
CSLocus *loc;
@@ -3755,10 +3839,7 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
fis_mean_all[j] = fis_mean_all[j] / n_all[j];
}
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".sumstats" << ".tsv";
-
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + ".sumstats.tsv";
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
@@ -3769,18 +3850,15 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
fh.setf(std::ios::fixed);
double p_freq;
- int start, end;
//
// Write the population members.
//
- map<int, pair<int, int> >::iterator pit;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- fh << "# " << pit->first << "\t";
- for (int i = start; i <= end; i++) {
- fh << files[i].second;
- if (i < end) fh << ",";
+ for (auto& pop : mpopi.pops()) {
+ fh << "# " << pop.name << "\t";
+ for (size_t i = pop.first_sample; i <= pop.last_sample; i++) {
+ fh << mpopi.samples()[i].name;
+ if (i < pop.last_sample)
+ fh << ",";
}
fh << "\n";
}
@@ -3827,14 +3905,15 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
for (int j = 0; j < pop_cnt; j++) {
- if (s[j]->nucs[i].num_indv == 0) continue;
+ if (s[j]->nucs[i].num_indv == 0)
+ continue;
fh << batch_id << "\t"
<< loc->id << "\t"
<< loc->loc.chr << "\t"
<< loc->sort_bp(i) + 1 << "\t"
<< i << "\t"
- << pop_key[psum->rev_pop_index(j)] << "\t";
+ << mpopi.pops()[j].name << "\t";
//
// Output the p and q alleles in the same order in each population.
@@ -3932,10 +4011,7 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
fh.close();
- pop_name.str("");
- pop_name << "batch_" << batch_id << ".sumstats_summary" << ".tsv";
-
- file = in_path + pop_name.str();
+ file = out_path + out_prefix + ".sumstats_summary.tsv";
fh.open(file.c_str(), ofstream::out);
@@ -3984,7 +4060,7 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
}
for (int j = 0; j < pop_cnt; j++)
- fh << pop_key[psum->rev_pop_index(j)] << "\t"
+ fh << mpopi.pops()[j].name << "\t"
<< private_cnt[j] << "\t"
<< num_indv_mean[j] << "\t"
<< num_indv_var[j] << "\t"
@@ -4044,7 +4120,7 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
<< "StdErr\n";
for (int j = 0; j < pop_cnt; j++) {
- fh << pop_key[psum->rev_pop_index(j)] << "\t"
+ fh << mpopi.pops()[j].name << "\t"
<< private_cnt[j] << "\t"
<< n_all[j] << "\t"
<< n[j] << "\t"
@@ -4122,20 +4198,16 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
}
int
-write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &pop_indexes,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum, ofstream &log_fh)
+write_fst_stats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum, ofstream &log_fh)
{
//
// We want to iterate over each pair of populations and calculate Fst at each
// nucleotide of each locus.
//
- vector<double> means;
- vector<int> pops;
- map<int, pair<int, int> >::iterator pit;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- pops.push_back(pit->first);
+ if (mpopi.pops().size() == 1)
+ return 0;
- if (pops.size() == 1) return 0;
+ vector<double> means;
//
// Instantiate the kernel smoothing object if requested.
@@ -4143,22 +4215,20 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
OPopPair<PopPair> *ord = new OPopPair<PopPair>(psum, log_fh);
KSmooth<PopPair> *ks;
Bootstrap<PopPair> *bs;
- if (kernel_smoothed && loci_ordered)
+ if (kernel_smoothed && loci_ordered) {
+ cerr << "Instantiating the kernel smoothing window, using sigma = " << sigma << " with a sliding window size of " << 6 * sigma << "\n";
ks = new KSmooth<PopPair>(2);
+ }
- for (uint i = 0; i < pops.size(); i++) {
-
- for (uint j = i + 1; j < pops.size(); j++) {
- int pop_1 = pops[i];
- int pop_2 = pops[j];
+ for (uint pop_1 = 0; pop_1 < mpopi.pops().size(); pop_1++) {
+ const Pop& pop_1p = mpopi.pops()[pop_1];
+ for (uint pop_2 = pop_1 + 1; pop_2 < mpopi.pops().size(); pop_2++) {
+ const Pop& pop_2p = mpopi.pops()[pop_2];
double sum = 0.0;
double cnt = 0.0;
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".fst_" << pop_key[pop_1] << "-" << pop_key[pop_2] << ".tsv";
-
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + ".fst_" + pop_1p.name + "-" + pop_2p.name + ".tsv";
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
cerr << "Error opening Fst output file '" << file << "'\n";
@@ -4167,7 +4237,7 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
fh.precision(fieldw);
fh.setf(std::ios::fixed);
- cerr << "Calculating Fst for populations '" << pop_key[pop_1] << "' and '" << pop_key[pop_2] << "' and writing it to file, '" << file << "'\n";
+ cerr << "Calculating Fst for populations '" << pop_1p.name << "' and '" << pop_2p.name << "' and writing it to file, '" << file << "'\n";
fh << "# Batch ID" << "\t"
<< "Locus ID" << "\t"
@@ -4323,10 +4393,10 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
fh << batch_id << "\t"
<< pairs[i]->loc_id << "\t"
- << pop_key[pop_1] << "\t"
- << pop_key[pop_2] << "\t"
+ << pop_1p.name << "\t"
+ << pop_2p.name << "\t"
<< chr << "\t"
- << pairs[i]->bp << "\t"
+ << pairs[i]->bp +1 << "\t"
<< pairs[i]->col << "\t"
<< pairs[i]->pi << "\t"
<< pairs[i]->fst << "\t"
@@ -4372,10 +4442,10 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
delete pairs[i];
}
}
- cerr << "Pop 1: " << pop_key[pop_1] << "; Pop 2: " << pop_key[pop_2] << "; mean Fst: " << (sum / cnt) << "\n";
+ cerr << "Pop 1: " << pop_1p.name << "; Pop 2: " << pop_2p.name << "; mean Fst: " << (sum / cnt) << "\n";
means.push_back(sum / cnt);
- cerr << "Pooled populations '" << pop_key[pop_1] << "' and '" << pop_key[pop_2] << "' contained: " << ord->incompatible_loci << " incompatible loci; "
+ cerr << "Pooled populations '" << pop_1p.name << "' and '" << pop_2p.name << "' contained: " << ord->incompatible_loci << " incompatible loci; "
<< ord->multiple_loci << " nucleotides covered by more than one RAD locus.\n";
fh.close();
@@ -4387,10 +4457,7 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
//
// Write out the mean Fst measure of each pair of populations.
//
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".fst_summary.tsv";
-
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + ".fst_summary.tsv";
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
@@ -4401,18 +4468,18 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
//
// Write out X-axis header.
//
- for (uint i = 0; i < pops.size(); i++)
- fh << "\t" << pop_key[pops[i]];
+ for (auto& pop : mpopi.pops())
+ fh << "\t" << pop.name;
fh << "\n";
uint n = 0;
- for (uint i = 0; i < pops.size() - 1; i++) {
- fh << pop_key[pops[i]];
+ for (uint i = 0; i < mpopi.pops().size() - 1; i++) {
+ fh << mpopi.pops()[i].name;
for (uint k = 0; k <= i; k++)
fh << "\t";
- for (uint j = i + 1; j < pops.size(); j++) {
+ for (uint j = i + 1; j < mpopi.pops().size(); j++) {
fh << "\t" << means[n];
n++;
}
@@ -4506,7 +4573,7 @@ kernel_smoothed_popstats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, Po
if (bootstrap_pifis) bs->add_data(sites);
}
- cerr << " Population '" << pop_key[pop_id] << "' contained " << ord->multiple_loci << " nucleotides covered by more than one RAD locus.\n";
+ cerr << " Population '" << mpopi.pops()[pop_id].name << "' contained " << ord->multiple_loci << " nucleotides covered by more than one RAD locus.\n";
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
if (bootstrap_pifis)
@@ -4875,17 +4942,9 @@ bootstrap_approximate_pval(int snp_cnt, double stat, map<int, vector<double> > &
}
int
-write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
- map<int, string> &samples, bool write_gtypes)
+write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, bool write_gtypes)
{
- stringstream pop_name;
- pop_name << "batch_" << batch_id;
- if (write_gtypes)
- pop_name << ".genotypes.tsv";
- else
- pop_name << ".haplotypes.tsv";
-
- string file = in_path + pop_name.str();
+ string file = out_path + out_prefix + (write_gtypes ? ".genotypes.tsv" : ".haplotypes.tsv");
ofstream fh(file.c_str(), ofstream::out);
@@ -4914,7 +4973,7 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
fh << "Cnt\t";
for (int i = 0; i < pmap->sample_cnt(); i++) {
- fh << samples[pmap->rev_sample_index(i)];
+ fh << mpopi.samples()[i].name;
if (i < pmap->sample_cnt() - 1)
fh << "\t";
}
@@ -4936,7 +4995,7 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
if (loc->annotation.length() > 0)
id << "\t" << loc->id << "\t" << loc->annotation;
else if (strlen(loc->loc.chr) > 0)
- id << "\t" << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp;
+ id << "\t" << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp +1;
else
id << "\t" << loc->id << "\t";
}
@@ -4954,7 +5013,7 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
if (d[i] == NULL)
fh << "-";
- else
+ else {
if (write_gtypes) {
fh << d[i]->gtype;
} else {
@@ -4964,6 +5023,7 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
obshap = obshap.substr(0, obshap.length()-1);
fh << obshap;
}
+ }
}
fh << "\n";
@@ -4974,3863 +5034,129 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
return 0;
}
-int
-write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
-{
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".markers.tsv";
- string file = in_path + pop_name.str();
-
- cerr << "Writing SQL markers file to '" << file << "'\n";
+int load_marker_list(string path, set<int> &list) {
+ char line[id_len];
+ ifstream fh(path.c_str(), ifstream::in);
- ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
- cerr << "Error opening markers SQL file '" << file << "'\n";
+ cerr << "Error opening white/black list file '" << path << "'\n";
exit(1);
}
- fh.precision(fieldw);
- fh.setf(std::ios::fixed);
-
- fh << "# SQL ID" << "\t"
- << "Batch ID" << "\t"
- << "Catalog Locus ID" << "\t"
- << "\t"
- << "Total Genotypes" << "\t"
- << "Max" << "\t"
- << "Genotype Freqs" << "\t"
- << "F" << "\t"
- << "Mean Log Likelihood" << "\t"
- << "Genotype Map" << "\t"
- << "\n";
- map<int, CSLocus *>::iterator it;
- CSLocus *loc;
- stringstream gtype_map;
+ int marker;
+ char *p, *e;
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ while (fh.good()) {
+ fh.getline(line, id_len);
- string freq = "";
- double max = 0.0;
- int total = 0;
- gtype_map.str("");
+ if (strlen(line) == 0) continue;
- if (loc->marker.length() > 0) {
- tally_haplotype_freq(loc, pmap, total, max, freq);
+ //
+ // Skip commented lines.
+ //
+ for (p = line; isspace(*p) && *p != '\0'; p++);
+ if (*p == '#') continue;
- //
- // Record the haplotype to genotype map.
- //
- map<string, string>::iterator j;
- for (j = loc->gmap.begin(); j != loc->gmap.end(); j++)
- gtype_map << j->first << ":" << j->second << ";";
- }
+ marker = (int) strtol(line, &e, 10);
- fh << 0 << "\t"
- << batch_id << "\t"
- << loc->id << "\t"
- << "\t" // Marker
- << total << "\t"
- << max << "\t"
- << freq << "\t"
- << loc->f << "\t"
- << loc->lnl << "\t"
- << gtype_map.str() << "\t"
- << "\n";
+ if (*e == '\0')
+ list.insert(marker);
}
fh.close();
- return 0;
-}
-
-int
-write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int, string> &samples, vector<int> &sample_ids)
-{
- //
- // Write a FASTA file containing each allele from each locus from
- // each sample in the population.
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".fa";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population alleles to FASTA file '" << file << "'\n";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening FASTA file '" << file << "'\n";
+ if (list.size() == 0) {
+ cerr << "Unable to load any markers from '" << path << "'\n";
exit(1);
}
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- char *seq;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- d = pmap->locus(loc->id);
- seq = new char[loc->len + 1];
- strcpy(seq, loc->con);
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL)
- continue;
-
- for (uint k = 0; k < d[j]->obshap.size(); k++) {
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
- }
-
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << k
- << " [" << samples[pmap->rev_sample_index(j)];
-
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
- }
- }
- delete [] seq;
- }
- }
-
- fh.close();
-
return 0;
}
-int
-write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int, string> &samples, vector<int> &sample_ids)
-{
- //
- // Write a FASTA file containing each allele from each locus from
- // each sample in the population.
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".strict.fa";
- string file = in_path + pop_name.str();
-
- cerr << "Writing strict population alleles to FASTA file '" << file << "'\n";
-
- ofstream fh(file.c_str(), ofstream::out);
+int load_marker_column_list(string path, map<int, set<int> > &list) {
+ char line[id_len];
+ ifstream fh(path.c_str(), ifstream::in);
if (fh.fail()) {
- cerr << "Error opening strict FASTA file '" << file << "'\n";
+ cerr << "Error opening white/black list file '" << path << "'\n";
exit(1);
}
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- char *seq;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- d = pmap->locus(loc->id);
- seq = new char[loc->len + 1];
- strcpy(seq, loc->con);
+ vector<string> parts;
+ uint marker, col;
+ char *p, *e;
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL)
- continue;
- if (d[j]->obshap.size() > 2)
- continue;
+ uint line_num = 1;
+ while (fh.good()) {
+ fh.getline(line, id_len);
- if (d[j]->obshap.size() == 1) {
+ if (strlen(line) == 0) continue;
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- seq[col] = col < loc->len ? d[j]->obshap[0][i] : loc->con[col];
- }
+ //
+ // Skip commented lines.
+ //
+ for (p = line; isspace(*p) && *p != '\0'; p++);
+ if (*p == '#') continue;
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << 0
- << " [" << samples[pmap->rev_sample_index(j)];
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
-
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << 1
- << " [" << samples[pmap->rev_sample_index(j)];
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
+ //
+ // Parse the whitelist, we expect:
+ // <marker>[<tab><snp column>]
+ //
+ parse_tsv(line, parts);
- } else {
- for (uint k = 0; k < d[j]->obshap.size(); k++) {
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
- }
+ if (parts.size() > 2) {
+ cerr << "Too many columns in whitelist " << path << "' at line " << line_num << "\n";
+ exit(1);
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << k
- << " [" << samples[pmap->rev_sample_index(j)];
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
- }
- }
+ } else if (parts.size() == 2) {
+ marker = (int) strtol(parts[0].c_str(), &e, 10);
+ if (*e != '\0') {
+ cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
+ exit(1);
+ }
+ col = (int) strtol(parts[1].c_str(), &e, 10);
+ if (*e != '\0') {
+ cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
+ exit(1);
}
+ list[marker].insert(col);
- delete [] seq;
+ } else {
+ marker = (int) strtol(parts[0].c_str(), &e, 10);
+ if (*e != '\0') {
+ cerr << "Unable to parse whitelist, '" << path << "' at line " << line << "\n";
+ exit(1);
+ }
+ list.insert(make_pair(marker, std::set<int>()));
}
+
+ line_num++;
}
fh.close();
- return 0;
-}
-
-int
-write_vcf_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, string> &samples, vector<int> &sample_ids,
- map<int, pair<merget, int> > &merge_map, ofstream &log_fh)
-{
- //
- // Write a VCF file as defined here: http://www.1000genomes.org/node/101
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".vcf";
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening VCF file '" << file << "'\n";
- exit(1);
+ if (list.size() == 0) {
+ cerr << "Unable to load any markers from '" << path << "'\n";
+ help();
}
- //
- // Load SNP data so that model likelihoods can be output to VCF file.
- //
- cerr << "In preparation for VCF export, loading SNP data for " << samples.size() << " samples.\n";
-
- populate_snp_calls(catalog, pmap, samples, sample_ids, merge_map);
-
- cerr << "Writing population data to VCF file '" << file << "'\n";
-
- log_fh << "\n#\n# Generating SNP-based VCF export.\n#\n";
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%Y%m%d", timeinfo);
-
- //
- // Output the header.
- //
- fh << "##fileformat=VCFv4.0\n"
- << "##fileDate=" << date << "\n"
- << "##source=\"Stacks v" << VERSION << "\"\n"
- << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n"
- << "##INFO=<ID=AF,Number=.,Type=Float,Description=\"Allele Frequency\">\n"
- << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
- << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n"
- << "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Allele Depth\">\n"
- << "##FORMAT=<ID=GL,Number=.,Type=Float,Description=\"Genotype Likelihood\">\n"
- << "#CHROM" << "\t" << "POS" << "\t" << "ID" << "\t" << "REF" << "\t" << "ALT" << "\t"
- << "QUAL" << "\t" << "FILTER" << "\t" << "INFO" << "\t" << "FORMAT";
-
- for (int i = 0; i < pmap->sample_cnt(); i++)
- fh << "\t" << samples[pmap->rev_sample_index(i)];
- fh << "\n";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- int gt_1, gt_2, dp_1, dp_2;
- char p_allele, q_allele, p_str[32], q_str[32];
- uint16_t col;
- int snp_index;
-
- //
- // We need to order the SNPs taking into account overlapping loci.
- //
- OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> sites;
- ord->order(sites, it->second);
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- if (catalog.count(sites[pos]->loc_id) == 0) {
- cerr << "Unable to find locus id " << sites[pos]->loc_id << "\n";
- continue;
- }
- loc = catalog[sites[pos]->loc_id];
- col = sites[pos]->col;
-
- sprintf(p_str, "%0.3f", sites[pos]->p_freq);
- sprintf(q_str, "%0.3f", 1 - sites[pos]->p_freq);
-
- //
- // If on the negative strand, complement the alleles.
- //
- p_allele = loc->loc.strand == minus ? reverse(sites[pos]->p_allele) : sites[pos]->p_allele;
- q_allele = loc->loc.strand == minus ? reverse(sites[pos]->q_allele) : sites[pos]->q_allele;
-
- fh << loc->loc.chr << "\t"
- << loc->sort_bp(col) + 1 << "\t"
- << loc->id << "\t"
- << p_allele << "\t" // REFerence allele
- << q_allele << "\t" // ALTernate allele
- << "." << "\t" // QUAL
- << "PASS" << "\t" // FILTER
- << "NS=" << sites[pos]->num_indv << ";" // INFO
- << "AF=" << p_str << "," << q_str << "\t" // INFO
- << "GT:DP:AD:GL"; // FORMAT
-
- snp_index = loc->snp_index(col);
- if (snp_index < 0) {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << "\n";
- fh << "\n";
- continue;
- }
+ return 0;
+}
- d = pmap->locus(loc->id);
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
+bool hap_compare(pair<string, int> a, pair<string, int> b) {
+ return (a.second > b.second);
+}
- if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "./.:0:.,.:.,.,.";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+int parse_command_line(int argc, char* argv[]) {
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- find_datum_allele_depths(d[j], snp_index, sites[pos]->p_allele, sites[pos]->q_allele, p_allele+q_allele, dp_1, dp_2);
-
- if (p_allele == 0) {
- gt_1 = q_allele == sites[pos]->p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else if (q_allele == 0) {
- gt_1 = p_allele == sites[pos]->p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else {
- gt_1 = p_allele == sites[pos]->p_allele ? 0 : 1;
- gt_2 = q_allele == sites[pos]->p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_2 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- }
- //
- // Output the likelihood for this model call.
- //
- if (col < d[j]->snps.size()) {
- fh << ":.," << d[j]->snps[col]->lratio << ",.";
- } else {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << ", tag ID " << d[j]->id << "\n";
- fh << ":.,.,.";
- }
- }
- }
- }
- fh << "\n";
- }
- }
- fh.close();
-
- return 0;
-}
-
-int
-write_vcf(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, string> &samples, vector<int> &sample_ids,
- map<int, pair<merget, int> > &merge_map)
-{
- //
- // Write a VCF file as defined here: http://www.1000genomes.org/node/101
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".vcf";
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening VCF file '" << file << "'\n";
- exit(1);
- }
-
- cerr << "In preparation for VCF export, loading SNP data for " << samples.size() << " samples.\n";
- //
- // Load SNP data so that model likelihoods can be output to VCF file.
- //
- populate_snp_calls(catalog, pmap, samples, sample_ids, merge_map);
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%Y%m%d", timeinfo);
-
- cerr << "Writing population data to VCF file '" << file << "'\n";
-
- //
- // Output the header.
- //
- fh << "##fileformat=VCFv4.0\n"
- << "##fileDate=" << date << "\n"
- << "##source=\"Stacks v" << VERSION << "\"\n"
- << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n"
- << "##INFO=<ID=AF,Number=.,Type=Float,Description=\"Allele Frequency\">\n"
- << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
- << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n"
- << "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Allele Depth\">\n"
- << "##FORMAT=<ID=GL,Number=.,Type=Float,Description=\"Genotype Likelihood\">\n"
- << "#CHROM" << "\t" << "POS" << "\t" << "ID" << "\t" << "REF" << "\t" << "ALT" << "\t"
- << "QUAL" << "\t" << "FILTER" << "\t" << "INFO" << "\t" << "FORMAT";
-
- for (int i = 0; i < pmap->sample_cnt(); i++)
- fh << "\t" << samples[pmap->rev_sample_index(i)];
- fh << "\n";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocTally *t;
- int gt_1, gt_2, dp_1, dp_2;
- double num_indv;
- char p_allele, q_allele, p_str[32], q_str[32];
- int snp_index;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
-
- //
- // We need to order the SNPs so negative and positive strand SNPs are properly ordered.
- //
- vector<GenPos> ordered_loci;
- uint col;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
- t = psum->locus_tally(loc->id);
-
- num_indv = (double) t->nucs[col].num_indv;
-
- sprintf(p_str, "%0.3f", t->nucs[col].p_freq);
- sprintf(q_str, "%0.3f", 1 - t->nucs[col].p_freq);
-
- //
- // If on the negative strand, complement the alleles.
- //
- p_allele = loc->loc.strand == minus ? reverse(t->nucs[col].p_allele) : t->nucs[col].p_allele;
- q_allele = loc->loc.strand == minus ? reverse(t->nucs[col].q_allele) : t->nucs[col].q_allele;
-
- fh << loc->loc.chr << "\t"
- << loc->sort_bp(col) + 1 << "\t"
- << loc->id << "\t"
- << p_allele << "\t" // REFerence allele
- << q_allele << "\t" // ALTernate allele
- << "." << "\t" // QUAL
- << "PASS" << "\t" // FILTER
- << "NS=" << num_indv << ";" // INFO
- << "AF=" << p_str << "," << q_str << "\t" // INFO
- << "GT:DP:AD:GL"; // FORMAT
-
- snp_index = loc->snp_index(col);
- if (snp_index < 0) {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << "\n";
- fh << "\n";
- continue;
- }
-
- d = pmap->locus(loc->id);
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
-
- if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "./.:0:.,.:.,.,.";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- find_datum_allele_depths(d[j], snp_index, t->nucs[col].p_allele, t->nucs[col].q_allele, p_allele+q_allele, dp_1, dp_2);
-
- if (p_allele == 0) {
- gt_1 = q_allele == t->nucs[col].p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else if (q_allele == 0) {
- gt_1 = p_allele == t->nucs[col].p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else {
- gt_1 = p_allele == t->nucs[col].p_allele ? 0 : 1;
- gt_2 = q_allele == t->nucs[col].p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_2 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- }
- //
- // Output the likelihood measure for this model call.
- //
- if (snp_index >= 0) {
- fh << ":.," << d[j]->snps[snp_index]->lratio << ",.";
- } else {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << ", tag ID " << d[j]->id << "\n";
- fh << ":.,.,.";
- }
- }
- }
- }
- fh << "\n";
- }
- }
- fh.close();
-
- return 0;
-}
-
-int
-populate_snp_calls(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
- map<int, string> &samples, vector<int> &sample_ids,
- map<int, pair<merget, int> > &merge_map)
-{
- map<int, CSLocus *>::iterator cit;
- map<int, SNPRes *>::iterator sit;
- CSLocus *loc;
- Datum *datum;
- SNPRes *snpr;
- SNP *snp;
-
- for (uint i = 0; i < sample_ids.size(); i++) {
- map<int, SNPRes *> snpres;
- load_snp_calls(in_path + samples[sample_ids[i]], snpres);
-
- for (cit = catalog.begin(); cit != catalog.end(); cit++) {
- loc = cit->second;
- datum = pmap->datum(loc->id, sample_ids[i]);
-
- if (datum != NULL && snpres.count(datum->id)) {
-
- if (merge_sites && merge_map.count(loc->id)) {
- datum_adjust_snp_positions(merge_map, loc, datum, snpres);
- } else {
- //
- // Deep copy the SNP objects.
- //
- snpr = snpres[datum->id];
- for (uint j = 0; j < snpr->snps.size(); j++) {
- snp = new SNP;
- snp->col = snpr->snps[j]->col;
- snp->lratio = snpr->snps[j]->lratio;
- snp->rank_1 = snpr->snps[j]->rank_1;
- snp->rank_2 = snpr->snps[j]->rank_2;
- snp->rank_3 = snpr->snps[j]->rank_3;
- snp->rank_4 = snpr->snps[j]->rank_4;
-
- datum->snps.push_back(snp);
- }
- }
- }
- }
-
- for (sit = snpres.begin(); sit != snpres.end(); sit++)
- delete sit->second;
- }
-
- return 0;
-}
-
-int
-find_datum_allele_depths(Datum *d, int snp_index, char p_allele, char q_allele, int allele_cnt, int &dp_1, int &dp_2)
-{
- dp_1 = 0;
- dp_2 = 0;
-
- if (allele_cnt == 1) {
-
- //
- // There is a single observed haplotype for this locus, e.g. GA.
- //
- if (d->obshap.size() == 1) {
- if (d->obshap[0][snp_index] == p_allele) {
- dp_1 = d->depth[0];
- dp_2 = 0;
- } else {
- dp_1 = 0;
- dp_2 = d->depth[0];
- }
- } else {
- //
- // This SNP position is homozygous, but the locus is heterozygous, so there is more
- // than one observed haplotype, e.g. GA / TA.
- //
- if (d->obshap[0][snp_index] == p_allele) {
- dp_1 = d->tot_depth;
- dp_2 = 0;
- } else {
- dp_1 = 0;
- dp_2 = d->tot_depth;
- }
- }
-
- } else {
- //
- // This SNP position is heterozygous.
- //
- for (uint i = 0; i < d->obshap.size(); i++) {
- if (d->obshap[i][snp_index] == p_allele)
- dp_1 = d->depth[i];
- else if (d->obshap[i][snp_index] == q_allele)
- dp_2 = d->depth[i];
- }
- }
-
- if (dp_1 == 0 && dp_2 == 0)
- cerr << "Warning: Unable to find allele depths for datum " << d->id << "\n";
-
- return 0;
-}
-
-int
-write_vcf_haplotypes(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, string> &samples, vector<int> &sample_ids)
-{
- //
- // Write a VCF file as defined here: http://samtools.github.io/hts-specs/
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".haplotypes.vcf";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data haplotypes to VCF file '" << file << "'\n";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening VCF file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%Y%m%d", timeinfo);
-
- //
- // Output the header.
- //
- fh << "##fileformat=VCFv4.2\n"
- << "##fileDate=" << date << "\n"
- << "##source=\"Stacks v" << VERSION << "\"\n"
- << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n"
- << "##INFO=<ID=AF,Number=.,Type=Float,Description=\"Allele Frequency\">\n"
- << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
- << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n"
- << "#CHROM" << "\t" << "POS" << "\t" << "ID" << "\t" << "REF" << "\t" << "ALT" << "\t"
- << "QUAL" << "\t" << "FILTER" << "\t" << "INFO" << "\t" << "FORMAT";
-
- for (int i = 0; i < pmap->sample_cnt(); i++)
- fh << "\t" << samples[pmap->rev_sample_index(i)];
- fh << "\n";
-
- map<string, vector<CSLocus *> >::iterator it;
- map<string, double>::iterator hit;
- map<string, double> hap_freq;
- map<string, int> hap_index;
- vector<pair<string, int> > ordered_hap;
- CSLocus *loc;
- Datum **d;
- double num_indv, num_hap;
- char allele[id_len];
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- d = pmap->locus(loc->id);
-
- hap_freq.clear();
- hap_index.clear();
- ordered_hap.clear();
-
- num_hap = count_haplotypes_at_locus(0, pmap->sample_cnt() - 1, d, hap_freq);
-
- if (num_hap == 0 || hap_freq.size() == 1)
- continue;
-
- num_indv = num_hap / 2.0;
-
- //
- // Order the haplotypes according to most frequent. Record the ordered position or each
- // haplotype and convert them from counts to frequencies.
- //
- for (hit = hap_freq.begin(); hit != hap_freq.end(); hit++) {
- ordered_hap.push_back(make_pair(hit->first, hit->second));
- hit->second = hit->second / num_hap;
- }
- sort(ordered_hap.begin(), ordered_hap.end(), compare_pair_haplotype);
- for (uint i = 0; i < ordered_hap.size(); i++)
- hap_index[ordered_hap[i].first] = i;
-
- string alt_str, freq_str;
- for (uint i = 1; i < ordered_hap.size(); i++) {
- alt_str += ordered_hap[i].first;
- sprintf(allele, "%0.3f", hap_freq[ordered_hap[i].first]);
- freq_str += allele;
- if (i < ordered_hap.size() - 1) {
- alt_str += ",";
- freq_str += ",";
- }
- }
-
- fh << loc->loc.chr << "\t"
- << loc->sort_bp() + 1 << "\t"
- << loc->id << "\t"
- << ordered_hap[0].first << "\t" // REFerence haplotypes
- << alt_str << "\t" // ALTernate haplotypes
- << "." << "\t" // QUAL
- << "PASS" << "\t" // FILTER
- << "NS=" << num_indv << ";" // INFO
- << "AF=" << freq_str << "\t" // INFO
- << "GT:DP"; // FORMAT
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
-
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "./.:0";
-
- } else if (d[j]->obshap.size() > 2) {
- fh << "./.:" << d[j]->tot_depth;
-
- } else if (d[j]->obshap.size() == 1) {
- if(uncalled_haplotype(d[j]->obshap[0]))
- fh << "./.:" << d[j]->tot_depth;
- else
- fh << hap_index[d[j]->obshap[0]] << "/" << hap_index[d[j]->obshap[0]] << ":" << d[j]->tot_depth;
- } else {
- if(!uncalled_haplotype(d[j]->obshap[0]) &&
- !uncalled_haplotype(d[j]->obshap[1]))
- fh << hap_index[d[j]->obshap[0]] << "/" << hap_index[d[j]->obshap[1]] << ":" << d[j]->tot_depth;
- else if (!uncalled_haplotype(d[j]->obshap[0]))
- fh << hap_index[d[j]->obshap[0]] << "/" << "." << ":" << d[j]->tot_depth;
- else if (!uncalled_haplotype(d[j]->obshap[1]))
- fh << "." << "/" << hap_index[d[j]->obshap[1]] << ":" << d[j]->tot_depth;
- }
- }
- fh << "\n";
- }
- }
-
- fh.close();
-
- return 0;
-}
-
-int
-write_genepop(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a GenePop file as defined here: http://kimura.univ-montp2.fr/~rousset/Genepop.htm
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".genepop";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data to GenePop file '" << file << "'\n";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening GenePop file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- //
- // Output the header line.
- //
- fh << "Stacks version " << VERSION << "; Genepop version 4.1.3; " << date << "\n";
-
- map<int, pair<int, int> >::iterator pit;
- map<int, CSLocus *>::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
- LocTally *t;
- int start_index, end_index, col, pop_id;
- char p_allele, q_allele;
-
- //
- // Determine how many loci will be output, then output all the loci on the second line, comma-separated.
- //
- uint cnt = 0;
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- for (uint j = 0; j < loc->snps.size(); j++) {
- col = loc->snps[j]->col;
- t = psum->locus_tally(loc->id);
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
- cnt++;
- }
- }
-
- uint i = 0;
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- for (uint j = 0; j < loc->snps.size(); j++) {
- col = loc->snps[j]->col;
- t = psum->locus_tally(loc->id);
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
- i++;
- fh << loc->id << "_" << col;
- if (i < cnt) fh << ",";
- }
- }
- fh << "\n";
-
- map<char, string> nuc_map;
- nuc_map['A'] = "01";
- nuc_map['C'] = "02";
- nuc_map['G'] = "03";
- nuc_map['T'] = "04";
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- fh << "pop\n";
-
- for (int j = start_index; j <= end_index; j++) {
-
- fh << samples[pmap->rev_sample_index(j)] << ",";
-
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->locus(loc->id);
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t0000";
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "\t0000";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t0000";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "\t0000";
- } else if (p_allele == 0) {
- fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
-
- } else if (q_allele == 0) {
- fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
-
- } else {
- fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
- }
- }
- }
- }
- fh << "\n";
- }
- }
-
- fh.close();
-
- return 0;
-}
-
-int
-write_genepop_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples, ofstream &log_fh)
-{
- //
- // Write a GenePop file as defined here: http://kimura.univ-montp2.fr/~rousset/Genepop.htm
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".genepop";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data to GenePop file '" << file << "'\n";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening GenePop file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- //
- // Output the header line.
- //
- fh << "Stacks version " << VERSION << "; Genepop version 4.1.3; " << date << "\n";
-
- map<string, vector<NucTally *> > genome_sites;
- map<int, pair<int, int> >::iterator pit;
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
- int start_index, end_index, pop_id;
- uint col, snp_index;
- char p_allele, q_allele;
-
- //
- // We need to order the SNPs to take into account overlapping loci.
- //
- OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
-
- //
- // Output all the loci on the second line, comma-separated.
- //
- int chrs = pmap->ordered_loci.size();
- int cnt = 0;
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
- ord->order(sites, it->second);
- cnt++;
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- fh << sites[pos]->loc_id << "_" << sites[pos]->col;
- if (cnt < chrs || pos < sites.size() - 1) fh << ",";
- }
- }
- fh << "\n";
-
- map<char, string> nuc_map;
- nuc_map['A'] = "01";
- nuc_map['C'] = "02";
- nuc_map['G'] = "03";
- nuc_map['T'] = "04";
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- fh << "pop\n";
-
- for (int j = start_index; j <= end_index; j++) {
-
- fh << samples[pmap->rev_sample_index(j)] << ",";
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- loc = catalog[sites[pos]->loc_id];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- col = sites[pos]->col;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t0000";
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "\t0000";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t0000";
- } else {
- snp_index = loc->snp_index(col);
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "\t0000";
- } else if (p_allele == 0) {
- fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
-
- } else if (q_allele == 0) {
- fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
-
- } else {
- fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
- }
- }
- }
- }
- fh << "\n";
- }
- }
-
- fh.close();
-
- return 0;
-}
-
-int
-write_structure(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a Structure file as defined here: http://pritch.bsd.uchicago.edu/structure.html
- //
- // To avoid linked SNPs (which Structure can't handle), we will only output the first
- // SNP from each variable locus.
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".structure.tsv";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data to Structure file '" << file << "'...";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening Structure file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- //
- // Output the header.
- //
- fh << "# Stacks v" << VERSION << "; " << " Structure v2.3; " << date << "\n";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
- LocTally *t;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- fh << "\t" << loc->id << "_" << col;
- }
- }
- }
- fh << "\n";
-
- map<char, string> nuc_map;
- nuc_map['A'] = "1";
- nuc_map['C'] = "2";
- nuc_map['G'] = "3";
- nuc_map['T'] = "4";
-
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id, p;
- char p_allele, q_allele;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
- pop_id = pit->first;
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "0";
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "0";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (p_allele == 0)
- fh << "\t" << nuc_map[q_allele];
- else
- fh << "\t" << nuc_map[p_allele];
- }
- }
- }
- }
- fh << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- fh << "\t" << "0";
- } else if (d[j] == NULL || col >= d[j]->len) {
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- fh << "\t" << "0";
- } else {
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (q_allele == 0)
- fh << "\t" << nuc_map[p_allele];
- else
- fh << "\t" << nuc_map[q_allele];
- }
- }
- }
- }
- fh << "\n";
- }
- }
-
- fh.close();
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_structure_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples, ofstream &log_fh)
-{
- //
- // Write a Structure file as defined here: http://pritch.bsd.uchicago.edu/structure.html
- //
- // To avoid linked SNPs (which Structure can't handle), we will only output the first
- // SNP from each variable locus.
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".structure.tsv";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data to Structure file '" << file << "'...";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening Structure file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- //
- // Output the header.
- //
- fh << "# Stacks v" << VERSION << "; " << " Structure v2.3; " << date << "\n";
-
- map<string, vector<NucTally *> > genome_sites;
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
-
- //
- // We need to order the SNPs to take into account overlapping loci.
- //
- OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
- ord->order(sites, it->second);
-
- for (uint pos = 0; pos < sites.size(); pos++)
- fh << "\t" << sites[pos]->loc_id << "_" << sites[pos]->col;
- }
- fh << "\n";
-
- map<char, string> nuc_map;
- nuc_map['A'] = "1";
- nuc_map['C'] = "2";
- nuc_map['G'] = "3";
- nuc_map['T'] = "4";
-
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id, p;
- char p_allele, q_allele;
- uint col, snp_index;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
- pop_id = pit->first;
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- loc = catalog[sites[pos]->loc_id];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- col = sites[pos]->col;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "0";
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "0";
- } else {
- snp_index = loc->snp_index(col);
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (p_allele == 0)
- fh << "\t" << nuc_map[q_allele];
- else
- fh << "\t" << nuc_map[p_allele];
- }
- }
- }
- fh << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- loc = catalog[sites[pos]->loc_id];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- col = sites[pos]->col;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- fh << "\t" << "0";
- } else if (d[j] == NULL || col >= d[j]->len) {
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- fh << "\t" << "0";
- } else {
- snp_index = loc->snp_index(col);
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (q_allele == 0)
- fh << "\t" << nuc_map[p_allele];
- else
- fh << "\t" << nuc_map[q_allele];
- }
- }
- }
- fh << "\n";
- }
- }
-
- fh.close();
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_hzar(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a Hybrid Zone Analysis using R (HZAR) file as defined here:
- // http://cran.r-project.org/web/packages/hzar/hzar.pdf
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".hzar.csv";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data to HZAR file '" << file << "'...";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening HZAR file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- //
- // Output the header.
- //
- fh << "# Stacks v" << VERSION << "; " << " HZAR v0.2-5; " << date << "\n"
- << "Population" << "," << "Distance";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- LocSum **s;
- LocTally *t;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2) {
- fh << "," << loc->id << "_" << col << ".A"
- << "," << loc->id << "_" << col << ".B"
- << "," << loc->id << "_" << col << ".N";
- }
- }
- }
- }
- fh << "\n";
-
- map<int, pair<int, int> >::iterator pit;
- int pop_id, p;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
- pop_id = pit->first;
-
- fh << pop_key[pop_id] << ",";
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[p]->nucs[col].num_indv == 0 ||
- s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- fh << ",0,0,0";
- continue;
- }
-
- if (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc)
- fh << "," << s[p]->nucs[col].p << "," << 1 - s[p]->nucs[col].p << ",";
- else
- fh << "," << 1 - s[p]->nucs[col].p << "," << s[p]->nucs[col].p << ",";
-
- fh << s[p]->nucs[col].num_indv * 2;
- }
- }
- }
- fh << "\n";
- }
-
- fh.close();
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_treemix(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a TreeMix file (Pickrell and Pritchard, 2012 PLoS Genetics)
- // https://bitbucket.org/nygcresearch/treemix/wiki/Home
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".treemix";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data to TreeMix file '" << file << "'; ";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening TreeMix file '" << file << "'\n";
- exit(1);
- }
-
- pop_name << ".log";
- file = in_path + pop_name.str();
-
- cerr << "logging nucleotide positions to '" << file << "'...";
-
- ofstream log_fh(file.c_str(), ofstream::out);
-
- if (log_fh.fail()) {
- cerr << "Error opening Phylip Log file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- log_fh << "# Stacks v" << VERSION << "; " << " TreeMix v1.1; " << date << "\n"
- << "# Line\tLocus ID\tColumn\tChr\tBasepair\n";
-
- //
- // Output the header.
- //
- fh << "# Stacks v" << VERSION << "; " << " TreeMix v1.1; " << date << "\n";
-
- map<string, vector<CSLocus *> >::iterator it;
- map<int, pair<int, int> >::iterator pit;
- CSLocus *loc;
- LocSum **s;
- LocTally *t;
- int p;
-
- //
- // Output a space-separated list of the populations on the first line.
- //
- stringstream sstr;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- sstr << pop_key[pit->first] << " ";
-
- fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
-
- double p_freq, p_cnt, q_cnt, allele_cnt;
- long int line = 1;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- sstr.str("");
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
-
- if (s[p]->nucs[col].num_indv == 0 ||
- s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- sstr << "0,0 ";
- continue;
- }
-
- p_freq = (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc) ?
- s[p]->nucs[col].p :
- 1 - s[p]->nucs[col].p;
-
- allele_cnt = s[p]->nucs[col].num_indv * 2;
- p_cnt = round(allele_cnt * p_freq);
- q_cnt = allele_cnt - p_cnt;
- sstr << (int) p_cnt << "," << (int) q_cnt << " ";
- }
-
- if (sstr.str().length() == 0)
- continue;
-
- fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
- log_fh << line << "\t" << loc->id << "\t" << col << "\t" << loc->loc.chr << "\t" << loc->sort_bp(col) + 1 << "\n";
- line++;
- }
- }
- }
-
- fh.close();
- log_fh.close();
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_fastphase(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a fastPHASE file as defined here: http://stephenslab.uchicago.edu/software.html
- //
- // Data will be written as independent, bi-allelic SNPs. We will write one file per chromosome.
- //
- cerr << "Writing population data to fastPHASE files...";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
- LocTally *t;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
-
- stringstream pop_name;
- pop_name << "batch_" << batch_id << "." << it->first << ".fastphase.inp";
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening fastPHASE file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Tally up the number of sites
- //
- int total_sites = 0;
- uint col;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- total_sites++;
- }
- }
-
- //
- // Output the total number of SNP sites and the number of individuals.
- //
- fh << samples.size() << "\n"
- << total_sites << "\n";
-
- //
- // We need to determine an ordering that can take into account overlapping RAD sites.
- //
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Output the position of each site according to its basepair.
- //
- fh << "P";
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
- fh << " " << ordered_loci[pos].bp;
- }
- fh << "\n";
-
- //
- // Output a line of 'S' characters, one per site, indicating that these are SNP markers.
- //
- string snp_markers, gtypes_str;
- snp_markers.assign(total_sites, 'S');
- fh << snp_markers << '\n';
-
- //
- // Now output each sample name followed by a new line, then all of the genotypes for that sample
- // on two lines.
- //
-
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id;
- char p_allele, q_allele;
- stringstream gtypes;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\n";
-
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- gtypes << "? ";
-
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- gtypes << "? ";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- gtypes << "? ";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (p_allele == 0)
- gtypes << q_allele << " ";
- else
- gtypes << p_allele << " ";
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- gtypes << "? ";
-
- } else if (d[j] == NULL || col >= d[j]->len) {
- gtypes << "? ";
-
- } else if (d[j]->model[col] == 'U') {
- gtypes << "? ";
-
- } else {
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (q_allele == 0)
- gtypes << p_allele << " ";
- else
- gtypes << q_allele << " ";
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
- }
- }
-
- fh.close();
- }
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_phase(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a PHASE file as defined here: http://stephenslab.uchicago.edu/software.html
- //
- // Data will be written as mixture of multiple allele, linked RAD sites
- // (SNPs within a single RAD locus are already phased), and bi-allelic SNPs. We
- // will write one file per chromosome.
- //
- cerr << "Writing population data to PHASE files...";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
- LocTally *t;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
-
- stringstream pop_name;
- pop_name << "batch_" << batch_id << "." << it->first << ".phase.inp";
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening PHASE file '" << file << "'\n";
- exit(1);
- }
-
- //
- // We need to determine an ordering for all legitimate loci/SNPs.
- //
- uint col;
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- if (loc->snps.size() == 0) continue;
-
- //
- // Will we output this locus as a haplotype or as a SNP?
- //
- if (loc->snps.size() > 1) {
- //
- // Check that there aren't too many haplotypes (PHASE has a max of 50).
- //
- if (loc->alleles.size() > 40) continue;
-
- //
- // Iterate over the population to determine that this subset of the population
- // has data at this locus.
- //
- d = pmap->locus(loc->id);
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] != NULL &&
- d[j]->obshap.size() > 0 &&
- d[j]->obshap.size() <= 2) {
- //
- // Data exists, and there are the correct number of haplotypes.
- //
- ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
- break;
- }
- }
- } else {
- col = loc->snps[0]->col;
-
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(col), snp));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Output the total number of SNP sites and the number of individuals.
- //
- fh << samples.size() << "\n"
- << ordered_loci.size() << "\n";
-
- //
- // Output the position of each site according to its basepair.
- //
- fh << "P";
- for (uint pos = 0; pos < ordered_loci.size(); pos++)
- fh << " " << ordered_loci[pos].bp;
- fh << "\n";
-
- //
- // Output a line of 'S' characters for SNP markers, 'M' characters for multiallelic haplotypes.
- //
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- if (pos > 0) fh << " ";
- fh << (ordered_loci[pos].type == snp ? "S" : "M");
- }
- fh << "\n";
-
- //
- // Now output each sample name followed by a new line, then all of the genotypes for that sample
- // on two lines.
- //
-
- map<int, pair<int, int> >::iterator pit;
- string gtypes_str;
- bool found;
- int start_index, end_index, pop_id;
- char p_allele, q_allele;
- stringstream gtypes;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\n";
-
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- //
- // Will we output this locus as a haplotype or as a SNP?
- //
- if (ordered_loci[pos].type == haplotype) {
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- gtypes << "-1 ";
- } else {
- //
- // Data exists, output the first haplotype. We will assume the haplotypes are
- // numbered by their position in the loc->strings vector.
- //
- if (d[j]->obshap.size() > 2) {
- // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
- gtypes << "-1 ";
- } else {
- found = false;
- for (uint k = 0; k < loc->strings.size(); k++)
- if (d[j]->obshap[0] == loc->strings[k].first) {
- found = true;
- gtypes << k + 1 << " ";
- }
- if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
- << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
- }
- }
- } else {
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- gtypes << "? ";
-
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- gtypes << "? ";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- gtypes << "? ";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (p_allele == 0)
- gtypes << q_allele << " ";
- else
- gtypes << p_allele << " ";
- }
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- //
- // Will we output this locus as a haplotype or as a SNP?
- //
- if (ordered_loci[pos].type == haplotype) {
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- gtypes << "-1 ";
- } else {
- //
- // Data exists, output the second haplotype. We will assume the haplotypes are
- // numbered by their position in the loc->strings vector.
- //
- if (d[j]->obshap.size() > 2) {
- // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
- gtypes << "-1 ";
- } else if (d[j]->obshap.size() > 1) {
- found = false;
- for (uint k = 0; k < loc->strings.size(); k++)
- if (d[j]->obshap[1] == loc->strings[k].first) {
- found = true;
- gtypes << k + 1 << " ";
- }
- if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[1] << " from individual "
- << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
- } else {
- found = false;
- for (uint k = 0; k < loc->strings.size(); k++)
- if (d[j]->obshap[0] == loc->strings[k].first) {
- found = true;
- gtypes << k + 1 << " ";
- }
- if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
- << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
- }
- }
- } else {
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- gtypes << "? ";
-
- } else if (d[j] == NULL || col >= d[j]->len) {
- gtypes << "? ";
-
- } else if (d[j]->model[col] == 'U') {
- gtypes << "? ";
-
- } else {
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (q_allele == 0)
- gtypes << p_allele << " ";
- else
- gtypes << q_allele << " ";
- }
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
- }
- }
-
- fh.close();
- }
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_plink(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a PLINK file as defined here: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
- //
- // We will write one file per chromosome.
- //
- cerr << "Writing population data to PLINK files...";
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
- LocTally *t;
- string chr;
-
- //
- // First, write a markers file containing each marker, the chromosome it falls on,
- // an empty centiMorgan field, and finally its genomic position in basepairs.
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".plink.map";
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening PLINK markers file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Output the header.
- //
- fh << "# Stacks v" << VERSION << "; " << " PLINK v1.07; " << date << "\n";
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- chr = it->first;
-
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- fh << chr << "\t"
- << loc->id << "_" << col << "\t"
- << "0\t"
- << loc->sort_bp(col) << "\n";
- }
- }
- }
- fh.close();
-
- //
- // Now output the genotypes in a separate file.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << ".plink.ped";
- file = in_path + pop_name.str();
-
- fh.open(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening PLINK markers file '" << file << "'\n";
- exit(1);
- }
-
- fh << "# Stacks v" << VERSION << "; " << " PLINK v1.07; " << date << "\n";
-
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id;
- char p_allele, q_allele;
-
- //
- // marker, output the genotypes for each sample in two successive columns.
- //
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
-
- fh << pit->first << "\t"
- << samples[pmap->rev_sample_index(j)] << "\t"
- << "0\t" // Paternal ID
- << "0\t" // Maternal ID
- << "0\t" // Sex
- << "0"; // Phenotype
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
- //
- // Output the p and q alleles
- //
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "0" << "\t" << "0";
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "\t" << "0" << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "0" << "\t" << "0";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0" << "\t" << "0";
- else if (p_allele == 0)
- fh << "\t" << q_allele << "\t" << q_allele;
- else if (q_allele == 0)
- fh << "\t" << p_allele << "\t" << p_allele;
- else
- fh << "\t" << p_allele << "\t" << q_allele;
- }
- }
- }
- }
- fh << "\n";
- }
- }
-
- fh.close();
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_beagle(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a Beagle file as defined here: http://faculty.washington.edu/browning/beagle/beagle.html
- //
- // We will write one file per chromosome, per population.
- //
- cerr << "Writing population data to unphased Beagle files...";
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
- LocSum **s;
- LocTally *t;
- uint col;
-
- stringstream pop_name;
- string file;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
-
- //
- // We need to determine an ordering that can take into account overlapping RAD sites.
- //
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Now output the genotypes in a separate file for each population.
- //
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- //
- // Open a markers file containing each marker, its genomic position in basepairs
- // and the two alternative alleles at this position.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".unphased.bgl.markers";
- file = in_path + pop_name.str();
-
- ofstream mfh(file.c_str(), ofstream::out);
- if (mfh.fail()) {
- cerr << "Error opening Beagle markers file '" << file << "'\n";
- exit(1);
- }
- mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- //
- // Open the genotypes file.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".unphased.bgl";
- file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
- if (fh.fail()) {
- cerr << "Error opening Beagle genotypes file '" << file << "'\n";
- exit(1);
- }
- fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- char p_allele, q_allele;
- //
- // Output a list of all the samples in this population.
- //
- fh << "I\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << samples[pmap->rev_sample_index(j)] << "\t" << samples[pmap->rev_sample_index(j)];
- fh << "\n";
-
- //
- // Output population IDs for each sample.
- //
- fh << "S\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << pit->first << "\t" << pit->first;
- fh << "\n";
-
- //
- // For each marker, output the genotypes for each sample in two successive columns.
- //
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- //
- // If this site is monomorphic in this population don't output it.
- //
- if (s[pop_id]->nucs[col].pi == 0.0)
- continue;
-
- //
- // Output this locus to the markers file.
- //
- mfh << loc->id << "_" << col << "\t"
- << loc->sort_bp(col) << "\t"
- << t->nucs[col].p_allele << "\t"
- << t->nucs[col].q_allele << "\n";
-
- fh << "M" << "\t" << loc->id << "_" << col;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output the p allele
- //
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "?";
-
- } else if (d[j] == NULL || col >= d[j]->len) {
- //
- // Data does not exist.
- //
- fh << "\t" << "?";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "?";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "?";
- else if (p_allele == 0)
- fh << "\t" << q_allele;
- else
- fh << "\t" << p_allele;
- }
-
- //
- // Now output the q allele
- //
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- fh << "\t" << "?";
-
- } else if (d[j] == NULL || col >= d[j]->len) {
- fh << "\t" << "?";
-
- } else if (d[j]->model[col] == 'U') {
- fh << "\t" << "?";
-
- } else {
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "?";
- else if (q_allele == 0)
- fh << "\t" << p_allele;
- else
- fh << "\t" << q_allele;
- }
- }
- fh << "\n";
- }
-
- fh.close();
- mfh.close();
- }
- }
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_beagle_phased(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // Write a Beagle file as a set of haplotpyes as defined here:
- // http://faculty.washington.edu/browning/beagle/beagle.html
- //
- // We will write one file per chromosome.
- //
- cerr << "Writing population data to phased Beagle files...";
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- Datum **d;
-
- stringstream pop_name;
- string file;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
-
- //
- // We need to determine an ordering for all legitimate loci/SNPs.
- //
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- if (loc->snps.size() == 0) continue;
-
- //
- // Check that there aren't too many haplotypes (PHASE has a max of 50).
- //
- if (loc->alleles.size() > 40) continue;
-
- //
- // Iterate over the population to determine that this subset of the population
- // has data at this locus.
- //
- d = pmap->locus(loc->id);
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] != NULL &&
- d[j]->obshap.size() > 0 &&
- d[j]->obshap.size() <= 2) {
- //
- // Data exists, and their are the corrent number of haplotypes.
- //
- ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
- break;
- }
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Now output the genotypes in a separate file for each population.
- //
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- //
- // Open a file for writing the markers: their genomic position in basepairs
- // and the two alternative alleles at this position.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".phased.bgl.markers";
- file = in_path + pop_name.str();
-
- ofstream mfh(file.c_str(), ofstream::out);
- if (mfh.fail()) {
- cerr << "Error opening Beagle markers file '" << file << "'\n";
- exit(1);
- }
- mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- //
- // Now output the haplotypes in a separate file.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".phased.bgl";
- file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
- if (fh.fail()) {
- cerr << "Error opening Beagle markers file '" << file << "'\n";
- exit(1);
- }
- fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- //
- // Output a list of all the samples in the data set.
- //
- fh << "I\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << samples[pmap->rev_sample_index(j)] << "\t" << samples[pmap->rev_sample_index(j)];
- fh << "\n";
-
- //
- // Output population IDs for each sample.
- //
- fh << "S\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << pop_id << "\t" << pop_id;
- fh << "\n";
-
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- d = pmap->locus(loc->id);
-
- //
- // If this locus is monomorphic in this population don't output it.
- //
- set<string> haplotypes;
- for (int j = start_index; j <= end_index; j++) {
- if (d[j] == NULL) continue;
-
- if (d[j]->obshap.size() == 2) {
- haplotypes.insert(d[j]->obshap[0]);
- haplotypes.insert(d[j]->obshap[1]);
- } else {
- haplotypes.insert(d[j]->obshap[0]);
- }
- }
- if (haplotypes.size() == 1) continue;
-
- //
- // Output this locus to the markers file.
- //
- mfh << loc->id << "\t"
- << loc->sort_bp();
- for (uint j = 0; j < loc->strings.size(); j++)
- mfh << "\t" << loc->strings[j].first;
- mfh << "\n";
-
- //
- // For each marker, output the genotypes for each sample in two successive columns.
- //
- fh << "M" << "\t" << loc->id;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output the p and the q haplotype
- //
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t" << "?" << "\t" << "?";
- } else {
- //
- // Data exists, output the first haplotype. We will assume the haplotypes are
- // numbered by their position in the loc->strings vector.
- //
- if (d[j]->obshap.size() > 2)
- fh << "\t" << "?" << "\t" << "?";
- else if (d[j]->obshap.size() == 2)
- fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[1];
- else
- fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[0];
- }
- }
- fh << "\n";
- }
- fh.close();
- mfh.close();
- }
- }
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_phylip(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // We want to find loci where each locus is fixed within a population but variable between populations.
- //
- // We will write those loci to a Phylip file as defined here:
- // http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".phylip";
- string file = in_path + pop_name.str();
-
- cerr << "Writing population data to Phylip file '" << file << "'; ";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening Phylip file '" << file << "'\n";
- exit(1);
- }
-
- pop_name << ".log";
- file = in_path + pop_name.str();
-
- cerr << "logging nucleotide positions to '" << file << "'...";
-
- ofstream log_fh(file.c_str(), ofstream::out);
-
- if (log_fh.fail()) {
- cerr << "Error opening Phylip Log file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- log_fh << "# Stacks v" << VERSION << "; " << " Phylip sequential; " << date << "\n"
- << "# Seq Pos\tLocus ID\tColumn\tPopulation\n";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- LocSum **s;
- LocTally *t;
-
- map<int, pair<int, int> >::iterator pit;
- int pop_cnt = psum->pop_cnt();
- int pop_id;
- char nuc;
-
- //
- // A map storing, for each population, the concatenated list of interspecific nucleotides.
- //
- map<int, string> interspecific_nucs;
-
- int index = 0;
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (phylip_var == false) {
- //
- // We are looking for loci that are fixed within each population, but are
- // variable between one or more populations.
- //
- if (t->nucs[col].fixed == true || t->nucs[col].allele_cnt != 2 || t->nucs[col].pop_cnt < 2)
- continue;
-
- bool fixed_within = true;
- for (int j = 0; j < pop_cnt; j++) {
- if (s[j]->nucs[col].num_indv == 0)
- continue;
- if (s[j]->nucs[col].fixed == false) {
- fixed_within = false;
- break;
- }
- }
- if (fixed_within == false) continue;
-
- log_fh << index << "\t" << loc->id << "\t" << col << "\t";
-
- for (int j = 0; j < pop_cnt; j++) {
- pop_id = psum->rev_pop_index(j);
-
- if (s[j]->nucs[col].num_indv > 0) {
- interspecific_nucs[pop_id] += s[j]->nucs[col].p_nuc;
- log_fh << pop_key[pop_id] << ":" << s[j]->nucs[col].p_nuc << ",";
- } else {
- interspecific_nucs[pop_id] += 'N';
- log_fh << pop_key[pop_id] << ":N" << ",";
- }
- }
- log_fh << "\n";
- index++;
-
- } else {
- //
- // Encode SNPs that are variable within a population as well, using IUPAC notation:
- // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- log_fh << index << "\t" << loc->id << "\t" << col << "\t";
-
- for (int j = 0; j < pop_cnt; j++) {
- pop_id = psum->rev_pop_index(j);
-
- switch(s[j]->nucs[col].p_nuc) {
- case 0:
- nuc = 'N';
- break;
- case 'A':
- switch(s[j]->nucs[col].q_nuc) {
- case 'C':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'R';
- break;
- case 'T':
- nuc = 'W';
- break;
- case 0:
- nuc = 'A';
- break;
- }
- break;
- case 'C':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'Y';
- break;
- case 0:
- nuc = 'C';
- break;
- }
- break;
- case 'G':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'R';
- break;
- case 'C':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'K';
- break;
- case 0:
- nuc = 'G';
- break;
- }
- break;
- case 'T':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'W';
- break;
- case 'C':
- nuc = 'Y';
- break;
- case 'G':
- nuc = 'K';
- break;
- case 0:
- nuc = 'T';
- break;
- }
- break;
- }
- interspecific_nucs[pop_id] += nuc;
- log_fh << pop_key[pop_id] << ":" << nuc << ",";
-
- }
- log_fh << "\n";
- index++;
- }
- }
- }
- }
-
- if (interspecific_nucs.size() == 0) {
- cerr << " No data is available to write to the Phylip file.\n";
- return 0;
- }
-
- char id_str[id_len];
- uint len;
-
- fh << pop_indexes.size() << " " << interspecific_nucs.begin()->second.length() << "\n";
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
-
- sprintf(id_str, "%s", pop_key[pop_id].c_str());
- len = strlen(id_str);
- for (uint j = len; j < 10; j++)
- id_str[j] = ' ';
- id_str[9] = '\0';
-
- fh << id_str << " " << interspecific_nucs[pop_id] << "\n";
- }
-
- //
- // Output the header.
- //
- fh << "# Stacks v" << VERSION << "; " << " Phylip sequential; " << date << "\n";
-
- fh.close();
- log_fh.close();
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-write_fullseq_phylip(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
-{
- //
- // We want to write all variable loci in Phylip interleaved format. Polymorphic positions
- // will be encoded using IUPAC notation.
- //
- // We will write those loci to a Phylip file as defined here:
- // http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
- //
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".fullseq.phylip";
- string file = in_path + pop_name.str();
-
- cerr << "Writing full sequence population data to Phylip file '" << file << "'; ";
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening Phylip file '" << file << "'\n";
- exit(1);
- }
-
- //
- // We will also write a file that allows us to specify each RAD locus as a separate partition
- // for use in phylogenetics programs.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << ".fullseq.partitions.phylip";
- file = in_path + pop_name.str();
-
- ofstream par_fh(file.c_str(), ofstream::out);
-
- if (par_fh.fail()) {
- cerr << "Error opening Phylip partitions file '" << file << "'\n";
- exit(1);
- }
-
- pop_name.str("");
- pop_name << "batch_" << batch_id << "fullseq.phylip.log";
- file = in_path + pop_name.str();
-
- cerr << "logging nucleotide positions to '" << file << "'...";
-
- ofstream log_fh(file.c_str(), ofstream::out);
-
- if (log_fh.fail()) {
- cerr << "Error opening Phylip Log file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Obtain the current date.
- //
- time_t rawtime;
- struct tm *timeinfo;
- char date[32];
- time(&rawtime);
- timeinfo = localtime(&rawtime);
- strftime(date, 32, "%B %d, %Y", timeinfo);
-
- log_fh << "# Stacks v" << VERSION << "; " << " Phylip interleaved; " << date << "\n"
- << "# Locus ID\tLine Number";
- if (loci_ordered) log_fh << "\tChr\tBasepair";
- log_fh << "\n";
-
- map<string, vector<CSLocus *> >::iterator it;
- CSLocus *loc;
- LocSum **s;
- LocTally *t;
-
- map<int, pair<int, int> >::iterator pit;
- int pop_cnt = psum->pop_cnt();
- int pop_id;
- char nuc;
-
- bool include;
- char id_str[id_len];
- uint len = 0;
-
- //
- // Determine the length of sequence we will output.
- //
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- t = psum->locus_tally(loc->id);
-
- include = true;
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (t->nucs[col].allele_cnt != 2)
- include = false;
- }
-
- if (include)
- len += strlen(loc->con);
- }
- }
-
- map<int, string> outstrs;
-
- fh << pop_indexes.size() << " " << len << "\n";
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
-
- outstrs[pop_id] = "";
- sprintf(id_str, "%s", pop_key[pop_id].c_str());
- len = strlen(id_str);
- for (uint j = len; j < 10; j++)
- id_str[j] = ' ';
- id_str[9] = '\0';
-
- outstrs[pop_id] += string(id_str) + " ";
- }
-
- char *seq;
- int line = 1;
- int index = 1;
- int cnt = 1;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- include = true;
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (t->nucs[col].allele_cnt != 2)
- include = false;
- }
-
- if (!include)
- continue;
-
- seq = new char[loc->len + 1];
- strcpy(seq, loc->con);
-
- for (int j = 0; j < pop_cnt; j++) {
- pop_id = psum->rev_pop_index(j);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // Encode SNPs that are variable within a population using IUPAC notation:
- // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
- //
- switch(s[j]->nucs[col].p_nuc) {
- case 0:
- nuc = 'N';
- break;
- case 'A':
- switch(s[j]->nucs[col].q_nuc) {
- case 'C':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'R';
- break;
- case 'T':
- nuc = 'W';
- break;
- case 0:
- nuc = 'A';
- break;
- }
- break;
- case 'C':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'Y';
- break;
- case 0:
- nuc = 'C';
- break;
- }
- break;
- case 'G':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'R';
- break;
- case 'C':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'K';
- break;
- case 0:
- nuc = 'G';
- break;
- }
- break;
- case 'T':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'W';
- break;
- case 'C':
- nuc = 'Y';
- break;
- case 'G':
- nuc = 'K';
- break;
- case 0:
- nuc = 'T';
- break;
- }
- break;
- }
-
- seq[col] = nuc;
- }
-
- outstrs[pop_id] += string(seq);
- }
- delete [] seq;
-
- log_fh << line << "\t" << loc->id;
- if (loci_ordered) log_fh << "\t" << loc->loc.chr << "\t" << loc->sort_bp() + 1;
- log_fh << "\n";
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
- fh << outstrs[pop_id] << "\n";
- outstrs[pop_id] = "";
- line++;
- }
- fh << "\n";
- line++;
-
- par_fh << "DNA, p" << cnt << "=" << index << "-" << index + loc->len - 1 << "\n";
- index += loc->len;
- cnt++;
- }
- }
-
- //
- // Output the header.
- //
- fh << "# Stacks v" << VERSION << "; " << " Phylip interleaved; " << date << "\n";
-
- fh.close();
- par_fh.close();
- log_fh.close();
-
- cerr << "done.\n";
-
- return 0;
-}
-
-int
-tally_ref_alleles(LocSum **s, int pop_cnt, int snp_index, char &p_allele, char &q_allele)
-{
- int nucs[4] = {0};
- char nuc[2];
-
- for (int j = 0; j < pop_cnt; j++) {
- nuc[0] = 0;
- nuc[1] = 0;
- nuc[0] = s[j]->nucs[snp_index].p_nuc;
- nuc[1] = s[j]->nucs[snp_index].q_nuc;
-
- for (uint k = 0; k < 2; k++)
- switch(nuc[k]) {
- case 'A':
- case 'a':
- nucs[0]++;
- break;
- case 'C':
- case 'c':
- nucs[1]++;
- break;
- case 'G':
- case 'g':
- nucs[2]++;
- break;
- case 'T':
- case 't':
- nucs[3]++;
- break;
- }
- }
-
- //
- // Determine how many alleles are present at this position in this population.
- // We cannot deal with more than two alternative alleles, if there are more than two
- // in a single population, print a warning and exclude this nucleotide position.
- //
- int i;
- int allele_cnt = 0;
- for (i = 0; i < 4; i++)
- if (nucs[i] > 0) allele_cnt++;
-
- if (allele_cnt > 2) {
- p_allele = 0;
- q_allele = 0;
- return 0;
- }
-
- //
- // Record which nucleotide is the P allele and which is the Q allele.
- //
- p_allele = 0;
- q_allele = 0;
-
- i = 0;
- while (p_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 0:
- p_allele = 'A';
- break;
- case 1:
- p_allele = 'C';
- break;
- case 2:
- p_allele = 'G';
- break;
- case 3:
- p_allele = 'T';
- break;
- }
- }
- i++;
- }
- while (q_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 1:
- q_allele = 'C';
- break;
- case 2:
- q_allele = 'G';
- break;
- case 3:
- q_allele = 'T';
- break;
- }
- }
- i++;
- }
-
- return 1;
-}
-
-int
-tally_observed_haplotypes(vector<char *> &obshap, int snp_index, char &p_allele, char &q_allele)
-{
- int nucs[4] = {0};
- char nuc;
-
- //
- // Pull each allele for this SNP from the observed haplotype.
- //
- for (uint j = 0; j < obshap.size(); j++) {
- nuc = obshap[j][snp_index];
-
- switch(nuc) {
- case 'A':
- case 'a':
- nucs[0]++;
- break;
- case 'C':
- case 'c':
- nucs[1]++;
- break;
- case 'G':
- case 'g':
- nucs[2]++;
- break;
- case 'T':
- case 't':
- nucs[3]++;
- break;
- }
- }
-
- //
- // Determine how many alleles are present at this position in this population.
- // We cannot deal with more than two alternative alleles, if there are more than two
- // in a single population, print a warning and exclude this nucleotide position.
- //
- int i;
- int allele_cnt = 0;
- for (i = 0; i < 4; i++)
- if (nucs[i] > 0) allele_cnt++;
-
- if (allele_cnt > 2) {
- p_allele = 0;
- q_allele = 0;
- return -1;
- }
-
- //
- // Record which nucleotide is the P allele and which is the Q allele.
- //
- p_allele = 0;
- q_allele = 0;
-
- i = 0;
- while (p_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 0:
- p_allele = 'A';
- break;
- case 1:
- p_allele = 'C';
- break;
- case 2:
- p_allele = 'G';
- break;
- case 3:
- p_allele = 'T';
- break;
- }
- }
- i++;
- }
- while (q_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 1:
- q_allele = 'C';
- break;
- case 2:
- q_allele = 'G';
- break;
- case 3:
- q_allele = 'T';
- break;
- }
- }
- i++;
- }
-
- return 0;
-}
-
-int load_marker_list(string path, set<int> &list) {
- char line[id_len];
- ifstream fh(path.c_str(), ifstream::in);
-
- if (fh.fail()) {
- cerr << "Error opening white/black list file '" << path << "'\n";
- exit(1);
- }
-
- int marker;
- char *p, *e;
-
- while (fh.good()) {
- fh.getline(line, id_len);
-
- if (strlen(line) == 0) continue;
-
- //
- // Skip commented lines.
- //
- for (p = line; isspace(*p) && *p != '\0'; p++);
- if (*p == '#') continue;
-
- marker = (int) strtol(line, &e, 10);
-
- if (*e == '\0')
- list.insert(marker);
- }
-
- fh.close();
-
- if (list.size() == 0) {
- cerr << "Unable to load any markers from '" << path << "'\n";
- exit(1);
- }
-
- return 0;
-}
-
-int load_marker_column_list(string path, map<int, set<int> > &list) {
- char line[id_len];
- ifstream fh(path.c_str(), ifstream::in);
-
- if (fh.fail()) {
- cerr << "Error opening white/black list file '" << path << "'\n";
- exit(1);
- }
-
- vector<string> parts;
- uint marker, col;
- char *p, *e;
-
- uint line_num = 1;
- while (fh.good()) {
- fh.getline(line, id_len);
-
- if (strlen(line) == 0) continue;
-
- //
- // Skip commented lines.
- //
- for (p = line; isspace(*p) && *p != '\0'; p++);
- if (*p == '#') continue;
-
- //
- // Parse the whitelist, we expect:
- // <marker>[<tab><snp column>]
- //
- parse_tsv(line, parts);
-
- if (parts.size() > 2) {
- cerr << "Too many columns in whitelist " << path << "' at line " << line_num << "\n";
- exit(1);
-
- } else if (parts.size() == 2) {
- marker = (int) strtol(parts[0].c_str(), &e, 10);
- if (*e != '\0') {
- cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
- exit(1);
- }
- col = (int) strtol(parts[1].c_str(), &e, 10);
- if (*e != '\0') {
- cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
- exit(1);
- }
- list[marker].insert(col);
-
- } else {
- marker = (int) strtol(parts[0].c_str(), &e, 10);
- if (*e != '\0') {
- cerr << "Unable to parse whitelist, '" << path << "' at line " << line << "\n";
- exit(1);
- }
- list.insert(make_pair(marker, std::set<int>()));
- }
-
- line_num++;
- }
-
- fh.close();
-
- if (list.size() == 0) {
- cerr << "Unable to load any markers from '" << path << "'\n";
- help();
- }
-
- return 0;
-}
-
-int
-build_file_list(vector<pair<int, string> > &files,
- map<int, pair<int, int> > &pop_indexes,
- map<int, vector<int> > &grp_members)
-{
- char line[max_len];
- vector<string> parts;
- map<string, int> pop_key_rev, grp_key_rev;
- set<string> pop_names, grp_names;
- string f;
- uint len;
-
- if (pmap_path.length() > 0) {
- cerr << "Parsing population map.\n";
-
- ifstream fh(pmap_path.c_str(), ifstream::in);
-
- if (fh.fail()) {
- cerr << "Error opening population map '" << pmap_path << "'\n";
- return 0;
- }
-
- uint pop_id = 0;
- uint grp_id = 0;
-
- while (fh.good()) {
- fh.getline(line, max_len);
-
- len = strlen(line);
- if (len == 0) continue;
-
- //
- // Check that there is no carraige return in the buffer.
- //
- if (line[len - 1] == '\r') line[len - 1] = '\0';
-
- //
- // Ignore comments
- //
- if (line[0] == '#') continue;
-
- //
- // Parse the population map, we expect:
- // <file name><tab><population string>[<tab><group string>]
- //
- parse_tsv(line, parts);
-
- if (parts.size() < 2 || parts.size() > 3) {
- cerr << "Population map is not formated correctly: expecting two or three, tab separated columns, found " << parts.size() << ".\n";
- return 0;
- }
-
- //
- // Have we seen this population or group before?
- //
- if (pop_names.count(parts[1]) == 0) {
- pop_names.insert(parts[1]);
- pop_id++;
- pop_key[pop_id] = parts[1];
- pop_key_rev[parts[1]] = pop_id;
-
- //
- // If this is the first time we have seen this population, but not the
- // first time we have seen this group, add the population to the group list.
- //
- if (parts.size() == 3 && grp_key_rev.count(parts[2]) > 0)
- grp_members[grp_key_rev[parts[2]]].push_back(pop_id);
- }
- if (parts.size() == 3 && grp_names.count(parts[2]) == 0) {
- grp_names.insert(parts[2]);
- grp_id++;
- grp_key[grp_id] = parts[2];
- grp_key_rev[parts[2]] = grp_id;
-
- //
- // Associate the current population with the group.
- //
- grp_members[grp_id].push_back(pop_id);
- }
-
- //
- // Test that file exists before adding to list.
- //
- ifstream test_fh;
- gzFile gz_test_fh;
-
- f = in_path.c_str() + parts[0] + ".matches.tsv";
- test_fh.open(f.c_str());
-
- if (test_fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = in_path.c_str() + parts[0] + ".matches.tsv.gz";
- gz_test_fh = gzopen(f.c_str(), "rb");
- if (!gz_test_fh) {
- cerr << " Unable to find " << f.c_str() << ", excluding it from the analysis.\n";
- } else {
- gzclose(gz_test_fh);
- files.push_back(make_pair(pop_key_rev[parts[1]], parts[0]));
- }
- } else {
- test_fh.close();
- files.push_back(make_pair(pop_key_rev[parts[1]], parts[0]));
- }
- }
-
- fh.close();
- } else {
- cerr << "No population map specified, building file list.\n";
-
- //
- // If no population map is specified, read all the files from the Stacks directory.
- //
- uint pos;
- string file;
- struct dirent *direntry;
-
- DIR *dir = opendir(in_path.c_str());
-
- if (dir == NULL) {
- cerr << "Unable to open directory '" << in_path << "' for reading.\n";
- exit(1);
- }
-
- while ((direntry = readdir(dir)) != NULL) {
- file = direntry->d_name;
-
- if (file == "." || file == "..")
- continue;
-
- if (file.substr(0, 6) == "batch_")
- continue;
-
- pos = file.rfind(".tags.tsv");
- if (pos < file.length()) {
- files.push_back(make_pair(1, file.substr(0, pos)));
- } else {
- pos = file.rfind(".tags.tsv.gz");
- if (pos < file.length())
- files.push_back(make_pair(1, file.substr(0, pos)));
- }
- }
-
- pop_key[1] = "1";
-
- closedir(dir);
- }
-
- if (files.size() == 0) {
- cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
- return 0;
- }
-
- //
- // Sort the files according to population ID.
- //
- sort(files.begin(), files.end(), compare_pop_map);
-
- cerr << "Found " << files.size() << " input file(s).\n";
-
- //
- // Determine the start/end index for each population in the files array.
- //
- int start = 0;
- int end = 0;
- int pop_id = files[0].first;
-
- do {
- end++;
- if (pop_id != files[end].first) {
- pop_indexes[pop_id] = make_pair(start, end - 1);
- start = end;
- pop_id = files[end].first;
- }
- } while (end < (int) files.size());
-
- pop_indexes.size() == 1 ?
- cerr << " " << pop_indexes.size() << " population found\n" :
- cerr << " " << pop_indexes.size() << " populations found\n";
-
- if (population_limit > (int) pop_indexes.size()) {
- cerr << "Population limit ("
- << population_limit
- << ") larger than number of popualtions present, adjusting parameter to "
- << pop_indexes.size() << "\n";
- population_limit = pop_indexes.size();
- }
-
- map<int, pair<int, int> >::iterator it;
- for (it = pop_indexes.begin(); it != pop_indexes.end(); it++) {
- start = it->second.first;
- end = it->second.second;
- cerr << " " << pop_key[it->first] << ": ";
- for (int i = start; i <= end; i++) {
- cerr << files[i].second;
- if (i < end) cerr << ", ";
- }
- cerr << "\n";
- }
-
- //
- // If no group membership is specified in the population map, create a default
- // group with each population ID as a member.
- //
- if (grp_members.size() == 0) {
- for (it = pop_indexes.begin(); it != pop_indexes.end(); it++)
- grp_members[1].push_back(it->first);
- grp_key[1] = "1";
- }
-
- grp_members.size() == 1 ?
- cerr << " " << grp_members.size() << " group of populations found\n" :
- cerr << " " << grp_members.size() << " groups of populations found\n";
- map<int, vector<int> >::iterator git;
- for (git = grp_members.begin(); git != grp_members.end(); git++) {
- cerr << " " << grp_key[git->first] << ": ";
- for (uint i = 0; i < git->second.size(); i++) {
- cerr << pop_key[git->second[i]];
- if (i < git->second.size() - 1) cerr << ", ";
- }
- cerr << "\n";
- }
-
- return 1;
-}
-
-bool compare_pop_map(pair<int, string> a, pair<int, string> b) {
- if (a.first == b.first)
- return (a.second < b.second);
- return (a.first < b.first);
-}
-
-bool hap_compare(pair<string, int> a, pair<string, int> b) {
- return (a.second > b.second);
-}
-
-bool compare_genpos(GenPos a, GenPos b) {
- return (a.bp < b.bp);
-}
-
-int parse_command_line(int argc, char* argv[]) {
- int c;
-
while (1) {
static struct option long_options[] = {
{"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
{"verbose", no_argument, NULL, 'd'},
{"sql", no_argument, NULL, 's'},
- {"vcf", no_argument, NULL, 'V'},
+ {"vcf", no_argument, NULL, 1004},
{"vcf_haplotypes", no_argument, NULL, 'n'},
{"fasta", no_argument, NULL, 'F'},
{"fasta_strict", no_argument, NULL, 'J'},
@@ -8849,23 +5175,25 @@ int parse_command_line(int argc, char* argv[]) {
{"treemix", no_argument, NULL, 'U'},
{"merge_sites", no_argument, NULL, 'D'},
{"window_size", required_argument, NULL, 'w'},
- {"num_threads", required_argument, NULL, 't'},
+ {"threads", required_argument, NULL, 't'},
{"batch_id", required_argument, NULL, 'b'},
{"in_path", required_argument, NULL, 'P'},
+ {"out_path", required_argument, NULL, 'O'},
+ {"in_vcf", required_argument, NULL, 'V'},
{"progeny", required_argument, NULL, 'r'},
{"min_depth", required_argument, NULL, 'm'},
{"renz", required_argument, NULL, 'e'},
- {"pop_map", required_argument, NULL, 'M'},
+ {"popmap", required_argument, NULL, 'M'},
{"whitelist", required_argument, NULL, 'W'},
{"blacklist", required_argument, NULL, 'B'},
{"write_single_snp", no_argument, NULL, 'I'},
{"write_random_snp", no_argument, NULL, 'j'},
- {"ordered_export", no_argument, NULL, 'N'},
+ {"ordered_export", no_argument, NULL, 1002},
{"kernel_smoothed", no_argument, NULL, 'k'},
{"fstats", no_argument, NULL, '6'},
{"log_fst_comp", no_argument, NULL, 'l'},
- {"bootstrap_type", required_argument, NULL, 'O'},
- {"bootstrap_reps", required_argument, NULL, 'R'},
+ {"bootstrap_type", required_argument, NULL, 1001},
+ {"bootstrap_reps", required_argument, NULL, 1003},
{"bootstrap_wl", required_argument, NULL, 'Q'},
{"bootstrap", no_argument, NULL, '1'},
{"bootstrap_fst", no_argument, NULL, '2'},
@@ -8879,13 +5207,12 @@ int parse_command_line(int argc, char* argv[]) {
{"merge_prune_lim", required_argument, NULL, 'i'},
{"fst_correction", required_argument, NULL, 'f'},
{"p_value_cutoff", required_argument, NULL, 'u'},
+ {"debug_flags", required_argument, NULL, 1000},
{0, 0, 0, 0}
};
// getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "ACDEFGHJKLNSTUVYZ123456dghjklnsva:b:c:e:f:i:m:o:p:q:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, &option_index);
+ int c = getopt_long(argc, argv, "ACDEFGHJKLNSTUVYZ123456dghjklnsva:b:c:e:f:i:m:o:p:q:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, NULL);
// Detect the end of the options.
if (c == -1)
@@ -8903,6 +5230,16 @@ int parse_command_line(int argc, char* argv[]) {
break;
case 'P':
in_path = optarg;
+ if (!in_path.empty() && in_path.back() != '/')
+ in_path += "/";
+ break;
+ case 'O':
+ out_path = optarg;
+ if (!out_path.empty() && out_path.back() != '/')
+ out_path += "/";
+ break;
+ case 'V':
+ in_vcf_path = optarg;
break;
case 'M':
pmap_path = optarg;
@@ -8912,9 +5249,23 @@ int parse_command_line(int argc, char* argv[]) {
break;
case 'i':
merge_prune_lim = is_double(optarg);
+ if (merge_prune_lim > 1.0)
+ merge_prune_lim = merge_prune_lim / 100;
+
+ if (merge_prune_lim < 0 || merge_prune_lim > 1.0) {
+ cerr << "Unable to parse the merge sites pruning limit.\n";
+ help();
+ }
break;
case 'q':
max_obs_het = is_double(optarg);
+ if (max_obs_het > 1)
+ max_obs_het = max_obs_het / 100;
+
+ if (max_obs_het < 0 || max_obs_het > 1.0) {
+ cerr << "Unable to parse the maximum observed heterozygosity.\n";
+ help();
+ }
break;
case 'b':
batch_id = is_integer(optarg);
@@ -8925,6 +5276,13 @@ int parse_command_line(int argc, char* argv[]) {
break;
case 'r':
sample_limit = atof(optarg);
+ if (sample_limit > 1)
+ sample_limit = sample_limit / 100;
+
+ if (sample_limit > 1.0) {
+ cerr << "Unable to parse the sample limit frequency\n";
+ help();
+ }
break;
case 'p':
population_limit = atoi(optarg);
@@ -8958,7 +5316,7 @@ int parse_command_line(int argc, char* argv[]) {
case '5':
bootstrap_pifis = true;
break;
- case 'O':
+ case 1001:
if (strcasecmp(optarg, "exact") == 0)
bootstrap_type = bs_exact;
else if (strcasecmp(optarg, "approx") == 0)
@@ -8968,7 +5326,7 @@ int parse_command_line(int argc, char* argv[]) {
help();
}
break;
- case 'R':
+ case 1003:
bootstrap_reps = atoi(optarg);
break;
case 'Q':
@@ -8985,13 +5343,13 @@ int parse_command_line(int argc, char* argv[]) {
case 'j':
write_random_snp = true;
break;
- case 'N':
+ case 1002:
ordered_export = true;
break;
case 's':
sql_out = true;
break;
- case 'V':
+ case 1004:
vcf_out = true;
break;
case 'n':
@@ -9053,6 +5411,13 @@ int parse_command_line(int argc, char* argv[]) {
break;
case 'a':
minor_allele_freq = atof(optarg);
+ if (minor_allele_freq > 1)
+ minor_allele_freq = minor_allele_freq / 100;
+
+ if (minor_allele_freq < 0 || minor_allele_freq > 0.5) {
+ cerr << "Unable to parse the minor allele frequency.\n";
+ help();
+ }
break;
case 'f':
if (strcasecmp(optarg, "p_value") == 0)
@@ -9071,17 +5436,44 @@ int parse_command_line(int argc, char* argv[]) {
break;
case 'e':
enz = optarg;
+ if (renz.count(enz) == 0) {
+ cerr << "Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
+ help();
+ }
break;
case 'w':
sigma = atof(optarg);
break;
case 'v':
version();
+ exit(0);
break;
case '?':
// getopt_long already printed an error message.
help();
break;
+ case 1000:
+ {
+ static const set<string> known_debug_flags = {"VCFCOMP"};
+ stringstream ss (optarg);
+ string s;
+ while (std::getline(ss, s, ',')) {
+ if (known_debug_flags.count(s)) {
+ debug_flags.insert(s);
+ } else {
+ cerr << "DEBUG> Error: Unknown error flag '" << s << "'.\n";
+ return -1;
+ }
+ }
+ cerr << "DEBUG> Debug flag(s) : '" << optarg << "'.\n";
+
+ if (debug_flags.count("VCFCOMP") && not write_random_snp) {
+ write_single_snp = true;
+ cerr << "DEBUG> Added --write_single_snp.\n";
+ }
+
+ break;
+ }
default:
cerr << "Unknown command line option: '" << (char) c << "'\n";
help();
@@ -9089,68 +5481,57 @@ int parse_command_line(int argc, char* argv[]) {
}
}
- if (in_path.length() == 0) {
- cerr << "You must specify a path to the directory containing Stacks output files.\n";
- help();
- }
-
- if (in_path.at(in_path.length() - 1) != '/')
- in_path += "/";
-
- if (pmap_path.length() == 0) {
- cerr << "A population map was not specified, all samples will be read from '" << in_path << "' as a single popultaion.\n";
- }
+ //
+ // Check argument constrains.
+ //
- if (batch_id < 0) {
- cerr << "You must specify a batch ID.\n";
+ if (not in_path.empty() && not in_vcf_path.empty()) {
+ cerr << "Error: Please specify either '-P' or '--in_vcf', not both.\n";
help();
- }
-
- if (enz.length() > 0 && renz.count(enz) == 0) {
- cerr << "Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
+ } else if (not in_path.empty()) {
+ input_mode = InputMode::stacks;
+ } else if (not in_vcf_path.empty()) {
+ input_mode = InputMode::vcf;
+ } else {
+ cerr << "Error: One of '--in_path' or '--in_vcf' is required.\n";
help();
}
- if (merge_prune_lim != 1.0) {
- if (merge_prune_lim > 1.0)
- merge_prune_lim = merge_prune_lim / 100;
-
- if (merge_prune_lim < 0 || merge_prune_lim > 1.0) {
- cerr << "Unable to parse the merge sites pruning limit.\n";
- help();
- }
- }
+ if (input_mode == InputMode::stacks) {
- if (minor_allele_freq > 0) {
- if (minor_allele_freq > 1)
- minor_allele_freq = minor_allele_freq / 100;
+ if (pmap_path.empty())
+ cerr << "A population map was not specified, all samples will be read from '" << in_path << "' as a single popultaion.\n";
- if (minor_allele_freq > 0.5) {
- cerr << "Unable to parse the minor allele frequency.\n";
+ if (batch_id < 0) {
+ cerr << "You must specify a batch ID.\n";
help();
}
- }
- if (max_obs_het != 1.0) {
- if (max_obs_het > 1)
- max_obs_het = max_obs_het / 100;
+ if (out_path.empty())
+ out_path = in_path;
- if (max_obs_het < 0 || max_obs_het > 1.0) {
- cerr << "Unable to parse the maximum observed heterozygosity.\n";
- help();
- }
- }
+ out_prefix = string("batch_") + to_string(batch_id);
- if (sample_limit > 0) {
- if (sample_limit > 1)
- sample_limit = sample_limit / 100;
+ } else if (input_mode == InputMode::vcf) {
- if (sample_limit > 1.0) {
- cerr << "Unable to parse the sample limit frequency\n";
+ if (out_path.empty()) {
+ cerr << "Error: Malformed arguments: input mode 'vcf' requires an output directory (--out_path).\n";
help();
}
+
+ // Determine out_prefix
+ string fname = in_vcf_path;
+ if (in_vcf_path.find_last_of('/') != string::npos && in_vcf_path.back() != '/')
+ fname = in_vcf_path.substr(in_vcf_path.find_last_of('/')+1);
+ size_t trim = 0;
+ if (fname.length() > 4 && fname.substr(fname.length()-4) == ".vcf")
+ trim = 4;
+ else if (fname.length() > 7 && fname.substr(fname.length()-7) == ".vcf.gz")
+ trim = 7;
+ out_prefix = fname.substr(0, fname.length()-trim);
}
+ // Other
if (write_single_snp && write_random_snp) {
cerr << "Please specify either '--write_single_snp' or '--write_random_snp', not both.\n";
help();
@@ -9165,74 +5546,83 @@ int parse_command_line(int argc, char* argv[]) {
}
void version() {
- std::cerr << "populations " << VERSION << "\n\n";
-
- exit(0);
+ cerr << "populations " << VERSION << "\n\n";
}
void help() {
- std::cerr << "populations " << VERSION << "\n"
- << "populations -b batch_id -P path -M path [-r min] [-m min] [-B blacklist] [-W whitelist] [-s] [-e renz] [-t threads] [-v] [-h]" << "\n"
- << " b: Batch ID to examine when exporting from the catalog.\n"
- << " P: path to the Stacks output files.\n"
- << " M: path to the population map, a tab separated file describing which individuals belong in which population.\n"
- << " s: output a file to import results into an SQL database.\n"
- << " B: specify a file containing Blacklisted markers to be excluded from the export.\n"
- << " W: specify a file containing Whitelisted markers to include in the export.\n"
- << " e: restriction enzyme, required if generating 'genomic' output.\n"
- << " t: number of threads to run in parallel sections of code.\n"
- << " v: print program version." << "\n"
- << " h: display this help messsage." << "\n\n"
- << " Merging and Phasing:\n"
- << " --merge_sites: merge loci that were produced from the same restriction enzyme cutsite (requires reference-aligned data).\n"
- << " --merge_prune_lim: when merging adjacent loci, if at least X% samples posses both loci prune the remaining samples out of the analysis.\n"
- << " Data Filtering:\n"
- << " r: minimum percentage of individuals in a population required to process a locus for that population.\n"
- << " p: minimum number of populations a locus must be present in to process a locus.\n"
- << " m: specify a minimum stack depth required for individuals at a locus.\n"
- << " f: specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'.\n"
- << " --min_maf: specify a minimum minor allele frequency required to process a nucleotide site at a locus (0 < min_maf < 0.5).\n"
- << " --max_obs_het: specify a maximum observed heterozygosity required to process a nucleotide site at a locus.\n"
- << " --p_value_cutoff [num]: required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction.\n"
- << " --lnl_lim [num]: filter loci with log likelihood values below this threshold.\n"
- << " --write_single_snp: restrict data analysis to only the first SNP per locus.\n"
- << " --write_random_snp: restrict data analysis to one random SNP per locus.\n\n"
- << " Fstats:\n"
- << " --fstats: enable SNP and haplotype-based F statistics.\n\n"
- << " Kernel-smoothing algorithm:\n"
- << " k: enable kernel-smoothed Pi, Fis, Fst, Fst', and Phi_st calculations.\n"
- << " --window_size [num]: distance over which to average values (sigma, default 150,000bp; window is 3sigma in length).\n\n"
- << " Bootstrap Resampling:\n"
- << " --bootstrap: turn on boostrap resampling for all smoothed statistics.\n"
- << " --bootstrap_pifis: turn on boostrap resampling for smoothed SNP-based Pi and Fis calculations.\n"
- << " --bootstrap_fst: turn on boostrap resampling for smoothed Fst calculations based on pairwise population comparison of SNPs.\n"
- << " --bootstrap_div: turn on boostrap resampling for smoothed haplotype diveristy and gene diversity calculations based on haplotypes.\n"
- << " --bootstrap_phist: turn on boostrap resampling for smoothed Phi_st calculations based on haplotypes.\n"
- << " --bootstrap_reps [num]: number of bootstrap resamplings to calculate (default 100).\n"
- << " --bootstrap_wl [path]: only bootstrap loci contained in this whitelist.\n\n"
- << " File ouput options:\n"
- << " --ordered_export: if data is reference aligned, exports will be ordered; only a single representative of each overlapping site.\n"
- << " --genomic: output each nucleotide position (fixed or polymorphic) in all population members to a file.\n"
- << " --fasta: output full sequence for each unique haplotype, from each sample locus in FASTA format, regardless of plausibility.\n"
- << " --fasta_strict: output full sequence for each haplotype, from each sample locus in FASTA format, only for biologically plausible loci.\n"
- << " --vcf: output SNPs in Variant Call Format (VCF).\n"
- << " --vcf_haplotypes: output haplotypes in Variant Call Format (VCF).\n"
- << " --genepop: output results in GenePop format.\n"
- << " --structure: output results in Structure format.\n"
- << " --phase: output genotypes in PHASE format.\n"
- << " --fastphase: output genotypes in fastPHASE format.\n"
- << " --beagle: output genotypes in Beagle format.\n"
- << " --beagle_phased: output haplotypes in Beagle format.\n"
- << " --plink: output genotypes in PLINK format.\n"
- << " --hzar: output genotypes in Hybrid Zone Analysis using R (HZAR) format.\n"
- << " --phylip: output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction.\n"
- << " --phylip_var: include variable sites in the phylip output encoded using IUPAC notation.\n"
- << " --phylip_var_all: include all sequence as well as variable sites in the phylip output encoded using IUPAC notation.\n"
- << " --treemix: output SNPs in a format useable for the TreeMix program (Pickrell and Pritchard).\n\n"
- << " Debugging:\n"
- << " --verbose: turn on additional logging.\n"
- << " --log_fst_comp: log components of Fst/Phi_st calculations to a file.\n";
- // << " --bootstrap_type [exact|approx]: enable bootstrap resampling for population statistics (reference genome required).\n"
+ cerr << "populations " << VERSION << "\n"
+ << "Usage:\n"
+ << "populations -P dir -b batch_id [-O dir] [-M popmap] (filters) [--fstats] [-k [--window_size=150000] [--bootstrap [-N 100]]] (output formats)\n"
+ << "populations -V vcf -O dir [-M popmap] (filters) [--fstats] [-k [--window_size=150000] [--bootstrap [-N 100]]] (output formats)\n"
+ << "\n"
+ << " -P,--in_path: path to the directory containing the Stacks files.\n"
+ << " -b,--batch_id: Batch ID to examine when exporting from the catalog (required by -P).\n"
+ << " -V,--in_vcf: path to an input VCF file.\n"
+ << " -O,--out_path: path to a directory where to white the output files. (Required by -V; otherwise defaults to value of -P.)\n"
+ << " -M,--popmap: path to a population map. (Format is 'SAMPLE1\tPOP1\\n...'.)\n"
+ << " -t,--threads: number of threads to run in parallel sections of code.\n"
+ << " -s,--sql_out: output a file to import results into an SQL database.\n"
+ << "\n"
+ << "Data Filtering:\n"
+ << " -p [int]: minimum number of populations a locus must be present in to process a locus.\n"
+ << " -r [float]: minimum percentage of individuals in a population required to process a locus for that population.\n"
+ << " --min_maf [float]: specify a minimum minor allele frequency required to process a nucleotide site at a locus (0 < min_maf < 0.5).\n"
+ << " --max_obs_het [float]: specify a maximum observed heterozygosity required to process a nucleotide site at a locus.\n"
+ << " -m [int]: specify a minimum stack depth required for individuals at a locus.\n"
+ << " --lnl_lim [float]: filter loci with log likelihood values below this threshold.\n"
+ << " --write_single_snp: restrict data analysis to only the first SNP per locus.\n"
+ << " --write_random_snp: restrict data analysis to one random SNP per locus.\n"
+ << " -B: path to a file containing Blacklisted markers to be excluded from the export.\n"
+ << " -W: path to a file containing Whitelisted markers to include in the export.\n"
+ << "\n"
+ << "Merging and Phasing:\n"
+ << " -e,--renz: restriction enzyme name.\n"
+ << " --merge_sites: merge loci that were produced from the same restriction enzyme cutsite (requires reference-aligned data).\n"
+ << " --merge_prune_lim: when merging adjacent loci, if at least X% samples posses both loci prune the remaining samples out of the analysis.\n"
+ << "\n"
+ << "Fstats:\n"
+ << " --fstats: enable SNP and haplotype-based F statistics.\n"
+ << " --fst_correction: specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'. Default: off.\n"
+ << " --p_value_cutoff [float]: maximum p-value to keep an Fst measurement. Default: 0.05. (Also used as base for Bonferroni correction.)\n"
+ << "\n"
+ << "Kernel-smoothing algorithm:\n"
+ << " -k,--kernel_smoothed: enable kernel-smoothed Pi, Fis, Fst, Fst', and Phi_st calculations.\n"
+ << " --sigma [int]: standard deviation of the kernel smoothing weight distribution. Default 150kb.\n"
+ << " --bootstrap: turn on boostrap resampling for all smoothed statistics.\n"
+ << " -N,--bootstrap_reps [int]: number of bootstrap resamplings to calculate (default 100).\n"
+ << " --bootstrap_pifis: turn on boostrap resampling for smoothed SNP-based Pi and Fis calculations.\n"
+ << " --bootstrap_fst: turn on boostrap resampling for smoothed Fst calculations based on pairwise population comparison of SNPs.\n"
+ << " --bootstrap_div: turn on boostrap resampling for smoothed haplotype diveristy and gene diversity calculations based on haplotypes.\n"
+ << " --bootstrap_phist: turn on boostrap resampling for smoothed Phi_st calculations based on haplotypes.\n"
+ << " --bootstrap_wl [path]: only bootstrap loci contained in this whitelist.\n"
+ << "\n"
+ << "File output options:\n"
+ << " --ordered_export: if data is reference aligned, exports will be ordered; only a single representative of each overlapping site.\n"
+ << " --genomic: output each nucleotide position (fixed or polymorphic) in all population members to a file (requires --renz).\n"
+ << " --fasta: output full sequence for each unique haplotype, from each sample locus in FASTA format, regardless of plausibility.\n"
+ << " --fasta_strict: output full sequence for each haplotype, from each sample locus in FASTA format, only for biologically plausible loci.\n"
+ << " --vcf: output SNPs in Variant Call Format (VCF).\n"
+ << " --vcf_haplotypes: output haplotypes in Variant Call Format (VCF).\n"
+ << " --genepop: output results in GenePop format.\n"
+ << " --structure: output results in Structure format.\n"
+ << " --phase: output genotypes in PHASE format.\n"
+ << " --fastphase: output genotypes in fastPHASE format.\n"
+ << " --beagle: output genotypes in Beagle format.\n"
+ << " --beagle_phased: output haplotypes in Beagle format.\n"
+ << " --plink: output genotypes in PLINK format.\n"
+ << " --hzar: output genotypes in Hybrid Zone Analysis using R (HZAR) format.\n"
+ << " --phylip: output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction.\n"
+ << " --phylip_var: include variable sites in the phylip output encoded using IUPAC notation.\n"
+ << " --phylip_var_all: include all sequence as well as variable sites in the phylip output encoded using IUPAC notation.\n"
+ << " --treemix: output SNPs in a format useable for the TreeMix program (Pickrell and Pritchard).\n"
+ << "\n"
+ << "Additional options:\n"
+ << " -h,--help: display this help messsage.\n"
+ << " -v,--version: print program version.\n"
+ << " --verbose: turn on additional logging.\n"
+ << (" --log_fst_comp: log components of Fst/Phi_st calculations to a file.\n");
+
+ // << " --bootstrap_type [exact|approx]: enable bootstrap resampling for population statistics (reference genome required).\n"
exit(0);
}
diff --git a/src/populations.h b/src/populations.h
index b10a831..38dbb56 100644
--- a/src/populations.h
+++ b/src/populations.h
@@ -75,108 +75,103 @@ enum corr_type {p_value, bonferroni_win, bonferroni_gen, no_correction};
enum bs_type {bs_exact, bs_approx, bs_none};
enum merget {merge_sink, merge_src};
enum phaset {merge_failure, simple_merge, complex_phase, nomapping_fail, multimapping_fail, multiple_fails};
+enum class InputMode {stacks, vcf};
const int max_snp_dist = 500;
-class GenPos {
-public:
- uint id;
- uint bp;
- uint snp_index;
- loc_type type;
-
- GenPos(int id, int snp_index, int bp) {
- this->id = id;
- this->snp_index = snp_index;
- this->bp = bp;
- this->type = snp;
- }
- GenPos(int id, int snp_index, int bp, loc_type type) {
- this->id = id;
- this->snp_index = snp_index;
- this->bp = bp;
- this->type = type;
- }
-};
-
void help( void );
void version( void );
int parse_command_line(int, char**);
-int build_file_list(vector<pair<int, string> > &, map<int, pair<int, int> > &, map<int, vector<int> > &);
+int build_file_list();
int load_marker_list(string, set<int> &);
int load_marker_column_list(string, map<int, set<int> > &);
-int apply_locus_constraints(map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, pair<int, int> > &, ofstream &);
-int prune_polymorphic_sites(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, set<int> > &, set<int> &, ofstream &);
+int apply_locus_constraints(map<int, CSLocus *> &, PopMap<CSLocus> *, ofstream &);
+int prune_polymorphic_sites(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, set<int> > &, set<int> &, ofstream &);
int log_haplotype_cnts(map<int, CSLocus *> &, ofstream &);
bool order_unordered_loci(map<int, CSLocus *> &);
int merge_shared_cutsite_loci(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<merget, int> > &, ofstream &);
phaset merge_and_phase_loci(PopMap<CSLocus> *, CSLocus *, CSLocus *, set<int> &, ofstream &);
int merge_datums(int, int, Datum **, Datum **, set<string> &, int);
int merge_csloci(CSLocus *, CSLocus *, set<string> &);
-int datum_adjust_snp_positions(map<int, pair<merget, int> > &, CSLocus *, Datum *, map<int, SNPRes *> &);
int tabulate_haplotypes(map<int, CSLocus *> &, PopMap<CSLocus> *);
int create_genotype_map(CSLocus *, PopMap<CSLocus> *);
int call_population_genotypes(CSLocus *, PopMap<CSLocus> *);
-int tally_haplotype_freq(CSLocus *, PopMap<CSLocus> *, int &, double &, string &);
-int translate_genotypes(map<string, string> &, map<string, map<string, string> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, set<int> &);
+int translate_genotypes(map<string, string> &, map<string, map<string, string> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, set<int> &); // This function doesn't exist (March 24, 2016)
int correct_fst_bonferroni_win(vector<PopPair *> &);
-int bootstrap_fst_approximate_dist(vector<double> &, vector<int> &, double *, int *, map<int, vector<double> > &);
+int bootstrap_fst_approximate_dist(vector<double> &, vector<int> &, double *, int *, map<int, vector<double> > &); // not used (March 23, 2016)
int kernel_smoothed_popstats(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, int, ofstream &);
-int bootstrap_popstats_approximate_dist(vector<double> &, vector<double> &, vector<int> &, double *, int *, int, map<int, vector<double> > &, map<int, vector<double> > &);
+int bootstrap_popstats_approximate_dist(vector<double> &, vector<double> &, vector<int> &, double *, int *, int, map<int, vector<double> > &, map<int, vector<double> > &); // not used (March 23, 2016)
double bootstrap_approximate_pval(int, double, map<int, vector<double> > &);
-int calculate_summary_stats(vector<pair<int, string> > &, map<int, pair<int, int> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
-int calculate_haplotype_stats(vector<pair<int, string> > &, map<int, pair<int, int> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int calculate_summary_stats(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int calculate_haplotype_stats(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
int kernel_smoothed_hapstats(vector<CSLocus *> &, PopSum<CSLocus> *, int, double *);
-int calculate_haplotype_divergence(vector<pair<int, string> > &, map<int, pair<int, int> > &, map<int, vector<int> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
-int calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &, map<int, pair<int, int> > &, map<int, vector<int> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
-double count_haplotypes_at_locus(int, int, Datum **, map<string, double> &);
-bool fixed_locus(map<int, pair<int, int> > &, Datum **, vector<int> &);
-bool uncalled_haplotype(const char *);
+int calculate_haplotype_divergence(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+int calculate_haplotype_divergence_pairwise(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *);
+bool fixed_locus(Datum **, vector<int> &);
int nuc_substitution_dist(map<string, int> &, double **);
int nuc_substitution_identity(map<string, int> &, double **);
int nuc_substitution_identity_max(map<string, int> &, double **);
-HapStat *haplotype_amova(map<int, int> &, map<int, pair<int, int> > &, Datum **, LocSum **, vector<int> &);
+HapStat *haplotype_amova(Datum **, LocSum **, vector<int> &);
double amova_ssd_total(vector<string> &, map<string, int> &, double **);
double amova_ssd_wp(vector<int> &, map<int, vector<int> > &, map<string, int> &, map<int, vector<string> > &, double **);
double amova_ssd_ap_wg(vector<int> &, map<int, vector<int> > &, map<string, int> &, map<int, vector<string> > &, double **, double **);
double amova_ssd_ag(vector<int> &, map<int, vector<int> > &, map<string, int> &, map<int, vector<string> > &, double **, double);
-double haplotype_d_est(map<int, pair<int, int> > &, Datum **, LocSum **, vector<int> &);
+double haplotype_d_est(Datum **, LocSum **, vector<int> &);
LocStat *haplotype_diversity(int, int, Datum **);
+double count_haplotypes_at_locus(int, int, Datum**, map<string, double>&);
-int write_sql(map<int, CSLocus *> &, PopMap<CSLocus> *);
-int write_fst_stats(vector<pair<int, string> > &, map<int, pair<int, int> > &, map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, ofstream &);
-int write_generic(map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, bool);
-int write_genomic(map<int, CSLocus *> &, PopMap<CSLocus> *);
-int write_fasta(map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, vector<int> &);
-int write_strict_fasta(map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, vector<int> &);
-int write_vcf(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, string> &, vector<int> &, map<int, pair<merget, int> > &);
-int write_vcf_ordered(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, string> &, vector<int> &, map<int, pair<merget, int> > &, ofstream &);
-int write_vcf_haplotypes(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, string> &, vector<int> &);
-int populate_snp_calls(map<int, CSLocus *> &, PopMap<CSLocus> *, map<int, string> &, vector<int> &, map<int, pair<merget, int> > &);
-int find_datum_allele_depths(Datum *, int, char, char, int, int &, int &);
-int write_genepop(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_genepop_ordered(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &, ofstream &);
-int write_structure(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_structure_ordered(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &, ofstream &);
-int write_phase(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_fastphase(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_beagle(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_beagle_phased(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_plink(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_hzar(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_treemix(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_phylip(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int write_fullseq_phylip(map<int, CSLocus *> &, PopMap<CSLocus> *, PopSum<CSLocus> *, map<int, pair<int, int> > &, map<int, string> &);
-int tally_observed_haplotypes(vector<char *> &, int, char &, char &);
-int tally_ref_alleles(LocSum **, int, int, char &, char &);
-int load_snp_calls(string, PopMap<CSLocus> *);
-
-bool compare_pop_map(pair<int, string>, pair<int, string>);
+//int tally_ref_alleles(LocSum **, int, int, char &, char &); //unused; also commented out in the .cc
+//int load_snp_calls(string, PopMap<CSLocus> *); //no implementation
+
+//bool compare_pop_map(pair<int, string>, pair<int, string>); //no implementation; the function is in [sql_utilities.h]
bool hap_compare(pair<string, int>, pair<string, int>);
-bool compare_genpos(GenPos, GenPos);
+
+void vcfcomp_simplify_pmap (map<int, CSLocus*>& catalog, PopMap<CSLocus>* pmap);
+
+inline
+bool uncalled_haplotype(const char *haplotype)
+{
+ for (const char *p = haplotype; *p != '\0'; p++)
+ if (*p == 'N' || *p == 'n')
+ return true;
+ return false;
+}
+
+inline
+double count_haplotypes_at_locus(int start, int end, Datum **d, map<string, double> &hap_cnts)
+{
+ double n = 0.0;
+
+ for (int i = start; i <= end; i++) {
+ if (d[i] == NULL)
+ // No data, ignore this sample.
+ continue;
+
+ const vector<char*>& haps = d[i]->obshap;
+ if (haps.size() > 2) {
+ // Too many haplotypes, ignore this sample.
+ continue;
+ } else if (haps.size() == 1) {
+ // Homozygote.
+ if(!uncalled_haplotype(d[i]->obshap[0])) {
+ n += 2;
+ hap_cnts[d[i]->obshap[0]] += 2;
+ }
+ } else {
+ // Heterozygote.
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ if(!uncalled_haplotype(d[i]->obshap[0])) {
+ n++;
+ hap_cnts[d[i]->obshap[j]]++;
+ }
+ }
+ }
+ }
+ return n;
+}
#endif // __POPULATIONS_H__
diff --git a/src/pstacks.cc b/src/pstacks.cc
index 45d3159..11bb13b 100644
--- a/src/pstacks.cc
+++ b/src/pstacks.cc
@@ -52,17 +52,17 @@ int main (int argc, char* argv[]) {
parse_command_line(argc, argv);
cerr << "Min depth of coverage to report a stack: " << min_stack_cov << "\n"
- << "Model type: ";
+ << "Model type: ";
switch (model_type) {
case snp:
- cerr << "SNP\n";
- break;
+ cerr << "SNP\n";
+ break;
case fixed:
- cerr << "Fixed\n";
- break;
+ cerr << "Fixed\n";
+ break;
case bounded:
- cerr << "Bounded; lower epsilon bound: " << bound_low << "; upper bound: " << bound_high << "\n";
- break;
+ cerr << "Bounded; lower epsilon bound: " << bound_low << "; upper bound: " << bound_high << "\n";
+ break;
}
cerr << "Alpha significance level for model: " << alpha << "\n";
@@ -72,17 +72,17 @@ int main (int argc, char* argv[]) {
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value
//
if (alpha == 0.1) {
- heterozygote_limit = -2.71;
- homozygote_limit = 2.71;
+ heterozygote_limit = -2.71;
+ homozygote_limit = 2.71;
} else if (alpha == 0.05) {
- heterozygote_limit = -3.84;
- homozygote_limit = 3.84;
+ heterozygote_limit = -3.84;
+ homozygote_limit = 3.84;
} else if (alpha == 0.01) {
- heterozygote_limit = -6.64;
- homozygote_limit = 6.64;
+ heterozygote_limit = -6.64;
+ homozygote_limit = 6.64;
} else if (alpha == 0.001) {
- heterozygote_limit = -10.83;
- homozygote_limit = 10.83;
+ heterozygote_limit = -10.83;
+ homozygote_limit = 10.83;
}
//
@@ -132,30 +132,30 @@ int call_alleles(MergedStack *mtag, vector<DNANSeq *> &reads) {
DNANSeq *d;
for (row = 0; row < height; row++) {
- allele.clear();
+ allele.clear();
- uint snp_cnt = 0;
+ uint snp_cnt = 0;
- for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
- if ((*snp)->type != snp_type_het) continue;
+ for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
+ if ((*snp)->type != snp_type_het) continue;
- snp_cnt++;
+ snp_cnt++;
d = reads[row];
- base = (*d)[(*snp)->col];
-
- //
- // Check to make sure the nucleotide at the location of this SNP is
- // of one of the two possible states the multinomial model called.
- //
- if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
- allele += base;
- else
- break;
- }
-
- if (snp_cnt > 0 && allele.length() == snp_cnt)
- mtag->alleles[allele]++;
+ base = (*d)[(*snp)->col];
+
+ //
+ // Check to make sure the nucleotide at the location of this SNP is
+ // of one of the two possible states the multinomial model called.
+ //
+ if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
+ allele += base;
+ else
+ break;
+ }
+
+ if (snp_cnt > 0 && allele.length() == snp_cnt)
+ mtag->alleles[allele]++;
}
return 0;
@@ -169,90 +169,90 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, PStack *> &unique,
map<int, MergedStack *>::iterator it;
vector<int> keys;
for (it = merged.begin(); it != merged.end(); it++)
- keys.push_back(it->first);
+ keys.push_back(it->first);
int i;
#pragma omp parallel private(i)
{
#pragma omp for schedule(dynamic)
- for (i = 0; i < (int) keys.size(); i++) {
- MergedStack *mtag;
- PStack *utag;
-
- mtag = merged[keys[i]];
-
- //
- // Create a two-dimensional array, each row containing one read. For
- // each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
- //
- vector<int>::iterator j;
- vector<DNANSeq *> reads;
-
- for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
- utag = unique[*j];
-
- for (uint k = 0; k < utag->count; k++) {
- reads.push_back(utag->seq);
- }
- }
-
- //
- // Iterate over each column of the array and call the consensus base.
- //
- int row, col;
- int length = reads[0]->size();
- int height = reads.size();
- string con;
- map<char, int> nuc;
- map<char, int>::iterator max, n;
- DNANSeq *d;
-
- for (col = 0; col < length; col++) {
- nuc['A'] = 0;
- nuc['C'] = 0;
- nuc['G'] = 0;
- nuc['T'] = 0;
- nuc['N'] = 0;
-
- for (row = 0; row < height; row++) {
- d = reads[row];
- if (nuc.count((*d)[col]))
- nuc[(*d)[col]]++;
- }
-
- //
- // Find the base with a plurality of occurances and call it.
- //
- max = nuc.end();
-
- for (n = nuc.begin(); n != nuc.end(); n++) {
- if (n->first == 'N')
- continue;
- if (max == nuc.end() || n->second > max->second)
- max = n;
- }
- con += max->second == 0 ? 'N' : max->first;
-
- //
- // Search this column for the presence of a SNP
- //
- if (invoke_model)
- switch(model_type) {
- case snp:
+ for (i = 0; i < (int) keys.size(); i++) {
+ MergedStack *mtag;
+ PStack *utag;
+
+ mtag = merged[keys[i]];
+
+ //
+ // Create a two-dimensional array, each row containing one read. For
+ // each unique tag that has been merged together, add the sequence for
+ // that tag into our array as many times as it originally occurred.
+ //
+ vector<int>::iterator j;
+ vector<DNANSeq *> reads;
+
+ for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
+ utag = unique[*j];
+
+ for (uint k = 0; k < utag->count; k++) {
+ reads.push_back(utag->seq);
+ }
+ }
+
+ //
+ // Iterate over each column of the array and call the consensus base.
+ //
+ int row, col;
+ int length = reads[0]->size();
+ int height = reads.size();
+ string con;
+ map<char, int> nuc;
+ map<char, int>::iterator max, n;
+ DNANSeq *d;
+
+ for (col = 0; col < length; col++) {
+ nuc['A'] = 0;
+ nuc['C'] = 0;
+ nuc['G'] = 0;
+ nuc['T'] = 0;
+ nuc['N'] = 0;
+
+ for (row = 0; row < height; row++) {
+ d = reads[row];
+ if (nuc.count((*d)[col]))
+ nuc[(*d)[col]]++;
+ }
+
+ //
+ // Find the base with a plurality of occurances and call it.
+ //
+ max = nuc.end();
+
+ for (n = nuc.begin(); n != nuc.end(); n++) {
+ if (n->first == 'N')
+ continue;
+ if (max == nuc.end() || n->second > max->second)
+ max = n;
+ }
+ con += max->second == 0 ? 'N' : max->first;
+
+ //
+ // Search this column for the presence of a SNP
+ //
+ if (invoke_model)
+ switch(model_type) {
+ case snp:
call_multinomial_snp(mtag, col, nuc, true);
- break;
- case bounded:
- call_bounded_multinomial_snp(mtag, col, nuc, true);
- break;
- case fixed:
+ break;
+ case bounded:
+ call_bounded_multinomial_snp(mtag, col, nuc, true);
+ break;
+ case fixed:
call_multinomial_fixed(mtag, col, nuc);
- break;
- }
- }
+ break;
+ }
+ }
- if (invoke_model) {
- call_alleles(mtag, reads);
+ if (invoke_model) {
+ call_alleles(mtag, reads);
if (model_type == fixed) {
//
@@ -260,27 +260,27 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, PStack *> &unique,
//
vector<SNP *>::iterator s;
for (s = mtag->snps.begin(); s != mtag->snps.end(); s++) {
- if ((*s)->type == snp_type_unk)
- con.replace((*s)->col, 1, "N");
+ if ((*s)->type == snp_type_unk)
+ con.replace((*s)->col, 1, "N");
}
}
}
- mtag->add_consensus(con.c_str());
-
- //
- // If SNPs were called at this locus but no alleles could be determined,
- // blacklist this tag. This can occur if there are two many uncalled bases
- // in the locus (Ns), such that haplotypes can't be consistently read
- // due to the presence of the Ns in the reads.
- //
- if (mtag->alleles.empty())
- for (uint j = 0; j < mtag->snps.size(); j++)
- if (mtag->snps[j]->type == snp_type_het) {
- mtag->blacklisted = 1;
- break;
- }
- }
+ mtag->add_consensus(con.c_str());
+
+ //
+ // If SNPs were called at this locus but no alleles could be determined,
+ // blacklist this tag. This can occur if there are two many uncalled bases
+ // in the locus (Ns), such that haplotypes can't be consistently read
+ // due to the presence of the Ns in the reads.
+ //
+ if (mtag->alleles.empty())
+ for (uint j = 0; j < mtag->snps.size(); j++)
+ if (mtag->snps[j]->type == snp_type_het) {
+ mtag->blacklisted = 1;
+ break;
+ }
+ }
}
return 0;
@@ -298,19 +298,19 @@ double calc_coverage_distribution(map<int, PStack *> &unique, map<int, MergedSta
double stdev = 0.0;
for (it = merged.begin(); it != merged.end(); it++) {
- depth = 0.0;
- for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
- tag = unique[*k];
- depth += tag->count;
- }
-
- if (depth < min_stack_cov)
- continue;
- if (depth > max)
- max = depth;
-
- sum += depth;
- total++;
+ depth = 0.0;
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ depth += tag->count;
+ }
+
+ if (depth < min_stack_cov)
+ continue;
+ if (depth > max)
+ max = depth;
+
+ sum += depth;
+ total++;
}
mean = sum / total;
@@ -319,16 +319,16 @@ double calc_coverage_distribution(map<int, PStack *> &unique, map<int, MergedSta
// Calculate the standard deviation
//
for (it = merged.begin(); it != merged.end(); it++) {
- depth = 0.0;
- for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
- tag = unique[*k];
- depth += tag->count;
- }
+ depth = 0.0;
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ depth += tag->count;
+ }
- if (depth < min_stack_cov)
- continue;
+ if (depth < min_stack_cov)
+ continue;
- sum += pow((depth - mean), 2);
+ sum += pow((depth - mean), 2);
}
stdev = sqrt(sum / (total - 1));
@@ -345,10 +345,10 @@ int count_raw_reads(map<int, PStack *> &unique, map<int, MergedStack *> &merged)
long int m = 0;
for (it = merged.begin(); it != merged.end(); it++) {
- for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
- tag = unique[*k];
- m += tag->count;
- }
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ m += tag->count;
+ }
m += it->second->remtags.size();
}
@@ -380,9 +380,9 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
string mod_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".models.tsv";
if (gzip) {
- tag_file += ".gz";
- snp_file += ".gz";
- all_file += ".gz";
+ tag_file += ".gz";
+ snp_file += ".gz";
+ all_file += ".gz";
mod_file += ".gz";
}
@@ -392,14 +392,14 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
gzFile gz_tags, gz_snps, gz_alle, gz_mods;
ofstream tags, snps, alle, mods;
if (gzip) {
- gz_tags = gzopen(tag_file.c_str(), "wb");
- if (!gz_tags) {
- cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gz_tags = gzopen(tag_file.c_str(), "wb");
+ if (!gz_tags) {
+ cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_tags, libz_buffer_size);
- #endif
+ gzbuffer(gz_tags, libz_buffer_size);
+ #endif
gz_mods = gzopen(mod_file.c_str(), "wb");
if (!gz_mods) {
cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
@@ -408,43 +408,43 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
#if ZLIB_VERNUM >= 0x1240
gzbuffer(gz_mods, libz_buffer_size);
#endif
- gz_snps = gzopen(snp_file.c_str(), "wb");
- if (!gz_snps) {
- cerr << "Error: Unable to open gzipped snps file '" << snp_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gz_snps = gzopen(snp_file.c_str(), "wb");
+ if (!gz_snps) {
+ cerr << "Error: Unable to open gzipped snps file '" << snp_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_snps, libz_buffer_size);
- #endif
- gz_alle = gzopen(all_file.c_str(), "wb");
- if (!gz_alle) {
- cerr << "Error: Unable to open gzipped alleles file '" << all_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gzbuffer(gz_snps, libz_buffer_size);
+ #endif
+ gz_alle = gzopen(all_file.c_str(), "wb");
+ if (!gz_alle) {
+ cerr << "Error: Unable to open gzipped alleles file '" << all_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_alle, libz_buffer_size);
- #endif
+ gzbuffer(gz_alle, libz_buffer_size);
+ #endif
} else {
- tags.open(tag_file.c_str());
- if (tags.fail()) {
- cerr << "Error: Unable to open tag file for writing.\n";
- exit(1);
- }
+ tags.open(tag_file.c_str());
+ if (tags.fail()) {
+ cerr << "Error: Unable to open tag file for writing.\n";
+ exit(1);
+ }
mods.open(mod_file.c_str());
if (mods.fail()) {
cerr << "Error: Unable to open tag file for writing.\n";
exit(1);
}
- snps.open(snp_file.c_str());
- if (snps.fail()) {
- cerr << "Error: Unable to open SNPs file for writing.\n";
- exit(1);
- }
- alle.open(all_file.c_str());
- if (alle.fail()) {
- cerr << "Error: Unable to open allele file for writing.\n";
- exit(1);
- }
+ snps.open(snp_file.c_str());
+ if (snps.fail()) {
+ cerr << "Error: Unable to open SNPs file for writing.\n";
+ exit(1);
+ }
+ alle.open(all_file.c_str());
+ if (alle.fail()) {
+ cerr << "Error: Unable to open allele file for writing.\n";
+ exit(1);
+ }
}
//
@@ -468,8 +468,8 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
} else {
tags << log.str();
mods << log.str();
- snps << log.str();
- alle << log.str();
+ snps << log.str();
+ alle << log.str();
}
int id;
@@ -480,147 +480,147 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
int blacklisted = 0;
for (i = m.begin(); i != m.end(); i++) {
- tag_1 = i->second;
+ tag_1 = i->second;
- float total = 0;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++)
- total += u[*k]->count;
+ float total = 0;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++)
+ total += u[*k]->count;
- if (total < min_stack_cov) {
- excluded++;
- continue;
- }
+ if (total < min_stack_cov) {
+ excluded++;
+ continue;
+ }
- //
- // Calculate the log likelihood of this merged stack.
- //
- tag_1->gen_matrix(u);
- tag_1->calc_likelihood();
+ //
+ // Calculate the log likelihood of this merged stack.
+ //
+ tag_1->gen_matrix(u);
+ tag_1->calc_likelihood();
- wrote++;
+ wrote++;
- if (tag_1->blacklisted) blacklisted++;
+ if (tag_1->blacklisted) blacklisted++;
- // First write the consensus sequence
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ // First write the consensus sequence
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
<< tag_1->loc.chr << "\t"
<< tag_1->loc.bp << "\t"
- << (tag_1->loc.strand == plus ? "+" : "-") << "\t"
- << "consensus\t" << "\t\t"
- << tag_1->con << "\t"
- << tag_1->deleveraged << "\t"
- << tag_1->blacklisted << "\t"
- << tag_1->lumberjackstack << "\t"
- << tag_1->lnl << "\n";
-
- //
- // Write a sequence recording the output of the SNP model for each nucleotide.
- //
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
+ << (tag_1->loc.strand == strand_plus ? "+" : "-") << "\t"
+ << "consensus\t" << "\t\t"
+ << tag_1->con << "\t"
+ << tag_1->deleveraged << "\t"
+ << tag_1->blacklisted << "\t"
+ << tag_1->lumberjackstack << "\t"
+ << tag_1->lnl << "\n";
+
+ //
+ // Write a sequence recording the output of the SNP model for each nucleotide.
+ //
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ << "\t"
+ << "\t"
+ << "\t"
+ << "model\t" << "\t"
+ << "\t";
+ for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
+ switch((*s)->type) {
+ case snp_type_het:
+ sstr << "E";
+ break;
+ case snp_type_hom:
+ sstr << "O";
+ break;
+ default:
+ sstr << "U";
+ break;
+ }
+ }
+ sstr << "\t"
<< "\t"
<< "\t"
<< "\t"
- << "model\t" << "\t"
- << "\t";
- for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
- switch((*s)->type) {
- case snp_type_het:
- sstr << "E";
- break;
- case snp_type_hom:
- sstr << "O";
- break;
- default:
- sstr << "U";
- break;
- }
- }
- sstr << "\t"
- << "\t"
- << "\t"
- << "\t"
- << "\n";
-
- if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ << "\n";
+
+ if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
if (gzip) gzputs(gz_mods, sstr.str().c_str()); else mods << sstr.str();
- sstr.str("");
-
- // Now write out the components of each unique tag merged into this one.
- id = 0;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
- tag_2 = u[*k];
- buf = tag_2->seq->seq();
-
- for (j = tag_2->map.begin(); j != tag_2->map.end(); j++) {
- sstr << "0" << "\t" << sql_id << "\t" << tag_1->id << "\t\t\t\t" << "primary\t" << id << "\t" << *j << "\t" << buf << "\t\t\t\t\n";
- if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
- sstr.str("");
- }
- id++;
- delete [] buf;
- }
-
- //
- // Write out the model calls for each nucleotide in this locus.
- //
- for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- << (*s)->col << "\t";
-
- switch((*s)->type) {
- case snp_type_het:
- sstr << "E\t";
- break;
- case snp_type_hom:
- sstr << "O\t";
- break;
- default:
- sstr << "U\t";
- break;
- }
-
- sstr << std::fixed << std::setprecision(2)
- << (*s)->lratio << "\t"
- << (*s)->rank_1 << "\t"
- << (*s)->rank_2 << "\t\t\n";
- }
-
- if (gzip) gzputs(gz_snps, sstr.str().c_str()); else snps << sstr.str();
- sstr.str("");
-
- // Write the expressed alleles seen for the recorded SNPs and
- // the percentage of tags a particular allele occupies.
- //
+ sstr.str("");
+
+ // Now write out the components of each unique tag merged into this one.
+ id = 0;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
+ tag_2 = u[*k];
+ buf = tag_2->seq->seq();
+
+ for (j = tag_2->map.begin(); j != tag_2->map.end(); j++) {
+ sstr << "0" << "\t" << sql_id << "\t" << tag_1->id << "\t\t\t\t" << "primary\t" << id << "\t" << *j << "\t" << buf << "\t\t\t\t\n";
+ if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ sstr.str("");
+ }
+ id++;
+ delete [] buf;
+ }
+
+ //
+ // Write out the model calls for each nucleotide in this locus.
+ //
+ for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ << (*s)->col << "\t";
+
+ switch((*s)->type) {
+ case snp_type_het:
+ sstr << "E\t";
+ break;
+ case snp_type_hom:
+ sstr << "O\t";
+ break;
+ default:
+ sstr << "U\t";
+ break;
+ }
+
+ sstr << std::fixed << std::setprecision(2)
+ << (*s)->lratio << "\t"
+ << (*s)->rank_1 << "\t"
+ << (*s)->rank_2 << "\t\t\n";
+ }
+
+ if (gzip) gzputs(gz_snps, sstr.str().c_str()); else snps << sstr.str();
+ sstr.str("");
+
+ // Write the expressed alleles seen for the recorded SNPs and
+ // the percentage of tags a particular allele occupies.
+ //
char pct[id_len];
- for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
+ for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
sprintf(pct, "%.2f", ((t->second/total) * 100));
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- << t->first << "\t"
- << pct << "\t"
- << t->second << "\n";
- }
- if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
- sstr.str("");
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ << t->first << "\t"
+ << pct << "\t"
+ << t->second << "\n";
+ }
+ if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
+ sstr.str("");
}
if (gzip) {
- gzclose(gz_tags);
+ gzclose(gz_tags);
gzclose(gz_mods);
- gzclose(gz_snps);
- gzclose(gz_alle);
+ gzclose(gz_snps);
+ gzclose(gz_alle);
} else {
- tags.close();
+ tags.close();
mods.close();
- snps.close();
- alle.close();
+ snps.close();
+ alle.close();
}
cerr << " Wrote " << wrote << " loci, excluded " << excluded << " loci due to insuffient depth of coverage; blacklisted " << blacklisted << " loci.\n";
@@ -644,23 +644,23 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
//
for (i = unique.begin(); i != unique.end(); i++) {
snprintf(id, id_len - 1, "%s|%d|%s",
- i->second->loc.chr,
- i->second->loc.bp,
- i->second->loc.strand == plus ? "+" : "-");
+ i->second->loc.chr,
+ i->second->loc.bp,
+ i->second->loc.strand == strand_plus ? "+" : "-");
locations[id].insert(i->second->id);
}
it_old = merged.begin();
for (k = locations.begin(); k != locations.end(); k++) {
- m = new MergedStack;
- m->id = global_id;
+ m = new MergedStack;
+ m->id = global_id;
//
// Record the consensus and physical location for this stack.
//
s = k->second.begin();
- m->add_consensus(unique[*s]->seq);
+ m->add_consensus(unique[*s]->seq);
m->loc.set(unique[*s]->loc.chr, unique[*s]->loc.bp, unique[*s]->loc.strand);
//
@@ -672,13 +672,13 @@ int populate_merged_tags(map<int, PStack *> &unique, map<int, MergedStack *> &me
m->utags.push_back(u->id);
}
- //
- // Insert the new MergedStack giving a hint as to which position
- // to insert it at.
- //
- it_new = merged.insert(it_old, pair<int, MergedStack *>(global_id, m));
- it_old = it_new;
- global_id++;
+ //
+ // Insert the new MergedStack giving a hint as to which position
+ // to insert it at.
+ //
+ it_new = merged.insert(it_old, pair<int, MergedStack *>(global_id, m));
+ it_old = it_new;
+ global_id++;
}
cerr << " Merged " << unique.size() << " unique Stacks into " << merged.size() << " loci.\n";
@@ -732,6 +732,8 @@ int reduce_radtags(HashMap &radtags, map<int, PStack *> &unique) {
}
}
+ cerr << " " << radtags.size() << " unique stacks were aligned to " << unique.size() << " genomic locations.\n";
+
return 0;
}
@@ -757,9 +759,9 @@ int load_radtags(string in_file, HashMap &radtags) {
int i = 1;
while ((c = fh->next_seq()) != NULL) {
if (i % 10000 == 0) cerr << "Loading aligned sequence " << i << " \r";
- // cerr << "Loading aligned sequence " << i << " \n";
+ // cerr << "Loading aligned sequence " << i << " \n";
- radtags[c->seq].push_back(c);
+ radtags[c->seq].push_back(c);
i++;
}
@@ -788,14 +790,14 @@ int dump_stacks(map<int, PStack *> &u) {
for (it = u.begin(); it != u.end(); it++) {
- cerr << "Stack ID: " << (*it).second->id << "\n"
- << " Seq: " << (*it).second->seq->seq() << "\n"
- << " IDs: ";
+ cerr << "Stack ID: " << (*it).second->id << "\n"
+ << " Seq: " << (*it).second->seq->seq() << "\n"
+ << " IDs: ";
- for (fit = (*it).second->map.begin(); fit != (*it).second->map.end(); fit++)
- cerr << *fit << " ";
+ for (fit = (*it).second->map.begin(); fit != (*it).second->map.end(); fit++)
+ cerr << *fit << " ";
- cerr << "\n\n";
+ cerr << "\n\n";
}
return 0;
@@ -808,24 +810,24 @@ int dump_merged_stacks(map<int, MergedStack *> &m) {
for (it = m.begin(); it != m.end(); it++) {
- cerr << "MergedStack ID: " << it->second->id << "\n"
- << " Consensus: ";
- if (it->second->con != NULL)
- cerr << it->second->con << "\n";
- else
- cerr << "\n";
- cerr << " IDs: ";
+ cerr << "MergedStack ID: " << it->second->id << "\n"
+ << " Consensus: ";
+ if (it->second->con != NULL)
+ cerr << it->second->con << "\n";
+ else
+ cerr << "\n";
+ cerr << " IDs: ";
- for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
- cerr << (*fit) << " ";
+ for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
+ cerr << (*fit) << " ";
- cerr << "\n"
- << " Distances: ";
+ cerr << "\n"
+ << " Distances: ";
- for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
- cerr << (*pit).first << ": " << (*pit).second << ", ";
+ for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
+ cerr << (*pit).first << ": " << (*pit).second << ", ";
- cerr << "\n\n";
+ cerr << "\n\n";
}
return 0;
@@ -835,38 +837,38 @@ int parse_command_line(int argc, char* argv[]) {
int c;
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
- {"infile_type", required_argument, NULL, 't'},
- {"outfile_type", required_argument, NULL, 'y'},
- {"file", required_argument, NULL, 'f'},
- {"outpath", required_argument, NULL, 'o'},
- {"id", required_argument, NULL, 'i'},
- {"min_cov", required_argument, NULL, 'm'},
- {"num_threads", required_argument, NULL, 'p'},
- {"bc_err_freq", required_argument, NULL, 'e'},
- {"model_type", required_argument, NULL, 'T'},
- {"bound_low", required_argument, NULL, 'L'},
- {"bound_high", required_argument, NULL, 'U'},
- {"alpha", required_argument, NULL, 'A'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
-
- c = getopt_long(argc, argv, "hvOT:A:L:U:f:o:i:e:p:m:s:f:t:y:", long_options, &option_index);
-
- // Detect the end of the options.
- if (c == -1)
- break;
-
- switch (c) {
- case 'h':
- help();
- break;
- case 't':
+ {"infile_type", required_argument, NULL, 't'},
+ {"outfile_type", required_argument, NULL, 'y'},
+ {"file", required_argument, NULL, 'f'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"id", required_argument, NULL, 'i'},
+ {"min_cov", required_argument, NULL, 'm'},
+ {"num_threads", required_argument, NULL, 'p'},
+ {"bc_err_freq", required_argument, NULL, 'e'},
+ {"model_type", required_argument, NULL, 'T'},
+ {"bound_low", required_argument, NULL, 'L'},
+ {"bound_high", required_argument, NULL, 'U'},
+ {"alpha", required_argument, NULL, 'A'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hvOT:A:L:U:f:o:i:e:p:m:s:f:t:y:", long_options, &option_index);
+
+ // Detect the end of the options.
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 't':
if (strcmp(optarg, "bowtie") == 0)
in_file_type = FileT::bowtie;
else if (strcmp(optarg, "sam") == 0)
@@ -877,33 +879,33 @@ int parse_command_line(int argc, char* argv[]) {
in_file_type = FileT::tsv;
else
in_file_type = FileT::unknown;
- break;
- case 'y':
+ break;
+ case 'y':
if (strcmp(optarg, "sam") == 0)
out_file_type = FileT::sam;
else
out_file_type = FileT::sql;
- break;
- case 'f':
- in_file = optarg;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'i':
- sql_id = is_integer(optarg);
- if (sql_id < 0) {
- cerr << "SQL ID (-i) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 'm':
- min_stack_cov = atoi(optarg);
- break;
- case 'e':
- barcode_err_freq = atof(optarg);
- break;
- case 'T':
+ break;
+ case 'f':
+ in_file = optarg;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'i':
+ sql_id = is_integer(optarg);
+ if (sql_id < 0) {
+ cerr << "SQL ID (-i) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 'm':
+ min_stack_cov = atoi(optarg);
+ break;
+ case 'e':
+ barcode_err_freq = atof(optarg);
+ break;
+ case 'T':
if (strcmp(optarg, "snp") == 0) {
model_type = snp;
} else if (strcmp(optarg, "fixed") == 0) {
@@ -914,66 +916,66 @@ int parse_command_line(int argc, char* argv[]) {
cerr << "Unknown model type specified '" << optarg << "'\n";
help();
}
- case 'L':
- bound_low = atof(optarg);
- break;
- case 'U':
- bound_high = atof(optarg);
- break;
- case 'A':
- alpha = atof(optarg);
- break;
- case 'p':
- num_threads = atoi(optarg);
- break;
+ case 'L':
+ bound_low = atof(optarg);
+ break;
+ case 'U':
+ bound_high = atof(optarg);
+ break;
+ case 'A':
+ alpha = atof(optarg);
+ break;
+ case 'p':
+ num_threads = atoi(optarg);
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (alpha != 0.1 && alpha != 0.05 && alpha != 0.01 && alpha != 0.001) {
- cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
- help();
+ cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
+ help();
}
if (bound_low != 0 && (bound_low < 0 || bound_low >= 1.0)) {
- cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
- help();
+ cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
+ help();
}
if (bound_high != 1 && (bound_high <= 0 || bound_high > 1.0)) {
- cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
- help();
+ cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
+ help();
}
if (bound_low > 0 || bound_high < 1.0) {
- model_type = bounded;
+ model_type = bounded;
}
if (in_file.length() == 0 || in_file_type == FileT::unknown) {
- cerr << "You must specify an input file of a supported type.\n";
- help();
+ cerr << "You must specify an input file of a supported type.\n";
+ help();
}
if (out_path.length() == 0)
- out_path = ".";
+ out_path = ".";
if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ out_path += "/";
if (model_type == fixed && barcode_err_freq == 0) {
- cerr << "You must specify the barcode error frequency.\n";
- help();
+ cerr << "You must specify the barcode error frequency.\n";
+ help();
}
return 0;
@@ -988,22 +990,22 @@ void version() {
void help() {
std::cerr << "pstacks " << VERSION << "\n"
<< "pstacks -t file_type -f file_path [-o path] [-i id] [-m min_cov] [-p num_threads] [-h]" << "\n"
- << " t: input file Type. Supported types: bowtie, sam, or bam.\n"
+ << " t: input file Type. Supported types: bowtie, sam, or bam.\n"
<< " f: input file path.\n"
- << " o: output path to write results.\n"
- << " i: SQL ID to insert into the output to identify this sample.\n"
- << " m: minimum depth of coverage to report a stack (default 1).\n"
+ << " o: output path to write results.\n"
+ << " i: SQL ID to insert into the output to identify this sample.\n"
+ << " m: minimum depth of coverage to report a stack (default 1).\n"
<< " p: enable parallel execution with num_threads threads.\n"
- << " h: display this help messsage.\n"
- << " Model options:\n"
- << " --model_type <type>: either 'snp' (default), 'bounded', or 'fixed'\n"
- << " For the SNP or Bounded SNP model:\n"
- << " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
- << " For the Bounded SNP model:\n"
- << " --bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
- << " --bound_high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
- << " For the Fixed model:\n"
- << " --bc_err_freq <num>: specify the barcode error frequency, between 0 and 1.0.\n";
+ << " h: display this help messsage.\n"
+ << " Model options:\n"
+ << " --model_type <type>: either 'snp' (default), 'bounded', or 'fixed'\n"
+ << " For the SNP or Bounded SNP model:\n"
+ << " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
+ << " For the Bounded SNP model:\n"
+ << " --bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
+ << " --bound_high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
+ << " For the Fixed model:\n"
+ << " --bc_err_freq <num>: specify the barcode error frequency, between 0 and 1.0.\n";
exit(0);
}
diff --git a/src/renz.h b/src/renz.cc
similarity index 80%
copy from src/renz.h
copy to src/renz.cc
index b009c52..c153069 100644
--- a/src/renz.h
+++ b/src/renz.cc
@@ -1,31 +1,11 @@
-// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
-//
-// Copyright 2011-2015, Julian Catchen <jcatchen at uoregon.edu>
-//
-// This file is part of Stacks.
-//
-// Stacks is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// Stacks is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
-//
-
-#ifndef __RENZ_H__
-#define __RENZ_H__
+#include "renz.h"
-#include <map>
-using std::map;
-#include <string>
-using std::string;
+using namespace std;
+//
+// First line of static array contains each enzyme's cut sites. Second
+// line is the reverse complement of each cut site.
+//
const char *aciI[] = {"CGC", "CGG", // C/CGC, AciI
"GCG", "CCG"};
const char *ageI[] = {"CCGGT", // A/CCGGT, AgeI
@@ -33,81 +13,89 @@ const char *ageI[] = {"CCGGT", // A/CCGGT, AgeI
const char *aluI[] = {"CT", // AG/CT, AluI
"AG"};
const char *apeKI[] = {"CAGC", "CTGC", // G/CWGC, ApeKI; W=A or T
- "GTCG", "GACG"};
+ "GTCG", "GACG"};
const char *apoI[] = {"AATTC", "AATTT", // R/AATTY, ApoI (also known as XapI)
"GAATT", "AAATT"};
const char *aseI[] = {"TAAT", // AT/TAAT, AseI
"ATTA"};
const char *bamHI[] = {"GATCC", // G/GATCC, BamHI
- "GGATC"};
+ "GGATC"};
const char *bfaI[] = {"TAG", // C/TAG, BfaI
"CTA"};
const char *bgIII[] = {"GATCT", // A/GATCT, BgIII
- "AGATC"};
+ "AGATC"};
+const char *bsaHI[] = {"CGCC", "CGTC", // GR/CGYC, BsaHI
+ "GGCG", "GACG"};
const char *bspDI[] = {"CGAT", // AT/CGAT, BspDI
- "ATCG"};
+ "ATCG"};
const char *bstYI[] = {"GATCC", "GATCT", // R/GATCY, BstYI (also known as PsuI)
"GGATC", "AGATC"};
const char *claI[] = {"CGAT", // AT/CGAT, ClaI
"ATCG"};
+const char *csp6I[] = {"TAC", // G/TAC, Csp6I
+ "GTA"};
const char *ddeI[] = {"TAAG", "TCAG", "TGAG", "TTAG", // C/TNAG, DdeI
- "CTTA", "CTGA", "CTCA", "CTAA"};
+ "CTTA", "CTGA", "CTCA", "CTAA"};
const char *dpnII[] = {"GATC", // GATC, DpnII
- "GATC"};
+ "GATC"};
const char *eaeI[] = {"GGCCA", "GGCCG", // Y/GGCCR, EaeI
- "TGGCC", "CGGCC"};
+ "TGGCC", "CGGCC"};
const char *ecoRI[] = {"AATTC", // G/AATTC, EcoRI
- "GAATT"};
+ "GAATT"};
const char *ecoRV[] = {"ATC", // GAT/ATC, EcoRV
- "GAT"};
+ "GAT"};
const char *ecoT22I[] = {"TGCAT", // A/TGCAT, EcoT22I
- "ATGCA"};
+ "ATGCA"};
const char *hindIII[] = {"AGCTT", // A/AGCTT, HindIII
- "TCGAA"};
+ "TCGAA"};
+const char *hpaII[] = {"CGG", // C/CGG, HpaII
+ "CCG"};
const char *kpnI[] = {"GTACC", // C/CATGG, KpnI
- "GGTAC"};
+ "GGTAC"};
const char *mluCI[] = {"AATT", // AATT, MluCI
- "AATT"};
+ "AATT"};
const char *mseI[] = {"TAA", // T/TAA, MseI
- "TTA"};
+ "TTA"};
const char *mspI[] = {"CGG", // C/CGG, MspI
- "CCG"};
-const char *ndeI[] = {"TA", // CA/TATG, NdeI
- "TA"};
+ "CCG"};
+const char *ncoI[] = {"CATGG", // C/CATGG, NcoI
+ "CCATG"};
+const char *ndeI[] = {"TATG", // CA/TATG, NdeI
+ "CATA"};
const char *nheI[] = {"CTAGC", // G/CTAGC, NheI
"GCTAG"};
const char *nlaIII[] = {"CATG", // CATG, NlaIII
- "CATG"};
+ "CATG"};
const char *notI[] = {"GGCCGC", // GC/GGCCGC, NotI
- "GCGGCC"};
+ "GCGGCC"};
const char *nsiI[] = {"TGCAT", // ATGCA/T, NsiI
- "ATGCA"};
+ "ATGCA"};
const char *pstI[] = {"TGCAG", // CTGCA/G, PstI
- "CTGCA"};
+ "CTGCA"};
const char *rsaI[] = {"AC", // GT/AC, RsaI
"GT"};
const char *sacI[] = {"AGCTC", // GAGCT/C, SacI
- "GAGCT"};
+ "GAGCT"};
const char *sau3AI[] = {"GATC", // GATC, Sau3AI
- "GATC"};
+ "GATC"};
const char *sbfI[] = {"TGCAGG", // CCTGCA/GG, SbfI
- "CCTGCA"};
+ "CCTGCA"};
const char *sexAI[] = {"CCAGGT", "CCTGGT", // A/CCWGGT, SexAI; W=A or T
- "ACCTGG", "ACCAGG"};
+ "ACCTGG", "ACCAGG"};
const char *sgrAI[] = {"CCGGCG", "CCGGTG", // CR/CCGGYG, SgrAI; R=A or G; Y=C or T
- "CGCCGG", "CACCGG"};
+ "CGCCGG", "CACCGG"};
const char *speI[] = {"CTAGT", // A/CTAGT, SpeI
"ACTAG"};
const char *sphI[] = {"CATGC", // GCATG/C, SphI
- "GCATG"};
+ "GCATG"};
const char *taqI[] = {"CGA", // T/CGA, TaqI
- "TCG"};
+ "TCG"};
const char *xbaI[] = {"CTAGA", // T/CTAGA, XbaI
- "TCTAG"};
+ "TCTAG"};
const char *xhoI[] = {"TCGAG", // C/TCGAG, XhoI
"CTCGA"};
-void
+void
initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len) {
renz["sbfI"] = sbfI; // CCTGCA/GG, SbfI
@@ -150,7 +138,11 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz["bfaI"] = bfaI; // C/TAG, BfaI
renz["aseI"] = aseI; // AT/TAAT, AseI
renz["bspDI"] = bspDI; // AT/CGAT, BspDI
-
+ renz["csp6I"] = csp6I; // G/TAC, Csp6I
+ renz["bsaHI"] = bsaHI; // GR/CGYC, BsaHI
+ renz["hpaII"] = hpaII; // C/CGG, HpaII
+ renz["ncoI"] = ncoI; // C/CATGG, NcoI
+
renz_cnt["sbfI"] = 1;
renz_cnt["pstI"] = 1;
renz_cnt["notI"] = 1;
@@ -191,6 +183,10 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_cnt["bfaI"] = 1;
renz_cnt["aseI"] = 1;
renz_cnt["bspDI"] = 1;
+ renz_cnt["csp6I"] = 1;
+ renz_cnt["bsaHI"] = 2;
+ renz_cnt["hpaII"] = 1;
+ renz_cnt["ncoI"] = 1;
renz_len["sbfI"] = 6;
renz_len["pstI"] = 5;
@@ -204,7 +200,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_len["nlaIII"] = 4;
renz_len["mluCI"] = 4;
renz_len["ecoT22I"] = 5;
- renz_len["ndeI"] = 2;
+ renz_len["ndeI"] = 4;
renz_len["nsiI"] = 5;
renz_len["mseI"] = 3;
renz_len["mspI"] = 3;
@@ -232,11 +228,14 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_len["bfaI"] = 3;
renz_len["aseI"] = 4;
renz_len["bspDI"] = 4;
+ renz_len["csp6I"] = 3;
+ renz_len["bsaHI"] = 4;
+ renz_len["hpaII"] = 3;
+ renz_len["ncoI"] = 5;
}
-void
+void
initialize_renz_olap(map<string, int> &renz_olap) {
renz_olap["sbfI"] = 4;
}
-#endif // __RENZ_H__
diff --git a/src/renz.h b/src/renz.h
index b009c52..63bc8c4 100644
--- a/src/renz.h
+++ b/src/renz.h
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2015, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2011-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -22,221 +22,12 @@
#define __RENZ_H__
#include <map>
-using std::map;
#include <string>
-using std::string;
-
-const char *aciI[] = {"CGC", "CGG", // C/CGC, AciI
- "GCG", "CCG"};
-const char *ageI[] = {"CCGGT", // A/CCGGT, AgeI
- "ACCGG"};
-const char *aluI[] = {"CT", // AG/CT, AluI
- "AG"};
-const char *apeKI[] = {"CAGC", "CTGC", // G/CWGC, ApeKI; W=A or T
- "GTCG", "GACG"};
-const char *apoI[] = {"AATTC", "AATTT", // R/AATTY, ApoI (also known as XapI)
- "GAATT", "AAATT"};
-const char *aseI[] = {"TAAT", // AT/TAAT, AseI
- "ATTA"};
-const char *bamHI[] = {"GATCC", // G/GATCC, BamHI
- "GGATC"};
-const char *bfaI[] = {"TAG", // C/TAG, BfaI
- "CTA"};
-const char *bgIII[] = {"GATCT", // A/GATCT, BgIII
- "AGATC"};
-const char *bspDI[] = {"CGAT", // AT/CGAT, BspDI
- "ATCG"};
-const char *bstYI[] = {"GATCC", "GATCT", // R/GATCY, BstYI (also known as PsuI)
- "GGATC", "AGATC"};
-const char *claI[] = {"CGAT", // AT/CGAT, ClaI
- "ATCG"};
-const char *ddeI[] = {"TAAG", "TCAG", "TGAG", "TTAG", // C/TNAG, DdeI
- "CTTA", "CTGA", "CTCA", "CTAA"};
-const char *dpnII[] = {"GATC", // GATC, DpnII
- "GATC"};
-const char *eaeI[] = {"GGCCA", "GGCCG", // Y/GGCCR, EaeI
- "TGGCC", "CGGCC"};
-const char *ecoRI[] = {"AATTC", // G/AATTC, EcoRI
- "GAATT"};
-const char *ecoRV[] = {"ATC", // GAT/ATC, EcoRV
- "GAT"};
-const char *ecoT22I[] = {"TGCAT", // A/TGCAT, EcoT22I
- "ATGCA"};
-const char *hindIII[] = {"AGCTT", // A/AGCTT, HindIII
- "TCGAA"};
-const char *kpnI[] = {"GTACC", // C/CATGG, KpnI
- "GGTAC"};
-const char *mluCI[] = {"AATT", // AATT, MluCI
- "AATT"};
-const char *mseI[] = {"TAA", // T/TAA, MseI
- "TTA"};
-const char *mspI[] = {"CGG", // C/CGG, MspI
- "CCG"};
-const char *ndeI[] = {"TA", // CA/TATG, NdeI
- "TA"};
-const char *nheI[] = {"CTAGC", // G/CTAGC, NheI
- "GCTAG"};
-const char *nlaIII[] = {"CATG", // CATG, NlaIII
- "CATG"};
-const char *notI[] = {"GGCCGC", // GC/GGCCGC, NotI
- "GCGGCC"};
-const char *nsiI[] = {"TGCAT", // ATGCA/T, NsiI
- "ATGCA"};
-const char *pstI[] = {"TGCAG", // CTGCA/G, PstI
- "CTGCA"};
-const char *rsaI[] = {"AC", // GT/AC, RsaI
- "GT"};
-const char *sacI[] = {"AGCTC", // GAGCT/C, SacI
- "GAGCT"};
-const char *sau3AI[] = {"GATC", // GATC, Sau3AI
- "GATC"};
-const char *sbfI[] = {"TGCAGG", // CCTGCA/GG, SbfI
- "CCTGCA"};
-const char *sexAI[] = {"CCAGGT", "CCTGGT", // A/CCWGGT, SexAI; W=A or T
- "ACCTGG", "ACCAGG"};
-const char *sgrAI[] = {"CCGGCG", "CCGGTG", // CR/CCGGYG, SgrAI; R=A or G; Y=C or T
- "CGCCGG", "CACCGG"};
-const char *speI[] = {"CTAGT", // A/CTAGT, SpeI
- "ACTAG"};
-const char *sphI[] = {"CATGC", // GCATG/C, SphI
- "GCATG"};
-const char *taqI[] = {"CGA", // T/CGA, TaqI
- "TCG"};
-const char *xbaI[] = {"CTAGA", // T/CTAGA, XbaI
- "TCTAG"};
-const char *xhoI[] = {"TCGAG", // C/TCGAG, XhoI
- "CTCGA"};
-
-void
-initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len) {
- renz["sbfI"] = sbfI; // CCTGCA/GG, SbfI
- renz["pstI"] = pstI; // CTGCA/G, PstI
- renz["notI"] = notI; // GC/GGCCGC, NotI
- renz["ecoRI"] = ecoRI; // G/AATTC, EcoRI
- renz["sgrAI"] = sgrAI; // CR/CCGGYG, SgrAI; R=A or G; Y=C or T
- renz["apeKI"] = apeKI; // G/CWGC, ApeKI; W=A or T
- renz["hindIII"] = hindIII; // A/AGCTT, HindIII
- renz["dpnII"] = dpnII; // GATC, DpnII
- renz["sphI"] = sphI; // GCATG/C, SphI
- renz["nlaIII"] = nlaIII; // CATG, NlaIII
- renz["mluCI"] = mluCI; // AATT, MluCI
- renz["ecoT22I"] = ecoT22I; // A/TGCAT, EcoT22I
- renz["ndeI"] = ndeI; // CA/TATG, NdeI
- renz["nsiI"] = nsiI; // ATGCA/T, NsiI
- renz["mseI"] = mseI; // T/TAA, MseI
- renz["mspI"] = mspI; // C/CGG, MspI
- renz["sexAI"] = sexAI; // A/CCWGGT, SexAI; W=A or T
- renz["sau3AI"] = sau3AI; // GATC, Sau3AI
- renz["bamHI"] = bamHI; // G/GATCC, BamHI
- renz["xbaI"] = xbaI; // T/CTAGA, XbaI
- renz["eaeI"] = eaeI; // Y/GGCCR, EaeI
- renz["taqI"] = taqI; // T/CGA, TaqI
- renz["claI"] = claI; // AT/CGAT, ClaI
- renz["nheI"] = nheI; // G/CTAGC, NheI
- renz["speI"] = speI; // A/CTAGT, SpeI
- renz["apoI"] = apoI; // R/AATTY, ApoI, XapI
- renz["bstYI"] = bstYI; // R/GATCY, BstYI, PsuI
- renz["xhoI"] = xhoI; // C/TCGAG, XhoI
- renz["sacI"] = sacI; // GAGCT/C, SacI
- renz["bgIII"] = bgIII; // A/GATCT, BgIII
- renz["ecoRV"] = ecoRV; // GAT/ATC, EcoRV
- renz["kpnI"] = kpnI; // C/CATGG, KpnI
- renz["ddeI"] = ddeI; // C/TNAG, DdeI
- renz["aluI"] = aluI; // AG/CT, AluI
- renz["ageI"] = ageI; // A/CCGGT, AgeI
- renz["rsaI"] = rsaI; // GT/AC, RsaI
- renz["aciI"] = aciI; // C/CGC, AciI
- renz["bfaI"] = bfaI; // C/TAG, BfaI
- renz["aseI"] = aseI; // AT/TAAT, AseI
- renz["bspDI"] = bspDI; // AT/CGAT, BspDI
-
- renz_cnt["sbfI"] = 1;
- renz_cnt["pstI"] = 1;
- renz_cnt["notI"] = 1;
- renz_cnt["ecoRI"] = 1;
- renz_cnt["sgrAI"] = 2;
- renz_cnt["apeKI"] = 2;
- renz_cnt["hindIII"] = 1;
- renz_cnt["dpnII"] = 1;
- renz_cnt["sphI"] = 1;
- renz_cnt["nlaIII"] = 1;
- renz_cnt["mluCI"] = 1;
- renz_cnt["ecoT22I"] = 1;
- renz_cnt["ndeI"] = 1;
- renz_cnt["nsiI"] = 1;
- renz_cnt["mseI"] = 1;
- renz_cnt["mspI"] = 1;
- renz_cnt["sexAI"] = 2;
- renz_cnt["sau3AI"] = 1;
- renz_cnt["bamHI"] = 1;
- renz_cnt["xbaI"] = 1;
- renz_cnt["eaeI"] = 2;
- renz_cnt["taqI"] = 1;
- renz_cnt["claI"] = 1;
- renz_cnt["nheI"] = 1;
- renz_cnt["speI"] = 1;
- renz_cnt["apoI"] = 2;
- renz_cnt["bstYI"] = 2;
- renz_cnt["xhoI"] = 1;
- renz_cnt["sacI"] = 1;
- renz_cnt["bgIII"] = 1;
- renz_cnt["ecoRV"] = 1;
- renz_cnt["kpnI"] = 1;
- renz_cnt["ddeI"] = 4;
- renz_cnt["aluI"] = 1;
- renz_cnt["ageI"] = 1;
- renz_cnt["rsaI"] = 1;
- renz_cnt["aciI"] = 2;
- renz_cnt["bfaI"] = 1;
- renz_cnt["aseI"] = 1;
- renz_cnt["bspDI"] = 1;
-
- renz_len["sbfI"] = 6;
- renz_len["pstI"] = 5;
- renz_len["notI"] = 6;
- renz_len["ecoRI"] = 5;
- renz_len["sgrAI"] = 6;
- renz_len["apeKI"] = 4;
- renz_len["hindIII"] = 5;
- renz_len["dpnII"] = 4;
- renz_len["sphI"] = 5;
- renz_len["nlaIII"] = 4;
- renz_len["mluCI"] = 4;
- renz_len["ecoT22I"] = 5;
- renz_len["ndeI"] = 2;
- renz_len["nsiI"] = 5;
- renz_len["mseI"] = 3;
- renz_len["mspI"] = 3;
- renz_len["sexAI"] = 6;
- renz_len["sau3AI"] = 4;
- renz_len["bamHI"] = 5;
- renz_len["xbaI"] = 5;
- renz_len["eaeI"] = 5;
- renz_len["taqI"] = 3;
- renz_len["claI"] = 4;
- renz_len["nheI"] = 5;
- renz_len["speI"] = 5;
- renz_len["apoI"] = 5;
- renz_len["bstYI"] = 5;
- renz_len["xhoI"] = 5;
- renz_len["sacI"] = 5;
- renz_len["bgIII"] = 5;
- renz_len["ecoRV"] = 3;
- renz_len["kpnI"] = 5;
- renz_len["ddeI"] = 4;
- renz_len["aluI"] = 2;
- renz_len["ageI"] = 5;
- renz_len["rsaI"] = 2;
- renz_len["aciI"] = 3;
- renz_len["bfaI"] = 3;
- renz_len["aseI"] = 4;
- renz_len["bspDI"] = 4;
-}
+using std::map;
+using std::string;
-void
-initialize_renz_olap(map<string, int> &renz_olap) {
- renz_olap["sbfI"] = 4;
-}
+void initialize_renz_olap(map<string, int> &renz_olap);
+void initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len);
#endif // __RENZ_H__
diff --git a/src/rxstacks.cc b/src/rxstacks.cc
index 3ca8781..b800df1 100644
--- a/src/rxstacks.cc
+++ b/src/rxstacks.cc
@@ -23,8 +23,12 @@
// across a population of samples.
//
+#include "MetaPopInfo.h"
+
#include "rxstacks.h"
+typedef MetaPopInfo::Sample Sample;
+
// Global variables to hold command-line options.
int num_threads = 1;
int batch_id = 0;
@@ -89,9 +93,12 @@ int main (int argc, char* argv[]) {
omp_set_num_threads(num_threads);
#endif
- vector<pair<int, string> > files;
- if (!build_file_list(files))
- exit(1);
+ MetaPopInfo mpopi;
+ mpopi.init_directory(in_path);
+ if (mpopi.samples().empty()) {
+ cerr << "Error: Failed to find sample files in directory '" << in_path << "'.\n";
+ return -1;
+ }
//
// Open and initialize the log files.
@@ -124,33 +131,46 @@ int main (int argc, char* argv[]) {
// Load matches to the catalog
//
vector<vector<CatMatch *> > catalog_matches;
- map<int, string> samples;
- vector<int> sample_ids;
- for (uint i = 0; i < files.size(); i++) {
- vector<CatMatch *> m;
- load_catalog_matches(in_path + files[i].second, m);
-
- if (m.size() == 0) {
- cerr << "Warning: unable to find any matches in file '" << files[i].second << "', excluding this sample from population analysis.\n";
- continue;
- }
+ vector<size_t> samples_to_remove;
+ set<size_t> seen_samples;
+ for (size_t i=0; i<mpopi.samples().size(); ++i) {
+ catalog_matches.push_back(vector<CatMatch*>());
+ vector<CatMatch *>& m = catalog_matches.back();
+
+ const Sample& sample = mpopi.samples()[i];
+ load_catalog_matches(in_path + sample.name, m);
+
+ if (m.size() == 0) {
+ cerr << "Warning: Absent or malformed matches file '"
+ << sample.name << ".matches.tsv(.gz)"
+ <<"', excluding this sample from population analysis.\n";
+ samples_to_remove.push_back(i);
+ catalog_matches.pop_back(); // n.b. Index shift will be resolved by the call to MetaPopInfo::delete_samples().
+ continue;
+ }
- catalog_matches.push_back(m);
- if (samples.count(m[0]->sample_id) == 0) {
- samples[m[0]->sample_id] = files[i].second;
- sample_ids.push_back(m[0]->sample_id);
- } else {
- cerr << "Fatal error: sample ID " << m[0]->sample_id << " occurs twice in this data set, likely the pipeline was run incorrectly.\n";
- exit(0);
- }
+ size_t sample_id = m[0]->sample_id;
+ if (seen_samples.count(sample_id) > 0) {
+ cerr << "Error: sample ID " << sample_id << " occurs twice in this data set, likely the pipeline was run incorrectly.\n";
+ return -1;
+ }
+ seen_samples.insert(sample_id);
+ mpopi.set_sample_id(i, sample_id);
+ }
+ mpopi.delete_samples(samples_to_remove);
+ if (mpopi.samples().size() == 0) {
+ cerr << "Error: Couln't find any matches files.\n";
+ return -1;
}
+ // [mpopi] is definitive.
+ cerr << "Working on " << mpopi.samples().size() << " samples.\n";
//
// Create the population map
//
- cerr << "Populating observed haplotypes for " << sample_ids.size() << " samples, " << catalog.size() << " loci.\n";
- PopMap<CSLocus> *pmap = new PopMap<CSLocus>(sample_ids.size(), catalog.size());
- pmap->populate(sample_ids, catalog, catalog_matches);
+ cerr << "Populating observed haplotypes for " << mpopi.samples().size() << " samples, " << catalog.size() << " loci.\n";
+ PopMap<CSLocus> *pmap = new PopMap<CSLocus>(mpopi, catalog.size());
+ pmap->populate(catalog, catalog_matches);
//
// Sum haplotype counts across the population for each catalog locus.
@@ -162,17 +182,15 @@ int main (int argc, char* argv[]) {
//
calc_lnl_means(catalog, pmap);
- int catalog_id, sample_id, tag_id;
- string file;
+ int catalog_id, tag_id;
//
// Process samples matched to the catalog, one by one.
//
- for (uint i = 0; i < catalog_matches.size(); i++) {
- sample_id = catalog_matches[i][0]->sample_id;
- file = samples[sample_id];
+ for (uint i = 0; i < mpopi.samples().size(); i++) {
+ const Sample& sample = mpopi.samples()[i];
- cerr << "Loading stacks from sample " << file << " [" << i+1 << " of " << catalog_matches.size() << "]...\n";
+ cerr << "Loading stacks from sample " << sample.name << " [" << i+1 << " of " << mpopi.samples().size() << "]...\n";
//////
//////
@@ -182,12 +200,12 @@ int main (int argc, char* argv[]) {
map<int, Locus *> stacks;
int res;
- if ((res = load_loci(in_path + file, stacks, true, true, compressed)) == 0) {
- cerr << "Unable to load sample file '" << file << "'\n";
+ if ((res = load_loci(in_path + sample.name, stacks, true, true, compressed)) == 0) {
+ cerr << "Unable to load sample file '" << sample.name << "'\n";
continue;
}
- cerr << "Making corrections to sample " << file << "...";
+ cerr << "Making corrections to sample " << sample.name << "...";
set<pair<int, int> > uniq_matches;
set<pair<int, int> >::iterator it;
@@ -262,7 +280,7 @@ int main (int argc, char* argv[]) {
continue;
}
- d = pmap->datum(catalog_id, sample_id);
+ d = pmap->datum(catalog_id, sample.id);
if (d == NULL) continue;
@@ -350,7 +368,7 @@ int main (int argc, char* argv[]) {
<< "Blacklisted: " << lnl_cnt << " loci due to log likelihoods below threshold.\n"
<< "Blacklisted: " << conf_loci_cnt << " confounded loci.\n";
- log_fh << file << "\t"
+ log_fh << sample.name << "\t"
<< nuc_cnt << "\t"
<< total << "\t"
<< unk_hom_cnt << "\t"
@@ -370,7 +388,7 @@ int main (int argc, char* argv[]) {
//
// Rewrite stacks, model outputs, and haplotypes.
//
- write_results(file, stacks);
+ write_results(sample.name, stacks);
//
// Free up memory
@@ -1300,7 +1318,7 @@ write_results(string file, map<int, Locus *> &m)
<< tag_1->id << "\t"
<< tag_1->loc.chr << "\t"
<< tag_1->loc.bp << "\t"
- << (tag_1->loc.strand == plus ? "+" : "-") << "\t"
+ << (tag_1->loc.strand == strand_plus ? "+" : "-") << "\t"
<< "consensus" << "\t"
<< "\t"
<< "\t"
@@ -1430,50 +1448,6 @@ write_results(string file, map<int, Locus *> &m)
return 0;
}
-int build_file_list(vector<pair<int, string> > &files) {
- vector<string> parts;
- string f;
-
- //
- // Read all the files from the Stacks directory.
- //
- uint pos;
- string file;
- struct dirent *direntry;
-
- DIR *dir = opendir(in_path.c_str());
-
- if (dir == NULL) {
- cerr << "Unable to open directory '" << in_path << "' for reading.\n";
- exit(1);
- }
-
- while ((direntry = readdir(dir)) != NULL) {
- file = direntry->d_name;
-
- if (file == "." || file == "..")
- continue;
-
- if (file.substr(0, 6) == "batch_")
- continue;
-
- pos = file.rfind(".tags.tsv");
- if (pos < file.length())
- files.push_back(make_pair(1, file.substr(0, pos)));
- }
-
- closedir(dir);
-
- if (files.size() == 0) {
- cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
- return 0;
- }
-
- cerr << "Found " << files.size() << " input file(s).\n";
-
- return 1;
-}
-
int
fill_catalog_snps(map<int, CSLocus *> &catalog)
{
diff --git a/src/smoothing.h b/src/smoothing.h
index 7c95913..b0ce73f 100644
--- a/src/smoothing.h
+++ b/src/smoothing.h
@@ -60,7 +60,8 @@ KSmooth<StatT>::smooth(vector<StatT *> &popstats)
// By default, sigma = 150Kb, for computational efficiency, only calculate average out to 3sigma.
//
#pragma omp parallel
- {
+ {
+ int limit = 3 * sigma;
int dist;
uint pos_l, pos_u;
double sum, final_weight;
@@ -94,16 +95,16 @@ KSmooth<StatT>::smooth(vector<StatT *> &popstats)
#pragma omp critical
{
cerr << "ERROR: current basepair is out of the sliding window.\n"
- << " Calculating sliding window; start position: " << pos_l << ", " << (popstats[pos_l] == NULL ? -1 : popstats[pos_l]->bp) << "bp; end position: "
- << pos_u << ", " << (popstats[pos_u] == NULL ? -1 : popstats[pos_u]->bp) << "bp; center: "
- << pos_c << ", " << popstats[pos_c]->bp << "bp\n"
- << " Current position: " << pos_p << ", " << popstats[pos_p]->bp << "; Dist: " << dist << "\n"
+ << " Calculating sliding window; start position: " << pos_l << ", " << (popstats[pos_l] == NULL ? -1 : popstats[pos_l]->bp +1) << "bp; end position: "
+ << pos_u << ", " << (popstats[pos_u] == NULL ? -1 : popstats[pos_u]->bp +1) << "bp; center: "
+ << pos_c << ", " << popstats[pos_c]->bp +1 << "bp\n"
+ << " Current position: " << pos_p << ", " << popstats[pos_p]->bp +1 << "; Dist: " << dist << "\n"
<< " Window positions:\n";
for (uint j = pos_l; j < pos_u; j++) {
p = popstats[j];
if (p == NULL) continue;
- cerr << " Position: " << j << "; " << p->bp << "bp\n";
+ cerr << " Position: " << j << "; " << p->bp +1 << "bp\n";
}
//exit(0);
}
diff --git a/src/smoothing_utils.h b/src/smoothing_utils.h
index 86b09e6..685e20d 100644
--- a/src/smoothing_utils.h
+++ b/src/smoothing_utils.h
@@ -26,8 +26,8 @@
using std::vector;
extern double sigma;
-int limit = 3 * sigma;
+inline
double *
calc_weights()
{
@@ -38,6 +38,7 @@ calc_weights()
// genetic statistic at position p to the region average was weighted by the Gaussian function:
// exp( (-1 * (p - c)^2) / (2 * sigma^2))
//
+ int limit = 3 * sigma;
double *weights = new double[limit + 1];
for (int i = 0; i <= limit; i++)
@@ -50,6 +51,7 @@ template<class StatT>
inline int
determine_window_limits(vector<StatT *> &sites, uint center_bp, uint &pos_l, uint &pos_u)
{
+ int limit = 3 * sigma;
int limit_l = center_bp - limit > 0 ? center_bp - limit : 0;
int limit_u = center_bp + limit;
diff --git a/src/sql_utilities.cc b/src/sql_utilities.cc
new file mode 100644
index 0000000..3143f2c
--- /dev/null
+++ b/src/sql_utilities.cc
@@ -0,0 +1,299 @@
+#include "sql_utilities.h"
+
+using namespace std;
+
+int load_catalog_matches(string sample, vector<CatMatch *> &matches) {
+ CatMatch *m;
+ string f;
+ vector<string> parts;
+ long int line_num;
+ ifstream fh;
+ gzFile gz_fh;
+
+ char *line = (char *) malloc(sizeof(char) * max_len);
+ int size = max_len;
+ int cnt = 0;
+ bool gzip = false;
+ int fh_status = 1;
+
+ f = sample + ".matches.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail()) {
+ //
+ // Test for a gzipped file.
+ //
+ f = sample + ".matches.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh)
+ return 0;
+
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
+ cerr << " Parsing " << f.c_str() << "\n";
+
+ line_num = 1;
+ while (fh_status) {
+ fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
+ line_num++;
+
+ if (!fh_status && strlen(line) == 0)
+ continue;
+
+ if (is_comment(line)) continue;
+
+ parse_tsv(line, parts);
+
+ cnt = parts.size();
+
+ if (cnt != num_matches_fields && cnt != num_matches_fields - 1) {
+ cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
+ return 0;
+ }
+
+ m = new CatMatch;
+ m->batch_id = atoi(parts[1].c_str());
+ m->cat_id = atoi(parts[2].c_str());
+ m->sample_id = atoi(parts[3].c_str());
+ m->tag_id = atoi(parts[4].c_str());
+ m->haplotype = new char[parts[5].length() + 1];
+ strcpy(m->haplotype, parts[5].c_str());
+ m->depth = atoi(parts[6].c_str());
+ m->lnl = is_double(parts[7].c_str());
+
+ if (cnt == num_matches_fields && parts[8].length() > 0) {
+ m->cigar = new char[parts[8].length() + 1];
+ strcpy(m->cigar, parts[8].c_str());
+ }
+
+ matches.push_back(m);
+ }
+
+ if (gzip)
+ gzclose(gz_fh);
+ else
+ fh.close();
+
+ return 0;
+}
+
+int load_model_results(string sample, map<int, ModRes *> &modres) {
+ string f;
+ vector<string> parts;
+ long int line_num;
+ ifstream fh;
+ gzFile gz_fh;
+
+ char *line = (char *) malloc(sizeof(char) * max_len);
+ int size = max_len;
+ bool gzip = false;
+ bool open_fail = false;
+ int fh_status = 1;
+
+ //
+ // Parse the models file (if it exists), otherwise parse the tag file to
+ // pull in the model calls for each locus.
+ //
+ gzip = false;
+ fh_status = 1;
+ line_num = 1;
+
+ f = sample + ".models.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail())
+ open_fail = true;
+
+ if (open_fail) {
+ //
+ // Test for a gzipped MODELs file.
+ //
+ f = sample + ".models.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ open_fail = true;
+ } else {
+ open_fail = false;
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
+ }
+
+ if (open_fail) {
+ //
+ // Test for a TAGs file.
+ //
+ f = sample + ".tags.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail())
+ open_fail = true;
+ else
+ open_fail = false;
+ }
+
+ if (open_fail) {
+ //
+ // Test for a gzipped TAGs file.
+ //
+ f = sample + ".tags.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ open_fail = true;
+ } else {
+ open_fail = false;
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
+ }
+
+ if (open_fail) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
+
+ cerr << " Parsing " << f.c_str() << "\n";
+
+ ModRes *mod;
+ uint tag_id, samp_id;
+
+ while (fh_status) {
+ fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
+ line_num++;
+
+ if (!fh_status && strlen(line) == 0)
+ continue;
+ if (is_comment(line)) continue;
+
+ parse_tsv(line, parts);
+
+ if (parts.size() != num_tags_fields) {
+ cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
+ return 0;
+ }
+
+ //
+ // Read the model sequence, a series of letters specifying if the model called a
+ // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
+ //
+ if (parts[6] != "model") continue;
+
+ samp_id = atoi(parts[1].c_str());
+ tag_id = atoi(parts[2].c_str());
+ mod = new ModRes(samp_id, tag_id, parts[9].c_str());
+
+ modres[tag_id] = mod;
+ }
+
+ if (gzip)
+ gzclose(gz_fh);
+ else
+ fh.close();
+
+ delete [] line;
+
+ return 1;
+}
+
+int load_snp_calls(string sample, map<int, SNPRes *> &snpres) {
+ string f;
+ int id, samp_id;
+ vector<string> parts;
+ long int line_num;
+ SNP *snp;
+ SNPRes *snpr;
+ ifstream fh;
+ gzFile gz_fh;
+
+ char *line = (char *) malloc(sizeof(char) * max_len);
+ int size = max_len;
+ bool gzip = false;
+ int fh_status = 1;
+
+ //
+ // Parse the SNP file
+ //
+ f = sample + ".snps.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail()) {
+ //
+ // Test for a gzipped file.
+ //
+ f = sample + ".snps.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
+ cerr << " Parsing " << f.c_str() << "\n";
+
+ line_num = 1;
+ while (fh_status) {
+ fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
+
+ if (!fh_status && strlen(line) == 0)
+ continue;
+ if (is_comment(line)) continue;
+
+ parse_tsv(line, parts);
+
+ if (parts.size() != num_snps_fields && parts.size() != num_snps_fields - 2) {
+ cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
+ return 0;
+ }
+
+ samp_id = atoi(parts[1].c_str());
+ id = atoi(parts[2].c_str());
+
+ snp = new SNP;
+ snp->col = atoi(parts[3].c_str());
+
+ if (parts[4] == "O")
+ snp->type = snp_type_hom;
+ else if (parts[4] == "E")
+ snp->type = snp_type_het;
+ else
+ snp->type = snp_type_unk;
+
+ snp->lratio = atof(parts[5].c_str());
+ snp->rank_1 = parts[6].at(0);
+ snp->rank_2 = parts[7].at(0) == '-' ? 0 : parts[7].at(0);
+
+ if (parts.size() == 10) {
+ if (parts[8].length() == 0 || parts[8].at(0) == '-')
+ snp->rank_3 = 0;
+ else
+ snp->rank_3 = parts[8].at(0);
+ if (parts[9].length() == 0 || parts[9].at(0) == '-')
+ snp->rank_4 = 0;
+ else
+ snp->rank_4 = parts[9].at(0);
+ }
+
+ if (snpres.count(id) == 0) {
+ snpr = new SNPRes(samp_id, id);
+ snpres[id] = snpr;
+ }
+ snpres[id]->snps.push_back(snp);
+
+ line_num++;
+ }
+
+ if (gzip)
+ gzclose(gz_fh);
+ else
+ fh.close();
+
+ delete [] line;
+
+ return 1;
+}
diff --git a/src/sql_utilities.h b/src/sql_utilities.h
index 8fc23d1..9b92b8f 100644
--- a/src/sql_utilities.h
+++ b/src/sql_utilities.h
@@ -35,6 +35,10 @@ const uint num_snps_fields = 10;
const uint num_alleles_fields = 6;
const uint num_matches_fields = 9;
+int load_catalog_matches(string sample, vector<CatMatch *> &matches);
+int load_model_results(string sample, map<int, ModRes *> &modres);
+int load_snp_calls(string sample, map<int, SNPRes *> &snpres);
+
template <class LocusT>
int
load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_all_model_calls, bool &compressed)
@@ -221,7 +225,7 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
//
// Parse the physical genome location of this locus.
//
- c->loc.set(parts[3].c_str(), atoi(parts[4].c_str()), (parts[5] == "+" ? plus : minus));
+ c->loc.set(parts[3].c_str(), atoi(parts[4].c_str()), (parts[5] == "+" ? strand_plus : strand_minus));
//
// Parse the components of this stack (either the Illumina ID, or the catalog constituents)
@@ -423,7 +427,7 @@ int dump_loci(map<int, LocusT *> &u) {
cerr << "Locus ID: " << i->second->id << "\n"
<< " Consensus: " << i->second->con << "\n"
- << " Genomic Location: " << i->second->loc.chr << "; " << i->second->loc.bp << "bp\n"
+ << " Genomic Location: " << i->second->loc.chr << "; " << i->second->loc.bp +1 << "bp\n"
<< " SNPs:\n";
for (s = i->second->snps.begin(); s != i->second->snps.end(); s++)
@@ -435,301 +439,4 @@ int dump_loci(map<int, LocusT *> &u) {
return 0;
}
-int load_catalog_matches(string sample, vector<CatMatch *> &matches) {
- CatMatch *m;
- string f;
- vector<string> parts;
- long int line_num;
- ifstream fh;
- gzFile gz_fh;
-
- char *line = (char *) malloc(sizeof(char) * max_len);
- int size = max_len;
- int cnt = 0;
- bool gzip = false;
- int fh_status = 1;
-
- f = sample + ".matches.tsv";
- fh.open(f.c_str(), ifstream::in);
- if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".matches.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
- }
- cerr << " Parsing " << f.c_str() << "\n";
-
- line_num = 1;
- while (fh_status) {
- fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
- line_num++;
-
- if (!fh_status && strlen(line) == 0)
- continue;
-
- if (is_comment(line)) continue;
-
- parse_tsv(line, parts);
-
- cnt = parts.size();
-
- if (cnt != num_matches_fields && cnt != num_matches_fields - 1) {
- cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
- return 0;
- }
-
- m = new CatMatch;
- m->batch_id = atoi(parts[1].c_str());
- m->cat_id = atoi(parts[2].c_str());
- m->sample_id = atoi(parts[3].c_str());
- m->tag_id = atoi(parts[4].c_str());
- m->haplotype = new char[parts[5].length() + 1];
- strcpy(m->haplotype, parts[5].c_str());
- m->depth = atoi(parts[6].c_str());
- m->lnl = is_double(parts[7].c_str());
-
- if (cnt == num_matches_fields && parts[8].length() > 0) {
- m->cigar = new char[parts[8].length() + 1];
- strcpy(m->cigar, parts[8].c_str());
- }
-
- matches.push_back(m);
- }
-
- if (gzip)
- gzclose(gz_fh);
- else
- fh.close();
-
- return 0;
-}
-
-int load_model_results(string sample, map<int, ModRes *> &modres) {
- string f;
- vector<string> parts;
- long int line_num;
- ifstream fh;
- gzFile gz_fh;
-
- char *line = (char *) malloc(sizeof(char) * max_len);
- int size = max_len;
- bool gzip = false;
- bool open_fail = false;
- int fh_status = 1;
-
- //
- // Parse the models file (if it exists), otherwise parse the tag file to
- // pull in the model calls for each locus.
- //
- gzip = false;
- fh_status = 1;
- line_num = 1;
-
- f = sample + ".models.tsv";
- fh.open(f.c_str(), ifstream::in);
- if (fh.fail())
- open_fail = true;
-
- if (open_fail) {
- //
- // Test for a gzipped MODELs file.
- //
- f = sample + ".models.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- open_fail = true;
- } else {
- open_fail = false;
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
- }
- }
-
- if (open_fail) {
- //
- // Test for a TAGs file.
- //
- f = sample + ".tags.tsv";
- fh.open(f.c_str(), ifstream::in);
- if (fh.fail())
- open_fail = true;
- else
- open_fail = false;
- }
-
- if (open_fail) {
- //
- // Test for a gzipped TAGs file.
- //
- f = sample + ".tags.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- open_fail = true;
- } else {
- open_fail = false;
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
- }
- }
-
- if (open_fail) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
-
- cerr << " Parsing " << f.c_str() << "\n";
-
- ModRes *mod;
- uint tag_id, samp_id;
-
- while (fh_status) {
- fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
- line_num++;
-
- if (!fh_status && strlen(line) == 0)
- continue;
- if (is_comment(line)) continue;
-
- parse_tsv(line, parts);
-
- if (parts.size() != num_tags_fields) {
- cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
- return 0;
- }
-
- //
- // Read the model sequence, a series of letters specifying if the model called a
- // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
- //
- if (parts[6] != "model") continue;
-
- samp_id = atoi(parts[1].c_str());
- tag_id = atoi(parts[2].c_str());
- mod = new ModRes(samp_id, tag_id, parts[9].c_str());
-
- modres[tag_id] = mod;
- }
-
- if (gzip)
- gzclose(gz_fh);
- else
- fh.close();
-
- delete [] line;
-
- return 1;
-}
-
-int load_snp_calls(string sample, map<int, SNPRes *> &snpres) {
- string f;
- int id, samp_id;
- vector<string> parts;
- long int line_num;
- SNP *snp;
- SNPRes *snpr;
- ifstream fh;
- gzFile gz_fh;
-
- char *line = (char *) malloc(sizeof(char) * max_len);
- int size = max_len;
- bool gzip = false;
- int fh_status = 1;
-
- //
- // Parse the SNP file
- //
- f = sample + ".snps.tsv";
- fh.open(f.c_str(), ifstream::in);
- if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".snps.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
- }
- cerr << " Parsing " << f.c_str() << "\n";
-
- line_num = 1;
- while (fh_status) {
- fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
-
- if (!fh_status && strlen(line) == 0)
- continue;
- if (is_comment(line)) continue;
-
- parse_tsv(line, parts);
-
- if (parts.size() != num_snps_fields && parts.size() != num_snps_fields - 2) {
- cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
- return 0;
- }
-
- samp_id = atoi(parts[1].c_str());
- id = atoi(parts[2].c_str());
-
- snp = new SNP;
- snp->col = atoi(parts[3].c_str());
-
- if (parts[4] == "O")
- snp->type = snp_type_hom;
- else if (parts[4] == "E")
- snp->type = snp_type_het;
- else
- snp->type = snp_type_unk;
-
- snp->lratio = atof(parts[5].c_str());
- snp->rank_1 = parts[6].at(0);
- snp->rank_2 = parts[7].at(0) == '-' ? 0 : parts[7].at(0);
-
- if (parts.size() == 10) {
- if (parts[8].length() == 0 || parts[8].at(0) == '-')
- snp->rank_3 = 0;
- else
- snp->rank_3 = parts[8].at(0);
- if (parts[9].length() == 0 || parts[9].at(0) == '-')
- snp->rank_4 = 0;
- else
- snp->rank_4 = parts[9].at(0);
- }
-
- if (snpres.count(id) == 0) {
- snpr = new SNPRes(samp_id, id);
- snpres[id] = snpr;
- }
- snpres[id]->snps.push_back(snp);
-
- line_num++;
- }
-
- if (gzip)
- gzclose(gz_fh);
- else
- fh.close();
-
- delete [] line;
-
- return 1;
-}
-
#endif // __SQL_UTILITIES_H__
diff --git a/src/sstacks.cc b/src/sstacks.cc
index 3b10560..d51331f 100644
--- a/src/sstacks.cc
+++ b/src/sstacks.cc
@@ -167,7 +167,7 @@ find_matches_by_genomic_loc(map<int, Locus *> &sample_1, map<int, QLocus *> &sam
snprintf(id, id_len - 1, "%s|%d|%c",
j->second->loc.chr,
j->second->loc.bp,
- j->second->loc.strand == plus ? '+' : '-');
+ j->second->loc.strand == strand_plus ? '+' : '-');
locations[id].insert(j->second->id);
}
@@ -198,7 +198,7 @@ find_matches_by_genomic_loc(map<int, Locus *> &sample_1, map<int, QLocus *> &sam
snprintf(id, id_len - 1, "%s|%d|%c",
i->second->loc.chr,
i->second->loc.bp,
- i->second->loc.strand == plus ? '+' : '-');
+ i->second->loc.strand == strand_plus ? '+' : '-');
if (locations.count(id) > 0) {
Locus *tag;
@@ -830,7 +830,7 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
// generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
//
uniq_kmers.clear();
- for (uint j = 0; j < num_kmers; j++)
+ for (int j = 0; j < num_kmers; j++)
uniq_kmers.insert(kmers[j]);
hits.clear();
@@ -869,7 +869,7 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
hit_cnt = 0;
allele_id = prev_id;
- while (hits[index] == prev_id) {
+ while ((uint)hits[index] == prev_id) {
hit_cnt++;
index++;
}
@@ -877,7 +877,7 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
if (index < hits_size)
prev_id = hits[index];
- if (hit_cnt >= min_hits)
+ if (hit_cnt >= (uint)min_hits)
ordered_hits.push_back(make_pair(allele_id, hit_cnt));
} while (index < hits_size);
@@ -896,7 +896,7 @@ search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
top_hit = ordered_hits[0].second;
stop = 1;
for (uint j = 1; j < ordered_hits.size(); j++)
- if (ordered_hits[j].second < top_hit) {
+ if ((uint)ordered_hits[j].second < top_hit) {
stop = j;
break;
}
@@ -1025,7 +1025,7 @@ verify_gapped_match(map<int, Locus *> &catalog, QLocus *query,
qseq = apply_cigar_to_seq(query->con, cigar);
query->add_consensus(qseq.c_str());
- int min_tag_len = query_len > cat->len ? query_len : cat->len;
+ int min_tag_len = (uint)query_len > cat->len ? query_len : cat->len;
vector<SNP *>::iterator i, j;
bool found;
@@ -1035,7 +1035,7 @@ verify_gapped_match(map<int, Locus *> &catalog, QLocus *query,
//
// SNP occurs in a column that is beyond the length of the catalog
//
- if ((*i)->col > min_tag_len - 1)
+ if ((int)(*i)->col > min_tag_len - 1)
continue;
for (j = cat->snps.begin(); j != cat->snps.end(); j++) {
diff --git a/src/stacks.h b/src/stacks.h
index dc9958d..d9b0ba0 100644
--- a/src/stacks.h
+++ b/src/stacks.h
@@ -49,7 +49,7 @@ typedef string allele_type;
enum snp_type {snp_type_het, snp_type_hom, snp_type_unk};
enum read_type {primary, secondary};
-enum strand_type {plus, minus};
+enum strand_type {strand_plus, strand_minus};
enum searcht {sequence, genomic_loc};
class PhyLoc {
@@ -69,12 +69,12 @@ public:
PhyLoc() {
chr = NULL;
bp = 0;
- strand = plus;
+ strand = strand_plus;
}
PhyLoc(const char *chr, uint bp) {
this->chr = new char[strlen(chr) + 1];
this->bp = bp;
- this->strand = plus;
+ this->strand = strand_plus;
strcpy(this->chr, chr);
}
PhyLoc(const char *chr, uint bp, strand_type strnd) {
diff --git a/src/ustacks.cc b/src/ustacks.cc
index b0bd7ae..f46d584 100644
--- a/src/ustacks.cc
+++ b/src/ustacks.cc
@@ -246,7 +246,7 @@ merge_gapped_alns(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Merg
if (tag_2->alns.size() > 1 && tag_2->alns[0].pct_id == tag_2->alns[1].pct_id)
continue;
- if (tag_1->id != tag_2->alns[0].id)
+ if (tag_1->id != (int)tag_2->alns[0].id)
continue;
cigar_1 = invert_cigar(tag_1->alns[0].cigar);
@@ -557,7 +557,7 @@ search_for_gaps(map<int, MergedStack *> &merged, double min_match_len)
//
// Free the k-mers we generated for this query.
//
- for (int j = 0; j < query_kmers.size(); j++)
+ for (uint j = 0; j < query_kmers.size(); j++)
delete [] query_kmers[j];
query_kmers.clear();
@@ -633,7 +633,7 @@ merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem)
//
// Lookup the occurances of each remainder k-mer in the MergedStack k-mer map
//
- for (uint k = 0; k < num_kmers; k++) {
+ for (int k = 0; k < num_kmers; k++) {
h = kmer_map.find(rem_kmers[k]);
if (h != kmer_map.end())
@@ -1591,7 +1591,7 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
//
// Free the k-mers we generated for this query
//
- for (int j = 0; j < query_kmers.size(); j++)
+ for (uint j = 0; j < query_kmers.size(); j++)
delete [] query_kmers[j];
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/stacks.git
More information about the debian-med-commit
mailing list