[med-svn] [wgs-assembler] 01/02: Imported Upstream version 8.1
Afif Elghraoui
afif-guest at moszumanska.debian.org
Sun May 3 06:06:20 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository wgs-assembler.
commit 1c238c01fafaf9e2ccc828448d2ea01f63721a3a
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Sat May 2 23:01:36 2015 -0700
Imported Upstream version 8.1
---
samtools | 1 +
samtools-0.1.19/.gitignore | 4 +
samtools-0.1.19/AUTHORS | 20 +
samtools-0.1.19/COPYING | 21 +
samtools-0.1.19/ChangeLog.old | 3875 +++++++++++++++++++++++++++++++
samtools-0.1.19/INSTALL | 30 +
samtools-0.1.19/Makefile | 101 +
samtools-0.1.19/Makefile.mingw | 63 +
samtools-0.1.19/NEWS | 836 +++++++
samtools-0.1.19/bam.c | 474 ++++
samtools-0.1.19/bam.h | 793 +++++++
samtools-0.1.19/bam2bcf.c | 467 ++++
samtools-0.1.19/bam2bcf.h | 67 +
samtools-0.1.19/bam2bcf_indel.c | 498 ++++
samtools-0.1.19/bam2depth.c | 143 ++
samtools-0.1.19/bam_aux.c | 217 ++
samtools-0.1.19/bam_cat.c | 185 ++
samtools-0.1.19/bam_color.c | 145 ++
samtools-0.1.19/bam_endian.h | 42 +
samtools-0.1.19/bam_import.c | 489 ++++
samtools-0.1.19/bam_index.c | 726 ++++++
samtools-0.1.19/bam_lpileup.c | 198 ++
samtools-0.1.19/bam_mate.c | 128 +
samtools-0.1.19/bam_md.c | 389 ++++
samtools-0.1.19/bam_pileup.c | 437 ++++
samtools-0.1.19/bam_plcmd.c | 606 +++++
samtools-0.1.19/bam_reheader.c | 62 +
samtools-0.1.19/bam_rmdup.c | 206 ++
samtools-0.1.19/bam_rmdupse.c | 159 ++
samtools-0.1.19/bam_sort.c | 571 +++++
samtools-0.1.19/bam_stat.c | 77 +
samtools-0.1.19/bam_tview.c | 368 +++
samtools-0.1.19/bam_tview.h | 75 +
samtools-0.1.19/bam_tview_curses.c | 297 +++
samtools-0.1.19/bam_tview_html.c | 349 +++
samtools-0.1.19/bamshuf.c | 141 ++
samtools-0.1.19/bamtk.c | 119 +
samtools-0.1.19/bcftools/Makefile | 51 +
samtools-0.1.19/bcftools/README | 36 +
samtools-0.1.19/bcftools/bcf.c | 396 ++++
samtools-0.1.19/bcftools/bcf.h | 197 ++
samtools-0.1.19/bcftools/bcf.tex | 77 +
samtools-0.1.19/bcftools/bcf2qcall.c | 91 +
samtools-0.1.19/bcftools/bcfutils.c | 504 ++++
samtools-0.1.19/bcftools/call1.c | 633 +++++
samtools-0.1.19/bcftools/em.c | 310 +++
samtools-0.1.19/bcftools/fet.c | 112 +
samtools-0.1.19/bcftools/index.c | 336 +++
samtools-0.1.19/bcftools/kfunc.c | 162 ++
samtools-0.1.19/bcftools/kmin.c | 209 ++
samtools-0.1.19/bcftools/kmin.h | 46 +
samtools-0.1.19/bcftools/main.c | 191 ++
samtools-0.1.19/bcftools/mut.c | 127 +
samtools-0.1.19/bcftools/prob1.c | 988 ++++++++
samtools-0.1.19/bcftools/prob1.h | 49 +
samtools-0.1.19/bcftools/vcf.c | 249 ++
samtools-0.1.19/bcftools/vcfutils.pl | 567 +++++
samtools-0.1.19/bedcov.c | 127 +
samtools-0.1.19/bedidx.c | 162 ++
samtools-0.1.19/bgzf.c | 694 ++++++
samtools-0.1.19/bgzf.h | 207 ++
samtools-0.1.19/bgzip.c | 206 ++
samtools-0.1.19/cut_target.c | 193 ++
samtools-0.1.19/errmod.c | 130 ++
samtools-0.1.19/errmod.h | 24 +
samtools-0.1.19/examples/00README.txt | 23 +
samtools-0.1.19/examples/Makefile | 50 +
samtools-0.1.19/examples/bam2bed.c | 51 +
samtools-0.1.19/examples/calDepth.c | 62 +
samtools-0.1.19/examples/chk_indel.c | 83 +
samtools-0.1.19/examples/ex1.fa | 56 +
samtools-0.1.19/examples/ex1.sam.gz | Bin 0 -> 114565 bytes
samtools-0.1.19/examples/toy.fa | 4 +
samtools-0.1.19/examples/toy.sam | 14 +
samtools-0.1.19/faidx.c | 437 ++++
samtools-0.1.19/faidx.h | 103 +
samtools-0.1.19/kaln.c | 486 ++++
samtools-0.1.19/kaln.h | 67 +
samtools-0.1.19/khash.h | 528 +++++
samtools-0.1.19/klist.h | 96 +
samtools-0.1.19/knetfile.c | 632 +++++
samtools-0.1.19/knetfile.h | 75 +
samtools-0.1.19/kprobaln.c | 280 +++
samtools-0.1.19/kprobaln.h | 49 +
samtools-0.1.19/kseq.h | 235 ++
samtools-0.1.19/ksort.h | 285 +++
samtools-0.1.19/kstring.c | 212 ++
samtools-0.1.19/kstring.h | 169 ++
samtools-0.1.19/misc/HmmGlocal.java | 178 ++
samtools-0.1.19/misc/Makefile | 69 +
samtools-0.1.19/misc/ace2sam.c | 249 ++
samtools-0.1.19/misc/bamcheck.c | 1521 ++++++++++++
samtools-0.1.19/misc/blast2sam.pl | 92 +
samtools-0.1.19/misc/bowtie2sam.pl | 92 +
samtools-0.1.19/misc/export2sam.pl | 545 +++++
samtools-0.1.19/misc/interpolate_sam.pl | 125 +
samtools-0.1.19/misc/maq2sam.c | 173 ++
samtools-0.1.19/misc/md5.c | 296 +++
samtools-0.1.19/misc/md5.h | 57 +
samtools-0.1.19/misc/md5fa.c | 58 +
samtools-0.1.19/misc/novo2sam.pl | 281 +++
samtools-0.1.19/misc/plot-bamcheck | 882 +++++++
samtools-0.1.19/misc/psl2sam.pl | 65 +
samtools-0.1.19/misc/r2plot.lua | 83 +
samtools-0.1.19/misc/sam2vcf.pl | 270 +++
samtools-0.1.19/misc/samtools.pl | 528 +++++
samtools-0.1.19/misc/soap2sam.pl | 109 +
samtools-0.1.19/misc/varfilter.py | 205 ++
samtools-0.1.19/misc/vcfutils.lua | 694 ++++++
samtools-0.1.19/misc/wgsim.c | 419 ++++
samtools-0.1.19/misc/wgsim_eval.pl | 91 +
samtools-0.1.19/misc/zoom2sam.pl | 97 +
samtools-0.1.19/padding.c | 479 ++++
samtools-0.1.19/phase.c | 687 ++++++
samtools-0.1.19/razf.c | 853 +++++++
samtools-0.1.19/razf.h | 134 ++
samtools-0.1.19/razip.c | 141 ++
samtools-0.1.19/sam.c | 186 ++
samtools-0.1.19/sam.h | 99 +
samtools-0.1.19/sam_header.c | 810 +++++++
samtools-0.1.19/sam_header.h | 48 +
samtools-0.1.19/sam_view.c | 441 ++++
samtools-0.1.19/sample.c | 107 +
samtools-0.1.19/sample.h | 17 +
samtools-0.1.19/samtools.1 | 1066 +++++++++
samtools-0.1.19/win32/libcurses.a | Bin 0 -> 114764 bytes
samtools-0.1.19/win32/libz.a | Bin 0 -> 74266 bytes
samtools-0.1.19/win32/xcurses.h | 1377 +++++++++++
samtools-0.1.19/win32/zconf.h | 332 +++
samtools-0.1.19/win32/zlib.h | 1357 +++++++++++
130 files changed, 39763 insertions(+)
diff --git a/samtools b/samtools
new file mode 120000
index 0000000..3bfd233
--- /dev/null
+++ b/samtools
@@ -0,0 +1 @@
+samtools-0.1.19
\ No newline at end of file
diff --git a/samtools-0.1.19/.gitignore b/samtools-0.1.19/.gitignore
new file mode 100644
index 0000000..bb605d4
--- /dev/null
+++ b/samtools-0.1.19/.gitignore
@@ -0,0 +1,4 @@
+*.o
+.*.swp
+*.a
+*.dSYM
diff --git a/samtools-0.1.19/AUTHORS b/samtools-0.1.19/AUTHORS
new file mode 100644
index 0000000..95afabb
--- /dev/null
+++ b/samtools-0.1.19/AUTHORS
@@ -0,0 +1,20 @@
+Heng Li from the Sanger Institute wrote most of the initial source codes
+of SAMtools and various converters.
+
+Bob Handsaker from the Broad Institute is a major contributor to the
+SAM/BAM specification. He designed and implemented the BGZF format, the
+underlying indexable compression format for the BAM format. BGZF does
+not support arithmetic between file offsets.
+
+Jue Ruan for the Beijing Genome Institute designed and implemented the
+RAZF format, an alternative indexable compression format. RAZF supports
+arithmetic between file offsets, at the cost of increased index file
+size and the full compatibility with gzip. RAZF is optional and only
+used in `faidx' for indexing RAZF compressed fasta files.
+
+Colin Hercus updated novo2sam.pl to support gapped alignment by
+novoalign.
+
+Petr Danecek contributed the header parsing library sam_header.c and
+sam2vcf.pl script and added knet support to the RAZF library.
+
diff --git a/samtools-0.1.19/COPYING b/samtools-0.1.19/COPYING
new file mode 100644
index 0000000..82fa2f4
--- /dev/null
+++ b/samtools-0.1.19/COPYING
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2008-2009 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/samtools-0.1.19/ChangeLog.old b/samtools-0.1.19/ChangeLog.old
new file mode 100644
index 0000000..19aefae
--- /dev/null
+++ b/samtools-0.1.19/ChangeLog.old
@@ -0,0 +1,3875 @@
+commit db2ad3e19068cbafde72ecde75d0638bbb3598ba
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 16 14:45:17 2012 -0500
+
+ removed downsample.c
+
+commit 6c55c576903992c6fef148fe3b606fbc8bd10655
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 16 14:45:06 2012 -0500
+
+ print to output
+
+commit db1044a34e6049c87eaa63c39ed6e56f03e7d4c1
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 16 14:39:34 2012 -0500
+
+ removed sample
+
+ Downsampling already exists in "view". View also keeps pairing while "sample" does not.
+
+commit ffdeed3e5d4a530bfdf6f9ba97fff0ba7add6cba
+Merge: 2daad7b accf026
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 16 14:22:15 2012 -0500
+
+ Merge branch 'master' of github.com:lh3/samtools
+
+commit accf0260fd1117e10047344345d40b31a9ec31bb
+Merge: 9134e0d c554160
+Author: Heng Li <lh3 at me.com>
+Date: Thu Feb 16 11:21:14 2012 -0800
+
+ Merge pull request #8 from nh13/master
+
+ Patches
+
+commit c554160df16ec7748cfdda4c7b54c641be7b809f
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 14:06:52 2012 -0500
+
+ * more README.md work
+
+commit 2a81ffe349208d917666808fbc9f3041e0cb57de
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 14:06:10 2012 -0500
+
+ * more README work
+
+commit fb3125f732715f62cded8685a23a002a96ce009b
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 14:05:19 2012 -0500
+
+ * more README work
+
+commit 444d41002c37e1c3d0f9208b4a88126c47276386
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 14:02:13 2012 -0500
+
+ * updating README
+
+commit dec53cb1043fe7efadfde75fa2fd39b76de22e54
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 13:55:01 2012 -0500
+
+ updating the README for markdown syntax
+
+commit 798da18c346dca8ec6005582a0ddb1d5420b04ca
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 13:48:35 2012 -0500
+
+ adding a README with the current differences between this repository and
+ the official one
+
+commit 4d22d86c0f28636662f2144a88cd168e104c4275
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 13:35:03 2012 -0500
+
+ adding "samtools sample" to the main
+
+commit 893c25a37c21005dc42f45d45e9ad78ddc5f29bb
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 13:33:51 2012 -0500
+
+ * removing some compile flags to work with OS X
+
+commit 7ac22f72fdc32edd5c24af6baebfa7db5faf8e7b
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:47:14 2012 -0500
+
+ Check write filehandle after opening for write. tamw/tamr is a union type, so change is only semantic.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit fef53330416631690f60fdff42b6e43d764170dc
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:44:59 2012 -0500
+
+ Catch and report invalid BAM header, instead of segfaulting later on.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit 5cc013fe4930bf9b6e7963aab1cd4a3c94f695bc
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:44:16 2012 -0500
+
+ Add downsample to examples.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit b3fa9e7071532905a81dc7aa48eadc24b8c8846b
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:43:48 2012 -0500
+
+ Adjust for leading hard clip on colorspace reads.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit 1a9296c1389469d1c1db5b8069f0e11ffcc8abb2
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:42:52 2012 -0500
+
+ Add samtools sample command, contributed by Davide Cittaro <davide.cittaro at ifom-ieo-campus.it>.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit 2a804f3379748aeba944f1dec306dd726ff3235e
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:42:07 2012 -0500
+
+ Add samtools qa command, contributed by Roman Valls Guimera <roman.valls.guimera at scilifelab.se>.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit 0f3207fe8fd93e44d40fcf57204079c8c06d24a6
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:39:08 2012 -0500
+
+ Makefile cleanup - allow CC, CFLAGS, LDFLAGS to be passed on make command line. Use LDFLAGS in samtools compile.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit 6e7df604025f6a86881bf7f4a16f30e15d31538a
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:31:15 2012 -0500
+
+ Allow max_mem for sort to be specified with units.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit f12ebcaf6e60d34180a27d70e09b743cef140b98
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:29:11 2012 -0500
+
+ Allow user defined [lowercase] tags in header elements.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit 50b931fa3312dc109537a4260698ddecd0f06a05
+Author: Jonathan Manning <jonathan.manning at lifetech.com>
+Date: Thu Feb 16 10:27:11 2012 -0500
+
+ Check lowerbound in text entry box to avoid segfault in tview. Remove redundant call to bam_aux_get.
+
+ Signed-off-by: Nils Homer <nils.homer at lifetech.com>
+
+commit 5e729da5190949a813d20d329eab7ddb661816bd
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 10:31:48 2012 -0500
+
+ * fixing overflow/underflow in integer parsing
+
+commit fa50a4330b9abedaf07c26e13d31f05e57f1d319
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 10:30:40 2012 -0500
+
+ * updating help message for samtools depth
+
+commit 79e52c9624b6dd3bdfdf439f4b4bc6f774c230a4
+Author: Nils Homer <nils.homer at lifetech.com>
+Date: Thu Feb 16 10:29:32 2012 -0500
+
+ * adding support for outputting a circos histogram file in "samtools depth". Use
+ the "-c/-B" options.
+
+commit 2daad7b52daa86561c0fb65fe366691fad9f5ed3
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 16 09:31:57 2012 -0500
+
+ bugfix: wrong SP; missing DV in the VCF hdr
+
+commit 9134e0d5047c281ef3bd53da91771d4814a5131c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 8 11:19:12 2012 -0500
+
+ missing support of DV
+
+commit 34ebf12078c1d1015a0b8b9a9221243a60b22893
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 8 11:08:56 2012 -0500
+
+ new BCF DV format: number of variant reads
+
+commit 9589d3312fa2d076f48bdd68e2a5edd419c8070c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Jan 10 10:30:27 2012 -0500
+
+ scale depth to quality (hidden option)
+
+commit 704473e14668333ecaca5fb7b238af405c43e3b1
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Jan 10 10:18:17 2012 -0500
+
+ really nothing
+
+commit 01b307fd287962372bbf07461c88b54f41636817
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Dec 7 13:07:42 2011 -0500
+
+ added an example containing 'B'
+
+commit c678791f0451ceb9205c1ab5c52c84641863c99a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Dec 3 12:10:30 2011 -0500
+
+ 'B' now moves backward w.r.t. the query
+
+commit 152119bc06a073933ca830e8e1407538e44626cc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Dec 2 10:50:12 2011 -0500
+
+ better consensus; a little more robust
+
+commit 454da4754ac503edda5b1329b67757d797e46e07
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Dec 2 00:20:22 2011 -0500
+
+ in pileup call remove_B()
+
+commit ff2bcac1cc078ba1879f18c89cfae314439d7086
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Dec 2 00:17:32 2011 -0500
+
+ working on a few toy examples
+
+commit 745ca7260158d6df7897b52598033ffb055a9e4f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Dec 1 22:55:39 2011 -0500
+
+ bam_remove_B(); not tested
+
+commit 07e4cdc7300abfcc82e03105b4689f95cab551cd
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Nov 10 12:58:55 2011 -0500
+
+ baseQ threshold on plain pipleup; removed -E
+
+commit 322ebf2082dfa91df44b3a996d26c85357e5d5a2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Oct 19 09:28:04 2011 -0400
+
+ fixed two gcc warnings
+
+commit a632457b4c4adc50d833b56b5a5231feafaf8193
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Oct 4 10:13:23 2011 -0400
+
+ change size_t to uint32_t in bam_header_t
+
+ This may cause issues on 64-bit big-endian machines. Reported and fixed by Paolo Emilio Mazzon.
+
+commit af31bf5a78aea03baf6eb90fe50076549d499f6e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Sep 26 20:17:57 2011 -0400
+
+ rename pad2unpad to depad
+
+commit 77b198b73dfad1048e5d1c5a64aa75ee7b90f596
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Sep 23 01:22:40 2011 -0400
+
+ convert padded BAM to unpadded BAM
+
+commit adb9e2342b7b7501d9527d3c23afab10469ae2c6
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Sep 7 11:40:50 2011 -0400
+
+ generate template cigar with "fixmate"
+
+commit 46e5ab445a0fe880216cbc0daf1225725b569d7a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Sep 2 12:50:18 2011 -0400
+
+ update kseq.h to the latest version
+
+commit 68e9e4a73eb91405bb3e56bf0cdaf12d1b487abb
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Sep 2 12:44:45 2011 -0400
+
+ Release samtools-0.1.18
+
+commit aa06bdadb2d109a79f927f478102f96a1f5fd258
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Sep 2 12:14:17 2011 -0400
+
+ updated the revision number
+
+commit 267e1e1b6e54c0ab24f94cd9aee9cbd2d1923f9f
+Merge: 19ff1d3 aebab30
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Sep 2 12:13:08 2011 -0400
+
+ Merge https://github.com/lh3/samtools into reduce
+
+ Conflicts:
+ bam_md.c
+
+ Fixed a few typos in the merge
+
+commit aebab302399c24eaa6c5ab79d13d6bd5e2e9ea9a
+Merge: c2c63d0 da62663
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Sep 2 09:03:49 2011 -0700
+
+ Merge pull request #4 from peterjc/x_equals2
+
+ Implement basic support for =/X CIGAR operations
+
+commit 19ff1d3d7f47d7e61b121292aefe5a74bb8a18d2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Aug 25 16:38:12 2011 -0400
+
+ reduce BAM size (experimental)
+
+commit da626630fd98fd4e07ceb4d58c5c9a42d312a85d
+Author: peterjc <p.j.a.cock at googlemail.com>
+Date: Mon Aug 22 06:58:08 2011 +0100
+
+ Support =/X CIGAR operations (treated like M)
+
+commit 461d8003529db77a4d5ecbd108312e868b051a3d
+Author: peterjc <p.j.a.cock at googlemail.com>
+Date: Mon Aug 22 05:52:56 2011 +0100
+
+ Define CIGAR equals and X operationss (7 and 8)
+
+commit c2c63d067113baab41f3bc35fb28f4f00578accb
+Merge: 7ab3ef3 9a0ed9a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Aug 18 17:21:54 2011 -0700
+
+ Merge pull request #3 from peterjc/x_equals
+
+ Accept SAM files using = in CIGAR (treats X and = as M)
+
+commit 9a0ed9a6b85c7981465f459300208dbd93e3c6f5
+Author: peterjc <p.j.a.cock at googlemail.com>
+Date: Thu Aug 18 19:28:52 2011 +0100
+
+ Accept SAM files using = in CIGAR (treats X and = as M)
+
+commit 7ab3ef388c1eb34d7912fd70cc5656c955240263
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Aug 8 10:22:22 2011 -0400
+
+ bugfix: indexing takes huge memory
+
+ This happens when an unmapped mate has coordinate 1. Thank Joel Martin for the fix.
+
+commit a3f6738593e944354a8f75306687d8b3acf08bf1
+Merge: a8bdca9 bc67ea2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Aug 8 09:52:26 2011 -0400
+
+ Merge branch 'master' of github.com:lh3/samtools
+
+commit bc67ea225da653f36a70b38382d6111dd494f659
+Author: Petr Danecek <pd3 at sanger.ac.uk>
+Date: Thu Jul 28 20:03:16 2011 +0100
+
+ Variant Distance Bias
+
+commit deb578f0c49d0b7d8c3bc6be220b4d67e2e7dfdf
+Author: Petr Danecek <pd3 at sanger.ac.uk>
+Date: Tue Jul 26 09:57:37 2011 +0100
+
+ If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but use the tag instead.
+
+commit a8bdca9cf482a637b89ee4f98469a93e0ab5e69b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Jul 25 10:10:55 2011 -0400
+
+ bugfix: LRT2=nan
+
+commit 0afe33137d046a3e849eeb4a54590f27cbad4228
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jul 22 21:55:38 2011 -0400
+
+ fixed a bug/typo
+
+commit 62d5849658c10222d40308c6b53ab4f99a448494
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jul 15 16:04:19 2011 -0400
+
+ allow to set see in subsampling
+
+commit 5f46243824cc9435b167973e1d51e13128794ea1
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jul 15 15:54:47 2011 -0400
+
+ support subsampling
+
+commit 5e55b6f34fc86cba7cf98d52ccaed405c3ffabbc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jul 15 15:53:38 2011 -0400
+
+ support indels
+
+commit f31c162926d6f43e8b60171789a258d02e1f9be5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jul 7 17:02:33 2011 -0400
+
+ do not count indel with "view -Y"
+
+commit e412dae587883b4c17e5fbf4b7c33f38bfa8458a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jul 7 00:35:25 2011 -0400
+
+ for WIN32 compatibility
+
+commit 70a52501bcfa63824749893a5ab8ed3c38e34958
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jul 7 00:32:46 2011 -0400
+
+ for WIN32 compatibility
+
+commit 00438f14ed5984f08e8f7645a9b95644a812f969
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jul 6 23:41:45 2011 -0400
+
+ fixed an uninitialized variable
+
+commit 7609c4a01059c326544b3d0142dfe9c4229d68c6
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jul 6 23:39:31 2011 -0400
+
+ fixed an uninitialized variable
+
+commit cec7189a412f80ccb068a73bd28528915c16b0bf
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jul 6 22:53:19 2011 -0400
+
+ Release samtools-0.1.17
+
+commit 93c06a249de3bb666029bf07b66de5e8e5e314fa
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jul 6 09:46:09 2011 -0400
+
+ bugfix: incorrect idxstats for the last seq
+
+ Again, this bug is caused by 3rd-party code for the sorting order checking.
+
+commit 84f6ca62db6e27b8c4c711e7b5f3ca704bf27b4f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Jul 5 23:30:23 2011 -0400
+
+ output mapping quality in the old pileup format
+
+commit 362e05fd670886acaede69b864903d730b9db3ca
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Jul 5 21:59:22 2011 -0400
+
+ added a brief description of the VCF format
+
+commit e690a696468205e0cc4560016361c997660dd496
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Jul 5 16:23:10 2011 -0400
+
+ improved samtools manual page
+
+commit 362b4a1408ef3c32311d638aa8d85ce39c1c7b2d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Jul 5 15:58:29 2011 -0400
+
+ merge bcftools.1 to samtools.1
+
+commit 643e0e61ba7266efbc9e5bfcb8e41f369ba2ce0a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Jul 5 13:39:02 2011 -0400
+
+ mpileup: when region set, set reference properly
+
+commit 613e4d67624a94f62563935fbd5cc294df69605a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Jul 4 23:29:02 2011 -0400
+
+ compute the min PL diff
+
+commit 5b7d5d3f52b97ca42c8500eede808dab88a46a53
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Jul 4 22:57:48 2011 -0400
+
+ rename trio.c to mut.c
+
+commit 84fe96ad64b0365ead93a4115d1684b9bebb98fc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Jul 3 15:38:51 2011 -0400
+
+ added pair caller interface; not tested
+
+commit 2f2867b87b84c35319cc416d6173819d5c8a4e8c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Jul 3 15:24:23 2011 -0400
+
+ inital implementation of a pair caller
+
+commit e97653cf2ad653c95886933c42a2b5492ccab5ff
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Jul 3 00:06:28 2011 -0400
+
+ convert bam to single-end fastq
+
+commit e8013e11f7a8db0a8d18c60d130169cca39bf2bd
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Jul 2 14:39:18 2011 -0400
+
+ improve BED parsing
+
+commit 1025714325fdc636aeee47a76db8dafbbbfde64b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jul 1 14:19:54 2011 -0400
+
+ update the manual page
+
+commit 8022d0039dff47b1c11b2421357d510c1f28ae15
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jul 1 14:17:03 2011 -0400
+
+ output the best constrained genotypes in trio
+
+commit 18c87295e12f5bebafdcae00d52000fb94c8a566
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jul 1 11:18:14 2011 -0400
+
+ added documentations for view -T
+
+commit daf7a8d96bd495296bf7c7d99cddb808a3ced7d5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jun 30 22:45:20 2011 -0400
+
+ fixed a bug in writing SP
+
+commit e5c32bf9b28c6e3e861db88de56b5dbe11058b61
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jun 30 22:35:25 2011 -0400
+
+ optionally output read positions in mpileup
+
+commit 1008051155ec994c1901e18f3eb03ea32a62e5d7
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jun 30 22:17:25 2011 -0400
+
+ make faidx works with <2GB lines
+
+commit 2daebb63762425dd3074ddf71582ad189001e394
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jun 30 17:28:58 2011 -0400
+
+ fixed an issue in the trio caller and the indel caller
+
+commit 9fdd52cf0716fb342a94946433d564b28b230835
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jun 30 13:34:01 2011 -0400
+
+ Added trio caller; NOT tested yet
+
+commit ea22a8ed83625e9c82382b56acc42a2d9cfd17e5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jun 30 11:42:29 2011 -0400
+
+ convert PL to 10-likelihood GL
+
+commit 10d7065267b0d12c2bfcb6c70204fb6944cd395d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jun 30 10:49:05 2011 -0400
+
+ fix a compatibility issue with the new bcftools
+
+commit d340f01f609c61b719d38a6a55629a3fc899e1cd
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Jun 26 23:41:20 2011 -0400
+
+ allow to ignore RG
+
+commit d6321faf98ebfe899b9409fb23c90a4aa8c6b542
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Jun 5 23:05:21 2011 -0400
+
+ fixed a bug in SO checking due to a recent change
+
+commit bc995abf666d0c9ab4258f6c1b3518a45a89209f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jun 3 14:45:36 2011 -0400
+
+ update the version number
+
+commit 9e7cd83a08383858d008e0ccb2238a2b93831d6c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Jun 3 14:43:12 2011 -0400
+
+ smarter way to parse a region string
+
+commit e58a90a0fde54053dac65352b34c13c3fea815fc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jun 1 14:36:22 2011 -0400
+
+ output LRT2 instead of LRT1
+
+commit 08f78c9af3e5661f04f80bef424232de721dba03
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jun 1 14:02:28 2011 -0400
+
+ genotype test, but assuming 1-degree
+
+commit 587b852340d7e60f6f7cf474a92ef77aeab46018
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jun 1 12:55:19 2011 -0400
+
+ perform 2-degree test by default
+
+commit 3d38e403c5c830478b7eb157a484776997440501
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jun 1 12:44:34 2011 -0400
+
+ fixed a typo; but the result is still not good
+
+commit 06291624f7dcc57445676f3be25d0bc355dd7110
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jun 1 12:24:18 2011 -0400
+
+ fixed a typo
+
+commit 63b98aa33636b0d82a435bf49153c8c1502e7d42
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jun 1 12:23:37 2011 -0400
+
+ added HWE+F<0 filter
+
+commit 37d926e8999999b593d0637ab7dc379dbd3d6006
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed May 4 10:11:59 2011 -0400
+
+ improved sorting order checking in index
+
+ Patches from Jonathan Manning
+
+commit 1c2dc6762c5f7cd946046b53346513f2f9761dbf
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue May 3 23:09:05 2011 -0400
+
+ added r^2 estimate; added Brent's method
+
+commit c2d3bcd8f98e31668b5f1321222fbc6fd6336e75
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun May 1 23:45:23 2011 -0400
+
+ combine several utilites into vcfutils.lua
+
+commit be2e7362d7593ea4d03fb33cdb6af2aa096ca6c4
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 27 21:09:22 2011 -0400
+
+ minor warning
+
+commit 683ef0443860813d743cf84fa86dda9bfaf5445a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 27 10:10:38 2011 -0400
+
+ added versioning
+
+commit ed72f25ec85671f7646dbc92fa7b5b1dda427f7d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 27 10:04:02 2011 -0400
+
+ Output ML allele count
+
+commit 2a9e36d2d6c405b2411ca47458f028ada8fe1000
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Apr 26 16:14:20 2011 -0400
+
+ use ar -s
+
+commit 7a4f54e6dbcd7c94acbb3f1050a93f94b8a07949
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Apr 23 01:22:31 2011 -0400
+
+ added another type of LRT
+
+commit b9c5e84762a4aacce3a3771b51ea80967c79a2e5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 22 16:00:31 2011 -0400
+
+ added version
+
+commit 8fad6677c5952efd67391581d64e67e02e7f6e68
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 22 00:30:19 2011 -0400
+
+ remove the pileup command
+
+commit 3a962fb6ebf779de70f9e6effb2d8701a9aa3dd9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 21 23:10:45 2011 -0400
+
+ Release 0.1.16 (r963:234)
+
+commit b4d683cffbd98c43f05aff8610b37d63dd7e54aa
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 21 12:44:44 2011 -0400
+
+ fixed a bug when coordinate-less reads are on the reverse strand
+
+commit c5ec45a128f409debc6a56a798024f53004037dc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 20 11:36:52 2011 -0400
+
+ added option '-f' to merge to avoid overwritting
+
+commit 68d431531370d24907c01a27f166f2341d7c4d35
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 20 10:26:58 2011 -0400
+
+ do not print a warning
+
+commit 32922607e51ad2260c337eb022b9e4aedacb049f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 20 10:21:06 2011 -0400
+
+ Added ldpair to compute LD between requested pairs
+
+commit b8d6fa71b91678fa02338257e0707d1e5ca098dd
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Apr 17 21:51:43 2011 -0400
+
+ On a toy sample, type "B" seems to be accepted
+
+commit 0e7ee9a6bb4029184202aa6e6738105ba0c0510b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Apr 17 21:21:20 2011 -0400
+
+ added type "B"; not tested yet
+
+commit a513dfad0ac0062b03871eb6ecf26cb8d18dc895
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Apr 17 19:25:54 2011 -0400
+
+ fixed a bug in bedidx.c: input BED not sorted
+
+commit de1e192bb0a8a762a54a6eee81d882fab01c3d32
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Apr 17 18:51:08 2011 -0400
+
+ by default, always perform posterior chi^2
+
+commit df6e0d1099895fc6cd7a19dc89fba95ed6654d35
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Apr 16 12:33:28 2011 -0400
+
+ added debugging
+
+commit 8ce52e024dc2ef361dbd5399c232163055057e70
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Apr 16 00:59:05 2011 -0400
+
+ avoid a segfault given wrong input
+
+commit e66b6684fc9a397f91ec29fdeecae9f8eb986a55
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 15 19:55:39 2011 -0400
+
+ do not segfault when there is no PL
+
+commit 9ce3c584ec0cebfa45576f2ef538df4dad2b7e55
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 15 11:59:55 2011 -0400
+
+ remove another unused part
+
+commit f53a051d68bf312ac8d5865210fae7a9808c0fb9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 15 10:41:25 2011 -0400
+
+ print G3 if HWE is small
+
+commit 4b2c08bb86ca4ed4959e4cb77a28f7d6fc19f5c9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 15 10:04:34 2011 -0400
+
+ fixed a bug
+
+ actually not fix, but hide it
+
+commit 088e13c32453fb533b7bb1c65a573f9b90a23625
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 15 09:48:47 2011 -0400
+
+ added LRT based permutation; not used though
+
+commit 1e3c2001afcb80b5eaa4c3f88df9da7b01b62524
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 15 09:28:55 2011 -0400
+
+ Perform posterior contrast for small LRT
+
+ Posterior contrast is much slower than LRT. Nonetheless, posterior P-value is
+ more robust to sequencing artifacts. Thus we may combine the two to achieve a
+ balance between speed and low FPR.
+
+commit 6f1b066270902198a7175ff6c1b05ebc8d1919be
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 15 01:36:06 2011 -0400
+
+ Added Brent's method
+
+commit 3d061e5db25b67b25f6ff87afe4162e121354232
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 23:30:10 2011 -0400
+
+ fixed a typo in printing
+
+commit 7fd14ceb5990bb350b8e97346ef3537d80058def
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 23:14:23 2011 -0400
+
+ fixed a stupid bug
+
+commit f5b2c3459ec098b3cafd9619b9077132516baf58
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 22:42:35 2011 -0400
+
+ separate EM and posterior
+
+ Now, constrast is not performed unless -C is in use. EM can be invoked
+ independently with -e without computing the posterior.
+
+commit 9eefcac963697fae554789b11ae3cb2c23f224d0
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 22:00:19 2011 -0400
+
+ further code cleanup; prepare to add EM interface
+
+commit c2cce52355262743711e4742b0c8542bfcab1cdd
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 21:44:03 2011 -0400
+
+ drop EM from prob1
+
+commit 24016f04bd3bdffb7eeb50cb25854f5007feb70f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 21:08:33 2011 -0400
+
+ drop posterior LRT; prepare for clean up
+
+commit 3670d8bd88c3eb22873f0a80e2a5913f64ca8c9a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 20:57:43 2011 -0400
+
+ better initial values for LD
+
+commit d48a8873c060b18b57799cfe3a0e5496ba069457
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Apr 14 20:36:25 2011 -0400
+
+ finished EM
+
+commit b101f2db476188a950c23f5c1b6185fdb7f8f40b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 13 01:19:04 2011 -0400
+
+ genotype frequency estimate
+
+commit d79bdcbf6242ecfb8accba9ac9a22fbcbd543cf2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 13 00:37:22 2011 -0400
+
+ prepare for code clean up
+
+commit e0ce416abfc094f0c090957080b1404fd0edf752
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 13 00:34:15 2011 -0400
+
+ rename ld.c to em.c
+
+commit 45ede3ad181f35c1be24bed5d75841e472357ab7
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Apr 13 00:22:10 2011 -0400
+
+ implemeted EM likelihood ratio test
+
+ The idea is learned from a brief chat with Rasmus Nielsen.
+
+commit 0454a346b60e42b75a2f742272089810279c7131
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Apr 12 15:45:52 2011 -0400
+
+ added likelihood-ratio test (idea from Nick)
+
+commit f6287c8646c690440a1554c8958e7268f4134dc2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Apr 10 18:24:37 2011 -0400
+
+ Release samtools-0.1.15 (r949:203)
+
+commit de6023f38f4d652438557cf7a0ac6eec324e7416
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Apr 10 15:54:58 2011 -0400
+
+ improved help information
+
+commit d3b337f2b7eda1e6f8f5575a19d1b5ed55cae279
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Apr 9 16:28:01 2011 -0400
+
+ fixed a minor issue
+
+commit 82f6e4f49247e75fbd8ec08c285b8d3047b3d235
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Apr 9 15:49:04 2011 -0400
+
+ separate QC-pass and QC-fail reads
+
+commit 8362b4a255081ee7ca0a4ca2eabc8c76758b6863
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 8 17:45:19 2011 -0400
+
+ added verbose level
+
+commit f7bf419c290462be7d289249a4a6d28f825b4c93
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 8 16:08:14 2011 -0400
+
+ fixed a bug
+
+commit 890cbb1ac93b3004fb6cf42ff47195077dcfc8ad
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 8 16:00:37 2011 -0400
+
+ drop unrelated @RG when "-R" is in use
+
+commit a62dc929c950fb51311b705f5b5bfba8e3f704d7
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 8 16:00:14 2011 -0400
+
+ skip header validation
+
+commit 39da810e2c56c8f0eff1ab726600b41f26d3d8e9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Apr 5 23:52:22 2011 -0400
+
+ change error message
+
+commit c0c50a34df250ef8a7a29b172058cd229be582b5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Apr 5 23:50:46 2011 -0400
+
+ fixed a bug caused by recent modifications
+
+commit 25226e8c468404cb5e1b5272efcea57e4193c762
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Apr 5 13:31:19 2011 -0400
+
+ reduce the indel filtering window
+
+commit 5e18d7014437734f9dac9ab45a95e43ec2526101
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Apr 4 13:56:20 2011 -0400
+
+ only output hwe if it is small enough
+
+commit 614941fb7dd276de662e7820eb8c7bae871a18cc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Apr 4 13:34:02 2011 -0400
+
+ added HWE back
+
+commit 7abe8825aa0bacccdeb38125934ae94d18f0ad4d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Apr 4 12:46:24 2011 -0400
+
+ EM estimate of genotype frequency
+
+commit 2bfeff9c645d177416664f1cb811e85cac3ff9e3
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Apr 4 11:29:12 2011 -0400
+
+ minor
+
+commit 401e40647e7e3abbac6e4ec3d8bb68eb6f2d401b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Apr 4 11:24:04 2011 -0400
+
+ Added genotype freq estimate and association test
+
+commit 6cc226df6e3b480f1bd6e763ce8ef47f785bbb74
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Apr 3 20:57:23 2011 -0400
+
+ minor changes
+
+commit 7e47a39630e812f09b80369f14606245976f687e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 1 15:21:59 2011 -0400
+
+ print the grayscale
+
+commit 2f675d9c0dde3c166c99e335fa17c7873a5ae8d5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 1 08:55:16 2011 -0400
+
+ change to comment
+
+commit 0592bb514994544ed84f51e509b233cf8821e0cf
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Apr 1 08:54:35 2011 -0400
+
+ added base quality filtering
+
+commit fc1b47e04a7b94f6362c45856cbeb89d9d0b5ca5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 31 23:31:14 2011 -0400
+
+ fixed a few typos in comments
+
+commit 60be79bc8f0d24656e5e8a329af7e9b5b91d4c8b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 31 23:13:23 2011 -0400
+
+ comments
+
+commit 2432864acc25ebe5cee4217dbb0120439077a7f8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 31 22:42:46 2011 -0400
+
+ added bam2depth.c, a demo program
+
+commit 39625f7c6bea9ccbfd9af0feb22348d52079f012
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 31 16:37:22 2011 -0400
+
+ added bgzf_check_bgzf() (used by tabix)
+
+commit 6de6bd3fb67fd22753a5f07d4cc25bf94e1b5a8c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 31 16:37:08 2011 -0400
+
+ fixed a bug in bedidx.c
+
+commit 3b9e257d25b2e81eed1625bc5d2882ed486ef20e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 30 13:27:15 2011 -0400
+
+ added bed support to bcftools
+
+commit 47bcce3d14ec4d205283b61e5e653803996c42e0
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 30 12:56:40 2011 -0400
+
+ Added BED support to "samtools view"
+
+commit a812386017faedfc86c0e6562adbb2138329cfeb
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 30 12:47:04 2011 -0400
+
+ support BED file
+
+commit 3052dddc929f1825e6e7f7f6f6724d9465d6cf9a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Mar 28 15:51:55 2011 -0400
+
+ relax RG matching; proper mismatching message
+
+commit f86d60c8fe25785523f01fae1486d2a6df4ee6ef
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Mar 26 10:38:23 2011 -0400
+
+ Avoid reporting association when something unexpected, which I do not understand, happens.
+
+commit dd41e6b26fd9fe30218748b9a0a1f49bdb1862b9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Mar 26 10:38:01 2011 -0400
+
+ Added -1 to merge
+
+commit 4a0364b0d7f87f1c88d71ec5857a1f1d40710681
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 23 16:56:55 2011 -0400
+
+ plot pairwise r^2
+
+commit 452629a711582e612bec22b3b082e234bd37039b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 23 14:31:01 2011 -0400
+
+ pairwise LD; case-control AF2
+
+commit 52862951adcaecde26ba8f0d9c1897944640a674
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Mar 21 23:03:14 2011 -0400
+
+ Release samtools-0.1.14 (r933:170)
+
+commit 59a5a8ba8e2940f0e38238f9339f02c91a8a0ce4
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Mar 21 13:52:55 2011 -0400
+
+ optionally skip loci with too low sample coverage
+
+commit 6434264b5c69514d4fafe62cbd30b3bbaddc1d41
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Mar 19 14:38:25 2011 -0400
+
+ mpileup support Illumina1.3+ quality; skip non-variant sites when "view -v" is in use
+
+commit 5f59e01987e1d5eca7d6359cae64a9734b18beea
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 18 17:19:18 2011 -0400
+
+ update version to r933:167
+
+commit 4d2c3c950910aa3d2c87760c3532e458fe01c0fa
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 18 16:25:01 2011 -0400
+
+ added "-1" to the command-line help
+
+commit 55313a015a7bd6369cf5a66fed7fab2333201dc9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 18 16:22:12 2011 -0400
+
+ added the "cat" command (by Chris Saunders)
+
+commit b670272cadf3efa4dc456ac4c76104f73477d60d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 18 15:59:46 2011 -0400
+
+ support varying the compression level
+
+commit c5dd3c9ca5f75f880e52c8cd2beae983bcb8d3b1
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 16 14:33:45 2011 -0400
+
+ update the manual pages
+
+commit 12fb4b596dc51bccd154fc4bd0593442f7937a46
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 16 12:49:26 2011 -0400
+
+ update changelog
+
+commit e7fe4fd66e02d60a1ca7952ad1938809e77729a9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 16 12:10:05 2011 -0400
+
+ do not call indels when the depth is very high
+
+commit 7455eeaa32b949bb3856f75810890aabf7cacb18
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 16 11:56:56 2011 -0400
+
+ code clean up
+
+commit 5f16679e54ced8e67a75d949f9175c50480b914e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Mar 15 14:45:24 2011 -0400
+
+ when -s is specified, change the sample order
+
+commit 7ba95adee09d3b06a7eaf797d25efef837e592f5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Mar 15 14:11:42 2011 -0400
+
+ compute the rank in permutation
+
+commit d219783cea7643fc7e10e1bd3a98e9b3165b4506
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Mar 13 21:35:13 2011 -0400
+
+ I have found a SERIOUS BUG!!!
+
+commit 8e20d04ecdac1a7788eef71c4bb91b8479cf7150
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Mar 13 17:04:04 2011 -0400
+
+ optionally shuffle samples in a BCF (debugging)
+
+commit fc7b261f181f2a411427bc9ee5d586c883ca9cdc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 11 09:34:20 2011 -0500
+
+ fixed a bug
+
+commit b3bbcc3d40994ae85705ab6fef9866ec8c142201
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 10 20:25:59 2011 -0500
+
+ use mode instead of mean
+
+commit f1161262d137098a19143b5cb0de810e5db3243e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 10 20:09:16 2011 -0500
+
+ start from the mean instead of the mode
+
+commit 2ba56f5e99e90674855c4ffc8bf583340b932e1e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 10 17:13:34 2011 -0500
+
+ fixed an error in Chi^2 test
+
+commit b4ce7ae400290bc43dd287240479667f99b3b11e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Mar 10 00:23:39 2011 -0500
+
+ minor
+
+commit 8487fa5d3a73a43443964e731ea2a4c873c9d4e5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 21:33:19 2011 -0500
+
+ added -F to accept BCFs generated by old samtools
+
+commit fd51d2093f7fd775a7eaaeea57fa34716ab59ac2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 17:39:09 2011 -0500
+
+ update version
+
+commit b6da54335df943015a998a934075331b467abb5b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 17:37:14 2011 -0500
+
+ compute pseudo-chi2 probability
+
+commit 9f73cefdb8935421d872b989dd98fbc8e1295029
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 15:54:04 2011 -0500
+
+ remove a comment which is wrong
+
+commit b10b1e47ece522e97ab8ef23417bcb6454f8b9db
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 15:51:12 2011 -0500
+
+ clean up
+
+commit 353bfae2c6ff59205bd9223db04084cf7f507f01
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 15:45:29 2011 -0500
+
+ for backup
+
+commit 53915d1c6410c2537d18bfa8eb8c657a2233c35e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 15:27:56 2011 -0500
+
+ having debugging code
+
+commit 0d0dbf66995b1511390d593981eae7b5d36fe17b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Mar 9 14:58:23 2011 -0500
+
+ temporary backup
+
+commit 5b74a174a8b637dee43b7f30250df6fb96580e12
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Mar 8 15:46:11 2011 -0500
+
+ the output makes sense, but there may be a typo...
+
+commit d81ec654b6c0c1eef6b0625d96f14b3155cee7c6
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Mar 8 15:19:09 2011 -0500
+
+ added contrast2(); fixed a bug in haploid mode
+
+commit 0cfd896fad5f7737cca49efa94a11892dafcd812
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Mar 7 21:40:17 2011 -0500
+
+ fixed a bug in haploid genotyping
+
+commit ccd52155ef61273f2b42ad9c7b31ff1915f81b24
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Mar 5 18:10:35 2011 -0500
+
+ fixed a few bugs; still not fully working
+
+commit edc3af753f96f831968ae32f2e0f915b74f31e6e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 4 17:31:33 2011 -0500
+
+ drop HWE calculation
+
+commit 92dac194debb66ca0718c21c871822dda2dd5bc1
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 4 17:28:35 2011 -0500
+
+ implemented hap/dipoind mode; probably BUGGY!
+
+commit 7f26804bc27937e36fdc967e5c76514653ea40f5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 4 16:01:27 2011 -0500
+
+ read ploidy
+
+commit e7b7213475b5e61a69aab77ffb02b4983c8e7678
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 4 14:12:14 2011 -0500
+
+ added math notes
+
+commit 46023e2f21321da83fc8e83e9229757a4e821acb
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Mar 4 13:34:10 2011 -0500
+
+ update BCF spec
+
+commit 13190c49eeb006ad7013b7f1e9fc1b3beca3ae78
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Mar 1 14:45:19 2011 -0500
+
+ Release samtools-0.1.13 (r926:134)
+
+commit be8fabbb6001d9fd5263a70a3e21ed6dfe5a9837
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Mar 1 14:07:15 2011 -0500
+
+ prepare to finalize 0.1.13
+
+commit 1e8c753660978bed7e9289fe50becd596d9314bb
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Mar 1 09:40:17 2011 -0500
+
+ allow to change whether to drop ambiguous reads
+
+commit 412210bfdb46606023f2e4b9086f2787f0cf1c62
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 22:01:29 2011 -0500
+
+ revert to the old behavior of phase
+
+commit 46035589518cf84738de8666b866e2619457c1fb
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 16:46:23 2011 -0500
+
+ change version number
+
+commit 7f40c33e37fc16fcb0a375ce46ae1d09cafb6d50
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 16:37:42 2011 -0500
+
+ bugfix in indel calling: interger overflow
+
+commit 75849470efbe30042e5ddd516f9bcbe3b9bf6062
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 15:35:47 2011 -0500
+
+ fixed a typo
+
+commit 9e6fb569885f906fabaab7fc2f02eae82f4bd602
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 15:34:09 2011 -0500
+
+ minor changes to heuristic rules
+
+commit 30a799a91f5e2c10b761aa5437f902c6649fceb3
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 15:20:26 2011 -0500
+
+ fixed a bug in the latest change
+
+commit e21ba9df950ea37f5c1b35c2af9ba9a4e0bba02a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 12:47:06 2011 -0500
+
+ put version in bam.h
+
+commit 918b14780c1dceb39c7010638ecd61c626e17166
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 12:00:38 2011 -0500
+
+ frag_t::phased==0 reads are dumped to chimera.bam
+
+commit 657293c7bdba3ac69f53cd1ffa2874ed8756475e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 11:05:29 2011 -0500
+
+ change default -q to 37 (previously 40)
+
+commit 33d8d3bea76e466798ea322d68d34deb8d2dff06
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 28 10:39:57 2011 -0500
+
+ fixed a minor bug in BAM reading
+
+commit daa25d426d42465d76c7317c95772bbb36bb3f47
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Feb 26 21:07:24 2011 -0500
+
+ suppress gzopen64() warning
+
+commit 9cec4256eb9e7848d4711adb67b540659c141e32
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 22:14:52 2011 -0500
+
+ fixed a long existing bug in vcf2fq
+
+commit 304487c83067a733add71cbc3886fa8c49f7ef2a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 16:37:40 2011 -0500
+
+ change version number
+
+commit 10ba6bf4f16692760f696f7b17f3719065786f77
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 16:34:08 2011 -0500
+
+ Change the order of PL; change SP to int32_t
+
+commit c5cc2a8036a9c3579fbfde651efec4f6763b0228
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 14:52:03 2011 -0500
+
+ claim X defined in the header
+
+commit 4ee8cb29f6092fd14a89f0cc5d3575112a204f39
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 14:40:24 2011 -0500
+
+ minor changes
+
+commit 00065e9336a2831dc53bee7da2f4719845be1a2a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 11:39:06 2011 -0500
+
+ fixed an error in the BCF spec
+
+commit 1e2a73afcb72a02aa448718cb017c0438de89f90
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 11:36:40 2011 -0500
+
+ update BCF spec
+
+commit dbf8eedaa38a405cb2fba5b3952b85776f51d035
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 11:28:43 2011 -0500
+
+ update BCF spec
+
+commit eed1d91af9fad3c9d965333a55e623757f9c4e9d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 09:51:39 2011 -0500
+
+ fixed a flaw in targetcut
+
+commit 59bc980bb832b92a8b0cc244cf106e6150e4db6f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 00:54:35 2011 -0500
+
+ update manual page
+
+commit fcc4738c4abdca79e3de159e21208df1b98ac76c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 00:45:39 2011 -0500
+
+ update version format
+
+commit 5748639ae542b7f6b853562edc2bb3faf43030e4
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 00:45:12 2011 -0500
+
+ update version number
+
+commit 06b44cc366cf27ce8976ee6a05810a0b3c48b56d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 00:44:21 2011 -0500
+
+ update version number
+
+commit ab7f4529d12739ff66fd4c09af9d992ab59c53ef
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 25 00:42:55 2011 -0500
+
+ various help message
+
+commit a092e1f6f963272f8bb23616986ddaf604fd0f82
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 24 23:43:13 2011 -0500
+
+ disable unfinished functionality
+
+commit f00a78db72b14ee4c6689fc13f20ed31aeaecd40
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 24 10:04:56 2011 -0500
+
+ added "const" to bcf_p1_cal()
+
+commit 91049c4a8db3bf50dcc9d07506f22fa4ca5b5a96
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 23 11:53:47 2011 -0500
+
+ randomly allocate unphased reads
+
+commit f4405354a8d4cb3441141fa734573031059d7f57
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 22 15:36:07 2011 -0500
+
+ fixed a typo
+
+commit 3075e4dc5c7c9d954426aabda6a73fa788357100
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 22 15:33:40 2011 -0500
+
+ make output more informative
+
+commit 628cf3235e2815a40acf089fb1d3357be6437787
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 22 14:50:06 2011 -0500
+
+ change the scoring rule; change default k to 13
+
+commit f22fd99831e4b5c74f898719216f359dbe987bbf
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 22 14:45:15 2011 -0500
+
+ update scoring in masking
+
+commit 2f23547b81984555032aa0eefd064b8e07986fdc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 22 14:37:17 2011 -0500
+
+ remove dropreg()
+
+commit 4d8b6b1f1f331ca9041983c66e34a857c3b8f1bb
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 22 13:10:16 2011 -0500
+
+ accept files from stdin
+
+commit 9b50c5038e6fc0185e29ca5b50fe0806a9a939b9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 22 11:16:57 2011 -0500
+
+ fixed a bug in consensus generation
+
+commit 1332ab32fb788fdc81b2ba8653b905d106238fad
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 22:53:23 2011 -0500
+
+ print dropped fragments
+
+commit a288761b4ca1584e51076a71cbc4d72fe923dda1
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 22:37:04 2011 -0500
+
+ bugfix: singletons are not phased
+
+commit 683365f534c0223dea7d72532015ac16a45ba22b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 17:27:10 2011 -0500
+
+ output singleton blocks
+
+commit 841a4609084d81f1bc81e0b00dd806002461e7d9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 15:58:55 2011 -0500
+
+ fixed a bug; not working with -l right now
+
+commit fdd57ea31732b5516dc212d72174b60206952636
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 15:17:00 2011 -0500
+
+ skip mapQ==0 reads
+
+commit 4eb6ba75c23c1c9be5f76814fa1b93a2e304b2af
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 14:03:03 2011 -0500
+
+ print the "targetcut" command
+
+commit 0123d9559ba58b026c0dfd15bc26019a193cd21a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 11:22:13 2011 -0500
+
+ allow to set the maximum depth
+
+commit 0f92eb248a4d06645b2c3d736a0faea8a7a9f731
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 09:56:41 2011 -0500
+
+ use a proper error model to call hets
+
+commit 587a01504af5aea6288740d121dccf48fb8a75f4
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 21 09:16:38 2011 -0500
+
+ phase is UNFINISHED; strip RG when merging
+
+commit 723bf3cd79e4f4a558373d4c707fa6b3db0fb357
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Feb 19 23:38:11 2011 -0500
+
+ use a proper model to compute consensus
+
+commit 891a6b02d4a9af2ed98fbaac4915bf1f0da4f6c8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Feb 19 22:14:19 2011 -0500
+
+ added comment
+
+commit 8b55e0a581ecc9e4ba754d1f3c8784f3038b6e48
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 18 17:23:39 2011 -0500
+
+ change the output format
+
+commit 75c36e8c563eddd0a362ba3b38cf0aea21aafb1f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 15 20:31:00 2011 -0500
+
+ fixed a bug in writing BAM
+
+commit bb0ce52f066cfebaa35a125d57b353bb717a5165
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 23:39:09 2011 -0500
+
+ skip uncovered; unknown alleles taken as X
+
+commit ba67f4d119c7d06907db3015d337d9a01a3fc9fe
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 23:21:19 2011 -0500
+
+ fixed a bug
+
+commit e4448d49e6129a5e1ee9c7f04f43612f12d6aad6
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 22:43:09 2011 -0500
+
+ prepare to read hets from a list; unfinished
+
+commit 129ea29c1f12177c0a7c3e21676f6210370fc59b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 16:32:22 2011 -0500
+
+ updated khash.h to 0.2.5
+
+commit 15b44ed93bd949dffcf79ac8dbea6d9b7dfcb58c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 16:15:04 2011 -0500
+
+ use the latest version of khash
+
+commit 486c05f06f44d981dfb2069bcb43e4b35fd8389c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 15:04:40 2011 -0500
+
+ change the default -k to 11
+
+commit 07cf9d1e443d73cf053de38dd01671e3781f6e29
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 14:50:51 2011 -0500
+
+ sort fragments by vpos instead of by beg
+
+commit d0d3e7faabf5cbb7e5ff7b294f7e220da807c4c0
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 14:45:41 2011 -0500
+
+ shuffling the two haplotypes for better randomness
+
+commit 3be28eaf5f6033229aedf12ddb11a0084ba01cd8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 14:09:17 2011 -0500
+
+ write chimeras to a separate BAM
+
+commit 80ccbc26f43918fe42be123cc1da9d3d7ce30816
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 13:54:13 2011 -0500
+
+ no mem leak/violation on small files; correctness is not checked
+
+commit 5c923867432fa14c26a19e3782e7f48d4080f6ac
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 14 13:50:25 2011 -0500
+
+ bam separation; at least not immediate segfault
+
+commit cea2643ec30a59735bf89b2f562b563bf7263e79
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Feb 13 23:24:11 2011 -0500
+
+ on the way to implement BAM separation; unfinished
+
+commit 964269cd15036a470ca89e43d0952201a0825671
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Feb 13 18:07:56 2011 -0500
+
+ keep singletons in the hash table
+
+commit 2d4aa649bd670d5e038a1acaefd33c5fe24ae0e8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Feb 13 17:42:24 2011 -0500
+
+ Revert "prepare to add bam separation"
+
+ This reverts commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f.
+
+commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Feb 13 00:24:28 2011 -0500
+
+ prepare to add bam separation
+
+commit d211e652d93791d2e112d334added243ffe5fc3e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Feb 12 18:50:20 2011 -0500
+
+ accelerate kstrtok
+
+commit 2d6af49d331ff5afe7b9e9b102e79d7d4512fdbe
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 11 21:08:21 2011 -0500
+
+ split unlinked blocks
+
+commit 68e4cd1b560b0a6fd4c77e5e51eadde9fda26ea4
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 11 10:47:58 2011 -0500
+
+ remove heading and tailing ambiguous positions
+
+commit d2b685141426a902ae76660c1fbe8020da150cf8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 11 10:02:21 2011 -0500
+
+ code clean up for further features
+
+commit c6980e062d55928b59f287c03e599dd5a37ed509
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 11 08:00:08 2011 -0500
+
+ change /64 to >>6; the latter is faster
+
+commit 91635b9c2687f24d72ee6a8aad2050a79bb8400f
+Merge: 41d4df2 9a7e155
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 11 01:22:55 2011 -0500
+
+ Merge branch 'master' into devel
+
+commit 9a7e155cc591c1b6c9f7f9cb939364a6becb65b2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 11 01:21:07 2011 -0500
+
+ output an unrecognized field as '.'; autofix GL/PL
+
+commit 41d4df2e9545e9abe97151cfe5d6c763f3d00db1
+Merge: c00c41c aacce0c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 23:00:14 2011 -0500
+
+ Merge branch 'master' into devel
+
+commit aacce0ce7276f451e4fddf81832f9e5f7f65198b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 22:57:53 2011 -0500
+
+ finished VCF->BCF conversion
+
+commit 0e875df643e41d848b709e2fa877de8ae53cdd4c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 21:57:28 2011 -0500
+
+ fixed a bug in reading VCF files
+
+commit c00c41c2a5da69cccea64adb542a0b365e56b4fc
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 16:28:37 2011 -0500
+
+ suppres one-allele blocks
+
+commit 2e2354b673722e2f00d72970a043f80a66270da1
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 16:06:56 2011 -0500
+
+ fixed the bug in filtering
+
+commit d971e1fe24de4ecaf94055efffc5f641e2bdb563
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 12:24:23 2011 -0500
+
+ prepare to add filtering; buggy right now
+
+commit a0a5a3fbf504c3b02f7b9212e72315c1047cc249
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 11:55:02 2011 -0500
+
+ make masking optional
+
+commit 28db71ccd95054a5f8a47c2332794f8968f6a822
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 10 11:40:47 2011 -0500
+
+ routine to mask poorly called regions
+
+commit a3f6c439262bc10a4067860440f4d4dde9e0c515
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 17:18:33 2011 -0500
+
+ code clean up: remove globals
+
+commit 0b711978492f6ad39d459d78723c299468906818
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 16:52:54 2011 -0500
+
+ output more information
+
+commit f69d217ae5b691bf42ad07a97f29a7cc6456046f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 16:11:54 2011 -0500
+
+ fixed another bug in flipping
+
+commit d47882d549337fbcc251597508a2c7faf1bb92e2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 16:01:35 2011 -0500
+
+ fixed a stupid bug in flipping
+
+commit e33f89de499496537f5fbde396a66557f0353f1b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 15:54:42 2011 -0500
+
+ fix chimeras; a little weird...
+
+commit 03d3c1d0b945245108ce0942d4772536a32212c7
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 13:27:35 2011 -0500
+
+ no effective change; prepare to fix chimera
+
+commit 6bc0a4676dd2252085a6e67bb06daa5ae05a554f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 11:52:58 2011 -0500
+
+ better count output
+
+commit dcac515439d25f71125d6de8111da417776ab9ce
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 9 10:31:07 2011 -0500
+
+ prepare for another way of filtering
+
+commit ca7e4f1899b86d2e077994c789e8f69d699b3cd9
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 8 16:10:08 2011 -0500
+
+ fixed the bug; I can do better.
+
+commit 0733f77b98af121bdcb198cea6151d159831bb9c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 8 15:55:38 2011 -0500
+
+ fixed two bugs; still not working...
+
+commit 80f18cba9ba73c9592380fc1ecd53c351d294782
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 8 15:42:58 2011 -0500
+
+ filter false SNPs; NOT working right now
+
+commit 69a66e2f96d5b102cd712ff1527a3802fa84c590
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 8 14:39:09 2011 -0500
+
+ write sequence in the SAM format for debugging
+
+commit b6f1c9d160822af2b713be206f37bd6dde00546a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 7 11:51:21 2011 -0500
+
+ fixed two bugs
+
+commit 400aa5c06100af9c47cd5e4ce8b95b7deb84f54b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 7 11:22:38 2011 -0500
+
+ Optionally apply BAQ
+
+commit 4c82e0e19682e424f5cdb8381364114c307b329e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Feb 7 01:23:31 2011 -0500
+
+ improved output; the result makes sense at a glance
+
+commit dc7853a581ab24bcc496e96b123ccf637e32ed1d
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Feb 6 14:12:43 2011 -0500
+
+ process per linked block instead of per chr
+
+commit e867d9c6c2e61d9e748e78163e5481dca5697a36
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sun Feb 6 00:45:46 2011 -0500
+
+ DP seems to work on toy examples
+
+commit 445ad72fc43d4354d56f5f759790e8ae0be73d02
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Feb 5 01:24:42 2011 -0500
+
+ implemented backtrack; not tested
+
+commit ba38e180b9cd545956583b22e97e09b4bb12073e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 4 23:55:23 2011 -0500
+
+ More "correct" DP; backtrack not implemented
+
+commit d69761fd9351273ccd37ea431b10509add91e7cf
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 4 17:22:31 2011 -0500
+
+ scratch of dynamic programming; unfinished...
+
+commit 769ffcb44e26e59300791658801d321559b33858
+Author: Heng Li <lh3 at live.co.uk>
+Date: Fri Feb 4 16:29:55 2011 -0500
+
+ UNFINISHED commit.
+
+commit 9adab9591317c3467f3d8cdf2d19ec1f65d1b5b7
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 3 16:20:59 2011 -0500
+
+ another way of counting; can be even faster
+
+commit bbafbdc01ed1ceaab44927def1ad47c4c78aeb9c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Feb 3 14:48:20 2011 -0500
+
+ for backup
+
+commit eba7446389cad62a19133bced1386a4334dcab79
+Merge: a44a98e f01a593
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 2 14:06:07 2011 -0500
+
+ Merge branch 'master' into devel
+
+commit f01a5930445b5fda7e6b5b813ed63c652160ada2
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 2 11:31:54 2011 -0500
+
+ Better truncation warning when EOF is absent
+
+commit dd3ee5ed26c8bbef4a62fa5b2bfb0a75833f2c31
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 2 10:38:28 2011 -0500
+
+ fixed a typo in BCF/VCF headers
+
+commit b9d1137c55f401387113d1ad8a387489afe741db
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Feb 2 09:13:44 2011 -0500
+
+ fixed an out-of-boundary bug (fixed by Roel Kluin)
+
+commit a44a98e16559b9672e8a3492c8f8c640074b7ee2
+Merge: ef68a14 d0443d5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 1 21:54:48 2011 -0500
+
+ Merge branch 'master' into devel
+
+commit d0443d5c2f648e0f69bd4c56eaac7868e501c18b
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 1 17:31:52 2011 -0500
+
+ improved sorting order checking
+
+commit ef68a14fab91399b2ecd38345936c3d6e7391cf3
+Merge: 1e597b3 1a39a2e
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 1 15:12:37 2011 -0500
+
+ Merge branch 'master' into devel
+
+commit 1a39a2eb08a270e20a34a0983e8bed6ffb3e2008
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 1 15:12:14 2011 -0500
+
+ more precise error message
+
+commit e028e7a47c02232e06a9dd3009262c00dede1060
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 1 14:48:01 2011 -0500
+
+ improved sorting order validation in index
+
+commit 1e597b3356744e2b791b12c9187f91c8054511d5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Tue Feb 1 14:44:27 2011 -0500
+
+ testing only; not working
+
+commit 5753ace1e54228822d8ee95f69943f586e42f6e8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Jan 31 17:37:08 2011 -0500
+
+ reduce the effect of seq errors at the cost of SN
+
+commit 6f239ce5e0abd47babee33174476d48b723260d8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Jan 31 17:29:34 2011 -0500
+
+ added testing code
+
+commit 3db42fe22d27d61ab5735cd2308f73d93def8ebe
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Jan 31 14:33:21 2011 -0500
+
+ routine for phasing fosmid resequencing (incomplete)
+
+commit ed88f2797323229ae8f38fbcd107b231007956a8
+Author: Heng Li <lh3 at live.co.uk>
+Date: Mon Jan 31 10:12:53 2011 -0500
+
+ SAM output
+
+commit abc6acae28dc4794f6422255f077cf370d34e414
+Merge: f1985a9 b133dbf
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Jan 29 22:56:10 2011 -0500
+
+ Merge branch 'master' into devel
+
+commit b133dbf82de4e8cea5eb56e5bbf0c4b3e9368fd5
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Jan 29 22:37:11 2011 -0500
+
+ fixed a bug in tview on big-endian by Nathan Weeks
+
+commit 9d3fdaef29f91e21dbfcb9ff0165b9573e7c1042
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Jan 29 22:24:00 2011 -0500
+
+ update INSTALL
+
+commit 9d074a38bde53961f96157b6fb3683b6dded38d7
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Jan 29 21:56:25 2011 -0500
+
+ avoid a segfault when network connect fails
+
+commit f1985a93f7455b3ea1b0ef9b959d50b896ccd620
+Author: Heng Li <lh3 at live.co.uk>
+Date: Sat Jan 29 21:53:18 2011 -0500
+
+ fixed a bug about bit ordering
+
+commit d09797db6fef648a6823cbe718d67664660c6ebe
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jan 27 16:53:19 2011 -0500
+
+ point out there are 4 or fewer free parameters
+
+commit 5fd1717650ed68ab6c55d094d1648c16a054891a
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jan 27 16:09:18 2011 -0500
+
+ updated .gitignore
+
+commit fccb19fbe8f9de91f59d85bb49a248683dc6266c
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jan 27 16:08:14 2011 -0500
+
+ fixed a bug; better scoring
+
+commit b4dcb844bde3d09eedcd9f6832186ece60ae5afd
+Merge: ffc3e89 6f502de
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jan 27 14:50:30 2011 -0500
+
+ Merge branch 'master' into devel
+
+commit 6f502dec46b18dae4bb5b2319715d028b5e193d0
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jan 27 14:47:31 2011 -0500
+
+ skip unmapped and ref-skip reads in indel calling
+
+commit 3639f37dd8257b24560c35effcc3b6c16c3c1bcb
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jan 27 14:19:15 2011 -0500
+
+ fixed an out-of-boundary bug in rare cases
+
+commit ffc3e89678ab9052b84f403da1e43044b045e73f
+Author: Heng Li <lh3 at live.co.uk>
+Date: Thu Jan 27 14:00:17 2011 -0500
+
+ targetcut can be compiled, though probably buggy
+
+commit f452b3ac51306865ddde31a8d715b155d4d3e6e6
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jan 26 18:58:43 2011 -0500
+
+ this is for a very special application...
+
+commit ca1451c6406c7ee757cb31349ea0b8de70db0656
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jan 26 18:48:09 2011 -0500
+
+ fixed compiling errors
+
+commit 085b87a7642865f17239fb6a436e626e25417838
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jan 26 18:45:09 2011 -0500
+
+ This script was put in a wrong place...
+
+commit 090d360828622520de60385af4928ce1aebe0e48
+Author: Heng Li <lh3 at live.co.uk>
+Date: Wed Jan 26 18:33:58 2011 -0500
+
+ Imported from samtools-r902
+------------------------------------------------------------------------
+r108 | lh3lh3 | 2009-01-20 11:56:45 +0000 (Tue, 20 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/examples/Makefile
+
+made it a little more convenient
+
+------------------------------------------------------------------------
+r107 | lh3lh3 | 2009-01-20 11:53:30 +0000 (Tue, 20 Jan 2009) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/examples/Makefile
+
+added a Makefile
+
+------------------------------------------------------------------------
+r106 | lh3lh3 | 2009-01-20 11:25:05 +0000 (Tue, 20 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/maq2sam.c
+
+support RG tag
+
+------------------------------------------------------------------------
+r105 | lh3lh3 | 2009-01-18 17:37:20 +0000 (Sun, 18 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+
+update changelog
+
+------------------------------------------------------------------------
+r104 | lh3lh3 | 2009-01-18 17:31:21 +0000 (Sun, 18 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_lpileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-18
+ * fixed a bug in bam_lpileup.c: segment start and end are not correctly recognized
+
+------------------------------------------------------------------------
+r103 | lh3lh3 | 2009-01-18 16:34:03 +0000 (Sun, 18 Jan 2009) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-17
+ * fixed a bug when there are reads without coordinates
+ * also recognize type 'c' as 'A'
+ * found a bug in bam_lpileup.c; NOT fixed yet
+
+------------------------------------------------------------------------
+r102 | lh3lh3 | 2009-01-17 19:46:49 +0000 (Sat, 17 Jan 2009) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/INSTALL
+
+Instruction for compilation
+
+------------------------------------------------------------------------
+r101 | lh3lh3 | 2009-01-17 19:31:36 +0000 (Sat, 17 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ A /branches/dev/samtools/Makefile.lite
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/faidx.c
+ M /branches/dev/samtools/misc/Makefile
+ M /branches/dev/samtools/razf.c
+
+ * replaced HAVE_RAZF with _NO_RAZF
+ * added Makefile.lite for people who have trouble with razf.c
+
+------------------------------------------------------------------------
+r100 | lh3lh3 | 2009-01-16 10:03:37 +0000 (Fri, 16 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_mate.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/misc/wgsim.c
+
+ * samtools-0.1.1-15
+ * fixed another bug in fixmate: unmapped pair has non-zero isize
+
+------------------------------------------------------------------------
+r99 | lh3lh3 | 2009-01-16 09:13:36 +0000 (Fri, 16 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+ M /branches/dev/samtools/bam_mate.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-14
+ * fixed a bug in fixmate: isize not equal to zero if two ends mapped to
+ different chr
+
+------------------------------------------------------------------------
+r98 | lh3lh3 | 2009-01-15 16:47:41 +0000 (Thu, 15 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-13
+ * fixed the prior for hom indels (Richard pointed this out)
+
+------------------------------------------------------------------------
+r97 | lh3lh3 | 2009-01-15 16:38:47 +0000 (Thu, 15 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/COPYING
+ M /branches/dev/samtools/bam_sort.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/source.dot
+
+ * samtools-0.1.1-12
+ * fixed a bug in sort
+ * update source file graph and copyright information
+
+------------------------------------------------------------------------
+r96 | lh3lh3 | 2009-01-14 21:46:14 +0000 (Wed, 14 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/glf.c
+
+fixed a typo
+
+------------------------------------------------------------------------
+r95 | lh3lh3 | 2009-01-14 21:44:53 +0000 (Wed, 14 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/glf.c
+
+added a main function for glf.c
+
+------------------------------------------------------------------------
+r94 | lh3lh3 | 2009-01-14 17:14:59 +0000 (Wed, 14 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.h
+ A /branches/dev/samtools/glf.c
+ M /branches/dev/samtools/glf.h
+
+ * samtools-0.1.1-11
+ * generate binary GLFv2
+ * added glfview command to dump GLFv2 binary file
+
+------------------------------------------------------------------------
+r93 | lh3lh3 | 2009-01-14 15:07:44 +0000 (Wed, 14 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_rmdup.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/glf.h
+
+ * samtools-0.1.1-10
+ * fixed several bugs in rmdup
+ * prepare to generate GLF2
+
+------------------------------------------------------------------------
+r92 | lh3lh3 | 2009-01-14 13:27:44 +0000 (Wed, 14 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_import.c
+ A /branches/dev/samtools/bam_rmdup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-9
+ * implemented rmdup; NOT tested yet
+
+------------------------------------------------------------------------
+r91 | lh3lh3 | 2009-01-13 20:15:43 +0000 (Tue, 13 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/examples/00README.txt
+
+update README for typos
+
+------------------------------------------------------------------------
+r90 | lh3lh3 | 2009-01-13 19:57:50 +0000 (Tue, 13 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/examples/ex1.sam.gz
+
+update example
+
+------------------------------------------------------------------------
+r89 | lh3lh3 | 2009-01-13 17:21:38 +0000 (Tue, 13 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.c
+ A /branches/dev/samtools/bam_mate.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-8
+ * added fixmate command
+
+------------------------------------------------------------------------
+r88 | lh3lh3 | 2009-01-13 10:48:23 +0000 (Tue, 13 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-7
+ * change the reported indel position to the previous way
+
+------------------------------------------------------------------------
+r87 | lh3lh3 | 2009-01-12 22:12:12 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-6
+ * addd glt output
+ * allow to change indel calling parameters at the command line
+
+------------------------------------------------------------------------
+r86 | lh3lh3 | 2009-01-12 21:16:48 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-5
+ * added two more flags
+ * allowed to select reads shown in pileup with a mask
+
+------------------------------------------------------------------------
+r85 | lh3lh3 | 2009-01-12 20:47:51 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-4
+ * fixed a bug in indexing (linear index)
+ * prepare to add glt output from pileup
+
+------------------------------------------------------------------------
+r84 | lh3lh3 | 2009-01-12 09:22:35 +0000 (Mon, 12 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-3
+ * fixed a bug in outputing the coordinate of an indel
+
+------------------------------------------------------------------------
+r83 | lh3lh3 | 2009-01-11 15:18:01 +0000 (Sun, 11 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-2
+ * pileup: allows to output indel sites only
+
+------------------------------------------------------------------------
+r82 | lh3lh3 | 2009-01-10 23:34:31 +0000 (Sat, 10 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_maqcns.h
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-1
+ * implemented a Bayesian indel caller
+
+------------------------------------------------------------------------
+r81 | lh3lh3 | 2009-01-09 09:54:28 +0000 (Fri, 09 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/examples/00README.txt
+ D /branches/dev/samtools/examples/ex1.fa.fai
+
+Let users generate ex1.fa.fai.
+
+------------------------------------------------------------------------
+r80 | lh3lh3 | 2009-01-08 16:10:08 +0000 (Thu, 08 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/bowtie2sam.pl
+
+make the bowtie converter works for "-k 2"
+
+------------------------------------------------------------------------
+r78 | lh3lh3 | 2009-01-03 17:25:24 +0000 (Sat, 03 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/export2sam.pl
+
+fixed a bug for "QC" reads
+
+------------------------------------------------------------------------
+r77 | lh3lh3 | 2009-01-01 18:32:06 +0000 (Thu, 01 Jan 2009) | 3 lines
+Changed paths:
+ A /branches/dev/samtools/misc/bowtie2sam.pl
+ M /branches/dev/samtools/misc/soap2sam.pl
+
+ * soap2sam.pl: added NM tag
+ * bowtie2sam.pl: converter for bowtie
+
+------------------------------------------------------------------------
+r76 | lh3lh3 | 2008-12-31 23:24:24 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/misc/soap2sam.pl
+
+soap2sam.pl: convert soap output to SAM
+
+------------------------------------------------------------------------
+r75 | lh3lh3 | 2008-12-31 17:54:32 +0000 (Wed, 31 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/misc/wgsim_eval.pl
+
+ * wgsim_eval.pl-0.1.1
+ * fixed a bug for a contig name like "NT_012345"
+
+------------------------------------------------------------------------
+r74 | lh3lh3 | 2008-12-31 16:38:21 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/misc/wgsim_eval.pl
+
+ * evaluate alignment for reads generated by wgsim
+
+------------------------------------------------------------------------
+r73 | lh3lh3 | 2008-12-31 15:11:22 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/Makefile
+ M /branches/dev/samtools/misc/wgsim.c
+
+fixed compiling warnings for wgsim
+
+------------------------------------------------------------------------
+r72 | lh3lh3 | 2008-12-31 13:40:51 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_tview.c
+
+remove an unused variable (a compiler warning only)
+
+------------------------------------------------------------------------
+r71 | lh3lh3 | 2008-12-31 13:37:16 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/Makefile
+ A /branches/dev/samtools/misc/wgsim.c
+
+wgsim: Paired-end reads simulator
+
+------------------------------------------------------------------------
+r70 | bhandsaker | 2008-12-29 20:27:16 +0000 (Mon, 29 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_tview.c
+
+Move definition of bam_nt16_nt4_table so we can build without curses.
+
+------------------------------------------------------------------------
+r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/NEWS
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+Release samtools-0.1.1
+
+------------------------------------------------------------------------
+r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines
+Changed paths:
+ M /branches/dev/samtools/bam_aux.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/razf.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-66
+ * fixed a bug in razf.c: reset z_eof when razf_seek() is called
+ * fixed a memory leak in parsing a region
+ * changed pileup a little bit when -s is in use: output ^ and $
+ * when a bam is not indexed, output more meaningful error message
+ * fixed a bug in indexing for small alignment
+ * fixed a bug in the viewer when we come to the end of a reference file
+ * updated documentation
+ * prepare to release 0.1.1
+
+------------------------------------------------------------------------
+r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/examples
+ A /branches/dev/samtools/examples/00README.txt
+ A /branches/dev/samtools/examples/ex1.fa
+ A /branches/dev/samtools/examples/ex1.fa.fai
+ A /branches/dev/samtools/examples/ex1.sam.gz
+
+example
+
+------------------------------------------------------------------------
+r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+
+update ChangeLog
+
+------------------------------------------------------------------------
+r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/misc/export2sam.pl
+
+ * added comments
+ * fixed several bugs
+
+------------------------------------------------------------------------
+r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/misc/export2sam.pl
+
+convert Export format to SAM; not thoroughly tested
+
+------------------------------------------------------------------------
+r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/source.dot
+
+ * samtools-0.1.0-65
+ * pileup: generate maq-like simple output
+ * pileup: allow to output pileup at required sites
+ * source.dot: source file relationship graph
+ * tview: fixed a minor bug
+
+------------------------------------------------------------------------
+r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines
+Changed paths:
+ D /branches/dev/samtools/misc/all2sam.pl
+
+remove all2sam.pl
+
+------------------------------------------------------------------------
+r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/COPYING
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/faidx.h
+ M /branches/dev/samtools/khash.h
+ M /branches/dev/samtools/kseq.h
+ M /branches/dev/samtools/ksort.h
+ M /branches/dev/samtools/samtools.1
+
+Added copyright information and a bit more documentation. No code change.
+
+------------------------------------------------------------------------
+r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-64
+ * improved efficiency of the indel caller for spliced alignments
+
+------------------------------------------------------------------------
+r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_aux.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-63
+ * a bit code cleanup: reduce the dependency between source files
+
+------------------------------------------------------------------------
+r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-62
+ * fixed a memory leak
+
+------------------------------------------------------------------------
+r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/samtools.1
+
+update documentation, ChangeLog and a comment
+
+------------------------------------------------------------------------
+r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_maqcns.h
+ M /branches/dev/samtools/bam_pileup.c
+ A /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-61
+ * moved pileup command to a separate source file
+ * added indel caller
+ * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!)
+ * updated documentation
+
+------------------------------------------------------------------------
+r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-60
+ * fixed another bug in maqcns when there is a nearby deletion
+
+------------------------------------------------------------------------
+r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-59
+ * pileup: outputing consensus is now optional
+ * fixed a bug in glfgen. This bug also exists in maq's glfgen. However,
+ I am not quite sure why the previous version may have problem.
+
+------------------------------------------------------------------------
+r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-58
+ * add maq consensus to pileup. However, I will move this part to a new
+ command as strictly speaking, consensus callin is not part of pileup,
+ and imposing it would make it harder to generate for other language
+ bindings.
+
+------------------------------------------------------------------------
+r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bgzf.c
+
+Fix bug in tell() after reads that consume to the exact end of a block.
+
+------------------------------------------------------------------------
+r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-57
+ * fixed a bug in parser when there is auxiliary fields
+ * made the parser a bit more robust
+
+------------------------------------------------------------------------
+r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-56
+ * fixed a bug in bgzf (only reading is affected)
+ * fixed a typo in bam_index.c
+ * in bam_index.c, check potential bugs in the underlying I/O library
+
+------------------------------------------------------------------------
+r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-55
+ * tried to make pileup work with clipping (previously not), though NOT tested
+ * removed -v from pileup
+ * made pileup take the reference sequence
+
+------------------------------------------------------------------------
+r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-54
+ * in parser, recognize "=", rather than ",", as a match
+ * in parser, correctl parse "=" at the MRNM field.
+
+------------------------------------------------------------------------
+r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/maq2sam.c
+
+fixed a bug in handling maq flag 64 and 192
+
+------------------------------------------------------------------------
+r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/md5fa.c
+
+also calculate unordered md5sum check
+
+------------------------------------------------------------------------
+r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/md5fa.c
+
+fixed a minor bug when there are space in the sequence
+
+------------------------------------------------------------------------
+r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/md5fa.c
+
+fixed a potential memory leak
+
+------------------------------------------------------------------------
+r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+
+ * fixed a bug in import: bin is wrongly calculated
+
+------------------------------------------------------------------------
+r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/all2sam.pl
+
+nothing, really
+
+------------------------------------------------------------------------
+r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/kseq.h
+ M /branches/dev/samtools/misc/Makefile
+ A /branches/dev/samtools/misc/md5.c
+ A /branches/dev/samtools/misc/md5.h
+ A /branches/dev/samtools/misc/md5fa.c
+
+ * fixed two warnings in kseq.h
+ * added md5sum utilities
+
+------------------------------------------------------------------------
+r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/kseq.h
+ D /branches/dev/samtools/kstream.h
+
+ * samtools-0.1.0-52
+ * replace kstream with kseq. kseq is a superset of kstream. I need the
+ extra functions in kseq.h.
+ * also compile stand-alone faidx
+
+------------------------------------------------------------------------
+r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_sort.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-51
+ * sorting by read names is available
+
+------------------------------------------------------------------------
+r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bam_sort.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/misc/maq2sam.c
+
+ * samtools-0.1.0-50
+ * format change to meet the latest specification
+
+------------------------------------------------------------------------
+r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/misc/maq2sam.c
+
+ * minor change in maqcns: special care when n==0
+ * change maq2sam to meet the latest specification
+
+------------------------------------------------------------------------
+r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+ M /branches/dev/samtools/razf.h
+
+considerable code clean up in razf
+
+------------------------------------------------------------------------
+r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/faidx.c
+
+make RAZF optional in faidx.c
+
+------------------------------------------------------------------------
+r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_aux.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-49
+ * added routines for retrieving aux data, NOT TESTED YET!
+
+------------------------------------------------------------------------
+r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam.c
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-48
+ * bgzf: fixed a potential integer overflow on 32-it machines
+ * maqcns: set the minimum combined quality as 0
+ * supporting hex strings
+
+------------------------------------------------------------------------
+r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-47
+ * fixed the bug in maqcns
+
+------------------------------------------------------------------------
+r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ A /branches/dev/samtools/bam_maqcns.c
+ A /branches/dev/samtools/bam_maqcns.h
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/glf.h
+
+ * samtools-0.1.0-46
+ * add MAQ consensus caller, currently BUGGY!
+
+------------------------------------------------------------------------
+r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-45
+ * tview: display padded alignment (but not P operation)
+ * better coordinates and reference sequence
+
+------------------------------------------------------------------------
+r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/ChangeLog
+
+new ChangeLog
+
+------------------------------------------------------------------------
+r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+ D /branches/dev/samtools/ChangeLog
+ A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6)
+
+Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from
+the log of my personal SVN repository.
+
+------------------------------------------------------------------------
+r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-44
+ * declare fseeko and ftello as some Linux may not do this by default and
+ missing these declarations will make bgzf buggy
+ * get rid of some harmless warings
+ * use BGZF by default, now
+
+------------------------------------------------------------------------
+r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/razf.c
+
+ * samtools-0.1.0-43
+ * fixed a bug in razf_read()
+ * give more warnings when the file is truncated (or due to bugs in I/O library)
+
+------------------------------------------------------------------------
+r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bgzf.c
+
+fixed a bug in bgzf.c at the end of the file
+
+------------------------------------------------------------------------
+r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-42
+ * a lot happened to RAZF, although samtools itself is untouched. Better
+ also update the version number anyway to avoid confusion
+
+------------------------------------------------------------------------
+r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+
+a change from Jue, but I think it should not matter
+
+------------------------------------------------------------------------
+r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+
+fixed a potential bug in razf. However, it seems still buggy, just
+rarely happens, very rarely.
+
+------------------------------------------------------------------------
+r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/razf.c
+
+fixed a bug in razf, with the help of Jue
+
+------------------------------------------------------------------------
+r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+
+remove a comment
+
+------------------------------------------------------------------------
+r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/razf.c
+ M /branches/dev/samtools/razf.h
+
+ * Jue has updated razf to realize Bob's scheme
+
+------------------------------------------------------------------------
+r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/samtools.1
+
+the manual page
+
+------------------------------------------------------------------------
+r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines
+Changed paths:
+ A /branches/dev/samtools/ChangeLog
+ A /branches/dev/samtools/Makefile
+ A /branches/dev/samtools/bam.c
+ A /branches/dev/samtools/bam.h
+ A /branches/dev/samtools/bam_aux.c
+ A /branches/dev/samtools/bam_endian.h
+ A /branches/dev/samtools/bam_import.c
+ A /branches/dev/samtools/bam_index.c
+ A /branches/dev/samtools/bam_lpileup.c
+ A /branches/dev/samtools/bam_pileup.c
+ A /branches/dev/samtools/bam_sort.c
+ A /branches/dev/samtools/bam_tview.c
+ A /branches/dev/samtools/bamtk.c
+ A /branches/dev/samtools/bgzf.c
+ A /branches/dev/samtools/bgzf.h
+ A /branches/dev/samtools/bgzip.c
+ A /branches/dev/samtools/faidx.c
+ A /branches/dev/samtools/faidx.h
+ A /branches/dev/samtools/khash.h
+ A /branches/dev/samtools/ksort.h
+ A /branches/dev/samtools/kstream.h
+ A /branches/dev/samtools/misc
+ A /branches/dev/samtools/misc/Makefile
+ A /branches/dev/samtools/misc/all2sam.pl
+ A /branches/dev/samtools/misc/maq2sam.c
+ A /branches/dev/samtools/razf.c
+ A /branches/dev/samtools/razf.h
+ A /branches/dev/samtools/razip.c
+ A /branches/dev/samtools/zutil.h
+
+The initial version of samtools, replicated from my local SVN repository.
+The current version is: 0.1.0-42. All future development will happen here.
+
+------------------------------------------------------------------------
+r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools
+
+samtools (C version)
+
+------------------------------------------------------------------------
+------------------------------------------------------------------------
+r703 | lh3 | 2008-11-25 20:20:02 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/samtools.1
+
+rename bamtk to samtools
+
+------------------------------------------------------------------------
+r702 | lh3 | 2008-11-25 20:15:09 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ D /branches/prog/bam/bamtk.1
+ A /branches/prog/bam/samtools.1 (from /branches/prog/bam/bamtk.1:679)
+
+rename bamtk.1 to samtools.1
+
+------------------------------------------------------------------------
+r701 | lh3 | 2008-11-25 13:29:10 +0000 (Tue, 25 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/misc/Makefile
+
+ * samtools-0.1.0-41
+ * small (but a bit dangerous) changes to meet the latest specification
+
+------------------------------------------------------------------------
+r700 | lh3 | 2008-11-25 13:15:11 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam/misc/all2sam.pl (from /branches/prog/bam/misc/all2tam.pl:649)
+ D /branches/prog/bam/misc/all2tam.pl
+ A /branches/prog/bam/misc/maq2sam.c (from /branches/prog/bam/misc/maq2tam.c:699)
+ D /branches/prog/bam/misc/maq2tam.c
+
+rename tam to sam
+
+------------------------------------------------------------------------
+r699 | lh3 | 2008-11-25 13:14:49 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/misc/maq2tam.c
+
+change for the new specification
+
+------------------------------------------------------------------------
+r698 | lh3 | 2008-11-24 13:15:20 +0000 (Mon, 24 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/razf.c
+ M /branches/prog/bam/razf.h
+
+ * add a fake BGZF mode to razf. It is fake in that it loads razf index into
+ memory but gives BGZF like virtual offset
+
+------------------------------------------------------------------------
+r697 | lh3 | 2008-11-24 09:53:44 +0000 (Mon, 24 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam/ChangeLog
+
+change log
+
+------------------------------------------------------------------------
+r696 | lh3 | 2008-11-24 09:53:23 +0000 (Mon, 24 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bgzf.c
+
+updated bgzf, on behalf of Bob
+
+------------------------------------------------------------------------
+r695 | lh3 | 2008-11-23 11:40:31 +0000 (Sun, 23 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/razf.c
+
+fixed a bug in razf
+
+------------------------------------------------------------------------
+r694 | lh3 | 2008-11-22 16:23:52 +0000 (Sat, 22 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_lpileup.c
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-40
+ * fixed two small memory leaks
+ * fixed a memory problem when seek outside the length of the sequence
+
+------------------------------------------------------------------------
+r693 | lh3 | 2008-11-22 16:10:04 +0000 (Sat, 22 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-39
+ * fixed an uninitialized warning. This does not matter in fact
+
+------------------------------------------------------------------------
+r692 | lh3 | 2008-11-22 15:44:05 +0000 (Sat, 22 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/razf.c
+ M /branches/prog/bam/razf.h
+
+Jue's new razf
+
+------------------------------------------------------------------------
+r691 | lh3 | 2008-11-21 21:30:39 +0000 (Fri, 21 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/bgzip.c
+
+ * bam-0.1.0-38
+ * get rid of some warings in bgzip.c
+ * potentially improve performance in indexing for BGZF
+
+------------------------------------------------------------------------
+r690 | lh3 | 2008-11-21 21:15:51 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bgzf.c
+
+I think I have fixed the bug in bgzf
+
+------------------------------------------------------------------------
+r689 | lh3 | 2008-11-21 20:48:56 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bgzf.c
+
+bug fix by Bob
+
+------------------------------------------------------------------------
+r688 | lh3 | 2008-11-21 20:37:27 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+
+fixed a bug due to the name change in _IOLIB
+
+------------------------------------------------------------------------
+r687 | lh3 | 2008-11-21 14:42:56 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bgzf.c
+
+fix small things
+
+------------------------------------------------------------------------
+r686 | lh3 | 2008-11-21 14:37:59 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam/bgzf.c
+ A /branches/prog/bam/bgzf.h
+ A /branches/prog/bam/bgzip.c
+
+Bob's BGZF format, although currently buggy
+
+------------------------------------------------------------------------
+r685 | lh3 | 2008-11-21 09:48:20 +0000 (Fri, 21 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-37
+ * improve interface a little bit
+
+------------------------------------------------------------------------
+r684 | lh3 | 2008-11-21 09:30:18 +0000 (Fri, 21 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-36
+ * improve the interface of tview, a little bit
+
+------------------------------------------------------------------------
+r683 | lh3 | 2008-11-20 22:33:54 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam_tview.c
+
+a little better viewer
+
+------------------------------------------------------------------------
+r682 | lh3 | 2008-11-20 22:27:01 +0000 (Thu, 20 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-35
+ * better viewer
+
+------------------------------------------------------------------------
+r681 | lh3 | 2008-11-20 20:51:16 +0000 (Thu, 20 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-34
+ * tview is now a component of bamtk
+
+------------------------------------------------------------------------
+r680 | lh3 | 2008-11-20 19:17:30 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam/bam_tview.c
+
+text alignment viewer
+
+------------------------------------------------------------------------
+r679 | lh3 | 2008-11-20 19:17:15 +0000 (Thu, 20 Nov 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_lpileup.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.1
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-33
+ * added routines to reset pileup bufferes
+ * fixed a bug in faidx
+ * add text alignment viewer
+
+------------------------------------------------------------------------
+r678 | lh3 | 2008-11-20 11:05:02 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ A /branches/prog/bam/bam_lpileup.c (from /branches/prog/bam/bam_tview.c:668)
+ D /branches/prog/bam/bam_tview.c
+
+rename tview as lpileup
+
+------------------------------------------------------------------------
+r677 | lh3 | 2008-11-20 10:08:52 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/razf.c
+
+fixed a bug in razf
+
+------------------------------------------------------------------------
+r676 | lh3 | 2008-11-19 22:52:20 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/faidx.h
+
+add documentations
+
+------------------------------------------------------------------------
+r674 | lh3 | 2008-11-19 21:39:17 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bamtk.1
+ M /branches/prog/bam/faidx.h
+
+update documentation
+
+------------------------------------------------------------------------
+r673 | lh3 | 2008-11-19 21:19:03 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam/bamtk.1
+
+add manual page
+
+------------------------------------------------------------------------
+r672 | lh3 | 2008-11-19 16:40:49 +0000 (Wed, 19 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-32
+ * make faidx more error resistant
+
+------------------------------------------------------------------------
+r671 | lh3 | 2008-11-19 16:09:55 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/faidx.h
+
+add index
+
+------------------------------------------------------------------------
+r670 | lh3 | 2008-11-19 16:02:39 +0000 (Wed, 19 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-31
+ * show reference sequence in pileup -v (not in the default pileup)
+
+------------------------------------------------------------------------
+r669 | lh3 | 2008-11-19 14:51:17 +0000 (Wed, 19 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-30
+ * put faidx in bamtk and remove faidx_main.c
+
+------------------------------------------------------------------------
+r668 | lh3 | 2008-11-19 14:15:05 +0000 (Wed, 19 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+ A /branches/prog/bam/faidx.c
+ A /branches/prog/bam/faidx.h
+ M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-29
+ * fixed a bug in tview.c
+ * prepare to add faidx
+
+------------------------------------------------------------------------
+r667 | lh3 | 2008-11-19 10:20:45 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/razf.c
+ M /branches/prog/bam/razf.h
+
+gzip-compatible razf
+
+------------------------------------------------------------------------
+r664 | lh3 | 2008-11-18 12:50:23 +0000 (Tue, 18 Nov 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-28
+ * fetch: fixed a bug at an array boundary
+ * fetch: fixed a bug when the whole chromosome is retrieved
+ * add linear index
+
+------------------------------------------------------------------------
+r663 | lh3 | 2008-11-17 21:29:22 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-27
+ * put l_qseq into core and move l_aux to bam1_t
+
+------------------------------------------------------------------------
+r662 | lh3 | 2008-11-17 20:55:16 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-26
+ * save seq and qual separately
+
+------------------------------------------------------------------------
+r661 | lh3 | 2008-11-17 13:09:37 +0000 (Mon, 17 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+
+little
+
+------------------------------------------------------------------------
+r660 | lh3 | 2008-11-17 13:06:14 +0000 (Mon, 17 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+
+more documentations
+
+------------------------------------------------------------------------
+r659 | lh3 | 2008-11-17 12:55:08 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-25
+ * make tview work for TAM
+
+------------------------------------------------------------------------
+r658 | lh3 | 2008-11-17 12:50:21 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-24
+ * make tview as an independent module
+
+------------------------------------------------------------------------
+r657 | lh3 | 2008-11-17 11:26:06 +0000 (Mon, 17 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_pileup.c
+
+change little
+
+------------------------------------------------------------------------
+r656 | lh3 | 2008-11-16 21:33:19 +0000 (Sun, 16 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-23
+ * also add tview for TAM
+
+------------------------------------------------------------------------
+r655 | lh3 | 2008-11-16 21:29:46 +0000 (Sun, 16 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-22
+ * make tview more efficient for deep depth
+
+------------------------------------------------------------------------
+r654 | lh3 | 2008-11-16 20:52:19 +0000 (Sun, 16 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_pileup.c
+ A /branches/prog/bam/bam_tview.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-21
+ * fixed bug in the TAM parser: lowercase not recognized
+ * unfinished function to leveled pileup (tview)
+
+------------------------------------------------------------------------
+r653 | lh3 | 2008-11-15 12:58:36 +0000 (Sat, 15 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-20
+ * pileup now display deleted bases as '*'
+
+------------------------------------------------------------------------
+r652 | lh3 | 2008-11-15 09:58:39 +0000 (Sat, 15 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-19
+ * fixed a bug in fetch()
+ * reduce memory in indexing
+
+------------------------------------------------------------------------
+r651 | lh3 | 2008-11-14 21:56:05 +0000 (Fri, 14 Nov 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-18
+ * important changes are made to index: the index size is increased, but
+ now we have no limit on file sizes and the new method potentially
+ works with BGZF, Bob's new compression format.
+
+------------------------------------------------------------------------
+r650 | lh3 | 2008-11-14 16:03:22 +0000 (Fri, 14 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-17
+ * more comments in bam.h
+ * fixed a bug in bam_index.c
+
+------------------------------------------------------------------------
+r649 | lh3 | 2008-11-13 16:04:18 +0000 (Thu, 13 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bam_sort.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-16
+ * use macros to retrieve pointers from bam1_t and thus reduce the size
+ of bam1_t struct.
+
+------------------------------------------------------------------------
+r648 | lh3 | 2008-11-13 13:21:39 +0000 (Thu, 13 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_sort.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-15
+ * make more things work over pipe
+
+------------------------------------------------------------------------
+r647 | lh3 | 2008-11-13 12:49:28 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/misc/maq2tam.c
+
+fixed a bug in maq2tam
+
+------------------------------------------------------------------------
+r646 | lh3 | 2008-11-13 11:46:59 +0000 (Thu, 13 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/Makefile
+ M /branches/prog/bam/misc/Makefile
+ M /branches/prog/bam/misc/maq2tam.c
+
+ * bug fix in maq2tam.c
+ * improve Makefile
+
+------------------------------------------------------------------------
+r645 | lh3 | 2008-11-13 11:39:46 +0000 (Thu, 13 Nov 2008) | 3 lines
+Changed paths:
+ A /branches/prog/bam/misc/Makefile
+ M /branches/prog/bam/misc/maq2tam.c
+
+ * corrected maq2tam
+ * add Makefile
+
+------------------------------------------------------------------------
+r644 | lh3 | 2008-11-13 11:25:45 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/razf.c
+
+fixed the bug in buffered write (on behalf of Jue)
+
+------------------------------------------------------------------------
+r643 | lh3 | 2008-11-13 10:53:42 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+ D /branches/prog/bam/all2tam.pl
+ A /branches/prog/bam/misc/all2tam.pl (from /branches/prog/bam/all2tam.pl:642)
+
+move to misc
+
+------------------------------------------------------------------------
+r642 | lh3 | 2008-11-13 10:53:23 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/all2tam.pl
+
+change tag
+
+------------------------------------------------------------------------
+r641 | lh3 | 2008-11-13 10:53:12 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+ D /branches/prog/bam/utils
+
+has been renamed
+
+------------------------------------------------------------------------
+r640 | lh3 | 2008-11-13 10:52:50 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam/misc (from /branches/prog/bam/utils:639)
+
+rename
+
+------------------------------------------------------------------------
+r639 | lh3 | 2008-11-13 10:52:35 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam/utils
+ A /branches/prog/bam/utils/maq2tam.c
+
+utilities (converters and so on)
+
+------------------------------------------------------------------------
+r638 | lh3 | 2008-11-12 22:24:22 +0000 (Wed, 12 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-14
+ * copy the text header to BAM
+ * add BAM1 header flag
+
+------------------------------------------------------------------------
+r637 | lh3 | 2008-11-12 14:56:08 +0000 (Wed, 12 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-13
+ * fixed a bug in razf
+ * improved and fixed potential bugs in index
+
+------------------------------------------------------------------------
+r636 | lh3 | 2008-11-12 11:57:13 +0000 (Wed, 12 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+update documentation in the HeaderDOC format
+
+------------------------------------------------------------------------
+r635 | lh3 | 2008-11-12 10:08:38 +0000 (Wed, 12 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-12
+ * more documentations
+ * rename baf1_core_t as bam1_core_t
+
+------------------------------------------------------------------------
+r634 | lh3 | 2008-11-11 23:00:35 +0000 (Tue, 11 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_pileup.c
+
+documentation
+
+------------------------------------------------------------------------
+r633 | lh3 | 2008-11-11 21:23:49 +0000 (Tue, 11 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-11
+ * give up regional pileup. We can now use pipe to mimic that.
+ * for index file, change suffix .idx to .bmi
+
+------------------------------------------------------------------------
+r632 | lh3 | 2008-11-11 21:00:11 +0000 (Tue, 11 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-10
+ * make pileup work on TAM
+
+------------------------------------------------------------------------
+r631 | lh3 | 2008-11-11 09:20:29 +0000 (Tue, 11 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/razf.c
+ M /branches/prog/bam/razf.h
+ M /branches/prog/bam/razip.c
+
+ * bamtk-0.1.0-9
+ * razf now supports streaming
+ * prepare to improve pileup (have not yet)
+
+------------------------------------------------------------------------
+r630 | lh3 | 2008-11-10 18:34:40 +0000 (Mon, 10 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-8
+ * improve the interface of TAM parser
+
+------------------------------------------------------------------------
+r629 | lh3 | 2008-11-10 13:06:13 +0000 (Mon, 10 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-7
+ * almost nothing
+
+------------------------------------------------------------------------
+r628 | lh3 | 2008-11-10 12:56:36 +0000 (Mon, 10 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-6
+ * fixed a bug in bam_pileup.c
+
+------------------------------------------------------------------------
+r627 | lh3 | 2008-11-10 11:32:46 +0000 (Mon, 10 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bamtk.c
+ M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-5
+ * fixed a bug in razf.c, caused by my modifications
+ * improve the interface of pileup. Now it will be slower but more flexible
+
+------------------------------------------------------------------------
+r626 | lh3 | 2008-11-09 20:51:04 +0000 (Sun, 09 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-4
+ * view: dumping binary output
+
+------------------------------------------------------------------------
+r625 | lh3 | 2008-11-09 20:31:54 +0000 (Sun, 09 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bam_pileup.c
+ M /branches/prog/bam/bam_sort.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-3
+ * rename functions
+
+------------------------------------------------------------------------
+r624 | lh3 | 2008-11-09 15:07:32 +0000 (Sun, 09 Nov 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bam/bam.h
+
+add comments
+
+------------------------------------------------------------------------
+r623 | lh3 | 2008-11-08 22:32:49 +0000 (Sat, 08 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-2
+ * improve indexing for a mixture of long and short reads, although currently
+ I do not know whether it really works...
+
+------------------------------------------------------------------------
+r622 | lh3 | 2008-11-08 22:13:58 +0000 (Sat, 08 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bam/bam_index.c
+ M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-1
+ * prepare for improving indexing algorithm
+
+------------------------------------------------------------------------
+r621 | lh3 | 2008-11-08 20:28:09 +0000 (Sat, 08 Nov 2008) | 4 lines
+Changed paths:
+ A /branches/prog/bam/all2tam.pl
+ M /branches/prog/bam/bam.c
+ M /branches/prog/bam/bam.h
+ M /branches/prog/bam/bam_import.c
+ M /branches/prog/bam/bamtk.c
+ D /branches/prog/bam/tam_utils.pl
+
+ * bamtk-0.1.0
+ * smarter integers
+ * rename tam_utils.pl to all2tam.pl
+
+------------------------------------------------------------------------
+r620 | lh3 | 2008-11-08 17:17:22 +0000 (Sat, 08 Nov 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bam
+ A /branches/prog/bam/Makefile
+ A /branches/prog/bam/bam.c
+ A /branches/prog/bam/bam.h
+ A /branches/prog/bam/bam_endian.h
+ A /branches/prog/bam/bam_import.c
+ A /branches/prog/bam/bam_index.c
+ A /branches/prog/bam/bam_pileup.c
+ A /branches/prog/bam/bam_sort.c
+ A /branches/prog/bam/bamtk.c
+ A /branches/prog/bam/khash.h
+ A /branches/prog/bam/ksort.h
+ A /branches/prog/bam/kstream.h
+ A /branches/prog/bam/razf.c
+ A /branches/prog/bam/razf.h
+ A /branches/prog/bam/razip.c
+ A /branches/prog/bam/tam_utils.pl
+ A /branches/prog/bam/zutil.h
+
+The Binary Alignment/Mapping format.
+
+------------------------------------------------------------------------
diff --git a/samtools-0.1.19/INSTALL b/samtools-0.1.19/INSTALL
new file mode 100644
index 0000000..37d84a9
--- /dev/null
+++ b/samtools-0.1.19/INSTALL
@@ -0,0 +1,30 @@
+System Requirements
+===================
+
+SAMtools depends on the zlib library <http://www.zlib.net>. Version 1.2.3+ is
+preferred and with 1.2.3+ you can compile razip and use it to compress a FASTA
+file. SAMtools' faidx is able to index a razip-compressed FASTA file to save
+diskspace. Older zlib also works with SAMtools, but razip cannot be compiled.
+
+The text-based viewer (tview) requires the GNU ncurses library
+<http://www.gnu.org/software/ncurses/>, which comes with Mac OS X and most of
+the modern Linux/Unix distributions. If you do not have this library installed,
+you can still compile the rest of SAMtools by manually changing:
+`-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and
+comment out the line starting with `LIBCURSES='.
+
+
+Compilation
+===========
+
+Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can compile
+razip with `make razip'.
+
+
+Installation
+============
+
+Copy `samtools', `bcftools/bcftools' and other executables/scripts in `misc' to
+a location you want (e.g. a directory in your $PATH). You may also copy
+`samtools.1' and `bcftools/bcftools.1' to a directory in your $MANPATH such
+that the `man' command may find the manual.
diff --git a/samtools-0.1.19/Makefile b/samtools-0.1.19/Makefile
new file mode 100644
index 0000000..2f51bfc
--- /dev/null
+++ b/samtools-0.1.19/Makefile
@@ -0,0 +1,101 @@
+CC= gcc
+CFLAGS= -g -Wall -O2
+#LDFLAGS= -Wl,-rpath,\$$ORIGIN/../lib
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1
+KNETFILE_O= knetfile.o
+LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
+ bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \
+ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o
+AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
+ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
+ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
+ cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o \
+ bam_tview_curses.o bam_tview_html.o
+PROG= samtools
+INCLUDES= -I.
+SUBDIRS= . bcftools misc
+LIBPATH=
+LIBCURSES= -lcurses # -lXCurses
+
+.SUFFIXES:.c .o
+.PHONY: all lib
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+.PHONY:all lib clean cleanlocal
+.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur
+
+lib:libbam.a
+
+libbam.a:$(LOBJS)
+ $(AR) -csru $@ $(LOBJS)
+
+samtools:lib-recur $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LDFLAGS) libbam.a -Lbcftools -lbcf $(LIBPATH) $(LIBCURSES) -lm -lz -lpthread
+
+razip:razip.o razf.o $(KNETFILE_O)
+ $(CC) $(CFLAGS) -o $@ $^ -lz
+
+bgzip:bgzip.o bgzf.o $(KNETFILE_O)
+ $(CC) $(CFLAGS) -o $@ $^ -lz -lpthread
+
+bgzf.o:bgzf.c bgzf.h
+ $(CC) -c $(CFLAGS) $(DFLAGS) -DBGZF_CACHE $(INCLUDES) bgzf.c -o $@
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h
+sam.o:sam.h bam.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h bam_tview.h
+bam_tview_curses.o:bam.h faidx.h bam_tview.h
+bam_tview_html.o:bam.h faidx.h bam_tview.h
+bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
+sam_header.o:sam_header.h khash.h
+bcf.o:bcftools/bcf.h
+bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h
+bam2bcf_indel.o:bam2bcf.h
+errmod.o:errmod.h
+phase.o:bam.h khash.h ksort.h
+bamtk.o:bam.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+
+libbam.1.dylib-local:$(LOBJS)
+ libtool -dynamic $(LOBJS) -o libbam.1.dylib -lc -lz
+
+libbam.so.1-local:$(LOBJS)
+ $(CC) -shared -Wl,-soname,libbam.so -o libbam.so.1 $(LOBJS) -lc -lz
+
+dylib:
+ @$(MAKE) cleanlocal; \
+ case `uname` in \
+ Linux) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.so.1-local;; \
+ Darwin) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.1.dylib-local;; \
+ *) echo 'Unknown OS';; \
+ esac
+
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib
+
+clean:cleanlocal-recur
diff --git a/samtools-0.1.19/Makefile.mingw b/samtools-0.1.19/Makefile.mingw
new file mode 100644
index 0000000..7a57ffc
--- /dev/null
+++ b/samtools-0.1.19/Makefile.mingw
@@ -0,0 +1,63 @@
+CC= gcc.exe
+AR= ar.exe
+CFLAGS= -g -Wall -O2
+DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2
+KNETFILE_O= knetfile.o
+LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
+ bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \
+ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o
+AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
+ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
+ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
+ cut_target.o phase.o bam_cat.o bam2depth.o
+BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \
+ bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \
+ bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o
+PROG= samtools.exe bcftools.exe
+INCLUDES= -I. -Iwin32
+SUBDIRS= .
+LIBPATH=
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+.PHONY:all lib clean cleanlocal
+.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur
+
+lib:libbam.a
+
+libbam.a:$(LOBJS)
+ $(AR) -cru $@ $(LOBJS)
+
+samtools.exe:$(AOBJS) libbam.a $(BCFOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32
+
+bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o
+ $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h
+sam.o:sam.h bam.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h
+bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
+sam_header.o:sam_header.h khash.h
+bcf.o:bcftools/bcf.h
+bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h
+bam2bcf_indel.o:bam2bcf.h
+errmod.o:errmod.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+clean:
+ rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib
diff --git a/samtools-0.1.19/NEWS b/samtools-0.1.19/NEWS
new file mode 100644
index 0000000..121485e
--- /dev/null
+++ b/samtools-0.1.19/NEWS
@@ -0,0 +1,836 @@
+Beta Release 0.1.19 (15 March, 2013)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in samtools and bcftools:
+
+ * The latest source code and development moved to github,
+ http://github.com/samtools/samtools
+
+ * Many important bugfixes and contributions by many people. Thanks to all!
+
+ * Performance improvements (multi-threading)
+
+ * Important changes in calling, see
+ - samtools mpileup -p
+ - bcftools view -m
+
+ * New annotations useful for filtering (RPB, HWE, QBD, MDV)
+
+ * New tools, bamcheck and plot-bamcheck
+
+ * New features in samtools tview
+
+ * And much more..
+
+For a detailed list of commits, please see
+http://github.com/samtools/samtools/commits/master
+
+(0.1.19: 15 March 2013, commit 96b5f2294ac0054230e88913c4983d548069ea4e)
+
+
+Beta Release 0.1.18 (2 September, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in samtools:
+
+ * Support the new =/X CIGAR operators (by Peter Cock).
+
+ * Allow to subsample BAM while keeping the pairing intact (view -s).
+
+ * Implemented variant distance bias as a new filter (by Petr Danecek).
+
+ * Bugfix: huge memory usage during indexing
+
+ * Bugfix: use of uninitialized variable in mpileup (rare)
+
+ * Bugfix: wrong BAQ probability (rare)
+
+Notable changes in bcftools:
+
+ * Support indel in the contrast caller.
+
+ * Bugfix: LRT2=nan in rare cases
+
+(0.1.18: 2 September 2011, r982:295)
+
+
+
+Beta Release 0.1.17 (6 July, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With the maturity of `mpileup' and the lack of update in the `pileup' command,
+the `pileup' command is now formally dropped. Most of the pileup functionality,
+such as outputting mapping quality and read positions, have been added
+`mpileup'.
+
+Since this release, `bcftools view' is able to perform contrast SNP calling
+(option -T) for discovering de novo and/or somatic mutations between a pair of
+samples or in a family trio. Potential mutations are scored by a log likelihood
+ratio, which is very simple in math, but should be comparable to more
+sophisticated methods. Note that getting the score is only the very first step.
+A lot more need to be done to reduce systematical errors due to mapping and
+reference errors and structural variations.
+
+Other notable changes in samtools:
+
+ * Improved sorting order checking during indexing.
+
+ * Improved region parsing. Colons in reference sequence names are parsed
+ properly.
+
+ * Fixed an issue where mpileup does not apply BAQ for the first few reads when
+ a region is specified.
+
+ * Fixed an issue where `faidx' does not work with FASTA files with long lines.
+
+ * Bugfix: wrong SP genotype information in the BCF output.
+
+Other notable changes in bcftools:
+
+ * Output the ML esitmate of the allele count.
+
+ * Added the HWE plus F<0 filter to varFilter. For multiple samples, it
+ effectively filters false heterozygous calls around centromeres.
+
+ * For association mapping, perform both 1-degree and 2-degree test. The
+ 2-degree test is conservative but more robust to HWE violation.
+
+(0.1.17: 6 July 2011, r973:277)
+
+
+
+Beta Release 0.1.16 (21 April, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in samtools:
+
+ * Support the new SAM/BAM type `B' in the latest SAM spec v1.4.
+
+ * When the output file of `samtools merge' exists, do not overwrite it unless
+ a new command-line option `-f' is applied.
+
+ * Bugfix: BED support is not working when the input BED is not sorted.
+
+ * Bugfix: some reads without coordinates but given on the reverse strand are
+ lost in merging.
+
+Notable changes in bcftools:
+
+ * Code cleanup: separated max-likelihood inference and Bayesian inference.
+
+ * Test Hardy-Weinberg equilibrium with a likelihood-ratio test.
+
+ * Provided another association test P-value by likelihood-ratio test.
+
+ * Use Brent's method to estimate the site allele frequency when EM converges
+ slowly. The resulting ML estimate of allele frequnecy is more accurate.
+
+ * Added the `ldpair' command, which computes r^2 between SNP pairs given in
+ an input file.
+
+Also, the `pileup' command, which has been deprecated by `mpileup' since
+version 0.1.10, will be dropped in the next release. The old `pileup' command
+is substandard and causing a lot of confusion.
+
+(0.1.16: 21 April 2011, r963:234)
+
+
+
+Beta Release 0.1.15 (10 April, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Noteable changes:
+
+ * Allow to perform variant calling or to extract information in multiple
+ regions specified by a BED file (`samtools mpileup -l', `samtools view -L'
+ and `bcftools view -l').
+
+ * Added the `depth' command to samtools to compute the per-base depth with a
+ simpler interface. File `bam2depth.c', which implements this command, is the
+ recommended example on how to use the mpileup APIs.
+
+ * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg
+ test using this estimate.
+
+ * For `samtools view', when `-R' is specified, drop read groups in the header
+ that are not contained in the specified file.
+
+ * For `samtools flagstat', separate QC-pass and QC-fail reads.
+
+ * Improved the command line help of `samtools mpileup' and `bcftools view'.
+
+ * Use a global variable to control the verbose level of samtools stderr
+ output. Nonetheless, it has not been full utilized.
+
+ * Fixed an issue in association test which may report false associations,
+ possibly due to floating point underflow.
+
+(0.1.15: 10 April 2011, r949:203)
+
+
+
+Beta release 0.1.14 (21 March, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release implements a method for testing associations for case-control
+data. The method does not call genotypes but instead sums over all genotype
+configurations to compute a chi^2 based test statistics. It can be potentially
+applied to comparing a pair of samples (e.g. a tumor-normal pair), but this
+has not been evaluated on real data.
+
+Another new feature is to make X chromosome variant calls when female and male
+samples are both present. The user needs to provide a file indicating the
+ploidy of each sample (see also manual bcftools/bcftools.1).
+
+Other notable changes:
+
+ * Added `bcftools view -F' to parse BCF files generated by samtools r921 or
+ older which encodes PL in a different way.
+
+ * Changed the behavior of `bcftools view -s'. Now when a list of samples is
+ provided, the samples in the output will be reordered to match the ordering
+ in the sample list. This change is mainly designed for association test.
+
+ * Sped up `bcftools view -v' for target sequencing given thousands of samples.
+ Also added a new option `view -d' to skip loci where only a few samples are
+ covered by reads.
+
+ * Dropped HWE test. This feature has never been implemented properly. An EM
+ should be much better. To be implemented in future.
+
+ * Added the `cat' command to samtools. This command concatenate BAMs with
+ identical sequence dictionaries in an efficient way. Modified from bam_cat.c
+ written by Chris Saunders.
+
+ * Added `samtools view -1' to write BAMs at a low compression level but twice
+ faster to create. The `sort' command generates temporary files at a low
+ compression level as well.
+
+ * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality
+ strings (strictly speaking, such a file is not BAM).
+
+ * Added `samtools mpileup -L' to skip INDEL calling in regions with
+ excessively high coverage. Such regions dramatically slow down mpileup.
+
+ * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc.
+
+(0.1.14: 21 March 2011, r933:170)
+
+
+
+Beta release 0.1.13 (1 March, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The most important though largely invisible modification is the change of the
+order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF
+spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF
+generated by samtools older than r921 inclusive. VCF/BCF generated by the new
+samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools
+version number.
+
+Single Individual Haplotyping (SIH) is added as an experimental feature. It
+originally aims to produce haploid consensus from fosmid pool sequencing, but
+also works with short-read data. For short reads, phased blocks are usually too
+short to be useful in many applications, but they can help to rule out part of
+SNPs close to INDELs or between copies of CNVs.
+
+
+Other notable changes in samtools:
+
+ * Construct per-sample consensus to reduce the effect of nearby SNPs in INDEL
+ calling. This reduces the power but improves specificity.
+
+ * Improved sorting order checking in indexing. Now indexing is the preferred way
+ to check if a BAM is sorted.
+
+ * Added a switch `-E' to mpileup and calmd. This option uses an alternative way
+ to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of
+ a little loss in specificity.
+
+ * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling.
+
+ * Added `mpileup -m' to allow fine control of the collection of INDEL candidates.
+
+ * Added `mpileup -S' to compute per-sample strand bias P-value.
+
+ * Added `mpileup -G' to exclude read groups in variant calling.
+
+ * Fixed segfault in indel calling related to unmapped and refskip reads.
+
+ * Fixed an integer overflow in INDEL calling. This bug produces wrong INDEL
+ genotypes for longer short INDELs, typically over 10bp.
+
+ * Fixed a bug in tview on big-endian machines.
+
+ * Fixed a very rare memory issue in bam_md.c
+
+ * Fixed an out-of-boundary bug in mpileup when the read base is `N'.
+
+ * Fixed a compiling error when the knetfile library is not used. Fixed a
+ library compiling error due to the lack of bam_nt16_nt4_table[] table.
+ Suppress a compiling warning related to the latest zlib.
+
+
+Other notable changes in bcftools:
+
+ * Updated the BCF spec.
+
+ * Added the `FQ' VCF INFO field, which gives the phred-scaled probability
+ of all samples being the same (identical to the reference or all homozygous
+ variants). Option `view -f' has been dropped.
+
+ * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence
+ similar to "samtools.pl pileup2fq".
+
+ * Make sure the GT FORMAT field is always the first FORMAT to conform the VCF
+ spec. Drop bcf-fix.pl.
+
+ * Output bcftools specific INFO and FORMAT in the VCF header.
+
+ * Added `view -s' to call variants from a subset of samples.
+
+ * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless,
+ custom fields are still unparsed and will be stored as a missing value.
+
+ * Fixed a minor bug in Fisher's exact test; the results are rarely changed.
+
+
+(0.1.13: 1 March 2011, r926:134)
+
+
+
+Beta release 0.1.12a (2 December, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is another bug fix release:
+
+ * Fixed a memory violation in mpileup, which causes segfault. Release
+ 0.1.9 and above are affected.
+
+ * Fixed a memory violation in the indel caller, which does not causes
+ segfault, but may potentially affect deletion calls in an unexpected
+ way. Release 0.1.10 and above are affected.
+
+ * Fixed a bug in computing r-square in bcftools. Few are using this
+ functionality and it only has minor effect.
+
+ * Fixed a memory leak in bam_fetch().
+
+ * Fixed a bug in writing meta information to the BAM index for the last
+ sequence. This bug is invisible to most users, but it is a bug anyway.
+
+ * Fixed a bug in bcftools which causes false "DP4=0,0,0,0" annotations.
+
+(0.1.12: 2 December 2010, r862)
+
+
+
+Beta release 0.1.11 (21 November, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is mainly a bug fix release:
+
+ * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads
+ are retrieved from a small region containing no reads.
+
+ * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion
+ failure when the first CIGAR operation is a deletion.
+
+ * Improved fault tolerence in remote access.
+
+One minor feature has been implemented in bcftools:
+
+ * Added a reference-free variant calling mode. In this mode, a site is
+ regarded as a variat iff the sample(s) contains two or more alleles;
+ the meaning of the QUAL field in the VCF output is changed
+ accordingly. Effectively, the reference allele is irrelevant to the
+ result in the new mode, although the reference sequence has to be
+ used in realignment when SAMtools computes genotype likelihoods.
+
+In addition, since 0.1.10, the `pileup' command has been deprecated by
+`mpileup' which is more powerful and more accurate. The `pileup' command
+will not be removed in the next few releases, but new features will not
+be added.
+
+(0.1.11: 21 November 2010, r851)
+
+
+
+Beta Release 0.1.10 (16 November, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release is featured as the first major improvement to the indel
+caller. The method is similar to the old one implemented in the pileup
+command, but the details are handled more carefully both in theory and
+in practice. As a result, the new indel caller usually gives more
+accurate indel calls, though at the cost of sensitivity. The caller is
+implemented in the mpileup command and is invoked by default. It works
+with multiple samples.
+
+Other notable changes:
+
+ * With the -r option, the calmd command writes the difference between
+ the original base quality and the BAQ capped base quality at the BQ
+ tag but does not modify the base quality. Please use -Ar to overwrite
+ the original base quality (the 0.1.9 behavior).
+
+ * Allow to set a maximum per-sample read depth to reduce memory. In
+ 0.1.9, most of memory is wasted for the ultra high read depth in some
+ regions (e.g. the chr1 centromere).
+
+ * Optionally write per-sample read depth and per-sample strand bias
+ P-value.
+
+ * Compute equal-tail (Bayesian) credible interval of site allele
+ frequency at the CI95 VCF annotation.
+
+ * Merged the vcfutils.pl varFilter and filter4vcf for better SNP/indel
+ filtering.
+
+(0.1.10: 16 November 2010, r829)
+
+
+
+Beta Release 0.1.9 (27 October, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release is featured as the first major improvement to the samtools'
+SNP caller. It comes with a revised MAQ error model, the support of
+multi-sample SNP calling and the computation of base alignment quality
+(BAQ).
+
+The revised MAQ error model is based on the original model. It solves an
+issue of miscalling SNPs in repetitive regions. Althought such SNPs can
+usually be filtered at a later step, they mess up unfiltered calls. This
+is a theoretical flaw in the original model. The revised MAQ model
+deprecates the orginal MAQ model and the simplified SOAPsnp model.
+
+Multi-sample SNP calling is separated in two steps. The first is done by
+samtools mpileup and the second by a new program, bcftools, which is
+included in the samtools source code tree. Multi-sample SNP calling also
+works for single sample and has the advantage of enabling more powerful
+filtration. It is likely to deprecate pileup in future once a proper
+indel calling method is implemented.
+
+BAQ is the Phred-scaled probability of a read base being wrongly
+aligned. Capping base quality by BAQ has been shown to be very effective
+in suppressing false SNPs caused by misalignments around indels or in
+low-complexity regions with acceptable compromise on computation
+time. This strategy is highly recommended and can be used with other SNP
+callers as well.
+
+In addition to the three major improvements, other notable changes are:
+
+ * Changes to the pileup format. A reference skip (the N CIGAR operator)
+ is shown as '<' or '>' depending on the strand. Tview is also changed
+ accordingly.
+
+ * Accelerated pileup. The plain pileup is about 50% faster.
+
+ * Regional merge. The merge command now accepts a new option to merge
+ files in a specified region.
+
+ * Fixed a bug in bgzip and razip which causes source files to be
+ deleted even if option -c is applied.
+
+ * In APIs, propogate errors to downstream callers and make samtools
+ return non-zero values once errors occur.
+
+(0.1.9: 27 October 2010, r783)
+
+
+
+Beta Release 0.1.8 (11 July, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable functional changes:
+
+ * Added the `reheader' command which replaces a BAM header with a new
+ header. This command is much faster than replacing header by
+ BAM->SAM->BAM conversions.
+
+ * Added the `mpileup' command which computes the pileup of multiple
+ alignments.
+
+ * The `index' command now stores the number of mapped and unmapped
+ reads in the index file. This information can be retrieved quickly by
+ the new `idxstats' command.
+
+ * By default, pileup used the SOAPsnp model for SNP calling. This
+ avoids the floating overflow in the MAQ model which leads to spurious
+ calls in repetitive regions, although these calls will be immediately
+ filtered by varFilter.
+
+ * The `tview' command now correctly handles CIGARs like 7I10M and
+ 10M1P1I10M which cause assertion failure in earlier versions.
+
+ * Tview accepts a region like `=10,000' where `=' stands for the
+ current sequence name. This saves typing for long sequence names.
+
+ * Added the `-d' option to `pileup' which avoids slow indel calling
+ in ultradeep regions by subsampling reads locally.
+
+ * Added the `-R' option to `view' which retrieves alignments in read
+ groups listed in the specified file.
+
+Performance improvements:
+
+ * The BAM->SAM conversion is up to twice faster, depending on the
+ characteristic of the input.
+
+ * Parsing SAM headers with a lot of reference sequences is now much
+ faster.
+
+ * The number of lseek() calls per query is reduced when the query
+ region contains no read alignments.
+
+Bug fixes:
+
+ * Fixed an issue in the indel caller that leads to miscall of indels.
+ Note that this solution may not work well when the sequencing indel
+ error rate is higher than the rate of SNPs.
+
+ * Fixed another issue in the indel caller which may lead to incorrect
+ genotype.
+
+ * Fixed a bug in `sort' when option `-o' is applied.
+
+ * Fixed a bug in `view -r'.
+
+APIs and other changes:
+
+ * Added iterator interfaces to random access and pileup. The callback
+ interfaces directly call the iterator interfaces.
+
+ * The BGZF blocks holding the BAM header are indepedent of alignment
+ BGZF blocks. Alignment records shorter than 64kB is guaranteed to be
+ fully contained in one BGZF block. This change is fully compatible
+ with the old version of samtools/picard.
+
+Changes in other utilities:
+
+ * Updated export2sam.pl by Chris Saunders.
+
+ * Improved the sam2vcf.pl script.
+
+ * Added a Python version of varfilter.py by Aylwyn Scally.
+
+(0.1.8: 11 July 2010, r613)
+
+
+
+Beta Release 0.1.7 (10 November, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Improved the indel caller in complex scenariors, in particular for
+ long reads. The indel caller is now able to make reasonable indel
+ calls from Craig Venter capillary reads.
+
+ * Rewrote single-end duplicate removal with improved
+ performance. Paired-end reads are not touched.
+
+ * Duplicate removal is now library aware. Samtools remove potential
+ PCR/optical dupliates inside a library rather than across libraries.
+
+ * SAM header is now fully parsed, although this functionality is not
+ used in merging and so on.
+
+ * In samtools merge, optionally take the input file name as RG-ID and
+ attach the RG tag to each alignment.
+
+ * Added FTP support in the RAZF library. RAZF-compressed reference
+ sequence can be retrieved remotely.
+
+ * Improved network support for Win32.
+
+ * Samtools sort and merge are now stable.
+
+Changes in other utilities:
+
+ * Implemented sam2vcf.pl that converts the pileup format to the VCF
+ format.
+
+ * This release of samtools is known to work with the latest
+ Bio-Samtools Perl module.
+
+(0.1.7: 10 November 2009, r510)
+
+
+
+Beta Release 0.1.6 (2 September, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * In tview, do not show a blank screen when no reads mapped to the
+ corresponding region.
+
+ * Implemented native HTTP support in the BGZF library. Samtools is now
+ able to directly open a BAM file on HTTP. HTTP proxy is also
+ supported via the "http_proxy" environmental variable.
+
+ * Samtools is now compitable with the MinGW (win32) compiler and the
+ PDCurses library.
+
+ * The calmd (or fillmd) command now calculates the NM tag and replaces
+ MD tags if they are wrong.
+
+ * The view command now recognizes and optionally prints FLAG in HEXs or
+ strings to make a SAM file more friendly to human eyes. This is a
+ samtools-C extension, not implemented in Picard for the time
+ being. Please type `samtools view -?' for more information.
+
+ * BAM files now have an end-of-file (EOF) marker to facilitate
+ truncation detection. A warning will be given if an on-disk BAM file
+ does not have this marker. The warning will be seen on BAM files
+ generated by an older version of samtools. It does NO harm.
+
+ * New key bindings in tview: `r' to show read names and `s' to show
+ reference skip (N operation) as deletions.
+
+ * Fixed a bug in `samtools merge -n'.
+
+ * Samtools merge now optionally copies the header of a user specified
+ SAM file to the resultant BAM output.
+
+ * Samtools pileup/tview works with a CIGAR with the first or the last
+ operation is an indel.
+
+ * Fixed a bug in bam_aux_get().
+
+
+Changes in other utilies:
+
+ * Fixed wrong FLAG in maq2sam.
+
+
+(0.1.6: 2 September 2009, r453)
+
+
+
+Beta Release 0.1.5 (7 July, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Support opening a BAM alignment on FTP. Users can now use "tview" to
+ view alignments at the NCBI ftp site. Please read manual for more
+ information.
+
+ * In library, propagate errors rather than exit or complain assertion
+ failure.
+
+ * Simplified the building system and fixed compiling errors caused by
+ zlib<1.2.2.1.
+
+ * Fixed an issue about lost header information when a SAM is imported
+ with "view -t".
+
+ * Implemented "samtool.pl varFilter" which filters both SNPs and short
+ indels. This command replaces "indelFilter".
+
+ * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from
+ pileup output.
+
+ * In pileup, cap mapping quality at 60. This helps filtering when
+ different aligners are in use.
+
+ * In pileup, allow to output variant sites only.
+
+ * Made pileup generate correct calls in repetitive region. At the same
+ time, I am considering to implement a simplified model in SOAPsnp,
+ although this has not happened yet.
+
+ * In view, added '-u' option to output BAM without compression. This
+ option is preferred when the output is piped to other commands.
+
+ * In view, added '-l' and '-r' to get the alignments for one library or
+ read group. The "@RG" header lines are now partially parsed.
+
+ * Do not include command line utilities to libbam.a.
+
+ * Fixed memory leaks in pileup and bam_view1().
+
+ * Made faidx more tolerant to empty lines right before or after FASTA >
+ lines.
+
+
+Changes in other utilities:
+
+ * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign.
+
+
+This release involves several modifications to the key code base which
+may potentially introduce new bugs even though we have tried to minimize
+this by testing on several examples. Please let us know if you catch
+bugs.
+
+(0.1.5: 7 July 2009, r373)
+
+
+
+Beta Release 0.1.4 (21 May, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Added the 'rmdupse' command: removing duplicates for SE reads.
+
+ * Fixed a critical bug in the indel caller: clipped alignments are not
+ processed correctly.
+
+ * Fixed a bug in the tview: gapped alignment may be incorrectly
+ displayed.
+
+ * Unified the interface to BAM and SAM I/O. This is done by
+ implementing a wrapper on top of the old APIs and therefore old APIs
+ are still valid. The new I/O APIs also recognize the @SQ header
+ lines.
+
+ * Generate the MD tag.
+
+ * Generate "=" bases. However, the indel caller will not work when "="
+ bases are present.
+
+ * Enhanced support of color-read display (by Nils Homer).
+
+ * Implemented the GNU building system. However, currently the building
+ system does not generate libbam.a. We will improve this later. For
+ the time being, `make -f Makefile.generic' is preferred.
+
+ * Fixed a minor bug in pileup: the first read in a chromosome may be
+ skipped.
+
+ * Fixed bugs in bam_aux.c. These bugs do not affect other components as
+ they were not used previously.
+
+ * Output the 'SM' tag from maq2sam.
+
+(0.1.4: 21 May 2009, r297)
+
+
+
+Beta Release 0.1.3 (15 April, 2009)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in SAMtools:
+
+ * SAMtools is more consistent with the specification: a) '*' in the
+ QUAL field is allowed; b) the field separator is TAB only and SPACE
+ is treated as a character in a field; c) empty header is allowed.
+
+ * Implemented GLFv3 support in pileup.
+
+ * Fixed a severe bug in fixmate: strand information is wrongly
+ overwritten.
+
+ * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are
+ not correctly retrieved sometimes.
+
+ * Fixed a bug in rmdup: segfault if unmapped reads are present.
+
+ * Move indel_filter.pl to samtools.pl and improved the filtering by
+ checking the actual number of alignments containing indels. The indel
+ pileup line is also changed a little to make this filtration easier.
+
+ * Fixed a minor bug in indexing: the bin number of an unmapped read is
+ wrongly calculated.
+
+ * Added `flagstat' command to show statistics on the FLAG field.
+
+ * Improved indel caller by setting the maximum window size in local
+ realignment.
+
+Changes in other utilities:
+
+ * Fixed a bug in maq2sam: a tag name is obsolete.
+
+ * Improvement to wgsim: a) added support for SOLiD read simulation; b)
+ show the number of substitutions/indels/errors in read name; c)
+ considerable code clean up.
+
+ * Various converters: improved functionality in general.
+
+ * Updated the example SAM due to the previous bug in fixmate.
+
+(0.1.3: 15 April 2009, r227)
+
+
+
+Beta Release 0.1.2 (28 January, 2008)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in SAMtools:
+
+ * Implemented a Bayesian indel caller. The new caller generate scores
+ and genotype and is potentially more accurate than Maq's indel
+ caller. The pileup format is also changed accordingly.
+
+ * Implemented rmdup command: remove potential PCR duplicates. Note that
+ this command ONLY works for FR orientation and requires ISIZE is
+ correctly set.
+
+ * Added fixmate command: fill in mate coordinates, ISIZE and mate
+ related flags from a name-sorted alignment.
+
+ * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved.
+
+ * Allow to select reads shown in the pileup output with a mask.
+
+ * Generate GLFv2 from pileup.
+
+ * Added two more flags for flagging PCR/optical duplicates and for QC
+ failure.
+
+ * Fixed a bug in sort command: name sorting for large alignment did not
+ work.
+
+ * Allow to completely disable RAZF (using Makefile.lite) as some people
+ have problem to compile it.
+
+ * Fixed a bug in import command when there are reads without
+ coordinates.
+
+ * Fixed a bug in tview: clipping broke the alignment viewer.
+
+ * Fixed a compiling error when _NO_CURSES is applied.
+
+ * Fixed a bug in merge command.
+
+Changes in other utilities:
+
+ * Added wgsim, a paired-end reads simulator. Wgsim was adapted from
+ maq's reads simulator. Colin Hercus further improved it to allow
+ longer indels.
+
+ * Added wgsim_eval.pl, a script that evaluates the accuracy of
+ alignment on reads generated by wgsim.
+
+ * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not
+ work properly when multiple hits are output.
+
+ * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will
+ be retained when multiple hits are present.
+
+ * Fixed a bug in export2sam.pl for QC reads.
+
+ * Support RG tag at MAQ->SAM converter.
+
+ * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and
+ indel are not properly handled, though.
+
+ * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the
+ default Illumina output.
+
+(0.1.2: 28 January 2008; r116)
+
+
+
+Beta Release 0.1.1 (22 December, 2008)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The is the first public release of samtools. For more information,
+please check the manual page `samtools.1' and the samtools website
+http://samtools.sourceforge.net
diff --git a/samtools-0.1.19/bam.c b/samtools-0.1.19/bam.c
new file mode 100644
index 0000000..b00d6a6
--- /dev/null
+++ b/samtools-0.1.19/bam.c
@@ -0,0 +1,474 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+#include "bam.h"
+#include "bam_endian.h"
+#include "kstring.h"
+#include "sam_header.h"
+
+int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0;
+char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
+
+/**************************
+ * CIGAR related routines *
+ **************************/
+
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
+{
+ int k, end = c->pos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = bam_cigar_op(cigar[k]);
+ int len = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CBACK) { // move backward
+ int l, u, v;
+ if (k == c->n_cigar - 1) break; // skip trailing 'B'
+ for (l = k - 1, u = v = 0; l >= 0; --l) {
+ int op1 = bam_cigar_op(cigar[l]);
+ int len1 = bam_cigar_oplen(cigar[l]);
+ if (bam_cigar_type(op1)&1) { // consume query
+ if (u + len1 >= len) { // stop
+ if (bam_cigar_type(op1)&2) v += len - u;
+ break;
+ } else u += len1;
+ }
+ if (bam_cigar_type(op1)&2) v += len1;
+ }
+ end = l < 0? c->pos : end - v;
+ } else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]);
+ }
+ return end;
+}
+
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
+{
+ uint32_t k;
+ int32_t l = 0;
+ for (k = 0; k < c->n_cigar; ++k)
+ if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
+ l += bam_cigar_oplen(cigar[k]);
+ return l;
+}
+
+/********************
+ * BAM I/O routines *
+ ********************/
+
+bam_header_t *bam_header_init()
+{
+ bam_is_be = bam_is_big_endian();
+ return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+ int32_t i;
+ extern void bam_destroy_header_hash(bam_header_t *header);
+ if (header == 0) return;
+ if (header->target_name) {
+ for (i = 0; i < header->n_targets; ++i)
+ free(header->target_name[i]);
+ free(header->target_name);
+ free(header->target_len);
+ }
+ free(header->text);
+ if (header->dict) sam_header_free(header->dict);
+ if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
+ bam_destroy_header_hash(header);
+ free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+ bam_header_t *header;
+ char buf[4];
+ int magic_len;
+ int32_t i = 1, name_len;
+ // check EOF
+ i = bgzf_check_EOF(fp);
+ if (i < 0) {
+ // If the file is a pipe, checking the EOF marker will *always* fail
+ // with ESPIPE. Suppress the error message in this case.
+ if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
+ }
+ else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
+ // read "BAM1"
+ magic_len = bam_read(fp, buf, 4);
+ if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
+ fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
+ return 0;
+ }
+ header = bam_header_init();
+ // read plain text and the number of reference sequences
+ bam_read(fp, &header->l_text, 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+ header->text = (char*)calloc(header->l_text + 1, 1);
+ bam_read(fp, header->text, header->l_text);
+ bam_read(fp, &header->n_targets, 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+ // read reference sequence names and lengths
+ header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+ header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+ for (i = 0; i != header->n_targets; ++i) {
+ bam_read(fp, &name_len, 4);
+ if (bam_is_be) bam_swap_endian_4p(&name_len);
+ header->target_name[i] = (char*)calloc(name_len, 1);
+ bam_read(fp, header->target_name[i], name_len);
+ bam_read(fp, &header->target_len[i], 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+ }
+ return header;
+}
+
+int bam_header_write(bamFile fp, const bam_header_t *header)
+{
+ char buf[4];
+ int32_t i, name_len, x;
+ // write "BAM1"
+ strncpy(buf, "BAM\001", 4);
+ bam_write(fp, buf, 4);
+ // write plain text and the number of reference sequences
+ if (bam_is_be) {
+ x = bam_swap_endian_4(header->l_text);
+ bam_write(fp, &x, 4);
+ if (header->l_text) bam_write(fp, header->text, header->l_text);
+ x = bam_swap_endian_4(header->n_targets);
+ bam_write(fp, &x, 4);
+ } else {
+ bam_write(fp, &header->l_text, 4);
+ if (header->l_text) bam_write(fp, header->text, header->l_text);
+ bam_write(fp, &header->n_targets, 4);
+ }
+ // write sequence names and lengths
+ for (i = 0; i != header->n_targets; ++i) {
+ char *p = header->target_name[i];
+ name_len = strlen(p) + 1;
+ if (bam_is_be) {
+ x = bam_swap_endian_4(name_len);
+ bam_write(fp, &x, 4);
+ } else bam_write(fp, &name_len, 4);
+ bam_write(fp, p, name_len);
+ if (bam_is_be) {
+ x = bam_swap_endian_4(header->target_len[i]);
+ bam_write(fp, &x, 4);
+ } else bam_write(fp, &header->target_len[i], 4);
+ }
+ bgzf_flush(fp);
+ return 0;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+ uint8_t *s;
+ uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+ s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+ for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+ while (s < data + data_len) {
+ uint8_t type;
+ s += 2; // skip key
+ type = toupper(*s); ++s; // skip type
+ if (type == 'C' || type == 'A') ++s;
+ else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+ else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+ else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
+ else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+ else if (type == 'B') {
+ int32_t n, Bsize = bam_aux_type2size(*s);
+ memcpy(&n, s + 1, 4);
+ if (1 == Bsize) {
+ } else if (2 == Bsize) {
+ for (i = 0; i < n; i += 2)
+ bam_swap_endian_2p(s + 5 + i);
+ } else if (4 == Bsize) {
+ for (i = 0; i < n; i += 4)
+ bam_swap_endian_4p(s + 5 + i);
+ }
+ bam_swap_endian_4p(s+1);
+ }
+ }
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+ bam1_core_t *c = &b->core;
+ int32_t block_len, ret, i;
+ uint32_t x[8];
+
+ assert(BAM_CORE_SIZE == 32);
+ if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+ if (ret == 0) return -1; // normal end-of-file
+ else return -2; // truncated
+ }
+ if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
+ if (bam_is_be) {
+ bam_swap_endian_4p(&block_len);
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+ }
+ c->tid = x[0]; c->pos = x[1];
+ c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+ c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+ c->l_qseq = x[4];
+ c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+ b->data_len = block_len - BAM_CORE_SIZE;
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+ b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+ if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+ if (bam_no_B) bam_remove_B(b);
+ return 4 + block_len;
+}
+
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
+{
+ uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
+ int i;
+ assert(BAM_CORE_SIZE == 32);
+ x[0] = c->tid;
+ x[1] = c->pos;
+ x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+ x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+ x[4] = c->l_qseq;
+ x[5] = c->mtid;
+ x[6] = c->mpos;
+ x[7] = c->isize;
+ bgzf_flush_try(fp, 4 + block_len);
+ if (bam_is_be) {
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+ y = block_len;
+ bam_write(fp, bam_swap_endian_4p(&y), 4);
+ swap_endian_data(c, data_len, data);
+ } else bam_write(fp, &block_len, 4);
+ bam_write(fp, x, BAM_CORE_SIZE);
+ bam_write(fp, data, data_len);
+ if (bam_is_be) swap_endian_data(c, data_len, data);
+ return 4 + block_len;
+}
+
+int bam_write1(bamFile fp, const bam1_t *b)
+{
+ return bam_write1_core(fp, &b->core, b->data_len, b->data);
+}
+
+char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
+{
+ uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
+ int i;
+ const bam1_core_t *c = &b->core;
+ kstring_t str;
+ str.l = str.m = 0; str.s = 0;
+
+ kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str);
+ if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); }
+ else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
+ else { // BAM_OFSTR
+ for (i = 0; i < 16; ++i)
+ if ((c->flag & 1<<i) && bam_flag2char_table[i])
+ kputc(bam_flag2char_table[i], &str);
+ kputc('\t', &str);
+ }
+ if (c->tid < 0) kputsn("*\t", 2, &str);
+ else {
+ if (header) kputs(header->target_name[c->tid] , &str);
+ else kputw(c->tid, &str);
+ kputc('\t', &str);
+ }
+ kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
+ if (c->n_cigar == 0) kputc('*', &str);
+ else {
+ uint32_t *cigar = bam1_cigar(b);
+ for (i = 0; i < c->n_cigar; ++i) {
+ kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
+ kputc(bam_cigar_opchr(cigar[i]), &str);
+ }
+ }
+ kputc('\t', &str);
+ if (c->mtid < 0) kputsn("*\t", 2, &str);
+ else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
+ else {
+ if (header) kputs(header->target_name[c->mtid], &str);
+ else kputw(c->mtid, &str);
+ kputc('\t', &str);
+ }
+ kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
+ if (c->l_qseq) {
+ for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
+ kputc('\t', &str);
+ if (t[0] == 0xff) kputc('*', &str);
+ else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+ } else kputsn("*\t*", 3, &str);
+ s = bam1_aux(b);
+ while (s < b->data + b->data_len) {
+ uint8_t type, key[2];
+ key[0] = s[0]; key[1] = s[1];
+ s += 2; type = *s; ++s;
+ kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
+ if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
+ else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }
+ else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }
+ else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }
+ else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }
+ else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }
+ else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
+ else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
+ else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
+ else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
+ else if (type == 'B') {
+ uint8_t sub_type = *(s++);
+ int32_t n;
+ memcpy(&n, s, 4);
+ s += 4; // no point to the start of the array
+ kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing
+ for (i = 0; i < n; ++i) {
+ kputc(',', &str);
+ if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
+ else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
+ else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
+ else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
+ else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
+ else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
+ else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
+ }
+ }
+ }
+ return str.s;
+}
+
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
+{
+ return bam_format1_core(header, b, BAM_OFDEC);
+}
+
+void bam_view1(const bam_header_t *header, const bam1_t *b)
+{
+ char *s = bam_format1(header, b);
+ puts(s);
+ free(s);
+}
+
+int bam_validate1(const bam_header_t *header, const bam1_t *b)
+{
+ char *s;
+
+ if (b->core.tid < -1 || b->core.mtid < -1) return 0;
+ if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0;
+
+ if (b->data_len < b->core.l_qname) return 0;
+ s = memchr(bam1_qname(b), '\0', b->core.l_qname);
+ if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
+
+ // FIXME: Other fields could also be checked, especially the auxiliary data
+
+ return 1;
+}
+
+// FIXME: we should also check the LB tag associated with each alignment
+const char *bam_get_library(bam_header_t *h, const bam1_t *b)
+{
+ const uint8_t *rg;
+ if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+ if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
+ rg = bam_aux_get(b, "RG");
+ return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
+}
+
+/************
+ * Remove B *
+ ************/
+
+int bam_remove_B(bam1_t *b)
+{
+ int i, j, end_j, k, l, no_qual;
+ uint32_t *cigar, *new_cigar;
+ uint8_t *seq, *qual, *p;
+ // test if removal is necessary
+ if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
+ cigar = bam1_cigar(b);
+ for (k = 0; k < b->core.n_cigar; ++k)
+ if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
+ if (k == b->core.n_cigar) return 0; // no 'B'
+ if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
+ // allocate memory for the new CIGAR
+ if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
+ b->m_data = b->data_len + b->core.n_cigar * 4;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ cigar = bam1_cigar(b); // after realloc, cigar may be changed
+ }
+ new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
+ // the core loop
+ seq = bam1_seq(b); qual = bam1_qual(b);
+ no_qual = (qual[0] == 0xff); // test whether base quality is available
+ i = j = 0; end_j = -1;
+ for (k = l = 0; k < b->core.n_cigar; ++k) {
+ int op = bam_cigar_op(cigar[k]);
+ int len = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CBACK) { // the backward operation
+ int t, u;
+ if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
+ if (len > j) goto rmB_err; // an excessively long backward
+ for (t = l - 1, u = 0; t >= 0; --t) { // look back
+ int op1 = bam_cigar_op(new_cigar[t]);
+ int len1 = bam_cigar_oplen(new_cigar[t]);
+ if (bam_cigar_type(op1)&1) { // consume the query
+ if (u + len1 >= len) { // stop
+ new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
+ break;
+ } else u += len1;
+ }
+ }
+ if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
+ l = t + 1;
+ end_j = j; j -= len;
+ } else { // other CIGAR operations
+ new_cigar[l++] = cigar[k];
+ if (bam_cigar_type(op)&1) { // consume the query
+ if (i != j) { // no need to copy if i == j
+ int u, c, c0;
+ for (u = 0; u < len; ++u) { // construct the consensus
+ c = bam1_seqi(seq, i+u);
+ if (j + u < end_j) { // in an overlap
+ c0 = bam1_seqi(seq, j+u);
+ if (c != c0) { // a mismatch; choose the better base
+ if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
+ bam1_seq_seti(seq, j+u, c);
+ qual[j+u] = qual[i+u] - qual[j+u];
+ } else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
+ } else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
+ } else { // not in an overlap; copy over
+ bam1_seq_seti(seq, j+u, c);
+ qual[j+u] = qual[i+u];
+ }
+ }
+ }
+ i += len, j += len;
+ }
+ }
+ }
+ if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
+ // merge adjacent operations if possible
+ for (k = 1; k < l; ++k)
+ if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
+ new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
+ // kill zero length operations
+ for (k = i = 0; k < l; ++k)
+ if (new_cigar[k] >> BAM_CIGAR_SHIFT)
+ new_cigar[i++] = new_cigar[k];
+ l = i;
+ // update b
+ memcpy(cigar, new_cigar, l * 4); // set CIGAR
+ p = b->data + b->core.l_qname + l * 4;
+ memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
+ memmove(p, qual, j); p += j; // set QUAL
+ memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields
+ b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
+ b->data_len = p - b->data; // update record length
+ return 0;
+
+rmB_err:
+ b->core.flag |= BAM_FUNMAP;
+ return -1;
+}
diff --git a/samtools-0.1.19/bam.h b/samtools-0.1.19/bam.h
new file mode 100644
index 0000000..6add34b
--- /dev/null
+++ b/samtools-0.1.19/bam.h
@@ -0,0 +1,793 @@
+/* The MIT License
+
+ Copyright (c) 2008-2010 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#ifndef BAM_BAM_H
+#define BAM_BAM_H
+
+/*!
+ @header
+
+ BAM library provides I/O and various operations on manipulating files
+ in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
+ format. It now supports importing from or exporting to SAM, sorting,
+ merging, generating pileup, and quickly retrieval of reads overlapped
+ with a specified region.
+
+ @copyright Genome Research Ltd.
+ */
+
+#define BAM_VERSION "0.1.19-96b5f2294a"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifndef BAM_LITE
+#define BAM_VIRTUAL_OFFSET16
+#include "bgzf.h"
+/*! @abstract BAM file handler */
+typedef BGZF *bamFile;
+#define bam_open(fn, mode) bgzf_open(fn, mode)
+#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
+#define bam_close(fp) bgzf_close(fp)
+#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
+#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
+#define bam_tell(fp) bgzf_tell(fp)
+#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
+#else
+#define BAM_TRUE_OFFSET
+#include <zlib.h>
+typedef gzFile bamFile;
+#define bam_open(fn, mode) gzopen(fn, mode)
+#define bam_dopen(fd, mode) gzdopen(fd, mode)
+#define bam_close(fp) gzclose(fp)
+#define bam_read(fp, buf, size) gzread(fp, buf, size)
+/* no bam_write/bam_tell/bam_seek() here */
+#endif
+
+/*! @typedef
+ @abstract Structure for the alignment header.
+ @field n_targets number of reference sequences
+ @field target_name names of the reference sequences
+ @field target_len lengths of the referene sequences
+ @field dict header dictionary
+ @field hash hash table for fast name lookup
+ @field rg2lib hash table for @RG-ID -> LB lookup
+ @field l_text length of the plain text in the header
+ @field text plain text
+
+ @discussion Field hash points to null by default. It is a private
+ member.
+ */
+typedef struct {
+ int32_t n_targets;
+ char **target_name;
+ uint32_t *target_len;
+ void *dict, *hash, *rg2lib;
+ uint32_t l_text, n_text;
+ char *text;
+} bam_header_t;
+
+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
+#define BAM_FPAIRED 1
+/*! @abstract the read is mapped in a proper pair */
+#define BAM_FPROPER_PAIR 2
+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
+#define BAM_FUNMAP 4
+/*! @abstract the mate is unmapped */
+#define BAM_FMUNMAP 8
+/*! @abstract the read is mapped to the reverse strand */
+#define BAM_FREVERSE 16
+/*! @abstract the mate is mapped to the reverse strand */
+#define BAM_FMREVERSE 32
+/*! @abstract this is read1 */
+#define BAM_FREAD1 64
+/*! @abstract this is read2 */
+#define BAM_FREAD2 128
+/*! @abstract not primary alignment */
+#define BAM_FSECONDARY 256
+/*! @abstract QC failure */
+#define BAM_FQCFAIL 512
+/*! @abstract optical or PCR duplicate */
+#define BAM_FDUP 1024
+
+#define BAM_OFDEC 0
+#define BAM_OFHEX 1
+#define BAM_OFSTR 2
+
+/*! @abstract defautl mask for pileup */
+#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
+
+#define BAM_CORE_SIZE sizeof(bam1_core_t)
+
+/**
+ * Describing how CIGAR operation/length is packed in a 32-bit integer.
+ */
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+
+/*
+ CIGAR operations.
+ */
+/*! @abstract CIGAR: M = match or mismatch*/
+#define BAM_CMATCH 0
+/*! @abstract CIGAR: I = insertion to the reference */
+#define BAM_CINS 1
+/*! @abstract CIGAR: D = deletion from the reference */
+#define BAM_CDEL 2
+/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */
+#define BAM_CREF_SKIP 3
+/*! @abstract CIGAR: S = clip on the read with clipped sequence
+ present in qseq */
+#define BAM_CSOFT_CLIP 4
+/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */
+#define BAM_CHARD_CLIP 5
+/*! @abstract CIGAR: P = padding */
+#define BAM_CPAD 6
+/*! @abstract CIGAR: equals = match */
+#define BAM_CEQUAL 7
+/*! @abstract CIGAR: X = mismatch */
+#define BAM_CDIFF 8
+#define BAM_CBACK 9
+
+#define BAM_CIGAR_STR "MIDNSHP=XB"
+#define BAM_CIGAR_TYPE 0x3C1A7
+
+#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
+#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
+#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)])
+#define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
+#define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
+
+/*! @typedef
+ @abstract Structure for core alignment information.
+ @field tid chromosome ID, defined by bam_header_t
+ @field pos 0-based leftmost coordinate
+ @field bin bin calculated by bam_reg2bin()
+ @field qual mapping quality
+ @field l_qname length of the query name
+ @field flag bitwise flag
+ @field n_cigar number of CIGAR operations
+ @field l_qseq length of the query sequence (read)
+ */
+typedef struct {
+ int32_t tid;
+ int32_t pos;
+ uint32_t bin:16, qual:8, l_qname:8;
+ uint32_t flag:16, n_cigar:16;
+ int32_t l_qseq;
+ int32_t mtid;
+ int32_t mpos;
+ int32_t isize;
+} bam1_core_t;
+
+/*! @typedef
+ @abstract Structure for one alignment.
+ @field core core information about the alignment
+ @field l_aux length of auxiliary data
+ @field data_len current length of bam1_t::data
+ @field m_data maximum length of bam1_t::data
+ @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
+
+ @discussion Notes:
+
+ 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+ 2. l_qseq is calculated from the total length of an alignment block
+ on reading or from CIGAR.
+ 3. cigar data is encoded 4 bytes per CIGAR operation.
+ 4. seq is nybble-encoded according to bam_nt16_table.
+ */
+typedef struct {
+ bam1_core_t core;
+ int l_aux, data_len, m_data;
+ uint8_t *data;
+} bam1_t;
+
+typedef struct __bam_iter_t *bam_iter_t;
+
+#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
+#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
+
+/*! @function
+ @abstract Get the CIGAR array
+ @param b pointer to an alignment
+ @return pointer to the CIGAR array
+
+ @discussion In the CIGAR array, each element is a 32-bit integer. The
+ lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+ length of a CIGAR.
+ */
+#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
+
+/*! @function
+ @abstract Get the name of the query
+ @param b pointer to an alignment
+ @return pointer to the name string, null terminated
+ */
+#define bam1_qname(b) ((char*)((b)->data))
+
+/*! @function
+ @abstract Get query sequence
+ @param b pointer to an alignment
+ @return pointer to sequence
+
+ @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+ 8 for T and 15 for N. Two bases are packed in one byte with the base
+ at the higher 4 bits having smaller coordinate on the read. It is
+ recommended to use bam1_seqi() macro to get the base.
+ */
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
+
+/*! @function
+ @abstract Get query quality
+ @param b pointer to an alignment
+ @return pointer to quality string
+ */
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
+
+/*! @function
+ @abstract Get a base on read
+ @param s Query sequence returned by bam1_seq()
+ @param i The i-th position, 0-based
+ @return 4-bit integer representing the base.
+ */
+//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
+
+#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) )
+
+/*! @function
+ @abstract Get query sequence and quality
+ @param b pointer to an alignment
+ @return pointer to the concatenated auxiliary data
+ */
+#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
+
+#ifndef kroundup32
+/*! @function
+ @abstract Round an integer to the next closest power-2 integer.
+ @param x integer to be rounded (in place)
+ @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/*!
+ @abstract Whether the machine is big-endian; modified only in
+ bam_header_init().
+ */
+extern int bam_is_be;
+
+/*!
+ @abstract Verbose level between 0 and 3; 0 is supposed to disable all
+ debugging information, though this may not have been implemented.
+ */
+extern int bam_verbose;
+
+extern int bam_no_B;
+
+/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
+extern unsigned char bam_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
+extern char *bam_nt16_rev_table;
+
+extern char bam_nt16_nt4_table[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /*********************
+ * Low-level SAM I/O *
+ *********************/
+
+ /*! @abstract TAM file handler */
+ typedef struct __tamFile_t *tamFile;
+
+ /*!
+ @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
+ @param fn SAM file name
+ @return SAM file handler
+ */
+ tamFile sam_open(const char *fn);
+
+ /*!
+ @abstract Close a SAM file handler
+ @param fp SAM file handler
+ */
+ void sam_close(tamFile fp);
+
+ /*!
+ @abstract Read one alignment from a SAM file handler
+ @param fp SAM file handler
+ @param header header information (ordered names of chromosomes)
+ @param b read alignment; all members in b will be updated
+ @return 0 if successful; otherwise negative
+ */
+ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
+
+ /*!
+ @abstract Read header information from a TAB-delimited list file.
+ @param fn_list file name for the list
+ @return a pointer to the header structure
+
+ @discussion Each line in this file consists of chromosome name and
+ the length of chromosome.
+ */
+ bam_header_t *sam_header_read2(const char *fn_list);
+
+ /*!
+ @abstract Read header from a SAM file (if present)
+ @param fp SAM file handler
+ @return pointer to header struct; 0 if no @SQ lines available
+ */
+ bam_header_t *sam_header_read(tamFile fp);
+
+ /*!
+ @abstract Parse @SQ lines a update a header struct
+ @param h pointer to the header struct to be updated
+ @return number of target sequences
+
+ @discussion bam_header_t::{n_targets,target_len,target_name} will
+ be destroyed in the first place.
+ */
+ int sam_header_parse(bam_header_t *h);
+ int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+ /*!
+ @abstract Parse @RG lines a update a header struct
+ @param h pointer to the header struct to be updated
+ @return number of @RG lines
+
+ @discussion bam_header_t::rg2lib will be destroyed in the first
+ place.
+ */
+ int sam_header_parse_rg(bam_header_t *h);
+
+#define sam_write1(header, b) bam_view1(header, b)
+
+
+ /********************************
+ * APIs for string dictionaries *
+ ********************************/
+
+ int bam_strmap_put(void *strmap, const char *rg, const char *lib);
+ const char *bam_strmap_get(const void *strmap, const char *rg);
+ void *bam_strmap_dup(const void*);
+ void *bam_strmap_init();
+ void bam_strmap_destroy(void *strmap);
+
+
+ /*********************
+ * Low-level BAM I/O *
+ *********************/
+
+ /*!
+ @abstract Initialize a header structure.
+ @return the pointer to the header structure
+
+ @discussion This function also modifies the global variable
+ bam_is_be.
+ */
+ bam_header_t *bam_header_init();
+
+ /*!
+ @abstract Destroy a header structure.
+ @param header pointer to the header
+ */
+ void bam_header_destroy(bam_header_t *header);
+
+ /*!
+ @abstract Read a header structure from BAM.
+ @param fp BAM file handler, opened by bam_open()
+ @return pointer to the header structure
+
+ @discussion The file position indicator must be placed at the
+ beginning of the file. Upon success, the position indicator will
+ be set at the start of the first alignment.
+ */
+ bam_header_t *bam_header_read(bamFile fp);
+
+ /*!
+ @abstract Write a header structure to BAM.
+ @param fp BAM file handler
+ @param header pointer to the header structure
+ @return always 0 currently
+ */
+ int bam_header_write(bamFile fp, const bam_header_t *header);
+
+ /*!
+ @abstract Read an alignment from BAM.
+ @param fp BAM file handler
+ @param b read alignment; all members are updated.
+ @return number of bytes read from the file
+
+ @discussion The file position indicator must be
+ placed right before an alignment. Upon success, this function
+ will set the position indicator to the start of the next
+ alignment. This function is not affected by the machine
+ endianness.
+ */
+ int bam_read1(bamFile fp, bam1_t *b);
+
+ int bam_remove_B(bam1_t *b);
+
+ /*!
+ @abstract Write an alignment to BAM.
+ @param fp BAM file handler
+ @param c pointer to the bam1_core_t structure
+ @param data_len total length of variable size data related to
+ the alignment
+ @param data pointer to the concatenated data
+ @return number of bytes written to the file
+
+ @discussion This function is not affected by the machine
+ endianness.
+ */
+ int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
+
+ /*!
+ @abstract Write an alignment to BAM.
+ @param fp BAM file handler
+ @param b alignment to write
+ @return number of bytes written to the file
+
+ @abstract It is equivalent to:
+ bam_write1_core(fp, &b->core, b->data_len, b->data)
+ */
+ int bam_write1(bamFile fp, const bam1_t *b);
+
+ /*! @function
+ @abstract Initiate a pointer to bam1_t struct
+ */
+#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
+
+ /*! @function
+ @abstract Free the memory allocated for an alignment.
+ @param b pointer to an alignment
+ */
+#define bam_destroy1(b) do { \
+ if (b) { free((b)->data); free(b); } \
+ } while (0)
+
+ /*!
+ @abstract Format a BAM record in the SAM format
+ @param header pointer to the header structure
+ @param b alignment to print
+ @return a pointer to the SAM string
+ */
+ char *bam_format1(const bam_header_t *header, const bam1_t *b);
+
+ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
+
+ /*!
+ @abstract Check whether a BAM record is plausibly valid
+ @param header associated header structure, or NULL if unavailable
+ @param b alignment to validate
+ @return 0 if the alignment is invalid; non-zero otherwise
+
+ @discussion Simple consistency check of some of the fields of the
+ alignment record. If the header is provided, several additional checks
+ are made. Not all fields are checked, so a non-zero result is not a
+ guarantee that the record is valid. However it is usually good enough
+ to detect when bam_seek() has been called with a virtual file offset
+ that is not the offset of an alignment record.
+ */
+ int bam_validate1(const bam_header_t *header, const bam1_t *b);
+
+ const char *bam_get_library(bam_header_t *header, const bam1_t *b);
+
+
+ /***************
+ * pileup APIs *
+ ***************/
+
+ /*! @typedef
+ @abstract Structure for one alignment covering the pileup position.
+ @field b pointer to the alignment
+ @field qpos position of the read base at the pileup site, 0-based
+ @field indel indel length; 0 for no indel, positive for ins and negative for del
+ @field is_del 1 iff the base on the padded read is a deletion
+ @field level the level of the read in the "viewer" mode
+
+ @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+ difference between the two functions is that the former does not
+ set bam_pileup1_t::level, while the later does. Level helps the
+ implementation of alignment viewers, but calculating this has some
+ overhead.
+ */
+ typedef struct {
+ bam1_t *b;
+ int32_t qpos;
+ int indel, level;
+ uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
+ } bam_pileup1_t;
+
+ typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
+
+ struct __bam_plp_t;
+ typedef struct __bam_plp_t *bam_plp_t;
+
+ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
+ int bam_plp_push(bam_plp_t iter, const bam1_t *b);
+ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
+ void bam_plp_set_mask(bam_plp_t iter, int mask);
+ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
+ void bam_plp_reset(bam_plp_t iter);
+ void bam_plp_destroy(bam_plp_t iter);
+
+ struct __bam_mplp_t;
+ typedef struct __bam_mplp_t *bam_mplp_t;
+
+ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
+ void bam_mplp_destroy(bam_mplp_t iter);
+ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
+ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
+
+ /*! @typedef
+ @abstract Type of function to be called by bam_plbuf_push().
+ @param tid chromosome ID as is defined in the header
+ @param pos start coordinate of the alignment, 0-based
+ @param n number of elements in pl array
+ @param pl array of alignments
+ @param data user provided data
+ @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
+ */
+ typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
+
+ typedef struct {
+ bam_plp_t iter;
+ bam_pileup_f func;
+ void *data;
+ } bam_plbuf_t;
+
+ void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+ void bam_plbuf_reset(bam_plbuf_t *buf);
+ bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
+ void bam_plbuf_destroy(bam_plbuf_t *buf);
+ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
+
+ int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
+
+ struct __bam_lplbuf_t;
+ typedef struct __bam_lplbuf_t bam_lplbuf_t;
+
+ void bam_lplbuf_reset(bam_lplbuf_t *buf);
+
+ /*! @abstract bam_plbuf_init() equivalent with level calculated. */
+ bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
+
+ /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */
+ void bam_lplbuf_destroy(bam_lplbuf_t *tv);
+
+ /*! @abstract bam_plbuf_push() equivalent with level calculated. */
+ int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
+
+
+ /*********************
+ * BAM indexing APIs *
+ *********************/
+
+ struct __bam_index_t;
+ typedef struct __bam_index_t bam_index_t;
+
+ /*!
+ @abstract Build index for a BAM file.
+ @discussion Index file "fn.bai" will be created.
+ @param fn name of the BAM file
+ @return always 0 currently
+ */
+ int bam_index_build(const char *fn);
+
+ /*!
+ @abstract Load index from file "fn.bai".
+ @param fn name of the BAM file (NOT the index file)
+ @return pointer to the index structure
+ */
+ bam_index_t *bam_index_load(const char *fn);
+
+ /*!
+ @abstract Destroy an index structure.
+ @param idx pointer to the index structure
+ */
+ void bam_index_destroy(bam_index_t *idx);
+
+ /*! @typedef
+ @abstract Type of function to be called by bam_fetch().
+ @param b the alignment
+ @param data user provided data
+ */
+ typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
+
+ /*!
+ @abstract Retrieve the alignments that are overlapped with the
+ specified region.
+
+ @discussion A user defined function will be called for each
+ retrieved alignment ordered by its start position.
+
+ @param fp BAM file handler
+ @param idx pointer to the alignment index
+ @param tid chromosome ID as is defined in the header
+ @param beg start coordinate, 0-based
+ @param end end coordinate, 0-based
+ @param data user provided data (will be transferred to func)
+ @param func user defined function
+ */
+ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
+
+ bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end);
+ int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);
+ void bam_iter_destroy(bam_iter_t iter);
+
+ /*!
+ @abstract Parse a region in the format: "chr2:100,000-200,000".
+ @discussion bam_header_t::hash will be initialized if empty.
+ @param header pointer to the header structure
+ @param str string to be parsed
+ @param ref_id the returned chromosome ID
+ @param begin the returned start coordinate
+ @param end the returned end coordinate
+ @return 0 on success; -1 on failure
+ */
+ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+
+
+ /**************************
+ * APIs for optional tags *
+ **************************/
+
+ /*!
+ @abstract Retrieve data of a tag
+ @param b pointer to an alignment struct
+ @param tag two-character tag to be retrieved
+
+ @return pointer to the type and data. The first character is the
+ type that can be 'iIsScCdfAZH'.
+
+ @discussion Use bam_aux2?() series to convert the returned data to
+ the corresponding type.
+ */
+ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
+
+ int32_t bam_aux2i(const uint8_t *s);
+ float bam_aux2f(const uint8_t *s);
+ double bam_aux2d(const uint8_t *s);
+ char bam_aux2A(const uint8_t *s);
+ char *bam_aux2Z(const uint8_t *s);
+
+ int bam_aux_del(bam1_t *b, uint8_t *s);
+ void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
+ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
+
+
+ /*****************
+ * Miscellaneous *
+ *****************/
+
+ /*!
+ @abstract Calculate the rightmost coordinate of an alignment on the
+ reference genome.
+
+ @param c pointer to the bam1_core_t structure
+ @param cigar the corresponding CIGAR array (from bam1_t::cigar)
+ @return the rightmost coordinate, 0-based
+ */
+ uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
+
+ /*!
+ @abstract Calculate the length of the query sequence from CIGAR.
+ @param c pointer to the bam1_core_t structure
+ @param cigar the corresponding CIGAR array (from bam1_t::cigar)
+ @return length of the query sequence
+ */
+ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*!
+ @abstract Calculate the minimum bin that contains a region [beg,end).
+ @param beg start of the region, 0-based
+ @param end end of the region, 0-based
+ @return bin
+ */
+static inline int bam_reg2bin(uint32_t beg, uint32_t end)
+{
+ --end;
+ if (beg>>14 == end>>14) return 4681 + (beg>>14);
+ if (beg>>17 == end>>17) return 585 + (beg>>17);
+ if (beg>>20 == end>>20) return 73 + (beg>>20);
+ if (beg>>23 == end>>23) return 9 + (beg>>23);
+ if (beg>>26 == end>>26) return 1 + (beg>>26);
+ return 0;
+}
+
+/*!
+ @abstract Copy an alignment
+ @param bdst destination alignment struct
+ @param bsrc source alignment struct
+ @return pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+{
+ uint8_t *data = bdst->data;
+ int m_data = bdst->m_data; // backup data and m_data
+ if (m_data < bsrc->data_len) { // double the capacity
+ m_data = bsrc->data_len; kroundup32(m_data);
+ data = (uint8_t*)realloc(data, m_data);
+ }
+ memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
+ *bdst = *bsrc; // copy the rest
+ // restore the backup
+ bdst->m_data = m_data;
+ bdst->data = data;
+ return bdst;
+}
+
+/*!
+ @abstract Duplicate an alignment
+ @param src source alignment struct
+ @return pointer to the destination alignment struct
+ */
+static inline bam1_t *bam_dup1(const bam1_t *src)
+{
+ bam1_t *b;
+ b = bam_init1();
+ *b = *src;
+ b->m_data = b->data_len;
+ b->data = (uint8_t*)calloc(b->data_len, 1);
+ memcpy(b->data, src->data, b->data_len);
+ return b;
+}
+
+static inline int bam_aux_type2size(int x)
+{
+ if (x == 'C' || x == 'c' || x == 'A') return 1;
+ else if (x == 'S' || x == 's') return 2;
+ else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4;
+ else return 0;
+}
+
+/*********************************
+ *** Compatibility with htslib ***
+ *********************************/
+
+typedef bam_header_t bam_hdr_t;
+
+#define bam_get_qname(b) bam1_qname(b)
+#define bam_get_cigar(b) bam1_cigar(b)
+
+#define bam_hdr_read(fp) bam_header_read(fp)
+#define bam_hdr_write(fp, h) bam_header_write(fp, h)
+#define bam_hdr_destroy(fp) bam_header_destroy(fp)
+
+#endif
diff --git a/samtools-0.1.19/bam2bcf.c b/samtools-0.1.19/bam2bcf.c
new file mode 100644
index 0000000..340b10b
--- /dev/null
+++ b/samtools-0.1.19/bam2bcf.c
@@ -0,0 +1,467 @@
+#include <math.h>
+#include <stdint.h>
+#include <assert.h>
+#include "bam.h"
+#include "kstring.h"
+#include "bam2bcf.h"
+#include "errmod.h"
+#include "bcftools/bcf.h"
+
+extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
+
+#define CALL_ETA 0.03f
+#define CALL_MAX 256
+#define CALL_DEFTHETA 0.83f
+#define DEF_MAPQ 20
+
+#define CAP_DIST 25
+
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+{
+ bcf_callaux_t *bca;
+ if (theta <= 0.) theta = CALL_DEFTHETA;
+ bca = calloc(1, sizeof(bcf_callaux_t));
+ bca->capQ = 60;
+ bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
+ bca->min_baseQ = min_baseQ;
+ bca->e = errmod_init(1. - theta);
+ bca->min_frac = 0.002;
+ bca->min_support = 1;
+ bca->per_sample_flt = 0;
+ bca->npos = 100;
+ bca->ref_pos = calloc(bca->npos, sizeof(int));
+ bca->alt_pos = calloc(bca->npos, sizeof(int));
+ return bca;
+}
+
+
+static int get_position(const bam_pileup1_t *p, int *len)
+{
+ int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
+ for (icig=0; icig<p->b->core.n_cigar; icig++)
+ {
+ // Conversion from uint32_t to MIDNSHP
+ // 0123456
+ // MIDNSHP
+ int cig = bam1_cigar(p->b)[icig] & BAM_CIGAR_MASK;
+ int ncig = bam1_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
+ if ( cig==0 )
+ {
+ n_tot_bases += ncig;
+ iread += ncig;
+ }
+ else if ( cig==1 )
+ {
+ n_tot_bases += ncig;
+ iread += ncig;
+ }
+ else if ( cig==4 )
+ {
+ iread += ncig;
+ if ( iread<=p->qpos ) edist -= ncig;
+ }
+ }
+ *len = n_tot_bases;
+ return edist;
+}
+
+void bcf_call_destroy(bcf_callaux_t *bca)
+{
+ if (bca == 0) return;
+ errmod_destroy(bca->e);
+ if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
+ free(bca->bases); free(bca->inscns); free(bca);
+}
+/* ref_base is the 4-bit representation of the reference base. It is
+ * negative if we are looking at an indel. */
+int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
+{
+ int i, n, ref4, is_indel, ori_depth = 0;
+ memset(r, 0, sizeof(bcf_callret1_t));
+ if (ref_base >= 0) {
+ ref4 = bam_nt16_nt4_table[ref_base];
+ is_indel = 0;
+ } else ref4 = 4, is_indel = 1;
+ if (_n == 0) return -1;
+ // enlarge the bases array if necessary
+ if (bca->max_bases < _n) {
+ bca->max_bases = _n;
+ kroundup32(bca->max_bases);
+ bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
+ }
+ // fill the bases array
+ for (i = n = r->n_supp = 0; i < _n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+ // set base
+ if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+ ++ori_depth;
+ baseQ = q = is_indel? p->aux&0xff : (int)bam1_qual(p->b)[p->qpos]; // base/indel quality
+ seqQ = is_indel? (p->aux>>8&0xff) : 99;
+ if (q < bca->min_baseQ) continue;
+ if (q > seqQ) q = seqQ;
+ mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
+ mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
+ if (q > mapQ) q = mapQ;
+ if (q > 63) q = 63;
+ if (q < 4) q = 4;
+ if (!is_indel) {
+ b = bam1_seqi(bam1_seq(p->b), p->qpos); // base
+ b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base
+ is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+ } else {
+ b = p->aux>>16&0x3f;
+ is_diff = (b != 0);
+ }
+ if (is_diff) ++r->n_supp;
+ bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+ // collect annotations
+ if (b < 4) r->qsum[b] += q;
+ ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)];
+ min_dist = p->b->core.l_qseq - 1 - p->qpos;
+ if (min_dist > p->qpos) min_dist = p->qpos;
+ if (min_dist > CAP_DIST) min_dist = CAP_DIST;
+ r->anno[1<<2|is_diff<<1|0] += baseQ;
+ r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
+ r->anno[2<<2|is_diff<<1|0] += mapQ;
+ r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
+ r->anno[3<<2|is_diff<<1|0] += min_dist;
+ r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
+
+ // collect read positions for ReadPosBias
+ int len, pos = get_position(p, &len);
+ int epos = (double)pos/(len+1) * bca->npos;
+ if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base )
+ bca->ref_pos[epos]++;
+ else
+ bca->alt_pos[epos]++;
+ }
+ r->depth = n; r->ori_depth = ori_depth;
+ // glfgen
+ errmod_cal(bca->e, n, 5, bca->bases, r->p);
+ return r->depth;
+}
+
+double mann_whitney_1947(int n, int m, int U)
+{
+ if (U<0) return 0;
+ if (n==0||m==0) return U==0 ? 1 : 0;
+ return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
+}
+
+void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call)
+{
+ int i, nref = 0, nalt = 0;
+ unsigned long int U = 0;
+ for (i=0; i<bca->npos; i++)
+ {
+ nref += bca->ref_pos[i];
+ nalt += bca->alt_pos[i];
+ U += nref*bca->alt_pos[i];
+ bca->ref_pos[i] = 0;
+ bca->alt_pos[i] = 0;
+ }
+#if 0
+//todo
+ double var = 0, avg = (double)(nref+nalt)/bca->npos;
+ for (i=0; i<bca->npos; i++)
+ {
+ double ediff = bca->ref_pos[i] + bca->alt_pos[i] - avg;
+ var += ediff*ediff;
+ bca->ref_pos[i] = 0;
+ bca->alt_pos[i] = 0;
+ }
+ call->read_pos.avg = avg;
+ call->read_pos.var = sqrt(var/bca->npos);
+ call->read_pos.dp = nref+nalt;
+#endif
+ if ( !nref || !nalt )
+ {
+ call->read_pos_bias = -1;
+ return;
+ }
+
+ if ( nref>=8 || nalt>=8 )
+ {
+ // normal approximation
+ double mean = ((double)nref*nalt+1.0)/2.0;
+ double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0;
+ double z = (U-mean)/sqrt(var2);
+ call->read_pos_bias = z;
+ //fprintf(stderr,"nref=%d nalt=%d U=%ld mean=%e var=%e zval=%e\n", nref,nalt,U,mean,sqrt(var2),call->read_pos_bias);
+ }
+ else
+ {
+ double p = mann_whitney_1947(nalt,nref,U);
+ // biased form claimed by GATK to behave better empirically
+ // double var2 = (1.0+1.0/(nref+nalt+1.0))*(double)nref*nalt*(nref+nalt+1.0)/12.0;
+ double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0;
+ double z;
+ if ( p >= 1./sqrt(var2*2*M_PI) ) z = 0; // equal to mean
+ else
+ {
+ if ( U >= nref*nalt/2. ) z = sqrt(-2*log(sqrt(var2*2*M_PI)*p));
+ else z = -sqrt(-2*log(sqrt(var2*2*M_PI)*p));
+ }
+ call->read_pos_bias = z;
+ //fprintf(stderr,"nref=%d nalt=%d U=%ld p=%e var2=%e zval=%e\n", nref,nalt,U, p,var2,call->read_pos_bias);
+ }
+}
+
+float mean_diff_to_prob(float mdiff, int dp, int readlen)
+{
+ if ( dp==2 )
+ {
+ if ( mdiff==0 )
+ return (2.0*readlen + 4.0*(readlen-1.0))/((float)readlen*readlen);
+ else
+ return 8.0*(readlen - 4.0*mdiff)/((float)readlen*readlen);
+ }
+
+ // This is crude empirical approximation and is not very accurate for
+ // shorter read lengths (<100bp). There certainly is a room for
+ // improvement.
+ const float mv[24][2] = { {0,0}, {0,0}, {0,0},
+ { 9.108, 4.934}, { 9.999, 3.991}, {10.273, 3.485}, {10.579, 3.160},
+ {10.828, 2.889}, {11.014, 2.703}, {11.028, 2.546}, {11.244, 2.391},
+ {11.231, 2.320}, {11.323, 2.138}, {11.403, 2.123}, {11.394, 1.994},
+ {11.451, 1.928}, {11.445, 1.862}, {11.516, 1.815}, {11.560, 1.761},
+ {11.544, 1.728}, {11.605, 1.674}, {11.592, 1.652}, {11.674, 1.613},
+ {11.641, 1.570} };
+
+ float m, v;
+ if ( dp>=24 )
+ {
+ m = readlen/8.;
+ if (dp>100) dp = 100;
+ v = 1.476/(0.182*pow(dp,0.514));
+ v = v*(readlen/100.);
+ }
+ else
+ {
+ m = mv[dp][0];
+ v = mv[dp][1];
+ m = m*readlen/100.;
+ v = v*readlen/100.;
+ v *= 1.2; // allow more variability
+ }
+ return 1.0/(v*sqrt(2*M_PI)) * exp(-0.5*((mdiff-m)/v)*((mdiff-m)/v));
+}
+
+void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call)
+{
+ int i, dp = 0;
+ float mean_pos = 0, mean_diff = 0;
+ for (i=0; i<bca->npos; i++)
+ {
+ if ( !bca->alt_pos[i] ) continue;
+ dp += bca->alt_pos[i];
+ int j = i<bca->npos/2 ? i : bca->npos - i;
+ mean_pos += bca->alt_pos[i]*j;
+ }
+ if ( dp<2 )
+ {
+ call->vdb = -1;
+ return;
+ }
+ mean_pos /= dp;
+ for (i=0; i<bca->npos; i++)
+ {
+ if ( !bca->alt_pos[i] ) continue;
+ int j = i<bca->npos/2 ? i : bca->npos - i;
+ mean_diff += bca->alt_pos[i] * fabs(j - mean_pos);
+ }
+ mean_diff /= dp;
+ call->vdb = mean_diff_to_prob(mean_diff, dp, bca->npos);
+}
+
+/**
+ * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
+ * @n: number of samples
+ * @calls: each sample's calls
+ * @bca: auxiliary data structure for holding temporary values
+ * @ref_base: the reference base
+ * @call: filled with the annotations
+ */
+int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
+{
+ int ref4, i, j, qsum[4];
+ int64_t tmp;
+ if (ref_base >= 0) {
+ call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
+ if (ref4 > 4) ref4 = 4;
+ } else call->ori_ref = -1, ref4 = 0;
+ // calculate qsum
+ memset(qsum, 0, 4 * sizeof(int));
+ for (i = 0; i < n; ++i)
+ for (j = 0; j < 4; ++j)
+ qsum[j] += calls[i].qsum[j];
+ int qsum_tot=0;
+ for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; }
+ for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j;
+ // find the top 2 alleles
+ for (i = 1; i < 4; ++i) // insertion sort
+ for (j = i; j > 0 && qsum[j] < qsum[j-1]; --j)
+ tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
+ // set the reference allele and alternative allele(s)
+ for (i = 0; i < 5; ++i) call->a[i] = -1;
+ call->unseen = -1;
+ call->a[0] = ref4;
+ for (i = 3, j = 1; i >= 0; --i) {
+ if ((qsum[i]&3) != ref4) {
+ if (qsum[i]>>2 != 0)
+ {
+ if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4
+ call->a[j++] = qsum[i]&3;
+ }
+ else break;
+ }
+ else
+ call->qsum[0] = (float)(qsum[i]>>2)/qsum_tot;
+ }
+ if (ref_base >= 0) { // for SNPs, find the "unseen" base
+ if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
+ call->unseen = j, call->a[j++] = qsum[i]&3;
+ call->n_alleles = j;
+ } else {
+ call->n_alleles = j;
+ if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
+ }
+ // set the PL array
+ if (call->n < n) {
+ call->n = n;
+ call->PL = realloc(call->PL, 15 * n);
+ }
+ {
+ int x, g[15], z;
+ double sum_min = 0.;
+ x = call->n_alleles * (call->n_alleles + 1) / 2;
+ // get the possible genotypes
+ for (i = z = 0; i < call->n_alleles; ++i)
+ for (j = 0; j <= i; ++j)
+ g[z++] = call->a[j] * 5 + call->a[i];
+ for (i = 0; i < n; ++i) {
+ uint8_t *PL = call->PL + x * i;
+ const bcf_callret1_t *r = calls + i;
+ float min = 1e37;
+ for (j = 0; j < x; ++j)
+ if (min > r->p[g[j]]) min = r->p[g[j]];
+ sum_min += min;
+ for (j = 0; j < x; ++j) {
+ int y;
+ y = (int)(r->p[g[j]] - min + .499);
+ if (y > 255) y = 255;
+ PL[j] = y;
+ }
+ }
+// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
+ call->shift = (int)(sum_min + .499);
+ }
+ // combine annotations
+ memset(call->anno, 0, 16 * sizeof(int));
+ for (i = call->depth = call->ori_depth = 0, tmp = 0; i < n; ++i) {
+ call->depth += calls[i].depth;
+ call->ori_depth += calls[i].ori_depth;
+ for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
+ }
+
+ calc_vdb(bca, call);
+ calc_ReadPosBias(bca, call);
+
+ return 0;
+}
+
+int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
+ const bcf_callaux_t *bca, const char *ref)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ kstring_t s;
+ int i, j;
+ b->n_smpl = bc->n;
+ b->tid = tid; b->pos = pos; b->qual = 0;
+ s.s = b->str; s.m = b->m_str; s.l = 0;
+ kputc('\0', &s);
+ if (bc->ori_ref < 0) { // an indel
+ // write REF
+ kputc(ref[pos], &s);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
+ kputc('\0', &s);
+ // write ALT
+ kputc(ref[pos], &s);
+ for (i = 1; i < 4; ++i) {
+ if (bc->a[i] < 0) break;
+ if (i > 1) {
+ kputc(',', &s); kputc(ref[pos], &s);
+ }
+ if (bca->indel_types[bc->a[i]] < 0) { // deletion
+ for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
+ kputc(ref[pos+1+j], &s);
+ } else { // insertion; cannot be a reference unless a bug
+ char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
+ for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
+ kputc("ACGTN"[(int)inscns[j]], &s);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
+ }
+ }
+ kputc('\0', &s);
+ } else { // a SNP
+ kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
+ for (i = 1; i < 5; ++i) {
+ if (bc->a[i] < 0) break;
+ if (i > 1) kputc(',', &s);
+ kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
+ }
+ kputc('\0', &s);
+ }
+ kputc('\0', &s);
+ // INFO
+ if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac);
+ kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s);
+ for (i = 0; i < 16; ++i) {
+ if (i) kputc(',', &s);
+ kputw(bc->anno[i], &s);
+ }
+ //ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var);
+ ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]);
+ if (bc->vdb != -1)
+ ksprintf(&s, ";VDB=%e", bc->vdb);
+ if (bc->read_pos_bias != -1 )
+ ksprintf(&s, ";RPB=%e", bc->read_pos_bias);
+ kputc('\0', &s);
+ // FMT
+ kputs("PL", &s);
+ if (bcr && fmt_flag) {
+ if (fmt_flag & B2B_FMT_DP) kputs(":DP", &s);
+ if (fmt_flag & B2B_FMT_DV) kputs(":DV", &s);
+ if (fmt_flag & B2B_FMT_SP) kputs(":SP", &s);
+ }
+ kputc('\0', &s);
+ b->m_str = s.m; b->str = s.s; b->l_str = s.l;
+ bcf_sync(b);
+ memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);
+ if (bcr && fmt_flag) {
+ uint16_t *dp = (fmt_flag & B2B_FMT_DP)? b->gi[1].data : 0;
+ uint16_t *dv = (fmt_flag & B2B_FMT_DV)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0)].data : 0;
+ int32_t *sp = (fmt_flag & B2B_FMT_SP)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0) + ((fmt_flag & B2B_FMT_DV) != 0)].data : 0;
+ for (i = 0; i < bc->n; ++i) {
+ bcf_callret1_t *p = bcr + i;
+ if (dp) dp[i] = p->depth < 0xffff? p->depth : 0xffff;
+ if (dv) dv[i] = p->n_supp < 0xffff? p->n_supp : 0xffff;
+ if (sp) {
+ if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2
+ || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2)
+ {
+ sp[i] = 0;
+ } else {
+ double left, right, two;
+ int x;
+ kt_fisher_exact(p->anno[0], p->anno[1], p->anno[2], p->anno[3], &left, &right, &two);
+ x = (int)(-4.343 * log(two) + .499);
+ if (x > 255) x = 255;
+ sp[i] = x;
+ }
+ }
+ }
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/bam2bcf.h b/samtools-0.1.19/bam2bcf.h
new file mode 100644
index 0000000..b2b1825
--- /dev/null
+++ b/samtools-0.1.19/bam2bcf.h
@@ -0,0 +1,67 @@
+#ifndef BAM2BCF_H
+#define BAM2BCF_H
+
+#include <stdint.h>
+#include "errmod.h"
+#include "bcftools/bcf.h"
+
+#define B2B_INDEL_NULL 10000
+
+#define B2B_FMT_DP 0x1
+#define B2B_FMT_SP 0x2
+#define B2B_FMT_DV 0x4
+
+typedef struct __bcf_callaux_t {
+ int capQ, min_baseQ;
+ int openQ, extQ, tandemQ; // for indels
+ int min_support, max_support; // for collecting indel candidates
+ double min_frac, max_frac; // for collecting indel candidates
+ int per_sample_flt; // indel filtering strategy
+ int *ref_pos, *alt_pos, npos; // for ReadPosBias
+ // for internal uses
+ int max_bases;
+ int indel_types[4];
+ int maxins, indelreg;
+ int read_len;
+ char *inscns;
+ uint16_t *bases;
+ errmod_t *e;
+ void *rghash;
+} bcf_callaux_t;
+
+typedef struct {
+ int depth, n_supp, ori_depth, qsum[4];
+ unsigned int anno[16];
+ float p[25];
+} bcf_callret1_t;
+
+typedef struct {
+ int a[5]; // alleles: ref, alt, alt2, alt3
+ float qsum[4];
+ int n, n_alleles, shift, ori_ref, unseen;
+ int n_supp; // number of supporting non-reference reads
+ unsigned int anno[16], depth, ori_depth;
+ uint8_t *PL;
+ float vdb; // variant distance bias
+ float read_pos_bias;
+ struct { float avg, var; int dp; } read_pos;
+} bcf_call_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ);
+ void bcf_call_destroy(bcf_callaux_t *bca);
+ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
+ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
+ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
+ const bcf_callaux_t *bca, const char *ref);
+ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
+ const void *rghash);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/bam2bcf_indel.c b/samtools-0.1.19/bam2bcf_indel.c
new file mode 100644
index 0000000..30b3f46
--- /dev/null
+++ b/samtools-0.1.19/bam2bcf_indel.c
@@ -0,0 +1,498 @@
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include "bam.h"
+#include "bam2bcf.h"
+#include "kaln.h"
+#include "kprobaln.h"
+#include "khash.h"
+KHASH_SET_INIT_STR(rg)
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint32_t)
+
+#define MINUS_CONST 0x10000000
+#define INDEL_WINDOW_SIZE 50
+
+void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
+{
+ const char *s, *p, *q, *r, *t;
+ khash_t(rg) *hash;
+ if (list == 0 || hdtext == 0) return _hash;
+ if (_hash == 0) _hash = kh_init(rg);
+ hash = (khash_t(rg)*)_hash;
+ if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
+ do {
+ t = strstr(s + 4, "@RG\t"); // the next @RG
+ if ((p = strstr(s, "\tID:")) != 0) p += 4;
+ if ((q = strstr(s, "\tPL:")) != 0) q += 4;
+ if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
+ int lp, lq;
+ char *x;
+ for (r = p; *r && *r != '\t' && *r != '\n'; ++r); lp = r - p;
+ for (r = q; *r && *r != '\t' && *r != '\n'; ++r); lq = r - q;
+ x = calloc((lp > lq? lp : lq) + 1, 1);
+ for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
+ if (strstr(list, x)) { // insert ID to the hash table
+ khint_t k;
+ int ret;
+ for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
+ x[r-p] = 0;
+ k = kh_get(rg, hash, x);
+ if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
+ else free(x);
+ } else free(x);
+ }
+ s = t;
+ } while (s);
+ return hash;
+}
+
+void bcf_call_del_rghash(void *_hash)
+{
+ khint_t k;
+ khash_t(rg) *hash = (khash_t(rg)*)_hash;
+ if (hash == 0) return;
+ for (k = kh_begin(hash); k < kh_end(hash); ++k)
+ if (kh_exist(hash, k))
+ free((char*)kh_key(hash, k));
+ kh_destroy(rg, hash);
+}
+
+static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
+{
+ int k, x = c->pos, y = 0, last_y = 0;
+ *_tpos = c->pos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int l = cigar[k] >> BAM_CIGAR_SHIFT;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ if (c->pos > tpos) return y;
+ if (x + l > tpos) {
+ *_tpos = tpos;
+ return y + (tpos - x);
+ }
+ x += l; y += l;
+ last_y = y;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ if (x + l > tpos) {
+ *_tpos = is_left? x : x + l;
+ return y;
+ }
+ x += l;
+ }
+ }
+ *_tpos = x;
+ return last_y;
+}
+// FIXME: check if the inserted sequence is consistent with the homopolymer run
+// l is the relative gap length and l_run is the length of the homopolymer on the reference
+static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
+{
+ int q, qh;
+ q = bca->openQ + bca->extQ * (abs(l) - 1);
+ qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
+ return q < qh? q : qh;
+}
+
+static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
+{
+ int i, j, max = 0, max_i = pos, score = 0;
+ l = abs(l);
+ for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
+ if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
+ else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
+ if (score < 0) break;
+ if (max < score) max = score, max_i = i;
+ }
+ return max_i - pos;
+}
+
+/*
+ * @n: number of samples
+ */
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
+ const void *rghash)
+{
+ int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+ int N, K, l_run, ref_type, n_alt;
+ char *inscns = 0, *ref2, *query, **ref_sample;
+ khash_t(rg) *hash = (khash_t(rg)*)rghash;
+ if (ref == 0 || bca == 0) return -1;
+ // mark filtered reads
+ if (rghash) {
+ N = 0;
+ for (s = N = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ const uint8_t *rg = bam_aux_get(p->b, "RG");
+ p->aux = 1; // filtered by default
+ if (rg) {
+ khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
+ if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
+ }
+ }
+ }
+ if (N == 0) return -1; // no reads left
+ }
+ // determine if there is a gap
+ for (s = N = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i)
+ if (plp[s][i].indel != 0) break;
+ if (i < n_plp[s]) break;
+ }
+ if (s == n) return -1; // there is no indel at this position.
+ for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
+ { // find out how many types of indels are present
+ bca->max_support = bca->max_frac = 0;
+ int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
+ uint32_t *aux;
+ aux = calloc(N + 1, 4);
+ m = max_rd_len = 0;
+ aux[m++] = MINUS_CONST; // zero indel is always a type
+ for (s = 0; s < n; ++s) {
+ int na = 0, nt = 0;
+ for (i = 0; i < n_plp[s]; ++i) {
+ const bam_pileup1_t *p = plp[s] + i;
+ if (rghash == 0 || p->aux == 0) {
+ ++nt;
+ if (p->indel != 0) {
+ ++na;
+ aux[m++] = MINUS_CONST + p->indel;
+ }
+ }
+ j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
+ if (j > max_rd_len) max_rd_len = j;
+ }
+ float frac = (float)na/nt;
+ if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
+ indel_support_ok = 1;
+ if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
+ n_alt += na;
+ n_tot += nt;
+ }
+ // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
+ // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
+ int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
+ if ( nN*2>i ) { free(aux); return -1; }
+
+ ks_introsort(uint32_t, m, aux);
+ // squeeze out identical types
+ for (i = 1, n_types = 1; i < m; ++i)
+ if (aux[i] != aux[i-1]) ++n_types;
+ // Taking totals makes it hard to call rare indels
+ if ( !bca->per_sample_flt )
+ indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
+ if ( n_types == 1 || !indel_support_ok ) { // then skip
+ free(aux); return -1;
+ }
+ if (n_types >= 64) {
+ free(aux);
+ if (bam_verbose >= 2)
+ fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
+ return -1;
+ }
+ types = (int*)calloc(n_types, sizeof(int));
+ t = 0;
+ types[t++] = aux[0] - MINUS_CONST;
+ for (i = 1; i < m; ++i)
+ if (aux[i] != aux[i-1])
+ types[t++] = aux[i] - MINUS_CONST;
+ free(aux);
+ for (t = 0; t < n_types; ++t)
+ if (types[t] == 0) break;
+ ref_type = t; // the index of the reference type (0)
+ }
+ { // calculate left and right boundary
+ left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+ right = pos + INDEL_WINDOW_SIZE;
+ if (types[0] < 0) right -= types[0];
+ // in case the alignments stand out the reference
+ for (i = pos; i < right; ++i)
+ if (ref[i] == 0) break;
+ right = i;
+ }
+ /* The following block fixes a long-existing flaw in the INDEL
+ * calling model: the interference of nearby SNPs. However, it also
+ * reduces the power because sometimes, substitutions caused by
+ * indels are not distinguishable from true mutations. Multiple
+ * sequence realignment helps to increase the power.
+ *
+ * Masks mismatches present in at least 70% of the reads with 'N'.
+ */
+ { // construct per-sample consensus
+ int L = right - left + 1, max_i, max2_i;
+ uint32_t *cns, max, max2;
+ char *ref0, *r;
+ ref_sample = calloc(n, sizeof(void*));
+ cns = calloc(L, 4);
+ ref0 = calloc(L, 1);
+ for (i = 0; i < right - left; ++i)
+ ref0[i] = bam_nt16_table[(int)ref[i+left]];
+ for (s = 0; s < n; ++s) {
+ r = ref_sample[s] = calloc(L, 1);
+ memset(cns, 0, sizeof(int) * L);
+ // collect ref and non-ref counts
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ bam1_t *b = p->b;
+ uint32_t *cigar = bam1_cigar(b);
+ uint8_t *seq = bam1_seq(b);
+ int x = b->core.pos, y = 0;
+ for (k = 0; k < b->core.n_cigar; ++k) {
+ int op = cigar[k]&0xf;
+ int j, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j)
+ if (x + j >= left && x + j < right)
+ cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
+ x += l; y += l;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ }
+ }
+ // determine the consensus
+ for (i = 0; i < right - left; ++i) r[i] = ref0[i];
+ max = max2 = 0; max_i = max2_i = -1;
+ for (i = 0; i < right - left; ++i) {
+ if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+ else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
+ }
+ if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
+ if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
+ if (max_i >= 0) r[max_i] = 15;
+ if (max2_i >= 0) r[max2_i] = 15;
+ //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
+ }
+ free(ref0); free(cns);
+ }
+ { // the length of the homopolymer run around the current position
+ int c = bam_nt16_table[(int)ref[pos + 1]];
+ if (c == 15) l_run = 1;
+ else {
+ for (i = pos + 2; ref[i]; ++i)
+ if (bam_nt16_table[(int)ref[i]] != c) break;
+ l_run = i;
+ for (i = pos; i >= 0; --i)
+ if (bam_nt16_table[(int)ref[i]] != c) break;
+ l_run -= i + 1;
+ }
+ }
+ // construct the consensus sequence
+ max_ins = types[n_types - 1]; // max_ins is at least 0
+ if (max_ins > 0) {
+ int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
+ // count the number of occurrences of each base at each position for each type of insertion
+ for (t = 0; t < n_types; ++t) {
+ if (types[t] > 0) {
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (p->indel == types[t]) {
+ uint8_t *seq = bam1_seq(p->b);
+ for (k = 1; k <= p->indel; ++k) {
+ int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)];
+ assert(c<5);
+ ++inscns_aux[(t*max_ins+(k-1))*5 + c];
+ }
+ }
+ }
+ }
+ }
+ }
+ // use the majority rule to construct the consensus
+ inscns = calloc(n_types * max_ins, 1);
+ for (t = 0; t < n_types; ++t) {
+ for (j = 0; j < types[t]; ++j) {
+ int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+ for (k = 0; k < 5; ++k)
+ if (ia[k] > max)
+ max = ia[k], max_k = k;
+ inscns[t*max_ins + j] = max? max_k : 4;
+ if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
+ }
+ }
+ free(inscns_aux);
+ }
+ // compute the likelihood given each type of indel for each read
+ max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
+ ref2 = calloc(max_ref2, 1);
+ query = calloc(right - left + max_rd_len + max_ins + 2, 1);
+ score1 = calloc(N * n_types, sizeof(int));
+ score2 = calloc(N * n_types, sizeof(int));
+ bca->indelreg = 0;
+ for (t = 0; t < n_types; ++t) {
+ int l, ir;
+ kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ apf1.bw = apf2.bw = abs(types[t]) + 3;
+ // compute indelreg
+ if (types[t] == 0) ir = 0;
+ else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+ else ir = est_indelreg(pos, ref, -types[t], 0);
+ if (ir > bca->indelreg) bca->indelreg = ir;
+// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
+ // realignment
+ for (s = K = 0; s < n; ++s) {
+ // write ref2
+ for (k = 0, j = left; j <= pos; ++j)
+ ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
+ if (types[t] <= 0) j += -types[t];
+ else for (l = 0; l < types[t]; ++l)
+ ref2[k++] = inscns[t*max_ins + l];
+ for (; j < right && ref[j]; ++j)
+ ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
+ for (; k < max_ref2; ++k) ref2[k] = 4;
+ if (j < right) right = j;
+ // align each read to ref2
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int qbeg, qend, tbeg, tend, sc, kk;
+ uint8_t *seq = bam1_seq(p->b);
+ uint32_t *cigar = bam1_cigar(p->b);
+ if (p->b->core.flag&4) continue; // unmapped reads
+ // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+ for (kk = 0; kk < p->b->core.n_cigar; ++kk)
+ if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
+ if (kk < p->b->core.n_cigar) continue;
+ // FIXME: the following skips soft clips, but using them may be more sensitive.
+ // determine the start and end of sequences for alignment
+ qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg);
+ qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend);
+ if (types[t] < 0) {
+ int l = -types[t];
+ tbeg = tbeg - l > left? tbeg - l : left;
+ }
+ // write the query sequence
+ for (l = qbeg; l < qend; ++l)
+ query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)];
+ { // do realignment; this is the bottleneck
+ const uint8_t *qual = bam1_qual(p->b), *bq;
+ uint8_t *qq;
+ qq = calloc(qend - qbeg, 1);
+ bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+ if (bq) ++bq; // skip type
+ for (l = qbeg; l < qend; ++l) {
+ qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
+ if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
+ if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
+ }
+ sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
+ if (l > 255) l = 255;
+ score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
+ if (sc > 5) {
+ sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499);
+ if (l > 255) l = 255;
+ score2[K*n_types + t] = sc<<8 | l;
+ }
+ free(qq);
+ }
+/*
+ for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
+ fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
+ fputc('\n', stderr);
+ for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
+ fputc('\n', stderr);
+ fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
+*/
+ }
+ }
+ }
+ free(ref2); free(query);
+ { // compute indelQ
+ int *sc, tmp, *sumq;
+ sc = alloca(n_types * sizeof(int));
+ sumq = alloca(n_types * sizeof(int));
+ memset(sumq, 0, sizeof(int) * n_types);
+ for (s = K = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ /* errmod_cal() assumes that if the call is wrong, the
+ * likelihoods of other events are equal. This is about
+ * right for substitutions, but is not desired for
+ * indels. To reuse errmod_cal(), I have to make
+ * compromise for multi-allelic indels.
+ */
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ1 = (sc[1]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ1 = (sc[t]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
+ sct = &score2[K*n_types];
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ2 = (sc[1]>>14) - (sc[0]>>14);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ2 = (sc[t]>>14) - (sc[0]>>14);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
+ // pick the smaller between indelQ1 and indelQ2
+ indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
+ if (indelQ > 255) indelQ = 255;
+ if (seqQ > 255) seqQ = 255;
+ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
+ sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+ }
+ }
+ // determine bca->indel_types[] and bca->inscns
+ bca->maxins = max_ins;
+ bca->inscns = realloc(bca->inscns, bca->maxins * 4);
+ for (t = 0; t < n_types; ++t)
+ sumq[t] = sumq[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sumq[t]&0x3f) == ref_type) break;
+ if (t) { // then move the reference type to the first
+ tmp = sumq[t];
+ for (; t > 0; --t) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+ for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+ for (t = 0; t < 4 && t < n_types; ++t) {
+ bca->indel_types[t] = types[sumq[t]&0x3f];
+ memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+ }
+ // update p->aux
+ for (s = n_alt = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ int x = types[p->aux>>16&0x3f];
+ for (j = 0; j < 4; ++j)
+ if (x == bca->indel_types[j]) break;
+ p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+ if ((p->aux>>16&0x3f) > 0) ++n_alt;
+// fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d q=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>16&63, bca->indel_types[p->aux>>16&63], p->aux&0xff, p->aux>>8&0xff);
+ }
+ }
+ }
+ free(score1); free(score2);
+ // free
+ for (i = 0; i < n; ++i) free(ref_sample[i]);
+ free(ref_sample);
+ free(types); free(inscns);
+ return n_alt > 0? 0 : -1;
+}
diff --git a/samtools-0.1.19/bam2depth.c b/samtools-0.1.19/bam2depth.c
new file mode 100644
index 0000000..02311ef
--- /dev/null
+++ b/samtools-0.1.19/bam2depth.c
@@ -0,0 +1,143 @@
+/* This program demonstrates how to generate pileup from multiple BAMs
+ * simutaneously, to achieve random access and to use the BED interface.
+ * To compile this program separately, you may:
+ *
+ * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "bam.h"
+
+typedef struct { // auxiliary data structure
+ bamFile fp; // the file handler
+ bam_iter_t iter; // NULL if a region not specified
+ int min_mapQ, min_len; // mapQ filter; length filter
+} aux_t;
+
+void *bed_read(const char *fn); // read a BED or position list file
+void bed_destroy(void *_h); // destroy the BED data structure
+int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps
+
+// This function reads a BAM alignment from one BAM file.
+static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
+{
+ aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
+ int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
+ if (!(b->core.flag&BAM_FUNMAP)) {
+ if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+ else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP;
+ }
+ return ret;
+}
+
+int read_file_list(const char *file_list,int *n,char **argv[]);
+
+#ifdef _MAIN_BAM2DEPTH
+int main(int argc, char *argv[])
+#else
+int main_depth(int argc, char *argv[])
+#endif
+{
+ int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles;
+ const bam_pileup1_t **plp;
+ char *reg = 0; // specified region
+ void *bed = 0; // BED data structure
+ char *file_list = NULL, **fn = NULL;
+ bam_header_t *h = 0; // BAM header of the 1st input
+ aux_t **data;
+ bam_mplp_t mplp;
+
+ // parse the command line
+ while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
+ switch (n) {
+ case 'l': min_len = atoi(optarg); break; // minimum query length
+ case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
+ case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
+ case 'q': baseQ = atoi(optarg); break; // base quality threshold
+ case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
+ case 'f': file_list = optarg; break;
+ }
+ }
+ if (optind == argc && !file_list) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -b <bed> list of positions or regions\n");
+ fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
+ fprintf(stderr, " -l <int> minQLen\n");
+ fprintf(stderr, " -q <int> base quality threshold\n");
+ fprintf(stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(stderr, " -r <chr:from-to> region\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+
+ // initialize the auxiliary data structures
+ if (file_list)
+ {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ n = nfiles;
+ argv = fn;
+ optind = 0;
+ }
+ else
+ n = argc - optind; // the number of BAMs on the command line
+ data = calloc(n, sizeof(void*)); // data[i] for the i-th input
+ beg = 0; end = 1<<30; tid = -1; // set the default region
+ for (i = 0; i < n; ++i) {
+ bam_header_t *htmp;
+ data[i] = calloc(1, sizeof(aux_t));
+ data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
+ data[i]->min_mapQ = mapQ; // set the mapQ filter
+ data[i]->min_len = min_len; // set the qlen filter
+ htmp = bam_header_read(data[i]->fp); // read the BAM header
+ if (i == 0) {
+ h = htmp; // keep the header of the 1st BAM
+ if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
+ } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
+ if (tid >= 0) { // if a region is specified and parsed successfully
+ bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index
+ data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
+ bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
+ }
+ }
+
+ // the core multi-pileup loop
+ mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
+ n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
+ plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
+ while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
+ if (pos < beg || pos >= end) continue; // out of range; skip
+ if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
+ fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
+ for (i = 0; i < n; ++i) { // base level filters have to go here
+ int j, m = 0;
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
+ if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
+ else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
+ }
+ printf("\t%d", n_plp[i] - m); // this the depth to output
+ }
+ putchar('\n');
+ }
+ free(n_plp); free(plp);
+ bam_mplp_destroy(mplp);
+
+ bam_header_destroy(h);
+ for (i = 0; i < n; ++i) {
+ bam_close(data[i]->fp);
+ if (data[i]->iter) bam_iter_destroy(data[i]->iter);
+ free(data[i]);
+ }
+ free(data); free(reg);
+ if (bed) bed_destroy(bed);
+ if ( file_list )
+ {
+ for (i=0; i<n; i++) free(fn[i]);
+ free(fn);
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/bam_aux.c b/samtools-0.1.19/bam_aux.c
new file mode 100644
index 0000000..4bbf975
--- /dev/null
+++ b/samtools-0.1.19/bam_aux.c
@@ -0,0 +1,217 @@
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+typedef char *str_p;
+KHASH_MAP_INIT_STR(s, int)
+KHASH_MAP_INIT_STR(r2l, str_p)
+
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+ int ori_len = b->data_len;
+ b->data_len += 3 + len;
+ b->l_aux += 3 + len;
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+ b->data[ori_len + 2] = type;
+ memcpy(b->data + ori_len + 3, data, len);
+}
+
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
+{
+ return bam_aux_get(b, tag);
+}
+
+#define __skip_tag(s) do { \
+ int type = toupper(*(s)); \
+ ++(s); \
+ if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
+ else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
+ else (s) += bam_aux_type2size(type); \
+ } while(0)
+
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
+{
+ uint8_t *s;
+ int y = tag[0]<<8 | tag[1];
+ s = bam1_aux(b);
+ while (s < b->data + b->data_len) {
+ int x = (int)s[0]<<8 | s[1];
+ s += 2;
+ if (x == y) return s;
+ __skip_tag(s);
+ }
+ return 0;
+}
+// s MUST BE returned by bam_aux_get()
+int bam_aux_del(bam1_t *b, uint8_t *s)
+{
+ uint8_t *p, *aux;
+ aux = bam1_aux(b);
+ p = s - 2;
+ __skip_tag(s);
+ memmove(p, s, b->l_aux - (s - aux));
+ b->data_len -= s - p;
+ b->l_aux -= s - p;
+ return 0;
+}
+
+int bam_aux_drop_other(bam1_t *b, uint8_t *s)
+{
+ if (s) {
+ uint8_t *p, *aux;
+ aux = bam1_aux(b);
+ p = s - 2;
+ __skip_tag(s);
+ memmove(aux, p, s - p);
+ b->data_len -= b->l_aux - (s - p);
+ b->l_aux = s - p;
+ } else {
+ b->data_len -= b->l_aux;
+ b->l_aux = 0;
+ }
+ return 0;
+}
+
+void bam_init_header_hash(bam_header_t *header)
+{
+ if (header->hash == 0) {
+ int ret, i;
+ khiter_t iter;
+ khash_t(s) *h;
+ header->hash = h = kh_init(s);
+ for (i = 0; i < header->n_targets; ++i) {
+ iter = kh_put(s, h, header->target_name[i], &ret);
+ kh_value(h, iter) = i;
+ }
+ }
+}
+
+void bam_destroy_header_hash(bam_header_t *header)
+{
+ if (header->hash)
+ kh_destroy(s, (khash_t(s)*)header->hash);
+}
+
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
+{
+ khint_t k;
+ khash_t(s) *h = (khash_t(s)*)header->hash;
+ k = kh_get(s, h, seq_name);
+ return k == kh_end(h)? -1 : kh_value(h, k);
+}
+
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
+{
+ char *s;
+ int i, l, k, name_end;
+ khiter_t iter;
+ khash_t(s) *h;
+
+ bam_init_header_hash(header);
+ h = (khash_t(s)*)header->hash;
+
+ *ref_id = *beg = *end = -1;
+ name_end = l = strlen(str);
+ s = (char*)malloc(l+1);
+ // remove space
+ for (i = k = 0; i < l; ++i)
+ if (!isspace(str[i])) s[k++] = str[i];
+ s[k] = 0; l = k;
+ // determine the sequence name
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+ if (i >= 0) name_end = i;
+ if (name_end < l) { // check if this is really the end
+ int n_hyphen = 0;
+ for (i = name_end + 1; i < l; ++i) {
+ if (s[i] == '-') ++n_hyphen;
+ else if (!isdigit(s[i]) && s[i] != ',') break;
+ }
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+ s[name_end] = 0;
+ iter = kh_get(s, h, s);
+ if (iter == kh_end(h)) { // cannot find the sequence name
+ iter = kh_get(s, h, str); // try str as the name
+ if (iter == kh_end(h)) {
+ if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__);
+ free(s); return -1;
+ } else s[name_end] = ':', name_end = l;
+ }
+ } else iter = kh_get(s, h, str);
+ if (iter == kh_end(h)) {
+ free(s);
+ return -1;
+ }
+ *ref_id = kh_val(h, iter);
+ // parse the interval
+ if (name_end < l) {
+ for (i = k = name_end + 1; i < l; ++i)
+ if (s[i] != ',') s[k++] = s[i];
+ s[k] = 0;
+ *beg = atoi(s + name_end + 1);
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+ *end = i < k? atoi(s + i + 1) : 1<<29;
+ if (*beg > 0) --*beg;
+ } else *beg = 0, *end = 1<<29;
+ free(s);
+ return *beg <= *end? 0 : -1;
+}
+
+int32_t bam_aux2i(const uint8_t *s)
+{
+ int type;
+ if (s == 0) return 0;
+ type = *s++;
+ if (type == 'c') return (int32_t)*(int8_t*)s;
+ else if (type == 'C') return (int32_t)*(uint8_t*)s;
+ else if (type == 's') return (int32_t)*(int16_t*)s;
+ else if (type == 'S') return (int32_t)*(uint16_t*)s;
+ else if (type == 'i' || type == 'I') return *(int32_t*)s;
+ else return 0;
+}
+
+float bam_aux2f(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0.0;
+ if (type == 'f') return *(float*)s;
+ else return 0.0;
+}
+
+double bam_aux2d(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0.0;
+ if (type == 'd') return *(double*)s;
+ else return 0.0;
+}
+
+char bam_aux2A(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0;
+ if (type == 'A') return *(char*)s;
+ else return 0;
+}
+
+char *bam_aux2Z(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0;
+ if (type == 'Z' || type == 'H') return (char*)s;
+ else return 0;
+}
+
+#ifdef _WIN32
+double drand48()
+{
+ return (double)rand() / RAND_MAX;
+}
+#endif
diff --git a/samtools-0.1.19/bam_cat.c b/samtools-0.1.19/bam_cat.c
new file mode 100644
index 0000000..a7502b9
--- /dev/null
+++ b/samtools-0.1.19/bam_cat.c
@@ -0,0 +1,185 @@
+/*
+
+bam_cat -- efficiently concatenates bam files
+
+bam_cat can be used to concatenate BAM files. Under special
+circumstances, it can be used as an alternative to 'samtools merge' to
+concatenate multiple sorted files into a single sorted file. For this
+to work each file must be sorted, and the sorted files must be given
+as command line arguments in order such that the final read in file i
+is less than or equal to the first read in file i+1.
+
+This code is derived from the bam_reheader function in samtools 0.1.8
+and modified to perform concatenation by Chris Saunders on behalf of
+Illumina.
+
+
+########## License:
+
+The MIT License
+
+Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
+Modified SAMtools work copyright (c) 2010 Illumina, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+
+/*
+makefile:
+"""
+CC=gcc
+CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR)
+LDFLAGS+=-L$(SAMTOOLS_DIR)
+LDLIBS+=-lbam -lz
+
+all:bam_cat
+"""
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "knetfile.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#define BUF_SIZE 0x10000
+
+#define GZIPID1 31
+#define GZIPID2 139
+
+#define BGZF_EMPTY_BLOCK_SIZE 28
+
+
+int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
+{
+ BGZF *fp;
+ FILE* fp_file;
+ uint8_t *buf;
+ uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
+ const int es=BGZF_EMPTY_BLOCK_SIZE;
+ int i;
+
+ fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
+ if (fp == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
+ return 1;
+ }
+ if (h) bam_header_write(fp, h);
+
+ buf = (uint8_t*) malloc(BUF_SIZE);
+ for(i = 0; i < nfn; ++i){
+ BGZF *in;
+ bam_header_t *old;
+ int len,j;
+
+ in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r");
+ if (in == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ return -1;
+ }
+ if (in->is_write) return -1;
+
+ old = bam_header_read(in);
+ if (h == 0 && i == 0) bam_header_write(fp, old);
+
+ if (in->block_offset < in->block_length) {
+ bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
+ bgzf_flush(fp);
+ }
+
+ j=0;
+#ifdef _USE_KNETFILE
+ fp_file = fp->fp;
+ while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) {
+#else
+ fp_file = fp->fp;
+ while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {
+#endif
+ if(len<es){
+ int diff=es-len;
+ if(j==0) {
+ fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
+ return -1;
+ }
+ fwrite(ebuf, 1, len, fp_file);
+ memcpy(ebuf,ebuf+len,diff);
+ memcpy(ebuf+diff,buf,len);
+ } else {
+ if(j!=0) fwrite(ebuf, 1, es, fp_file);
+ len-= es;
+ memcpy(ebuf,buf+len,es);
+ fwrite(buf, 1, len, fp_file);
+ }
+ j=1;
+ }
+
+ /* check final gzip block */
+ {
+ const uint8_t gzip1=ebuf[0];
+ const uint8_t gzip2=ebuf[1];
+ const uint32_t isize=*((uint32_t*)(ebuf+es-4));
+ if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
+ fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
+ fprintf(stderr, " Possible output corruption.\n");
+ fwrite(ebuf, 1, es, fp_file);
+ }
+ }
+ bam_header_destroy(old);
+ bgzf_close(in);
+ }
+ free(buf);
+ bgzf_close(fp);
+ return 0;
+}
+
+
+
+int main_cat(int argc, char *argv[])
+{
+ bam_header_t *h = 0;
+ char *outfn = 0;
+ int c, ret;
+ while ((c = getopt(argc, argv, "h:o:")) >= 0) {
+ switch (c) {
+ case 'h': {
+ tamFile fph = sam_open(optarg);
+ if (fph == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
+ return 1;
+ }
+ h = sam_header_read(fph);
+ sam_close(fph);
+ break;
+ }
+ case 'o': outfn = strdup(optarg); break;
+ }
+ }
+ if (argc - optind < 2) {
+ fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
+ return 1;
+ }
+ ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ free(outfn);
+ return ret;
+}
diff --git a/samtools-0.1.19/bam_color.c b/samtools-0.1.19/bam_color.c
new file mode 100644
index 0000000..8b86e2f
--- /dev/null
+++ b/samtools-0.1.19/bam_color.c
@@ -0,0 +1,145 @@
+#include <ctype.h>
+#include "bam.h"
+
+/*!
+ @abstract Get the color encoding the previous and current base
+ @param b pointer to an alignment
+ @param i The i-th position, 0-based
+ @return color
+
+ @discussion Returns 0 no color information is found.
+ */
+char bam_aux_getCSi(bam1_t *b, int i)
+{
+ uint8_t *c = bam_aux_get(b, "CS");
+ char *cs = NULL;
+
+ // return the base if the tag was not found
+ if(0 == c) return 0;
+
+ cs = bam_aux2Z(c);
+ // adjust for strandedness and leading adaptor
+ if(bam1_strand(b)) {
+ i = strlen(cs) - 1 - i;
+ // adjust for leading hard clip
+ uint32_t cigar = bam1_cigar(b)[0];
+ if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
+ i -= cigar >> BAM_CIGAR_SHIFT;
+ }
+ } else { i++; }
+ return cs[i];
+}
+
+/*!
+ @abstract Get the color quality of the color encoding the previous and current base
+ @param b pointer to an alignment
+ @param i The i-th position, 0-based
+ @return color quality
+
+ @discussion Returns 0 no color information is found.
+ */
+char bam_aux_getCQi(bam1_t *b, int i)
+{
+ uint8_t *c = bam_aux_get(b, "CQ");
+ char *cq = NULL;
+
+ // return the base if the tag was not found
+ if(0 == c) return 0;
+
+ cq = bam_aux2Z(c);
+ // adjust for strandedness
+ if(bam1_strand(b)) {
+ i = strlen(cq) - 1 - i;
+ // adjust for leading hard clip
+ uint32_t cigar = bam1_cigar(b)[0];
+ if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
+ i -= (cigar >> BAM_CIGAR_SHIFT);
+ }
+ }
+ return cq[i];
+}
+
+char bam_aux_nt2int(char a)
+{
+ switch(toupper(a)) {
+ case 'A':
+ return 0;
+ break;
+ case 'C':
+ return 1;
+ break;
+ case 'G':
+ return 2;
+ break;
+ case 'T':
+ return 3;
+ break;
+ default:
+ return 4;
+ break;
+ }
+}
+
+char bam_aux_ntnt2cs(char a, char b)
+{
+ a = bam_aux_nt2int(a);
+ b = bam_aux_nt2int(b);
+ if(4 == a || 4 == b) return '4';
+ return "0123"[(int)(a ^ b)];
+}
+
+/*!
+ @abstract Get the color error profile at the give position
+ @param b pointer to an alignment
+ @return the original color if the color was an error, '-' (dash) otherwise
+
+ @discussion Returns 0 no color information is found.
+ */
+char bam_aux_getCEi(bam1_t *b, int i)
+{
+ int cs_i;
+ uint8_t *c = bam_aux_get(b, "CS");
+ char *cs = NULL;
+ char prev_b, cur_b;
+ char cur_color, cor_color;
+
+ // return the base if the tag was not found
+ if(0 == c) return 0;
+
+ cs = bam_aux2Z(c);
+
+ // adjust for strandedness and leading adaptor
+ if(bam1_strand(b)) { //reverse strand
+ cs_i = strlen(cs) - 1 - i;
+ // adjust for leading hard clip
+ uint32_t cigar = bam1_cigar(b)[0];
+ if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
+ cs_i -= cigar >> BAM_CIGAR_SHIFT;
+ }
+ // get current color
+ cur_color = cs[cs_i];
+ // get previous base. Note: must rc adaptor
+ prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+ // get current base
+ cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ }
+ else {
+ cs_i=i+1;
+ // get current color
+ cur_color = cs[cs_i];
+ // get previous base
+ prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+ // get current base
+ cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ }
+
+ // corrected color
+ cor_color = bam_aux_ntnt2cs(prev_b, cur_b);
+
+ if(cur_color == cor_color) {
+ return '-';
+ }
+ else {
+ return cur_color;
+ }
+}
diff --git a/samtools-0.1.19/bam_endian.h b/samtools-0.1.19/bam_endian.h
new file mode 100644
index 0000000..0fc74a8
--- /dev/null
+++ b/samtools-0.1.19/bam_endian.h
@@ -0,0 +1,42 @@
+#ifndef BAM_ENDIAN_H
+#define BAM_ENDIAN_H
+
+#include <stdint.h>
+
+static inline int bam_is_big_endian()
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *bam_swap_endian_2p(void *x)
+{
+ *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
+ return x;
+}
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *bam_swap_endian_4p(void *x)
+{
+ *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
+ return x;
+}
+static inline uint64_t bam_swap_endian_8(uint64_t v)
+{
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *bam_swap_endian_8p(void *x)
+{
+ *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
+ return x;
+}
+
+#endif
diff --git a/samtools-0.1.19/bam_import.c b/samtools-0.1.19/bam_import.c
new file mode 100644
index 0000000..da2bf94
--- /dev/null
+++ b/samtools-0.1.19/bam_import.c
@@ -0,0 +1,489 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#ifdef _WIN32
+#include <fcntl.h>
+#endif
+#include "kstring.h"
+#include "bam.h"
+#include "sam_header.h"
+#include "kseq.h"
+#include "khash.h"
+
+KSTREAM_INIT(gzFile, gzread, 16384)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+
+void bam_init_header_hash(bam_header_t *header);
+void bam_destroy_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+unsigned char bam_nt16_table[256] = {
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+unsigned short bam_char2flag_table[256] = {
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
+ BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
+};
+
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
+
+struct __tamFile_t {
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t *str;
+ uint64_t n_lines;
+ int is_first;
+};
+
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
+{
+ char **list = 0, *s;
+ int n = 0, dret, m = 0;
+ gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ kstream_t *ks;
+ kstring_t *str;
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, '\n', str, &dret) > 0) {
+ if (n == m) {
+ m = m? m << 1 : 16;
+ list = (char**)realloc(list, m * sizeof(char*));
+ }
+ if (str->s[str->l-1] == '\r')
+ str->s[--str->l] = '\0';
+ s = list[n++] = (char*)calloc(str->l + 1, 1);
+ strcpy(s, str->s);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ *_n = n;
+ return list;
+}
+
+static bam_header_t *hash2header(const kh_ref_t *hash)
+{
+ bam_header_t *header;
+ khiter_t k;
+ header = bam_header_init();
+ header->n_targets = kh_size(hash);
+ header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
+ header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
+ for (k = kh_begin(hash); k != kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ int i = (int)kh_value(hash, k);
+ header->target_name[i] = (char*)kh_key(hash, k);
+ header->target_len[i] = kh_value(hash, k)>>32;
+ }
+ }
+ bam_init_header_hash(header);
+ return header;
+}
+bam_header_t *sam_header_read2(const char *fn)
+{
+ bam_header_t *header;
+ int c, dret, ret, error = 0;
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t *str;
+ kh_ref_t *hash;
+ khiter_t k;
+ if (fn == 0) return 0;
+ fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ if (fp == 0) return 0;
+ hash = kh_init(ref);
+ ks = ks_init(fp);
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ while (ks_getuntil(ks, 0, str, &dret) > 0) {
+ char *s = strdup(str->s);
+ int len, i;
+ i = kh_size(hash);
+ ks_getuntil(ks, 0, str, &dret);
+ len = atoi(str->s);
+ k = kh_put(ref, hash, s, &ret);
+ if (ret == 0) {
+ fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s);
+ error = 1;
+ }
+ kh_value(hash, k) = (uint64_t)len<<32 | i;
+ if (dret != '\n')
+ while ((c = ks_getc(ks)) != '\n' && c != -1);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+ if (error) return 0;
+ header = hash2header(hash);
+ kh_destroy(ref, hash);
+ return header;
+}
+static inline uint8_t *alloc_data(bam1_t *b, int size)
+{
+ if (b->m_data < size) {
+ b->m_data = size;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ return b->data;
+}
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
+{
+ fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
+ abort();
+}
+static inline void append_text(bam_header_t *header, kstring_t *str)
+{
+ size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+ kroundup32(x); kroundup32(y);
+ if (x < y)
+ {
+ header->n_text = y;
+ header->text = (char*)realloc(header->text, y);
+ if ( !header->text )
+ {
+ fprintf(stderr,"realloc failed to alloc %ld bytes\n", y);
+ abort();
+ }
+ }
+ // Sanity check
+ if ( header->l_text+str->l+1 >= header->n_text )
+ {
+ fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y);
+ abort();
+ }
+ strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
+ header->l_text += str->l + 1;
+ header->text[header->l_text] = 0;
+}
+
+int sam_header_parse(bam_header_t *h)
+{
+ char **tmp;
+ int i;
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ if (h->l_text < 3) return 0;
+ if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+ tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets);
+ if (h->n_targets == 0) return 0;
+ h->target_name = calloc(h->n_targets, sizeof(void*));
+ for (i = 0; i < h->n_targets; ++i)
+ h->target_name[i] = strdup(tmp[i]);
+ free(tmp);
+ tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets);
+ h->target_len = calloc(h->n_targets, 4);
+ for (i = 0; i < h->n_targets; ++i)
+ h->target_len[i] = atoi(tmp[i]);
+ free(tmp);
+ return h->n_targets;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+ int ret, dret;
+ bam_header_t *header = bam_header_init();
+ kstring_t *str = fp->str;
+ while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
+ str->s[str->l] = dret; // note that str->s is NOT null terminated!!
+ append_text(header, str);
+ if (dret != '\n') {
+ ret = ks_getuntil(fp->ks, '\n', str, &dret);
+ str->s[str->l] = '\n'; // NOT null terminated!!
+ append_text(header, str);
+ }
+ ++fp->n_lines;
+ }
+ sam_header_parse(header);
+ bam_init_header_hash(header);
+ fp->is_first = 1;
+ return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+ int ret, doff, doff0, dret, z = 0;
+ bam1_core_t *c = &b->core;
+ kstring_t *str = fp->str;
+ kstream_t *ks = fp->ks;
+
+ if (fp->is_first) {
+ fp->is_first = 0;
+ ret = str->l;
+ } else {
+ do { // special consideration for empty lines
+ ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+ if (ret >= 0) z += str->l + 1;
+ } while (ret == 0);
+ }
+ if (ret < 0) return -1;
+ ++fp->n_lines;
+ doff = 0;
+
+ { // name
+ c->l_qname = strlen(str->s) + 1;
+ memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
+ doff += c->l_qname;
+ }
+ { // flag
+ long flag;
+ char *s;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ flag = strtol((char*)str->s, &s, 0);
+ if (*s) { // not the end of the string
+ flag = 0;
+ for (s = str->s; *s; ++s)
+ flag |= bam_char2flag_table[(int)*s];
+ }
+ c->flag = flag;
+ }
+ { // tid, pos, qual
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+ if (c->tid < 0 && strcmp(str->s, "*")) {
+ if (header->n_targets == 0) {
+ fprintf(stderr, "[sam_read1] missing header? Abort!\n");
+ exit(1);
+ } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
+ }
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
+ if (ret < 0) return -2;
+ }
+ { // cigar
+ char *s, *t;
+ int i, op;
+ long x;
+ c->n_cigar = 0;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
+ z += str->l + 1;
+ if (str->s[0] != '*') {
+ uint32_t *cigar;
+ for (s = str->s; *s; ++s) {
+ if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
+ else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
+ }
+ b->data = alloc_data(b, doff + c->n_cigar * 4);
+ cigar = bam1_cigar(b);
+ for (i = 0, s = str->s; i != c->n_cigar; ++i) {
+ x = strtol(s, &t, 10);
+ op = toupper(*t);
+ if (op == 'M') op = BAM_CMATCH;
+ else if (op == 'I') op = BAM_CINS;
+ else if (op == 'D') op = BAM_CDEL;
+ else if (op == 'N') op = BAM_CREF_SKIP;
+ else if (op == 'S') op = BAM_CSOFT_CLIP;
+ else if (op == 'H') op = BAM_CHARD_CLIP;
+ else if (op == 'P') op = BAM_CPAD;
+ else if (op == '=') op = BAM_CEQUAL;
+ else if (op == 'X') op = BAM_CDIFF;
+ else if (op == 'B') op = BAM_CBACK;
+ else parse_error(fp->n_lines, "invalid CIGAR operation");
+ s = t + 1;
+ cigar[i] = bam_cigar_gen(x, op);
+ }
+ if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
+ c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar));
+ doff += c->n_cigar * 4;
+ } else {
+ if (!(c->flag&BAM_FUNMAP)) {
+ fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
+ c->flag |= BAM_FUNMAP;
+ }
+ c->bin = bam_reg2bin(c->pos, c->pos + 1);
+ }
+ }
+ { // mtid, mpos, isize
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
+ if (ret < 0) return -4;
+ }
+ { // seq and qual
+ int i;
+ uint8_t *p = 0;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
+ z += str->l + 1;
+ if (strcmp(str->s, "*")) {
+ c->l_qseq = strlen(str->s);
+ if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
+ fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
+ (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
+ parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+ }
+ p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+ memset(p, 0, (c->l_qseq+1)/2);
+ for (i = 0; i < c->l_qseq; ++i)
+ p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+ } else c->l_qseq = 0;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
+ z += str->l + 1;
+ if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
+ parse_error(fp->n_lines, "sequence and quality are inconsistent");
+ p += (c->l_qseq+1)/2;
+ if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
+ else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
+ doff += c->l_qseq + (c->l_qseq+1)/2;
+ }
+ doff0 = doff;
+ if (dret != '\n' && dret != '\r') { // aux
+ while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
+ uint8_t *s, type, key[2];
+ z += str->l + 1;
+ if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
+ parse_error(fp->n_lines, "missing colon in auxiliary data");
+ key[0] = str->s[0]; key[1] = str->s[1];
+ type = str->s[3];
+ s = alloc_data(b, doff + 3) + doff;
+ s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
+ if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
+ s = alloc_data(b, doff + 2) + doff;
+ *s++ = 'A'; *s = str->s[5];
+ doff += 2;
+ } else if (type == 'I' || type == 'i') {
+ long long x;
+ s = alloc_data(b, doff + 5) + doff;
+ x = (long long)atoll(str->s + 5);
+ if (x < 0) {
+ if (x >= -127) {
+ *s++ = 'c'; *(int8_t*)s = (int8_t)x;
+ s += 1; doff += 2;
+ } else if (x >= -32767) {
+ *s++ = 's'; *(int16_t*)s = (int16_t)x;
+ s += 2; doff += 3;
+ } else {
+ *s++ = 'i'; *(int32_t*)s = (int32_t)x;
+ s += 4; doff += 5;
+ if (x < -2147483648ll)
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+ (long long)fp->n_lines, x);
+ }
+ } else {
+ if (x <= 255) {
+ *s++ = 'C'; *s++ = (uint8_t)x;
+ doff += 2;
+ } else if (x <= 65535) {
+ *s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
+ s += 2; doff += 3;
+ } else {
+ *s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
+ s += 4; doff += 5;
+ if (x > 4294967295ll)
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+ (long long)fp->n_lines, x);
+ }
+ }
+ } else if (type == 'f') {
+ s = alloc_data(b, doff + 5) + doff;
+ *s++ = 'f';
+ *(float*)s = (float)atof(str->s + 5);
+ s += 4; doff += 5;
+ } else if (type == 'd') {
+ s = alloc_data(b, doff + 9) + doff;
+ *s++ = 'd';
+ *(float*)s = (float)atof(str->s + 9);
+ s += 8; doff += 9;
+ } else if (type == 'Z' || type == 'H') {
+ int size = 1 + (str->l - 5) + 1;
+ if (type == 'H') { // check whether the hex string is valid
+ int i;
+ if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
+ for (i = 0; i < str->l - 5; ++i) {
+ int c = toupper(str->s[5 + i]);
+ if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
+ parse_error(fp->n_lines, "invalid hex character");
+ }
+ }
+ s = alloc_data(b, doff + size) + doff;
+ *s++ = type;
+ memcpy(s, str->s + 5, str->l - 5);
+ s[str->l - 5] = 0;
+ doff += size;
+ } else if (type == 'B') {
+ int32_t n = 0, Bsize, k = 0, size;
+ char *p;
+ if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B");
+ Bsize = bam_aux_type2size(str->s[5]); // the size of each element
+ for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array
+ if (*p == ',') ++n;
+ p = str->s + 7; // now p points to the first number in the array
+ size = 6 + Bsize * n; // total number of bytes allocated to this tag
+ s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory
+ *s++ = 'B'; *s++ = str->s[5];
+ memcpy(s, &n, 4); s += 4; // write the number of elements
+ if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory
+ else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p;
+ else parse_error(fp->n_lines, "unrecognized array type");
+ s += Bsize * n; doff += size;
+ } else parse_error(fp->n_lines, "unrecognized type");
+ if (dret == '\n' || dret == '\r') break;
+ }
+ }
+ b->l_aux = doff - doff0;
+ b->data_len = doff;
+ if (bam_no_B) bam_remove_B(b);
+ return z;
+}
+
+tamFile sam_open(const char *fn)
+{
+ tamFile fp;
+ gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");
+ if (gzfp == 0) return 0;
+ fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
+ fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ fp->fp = gzfp;
+ fp->ks = ks_init(fp->fp);
+ return fp;
+}
+
+void sam_close(tamFile fp)
+{
+ if (fp) {
+ ks_destroy(fp->ks);
+ gzclose(fp->fp);
+ free(fp->str->s); free(fp->str);
+ free(fp);
+ }
+}
diff --git a/samtools-0.1.19/bam_index.c b/samtools-0.1.19/bam_index.c
new file mode 100644
index 0000000..f916e04
--- /dev/null
+++ b/samtools-0.1.19/bam_index.c
@@ -0,0 +1,726 @@
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+/*!
+ @header
+
+ Alignment indexing. Before indexing, BAM must be sorted based on the
+ leftmost coordinate of alignments. In indexing, BAM uses two indices:
+ a UCSC binning index and a simple linear index. The binning index is
+ efficient for alignments spanning long distance, while the auxiliary
+ linear index helps to reduce unnecessary seek calls especially for
+ short alignments.
+
+ The UCSC binning scheme was suggested by Richard Durbin and Lincoln
+ Stein and is explained by Kent et al. (2002). In this scheme, each bin
+ represents a contiguous genomic region which can be fully contained in
+ another bin; each alignment is associated with a bin which represents
+ the smallest region containing the entire alignment. The binning
+ scheme is essentially another representation of R-tree. A distinct bin
+ uniquely corresponds to a distinct internal node in a R-tree. Bin A is
+ a child of Bin B if region A is contained in B.
+
+ In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
+ 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
+ 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
+ find the alignments overlapped with a region [rbeg,rend), we need to
+ calculate the list of bins that may be overlapped the region and test
+ the alignments in the bins to confirm the overlaps. If the specified
+ region is short, typically only a few alignments in six bins need to
+ be retrieved. The overlapping alignments can be quickly fetched.
+
+ */
+
+#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
+#define BAM_LIDX_SHIFT 14
+
+#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
+
+typedef struct {
+ uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(off, pair64_t, pair64_lt)
+
+typedef struct {
+ uint32_t m, n;
+ pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+ int32_t n, m;
+ uint64_t *offset;
+} bam_lidx_t;
+
+KHASH_MAP_INIT_INT(i, bam_binlist_t)
+
+struct __bam_index_t {
+ int32_t n;
+ uint64_t n_no_coor; // unmapped reads without coordinate
+ khash_t(i) **index;
+ bam_lidx_t *index2;
+};
+
+// requirement: len <= LEN_MASK
+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
+{
+ khint_t k;
+ bam_binlist_t *l;
+ int ret;
+ k = kh_put(i, h, bin, &ret);
+ l = &kh_value(h, k);
+ if (ret) { // not present
+ l->m = 1; l->n = 0;
+ l->list = (pair64_t*)calloc(l->m, 16);
+ }
+ if (l->n == l->m) {
+ l->m <<= 1;
+ l->list = (pair64_t*)realloc(l->list, l->m * 16);
+ }
+ l->list[l->n].u = beg; l->list[l->n++].v = end;
+}
+
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
+{
+ int i, beg, end;
+ beg = b->core.pos >> BAM_LIDX_SHIFT;
+ end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
+ kroundup32(index2->m);
+ index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+ }
+ if (beg == end) {
+ if (index2->offset[beg] == 0) index2->offset[beg] = offset;
+ } else {
+ for (i = beg; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ }
+ index2->n = end + 1;
+}
+
+static void merge_chunks(bam_index_t *idx)
+{
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+ khash_t(i) *index;
+ int i, l, m;
+ khint_t k;
+ for (i = 0; i < idx->n; ++i) {
+ index = idx->index[i];
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ bam_binlist_t *p;
+ if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
+ p = &kh_value(index, k);
+ m = 0;
+ for (l = 1; l < p->n; ++l) {
+#ifdef BAM_TRUE_OFFSET
+ if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
+#else
+ if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
+#endif
+ else p->list[++m] = p->list[l];
+ } // ~for(l)
+ p->n = m + 1;
+ } // ~for(k)
+ } // ~for(i)
+#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
+}
+
+static void fill_missing(bam_index_t *idx)
+{
+ int i, j;
+ for (i = 0; i < idx->n; ++i) {
+ bam_lidx_t *idx2 = &idx->index2[i];
+ for (j = 1; j < idx2->n; ++j)
+ if (idx2->offset[j] == 0)
+ idx2->offset[j] = idx2->offset[j-1];
+ }
+}
+
+bam_index_t *bam_index_core(bamFile fp)
+{
+ bam1_t *b;
+ bam_header_t *h;
+ int i, ret;
+ bam_index_t *idx;
+ uint32_t last_bin, save_bin;
+ int32_t last_coor, last_tid, save_tid;
+ bam1_core_t *c;
+ uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
+
+ h = bam_header_read(fp);
+ if(h == 0) {
+ fprintf(stderr, "[bam_index_core] Invalid BAM header.");
+ return NULL;
+ }
+
+ idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+ b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ c = &b->core;
+
+ idx->n = h->n_targets;
+ bam_header_destroy(h);
+ idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+ for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
+ idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+
+ save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
+ save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+ n_mapped = n_unmapped = n_no_coor = off_end = 0;
+ off_beg = off_end = bam_tell(fp);
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ if (c->tid < 0) ++n_no_coor;
+ if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
+ last_tid = c->tid;
+ last_bin = 0xffffffffu;
+ } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
+ bam1_qname(b), last_tid+1, c->tid+1);
+ return NULL;
+ } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
+ bam1_qname(b), last_coor, c->pos, c->tid+1);
+ return NULL;
+ }
+ if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+ if (c->bin != last_bin) { // then possibly write the binning index
+ if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+ insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+ if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
+ off_end = last_off;
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+ n_mapped = n_unmapped = 0;
+ off_beg = off_end;
+ }
+ save_off = last_off;
+ save_bin = last_bin = c->bin;
+ save_tid = c->tid;
+ if (save_tid < 0) break;
+ }
+ if (bam_tell(fp) <= last_off) {
+ fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
+ (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
+ return NULL;
+ }
+ if (c->flag & BAM_FUNMAP) ++n_unmapped;
+ else ++n_mapped;
+ last_off = bam_tell(fp);
+ last_coor = b->core.pos;
+ }
+ if (save_tid >= 0) {
+ insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+ }
+ merge_chunks(idx);
+ fill_missing(idx);
+ if (ret >= 0) {
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ ++n_no_coor;
+ if (c->tid >= 0 && n_no_coor) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
+ return NULL;
+ }
+ }
+ }
+ if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
+ free(b->data); free(b);
+ idx->n_no_coor = n_no_coor;
+ return idx;
+}
+
+void bam_index_destroy(bam_index_t *idx)
+{
+ khint_t k;
+ int i;
+ if (idx == 0) return;
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index = idx->index[i];
+ bam_lidx_t *index2 = idx->index2 + i;
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ if (kh_exist(index, k))
+ free(kh_value(index, k).list);
+ }
+ kh_destroy(i, index);
+ free(index2->offset);
+ }
+ free(idx->index); free(idx->index2);
+ free(idx);
+}
+
+void bam_index_save(const bam_index_t *idx, FILE *fp)
+{
+ int32_t i, size;
+ khint_t k;
+ fwrite("BAI\1", 1, 4, fp);
+ if (bam_is_be) {
+ uint32_t x = idx->n;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&idx->n, 4, 1, fp);
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index = idx->index[i];
+ bam_lidx_t *index2 = idx->index2 + i;
+ // write binning index
+ size = kh_size(index);
+ if (bam_is_be) { // big endian
+ uint32_t x = size;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&size, 4, 1, fp);
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ if (kh_exist(index, k)) {
+ bam_binlist_t *p = &kh_value(index, k);
+ if (bam_is_be) { // big endian
+ uint32_t x;
+ x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ for (x = 0; (int)x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ fwrite(p->list, 16, p->n, fp);
+ for (x = 0; (int)x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ } else {
+ fwrite(&kh_key(index, k), 4, 1, fp);
+ fwrite(&p->n, 4, 1, fp);
+ fwrite(p->list, 16, p->n, fp);
+ }
+ }
+ }
+ // write linear index (index2)
+ if (bam_is_be) {
+ int x = index2->n;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&index2->n, 4, 1, fp);
+ if (bam_is_be) { // big endian
+ int x;
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ fwrite(index2->offset, 8, index2->n, fp);
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ } else fwrite(index2->offset, 8, index2->n, fp);
+ }
+ { // write the number of reads coor-less records.
+ uint64_t x = idx->n_no_coor;
+ if (bam_is_be) bam_swap_endian_8p(&x);
+ fwrite(&x, 8, 1, fp);
+ }
+ fflush(fp);
+}
+
+static bam_index_t *bam_index_load_core(FILE *fp)
+{
+ int i;
+ char magic[4];
+ bam_index_t *idx;
+ if (fp == 0) {
+ fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
+ return 0;
+ }
+ fread(magic, 1, 4, fp);
+ if (strncmp(magic, "BAI\1", 4)) {
+ fprintf(stderr, "[bam_index_load] wrong magic number.\n");
+ fclose(fp);
+ return 0;
+ }
+ idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+ fread(&idx->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&idx->n);
+ idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+ idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index;
+ bam_lidx_t *index2 = idx->index2 + i;
+ uint32_t key, size;
+ khint_t k;
+ int j, ret;
+ bam_binlist_t *p;
+ index = idx->index[i] = kh_init(i);
+ // load binning index
+ fread(&size, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&size);
+ for (j = 0; j < (int)size; ++j) {
+ fread(&key, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&key);
+ k = kh_put(i, index, key, &ret);
+ p = &kh_value(index, k);
+ fread(&p->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&p->n);
+ p->m = p->n;
+ p->list = (pair64_t*)malloc(p->m * 16);
+ fread(p->list, 16, p->n, fp);
+ if (bam_is_be) {
+ int x;
+ for (x = 0; x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ }
+ }
+ // load linear index
+ fread(&index2->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&index2->n);
+ index2->m = index2->n;
+ index2->offset = (uint64_t*)calloc(index2->m, 8);
+ fread(index2->offset, index2->n, 8, fp);
+ if (bam_is_be)
+ for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+ }
+ if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
+ if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
+ return idx;
+}
+
+bam_index_t *bam_index_load_local(const char *_fn)
+{
+ FILE *fp;
+ char *fnidx, *fn;
+
+ if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
+ const char *p;
+ int l = strlen(_fn);
+ for (p = _fn + l - 1; p >= _fn; --p)
+ if (*p == '/') break;
+ fn = strdup(p + 1);
+ } else fn = strdup(_fn);
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ fp = fopen(fnidx, "rb");
+ if (fp == 0) { // try "{base}.bai"
+ char *s = strstr(fn, "bam");
+ if (s == fn + strlen(fn) - 3) {
+ strcpy(fnidx, fn);
+ fnidx[strlen(fn)-1] = 'i';
+ fp = fopen(fnidx, "rb");
+ }
+ }
+ free(fnidx); free(fn);
+ if (fp) {
+ bam_index_t *idx = bam_index_load_core(fp);
+ fclose(fp);
+ return idx;
+ } else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ char *fn;
+ FILE *fp;
+ uint8_t *buf;
+ knetFile *fp_remote;
+ int l;
+ if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+ l = strlen(url);
+ for (fn = (char*)url + l - 1; fn >= url; --fn)
+ if (*fn == '/') break;
+ ++fn; // fn now points to the file name
+ fp_remote = knet_open(url, "r");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+ return;
+ }
+ if ((fp = fopen(fn, "wb")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+ knet_close(fp_remote);
+ return;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+ return;
+}
+#endif
+
+bam_index_t *bam_index_load(const char *fn)
+{
+ bam_index_t *idx;
+ idx = bam_index_load_local(fn);
+ if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
+ char *fnidx = calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".bai");
+ fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
+ download_from_remote(fnidx);
+ free(fnidx);
+ idx = bam_index_load_local(fn);
+ }
+ if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
+ return idx;
+}
+
+int bam_index_build2(const char *fn, const char *_fnidx)
+{
+ char *fnidx;
+ FILE *fpidx;
+ bamFile fp;
+ bam_index_t *idx;
+ if ((fp = bam_open(fn, "r")) == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
+ return -1;
+ }
+ idx = bam_index_core(fp);
+ bam_close(fp);
+ if(idx == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
+ return -1;
+ }
+ if (_fnidx == 0) {
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ } else fnidx = strdup(_fnidx);
+ fpidx = fopen(fnidx, "wb");
+ if (fpidx == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
+ free(fnidx);
+ bam_index_destroy(idx);
+ return -1;
+ }
+ bam_index_save(idx, fpidx);
+ bam_index_destroy(idx);
+ fclose(fpidx);
+ free(fnidx);
+ return 0;
+}
+
+int bam_index_build(const char *fn)
+{
+ return bam_index_build2(fn, 0);
+}
+
+int bam_index(int argc, char *argv[])
+{
+ if (argc < 2) {
+ fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
+ return 1;
+ }
+ if (argc >= 3) bam_index_build2(argv[1], argv[2]);
+ else bam_index_build(argv[1]);
+ return 0;
+}
+
+int bam_idxstats(int argc, char *argv[])
+{
+ bam_index_t *idx;
+ bam_header_t *header;
+ bamFile fp;
+ int i;
+ if (argc < 2) {
+ fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
+ return 1;
+ }
+ fp = bam_open(argv[1], "r");
+ if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+ header = bam_header_read(fp);
+ bam_close(fp);
+ idx = bam_index_load(argv[1]);
+ if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+ for (i = 0; i < idx->n; ++i) {
+ khint_t k;
+ khash_t(i) *h = idx->index[i];
+ printf("%s\t%d", header->target_name[i], header->target_len[i]);
+ k = kh_get(i, h, BAM_MAX_BIN);
+ if (k != kh_end(h))
+ printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
+ else printf("\t0\t0");
+ putchar('\n');
+ }
+ printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
+ bam_header_destroy(header);
+ bam_index_destroy(idx);
+ return 0;
+}
+
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
+{
+ int i = 0, k;
+ if (beg >= end) return 0;
+ if (end >= 1u<<29) end = 1u<<29;
+ --end;
+ list[i++] = 0;
+ for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
+ for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
+ for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
+ for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
+ for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+ return i;
+}
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+ uint32_t rbeg = b->core.pos;
+ uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
+ return (rend > beg && rbeg < end);
+}
+
+struct __bam_iter_t {
+ int from_first; // read from the first record; no random access
+ int tid, beg, end, n_off, i, finished;
+ uint64_t curr_off;
+ pair64_t *off;
+};
+
+// bam_fetch helper function retrieves
+bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
+{
+ uint16_t *bins;
+ int i, n_bins, n_off;
+ pair64_t *off;
+ khint_t k;
+ khash_t(i) *index;
+ uint64_t min_off;
+ bam_iter_t iter = 0;
+
+ if (beg < 0) beg = 0;
+ if (end < beg) return 0;
+ // initialize iter
+ iter = calloc(1, sizeof(struct __bam_iter_t));
+ iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
+ //
+ bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
+ n_bins = reg2bins(beg, end, bins);
+ index = idx->index[tid];
+ if (idx->index2[tid].n > 0) {
+ min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+ : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+ if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
+ int n = beg>>BAM_LIDX_SHIFT;
+ if (n > idx->index2[tid].n) n = idx->index2[tid].n;
+ for (i = n - 1; i >= 0; --i)
+ if (idx->index2[tid].offset[i] != 0) break;
+ if (i >= 0) min_off = idx->index2[tid].offset[i];
+ }
+ } else min_off = 0; // tabix 0.1.2 may produce such index files
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((k = kh_get(i, index, bins[i])) != kh_end(index))
+ n_off += kh_value(index, k).n;
+ }
+ if (n_off == 0) {
+ free(bins); return iter;
+ }
+ off = (pair64_t*)calloc(n_off, 16);
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
+ int j;
+ bam_binlist_t *p = &kh_value(index, k);
+ for (j = 0; j < p->n; ++j)
+ if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+ }
+ }
+ free(bins);
+ if (n_off == 0) {
+ free(off); return iter;
+ }
+ {
+ bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ int l;
+ ks_introsort(off, n_off, off);
+ // resolve completely contained adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i)
+ if (off[l].v < off[i].v)
+ off[++l] = off[i];
+ n_off = l + 1;
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+ for (i = 1; i < n_off; ++i)
+ if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+ { // merge adjacent blocks
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+ for (i = 1, l = 0; i < n_off; ++i) {
+#ifdef BAM_TRUE_OFFSET
+ if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
+#else
+ if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+#endif
+ else off[++l] = off[i];
+ }
+ n_off = l + 1;
+#endif
+ }
+ bam_destroy1(b);
+ }
+ iter->n_off = n_off; iter->off = off;
+ return iter;
+}
+
+pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
+{ // for pysam compatibility
+ bam_iter_t iter;
+ pair64_t *off;
+ iter = bam_iter_query(idx, tid, beg, end);
+ off = iter->off; *cnt_off = iter->n_off;
+ free(iter);
+ return off;
+}
+
+void bam_iter_destroy(bam_iter_t iter)
+{
+ if (iter) { free(iter->off); free(iter); }
+}
+
+int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
+{
+ int ret;
+ if (iter && iter->finished) return -1;
+ if (iter == 0 || iter->from_first) {
+ ret = bam_read1(fp, b);
+ if (ret < 0 && iter) iter->finished = 1;
+ return ret;
+ }
+ if (iter->off == 0) return -1;
+ for (;;) {
+ if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
+ if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
+ if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
+ if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
+ bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
+ iter->curr_off = bam_tell(fp);
+ }
+ ++iter->i;
+ }
+ if ((ret = bam_read1(fp, b)) >= 0) {
+ iter->curr_off = bam_tell(fp);
+ if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
+ ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
+ break;
+ }
+ else if (is_overlap(iter->beg, iter->end, b)) return ret;
+ } else break; // end of file or error
+ }
+ iter->finished = 1;
+ return ret;
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+ int ret;
+ bam_iter_t iter;
+ bam1_t *b;
+ b = bam_init1();
+ iter = bam_iter_query(idx, tid, beg, end);
+ while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
+ bam_iter_destroy(iter);
+ bam_destroy1(b);
+ return (ret == -1)? 0 : ret;
+}
diff --git a/samtools-0.1.19/bam_lpileup.c b/samtools-0.1.19/bam_lpileup.c
new file mode 100644
index 0000000..d4dd63b
--- /dev/null
+++ b/samtools-0.1.19/bam_lpileup.c
@@ -0,0 +1,198 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "bam.h"
+#include "ksort.h"
+
+#define TV_GAP 2
+
+typedef struct __freenode_t {
+ uint32_t level:28, cnt:4;
+ struct __freenode_t *next;
+} freenode_t, *freenode_p;
+
+#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
+KSORT_INIT(node, freenode_p, freenode_lt)
+
+/* Memory pool, similar to the one in bam_pileup.c */
+typedef struct {
+ int cnt, n, max;
+ freenode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+ return (mempool_t*)calloc(1, sizeof(mempool_t));
+}
+static void mp_destroy(mempool_t *mp)
+{
+ int k;
+ for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
+ free(mp->buf); free(mp);
+}
+static inline freenode_t *mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
+ else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, freenode_t *p)
+{
+ --mp->cnt; p->next = 0; p->cnt = TV_GAP;
+ if (mp->n == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 256;
+ mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
+ }
+ mp->buf[mp->n++] = p;
+}
+
+/* core part */
+struct __bam_lplbuf_t {
+ int max, n_cur, n_pre;
+ int max_level, *cur_level, *pre_level;
+ mempool_t *mp;
+ freenode_t **aux, *head, *tail;
+ int n_nodes, m_aux;
+ bam_pileup_f func;
+ void *user_data;
+ bam_plbuf_t *plbuf;
+};
+
+void bam_lplbuf_reset(bam_lplbuf_t *buf)
+{
+ freenode_t *p, *q;
+ bam_plbuf_reset(buf->plbuf);
+ for (p = buf->head; p->next;) {
+ q = p->next;
+ mp_free(buf->mp, p);
+ p = q;
+ }
+ buf->head = buf->tail;
+ buf->max_level = 0;
+ buf->n_cur = buf->n_pre = 0;
+ buf->n_nodes = 0;
+}
+
+static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+ bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
+ freenode_t *p;
+ int i, l, max_level;
+ // allocate memory if necessary
+ if (tv->max < n) { // enlarge
+ tv->max = n;
+ kroundup32(tv->max);
+ tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
+ tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
+ }
+ tv->n_cur = n;
+ // update cnt
+ for (p = tv->head; p->next; p = p->next)
+ if (p->cnt > 0) --p->cnt;
+ // calculate cur_level[]
+ max_level = 0;
+ for (i = l = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->is_head) {
+ if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
+ freenode_t *p = tv->head->next;
+ tv->cur_level[i] = tv->head->level;
+ mp_free(tv->mp, tv->head);
+ tv->head = p;
+ --tv->n_nodes;
+ } else tv->cur_level[i] = ++tv->max_level;
+ } else {
+ tv->cur_level[i] = tv->pre_level[l++];
+ if (p->is_tail) { // then return a free slot
+ tv->tail->level = tv->cur_level[i];
+ tv->tail->next = mp_alloc(tv->mp);
+ tv->tail = tv->tail->next;
+ ++tv->n_nodes;
+ }
+ }
+ if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
+ ((bam_pileup1_t*)p)->level = tv->cur_level[i];
+ }
+ assert(l == tv->n_pre);
+ tv->func(tid, pos, n, pl, tv->user_data);
+ // sort the linked list
+ if (tv->n_nodes) {
+ freenode_t *q;
+ if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
+ tv->m_aux = tv->n_nodes + 1;
+ kroundup32(tv->m_aux);
+ tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
+ }
+ for (p = tv->head, i = l = 0; p->next;) {
+ if (p->level > max_level) { // then discard this entry
+ q = p->next;
+ mp_free(tv->mp, p);
+ p = q;
+ } else {
+ tv->aux[i++] = p;
+ p = p->next;
+ }
+ }
+ tv->aux[i] = tv->tail; // add a proper tail for the loop below
+ tv->n_nodes = i;
+ if (tv->n_nodes) {
+ ks_introsort(node, tv->n_nodes, tv->aux);
+ for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
+ tv->head = tv->aux[0];
+ } else tv->head = tv->tail;
+ }
+ // clean up
+ tv->max_level = max_level;
+ memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
+ // squeeze out terminated levels
+ for (i = l = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (!p->is_tail)
+ tv->pre_level[l++] = tv->pre_level[i];
+ }
+ tv->n_pre = l;
+/*
+ fprintf(stderr, "%d\t", pos+1);
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->is_head) fprintf(stderr, "^");
+ if (p->is_tail) fprintf(stderr, "$");
+ fprintf(stderr, "%d,", p->level);
+ }
+ fprintf(stderr, "\n");
+*/
+ return 0;
+}
+
+bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
+{
+ bam_lplbuf_t *tv;
+ tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
+ tv->mp = mp_init();
+ tv->head = tv->tail = mp_alloc(tv->mp);
+ tv->func = func;
+ tv->user_data = data;
+ tv->plbuf = bam_plbuf_init(tview_func, tv);
+ return (bam_lplbuf_t*)tv;
+}
+
+void bam_lplbuf_destroy(bam_lplbuf_t *tv)
+{
+ freenode_t *p, *q;
+ free(tv->cur_level); free(tv->pre_level);
+ bam_plbuf_destroy(tv->plbuf);
+ free(tv->aux);
+ for (p = tv->head; p->next;) {
+ q = p->next;
+ mp_free(tv->mp, p); p = q;
+ }
+ mp_free(tv->mp, p);
+ assert(tv->mp->cnt == 0);
+ mp_destroy(tv->mp);
+ free(tv);
+}
+
+int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
+{
+ return bam_plbuf_push(b, tv->plbuf);
+}
diff --git a/samtools-0.1.19/bam_mate.c b/samtools-0.1.19/bam_mate.c
new file mode 100644
index 0000000..b947c9d
--- /dev/null
+++ b/samtools-0.1.19/bam_mate.c
@@ -0,0 +1,128 @@
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "kstring.h"
+#include "bam.h"
+
+void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
+{
+ bam1_t *swap;
+ int i, end;
+ uint32_t *cigar;
+ str->l = 0;
+ if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip
+ if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
+ kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
+ kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
+ for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) {
+ kputw(bam_cigar_oplen(cigar[i]), str);
+ kputc(bam_cigar_opchr(cigar[i]), str);
+ }
+ end = bam_calend(&b1->core, cigar);
+ kputw(b2->core.pos - end, str);
+ kputc('T', str);
+ kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
+ kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
+ for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) {
+ kputw(bam_cigar_oplen(cigar[i]), str);
+ kputc(bam_cigar_opchr(cigar[i]), str);
+ }
+ bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s);
+}
+
+// currently, this function ONLY works if each read has one hit
+void bam_mating_core(bamFile in, bamFile out, int remove_reads)
+{
+ bam_header_t *header;
+ bam1_t *b[2];
+ int curr, has_prev, pre_end = 0, cur_end;
+ kstring_t str;
+
+ str.l = str.m = 0; str.s = 0;
+ header = bam_header_read(in);
+ bam_header_write(out, header);
+
+ b[0] = bam_init1();
+ b[1] = bam_init1();
+ curr = 0; has_prev = 0;
+ while (bam_read1(in, b[curr]) >= 0) {
+ bam1_t *cur = b[curr], *pre = b[1-curr];
+ if (cur->core.tid < 0)
+ {
+ if ( !remove_reads ) bam_write1(out, cur);
+ continue;
+ }
+ cur_end = bam_calend(&cur->core, bam1_cigar(cur));
+ if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP;
+ if (cur->core.flag & BAM_FSECONDARY)
+ {
+ if ( !remove_reads ) bam_write1(out, cur);
+ continue; // skip secondary alignments
+ }
+ if (has_prev) {
+ if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
+ cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
+ pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
+ if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
+ && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE
+ {
+ uint32_t cur5, pre5;
+ cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
+ pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
+ cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
+ } else cur->core.isize = pre->core.isize = 0;
+ if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
+ else cur->core.flag &= ~BAM_FMREVERSE;
+ if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
+ else pre->core.flag &= ~BAM_FMREVERSE;
+ if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
+ if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
+ bam_template_cigar(pre, cur, &str);
+ bam_write1(out, pre);
+ bam_write1(out, cur);
+ has_prev = 0;
+ } else { // unpaired or singleton
+ pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
+ if (pre->core.flag & BAM_FPAIRED) {
+ pre->core.flag |= BAM_FMUNMAP;
+ pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
+ }
+ bam_write1(out, pre);
+ }
+ } else has_prev = 1;
+ curr = 1 - curr;
+ pre_end = cur_end;
+ }
+ if (has_prev) bam_write1(out, b[1-curr]);
+ bam_header_destroy(header);
+ bam_destroy1(b[0]);
+ bam_destroy1(b[1]);
+ free(str.s);
+}
+
+void usage()
+{
+ fprintf(stderr,"Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n");
+ fprintf(stderr,"Options:\n");
+ fprintf(stderr," -r remove unmapped reads and secondary alignments\n");
+ exit(1);
+}
+
+int bam_mating(int argc, char *argv[])
+{
+ bamFile in, out;
+ int c, remove_reads=0;
+ while ((c = getopt(argc, argv, "r")) >= 0) {
+ switch (c) {
+ case 'r': remove_reads=1; break;
+ }
+ }
+ if (optind+1 >= argc) usage();
+ in = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
+ out = (strcmp(argv[optind+1], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[optind+1], "w");
+ bam_mating_core(in, out, remove_reads);
+ bam_close(in); bam_close(out);
+ return 0;
+}
+
+
diff --git a/samtools-0.1.19/bam_md.c b/samtools-0.1.19/bam_md.c
new file mode 100644
index 0000000..ce40a12
--- /dev/null
+++ b/samtools-0.1.19/bam_md.c
@@ -0,0 +1,389 @@
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+#include "faidx.h"
+#include "sam.h"
+#include "kstring.h"
+#include "kaln.h"
+#include "kprobaln.h"
+
+#define USE_EQUAL 1
+#define DROP_TAG 2
+#define BIN_QUAL 4
+#define UPDATE_NM 8
+#define UPDATE_MD 16
+#define HASH_QNM 32
+
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+int bam_aux_drop_other(bam1_t *b, uint8_t *s);
+
+void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
+{
+ uint8_t *seq = bam1_seq(b);
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ int i, x, y, u = 0;
+ kstring_t *str;
+ int32_t old_nm_i = -1, nm = 0;
+
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+ if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+ ++u;
+ } else {
+ kputw(u, str); kputc(ref[x+j], str);
+ u = 0; ++nm;
+ }
+ }
+ if (j < l) break;
+ x += l; y += l;
+ } else if (op == BAM_CDEL) {
+ kputw(u, str); kputc('^', str);
+ for (j = 0; j < l; ++j) {
+ if (ref[x+j] == 0) break;
+ kputc(ref[x+j], str);
+ }
+ u = 0;
+ if (j < l) break;
+ x += l; nm += l;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+ y += l;
+ if (op == BAM_CINS) nm += l;
+ } else if (op == BAM_CREF_SKIP) {
+ x += l;
+ }
+ }
+ kputw(u, str);
+ // apply max_nm
+ if (max_nm > 0 && nm >= max_nm) {
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+ seq[z/2] |= (z&1)? 0x0f : 0xf0;
+ bam1_qual(b)[z] = 0;
+ }
+ }
+ if (j < l) break;
+ x += l; y += l;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ }
+ }
+ // update NM
+ if (flag & UPDATE_NM) {
+ uint8_t *old_nm = bam_aux_get(b, "NM");
+ if (c->flag & BAM_FUNMAP) return;
+ if (old_nm) old_nm_i = bam_aux2i(old_nm);
+ if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ else if (nm != old_nm_i) {
+ fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+ bam_aux_del(b, old_nm);
+ bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ }
+ }
+ // update MD
+ if (flag & UPDATE_MD) {
+ uint8_t *old_md = bam_aux_get(b, "MD");
+ if (c->flag & BAM_FUNMAP) return;
+ if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ else {
+ int is_diff = 0;
+ if (strlen((char*)old_md+1) == str->l) {
+ for (i = 0; i < str->l; ++i)
+ if (toupper(old_md[i+1]) != toupper(str->s[i]))
+ break;
+ if (i < str->l) is_diff = 1;
+ } else is_diff = 1;
+ if (is_diff) {
+ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+ bam_aux_del(b, old_md);
+ bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ }
+ }
+ }
+ // drop all tags but RG
+ if (flag&DROP_TAG) {
+ uint8_t *q = bam_aux_get(b, "RG");
+ bam_aux_drop_other(b, q);
+ }
+ // reduce the resolution of base quality
+ if (flag&BIN_QUAL) {
+ uint8_t *qual = bam1_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
+ }
+ free(str->s); free(str);
+}
+
+void bam_fillmd1(bam1_t *b, char *ref, int flag)
+{
+ bam_fillmd1_core(b, ref, flag, 0);
+}
+
+int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+{
+ uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ int i, x, y, mm, q, len, clip_l, clip_q;
+ double t;
+ if (thres < 0) thres = 40; // set the default
+ mm = q = len = clip_l = clip_q = 0;
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
+ ++len;
+ if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
+ ++mm;
+ q += qual[z] > 33? 33 : qual[z];
+ }
+ }
+ }
+ if (j < l) break;
+ x += l; y += l; len += l;
+ } else if (op == BAM_CDEL) {
+ for (j = 0; j < l; ++j)
+ if (ref[x+j] == 0) break;
+ if (j < l) break;
+ x += l;
+ } else if (op == BAM_CSOFT_CLIP) {
+ for (j = 0; j < l; ++j) clip_q += qual[y+j];
+ clip_l += l;
+ y += l;
+ } else if (op == BAM_CHARD_CLIP) {
+ clip_q += 13 * l;
+ clip_l += l;
+ } else if (op == BAM_CINS) y += l;
+ else if (op == BAM_CREF_SKIP) x += l;
+ }
+ for (i = 0, t = 1; i < mm; ++i)
+ t *= (double)len / (i+1);
+ t = q - 4.343 * log(t) + clip_q / 5.;
+ if (t > thres) return -1;
+ if (t < 0) t = 0;
+ t = sqrt((thres - t) / thres) * thres;
+// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
+ return (int)(t + .499);
+}
+
+int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
+{
+ int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ kpa_par_t conf = kpa_par_def;
+ uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
+ if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
+ // test if BQ or ZQ is present
+ if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
+ if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
+ if (bq && redo_baq)
+ {
+ bam_aux_del(b, bq-1);
+ bq = 0;
+ }
+ if (bq && zq) { // remove the ZQ tag
+ bam_aux_del(b, zq-1);
+ zq = 0;
+ }
+ if (bq || zq) {
+ if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
+ if (bq && apply_baq) { // then convert BQ to ZQ
+ for (i = 0; i < c->l_qseq; ++i)
+ qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
+ *(bq - 3) = 'Z';
+ } else if (zq && !apply_baq) { // then convert ZQ to BQ
+ for (i = 0; i < c->l_qseq; ++i)
+ qual[i] += (int)zq[i] - 64;
+ *(zq - 3) = 'B';
+ }
+ return 0;
+ }
+ // find the start and end of the alignment
+ x = c->pos, y = 0, yb = ye = xb = xe = -1;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op, l;
+ op = cigar[k]&0xf; l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ if (yb < 0) yb = y;
+ if (xb < 0) xb = x;
+ ye = y + l; xe = x + l;
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
+ }
+ // set bandwidth and the start and the end
+ bw = 7;
+ if (abs((xe - xb) - (ye - yb)) > bw)
+ bw = abs((xe - xb) - (ye - yb)) + 3;
+ conf.bw = bw;
+ xb -= yb + bw/2; if (xb < 0) xb = 0;
+ xe += c->l_qseq - ye + bw/2;
+ if (xe - xb - c->l_qseq > bw)
+ xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
+ { // glocal
+ uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
+ int *state;
+ bq = calloc(c->l_qseq + 1, 1);
+ memcpy(bq, qual, c->l_qseq);
+ s = calloc(c->l_qseq, 1);
+ for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
+ r = calloc(xe - xb, 1);
+ for (i = xb; i < xe; ++i) {
+ if (ref[i] == 0) { xe = i; break; }
+ r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
+ }
+ state = calloc(c->l_qseq, sizeof(int));
+ q = calloc(c->l_qseq, 1);
+ kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
+ if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
+ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k]&0xf, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (i = y; i < y + l; ++i) {
+ if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
+ else bq[i] = bq[i] < q[i]? bq[i] : q[i];
+ }
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ }
+ for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
+ } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
+ uint8_t *left, *rght;
+ left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
+ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k]&0xf, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (i = y; i < y + l; ++i)
+ bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
+ for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
+ left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
+ for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
+ rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
+ for (i = y; i < y + l; ++i)
+ bq[i] = left[i] < rght[i]? left[i] : rght[i];
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ }
+ for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
+ free(left); free(rght);
+ }
+ if (apply_baq) {
+ for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
+ bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
+ } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
+ free(bq); free(s); free(r); free(q); free(state);
+ }
+ return 0;
+}
+
+int bam_prob_realn(bam1_t *b, const char *ref)
+{
+ return bam_prob_realn_core(b, ref, 1);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+ int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ samfile_t *fp, *fpout = 0;
+ faidx_t *fai;
+ char *ref = 0, mode_w[8], mode_r[8];
+ bam1_t *b;
+
+ flt_flag = UPDATE_NM | UPDATE_MD;
+ is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
+ mode_w[0] = mode_r[0] = 0;
+ strcpy(mode_r, "r"); strcpy(mode_w, "w");
+ while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
+ switch (c) {
+ case 'r': is_realn = 1; break;
+ case 'e': flt_flag |= USE_EQUAL; break;
+ case 'd': flt_flag |= DROP_TAG; break;
+ case 'q': flt_flag |= BIN_QUAL; break;
+ case 'h': flt_flag |= HASH_QNM; break;
+ case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
+ case 'b': is_bam_out = 1; break;
+ case 'u': is_uncompressed = is_bam_out = 1; break;
+ case 'S': is_sam_in = 1; break;
+ case 'n': max_nm = atoi(optarg); break;
+ case 'C': capQ = atoi(optarg); break;
+ case 'A': baq_flag |= 1; break;
+ case 'E': baq_flag |= 2; break;
+ default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+ }
+ }
+ if (!is_sam_in) strcat(mode_r, "b");
+ if (is_bam_out) strcat(mode_w, "b");
+ else strcat(mode_w, "h");
+ if (is_uncompressed) strcat(mode_w, "u");
+ if (optind + 1 >= argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
+ fprintf(stderr, "Options: -e change identical bases to '='\n");
+ fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
+ fprintf(stderr, " -b compressed BAM output\n");
+ fprintf(stderr, " -S the input is SAM with header\n");
+ fprintf(stderr, " -A modify the quality string\n");
+ fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
+ fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n");
+ return 1;
+ }
+ fp = samopen(argv[optind], mode_r, 0);
+ if (fp == 0) return 1;
+ if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
+ fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
+ return 1;
+ }
+ fpout = samopen("-", mode_w, fp->header);
+ fai = fai_load(argv[optind+1]);
+
+ b = bam_init1();
+ while ((ret = samread(fp, b)) >= 0) {
+ if (b->core.tid >= 0) {
+ if (tid != b->core.tid) {
+ free(ref);
+ ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
+ tid = b->core.tid;
+ if (ref == 0)
+ fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
+ fp->header->target_name[tid]);
+ }
+ if (is_realn) bam_prob_realn_core(b, ref, baq_flag);
+ if (capQ > 10) {
+ int q = bam_cap_mapQ(b, ref, capQ);
+ if (b->core.qual > q) b->core.qual = q;
+ }
+ if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
+ }
+ samwrite(fpout, b);
+ }
+ bam_destroy1(b);
+
+ free(ref);
+ fai_destroy(fai);
+ samclose(fp); samclose(fpout);
+ return 0;
+}
diff --git a/samtools-0.1.19/bam_pileup.c b/samtools-0.1.19/bam_pileup.c
new file mode 100644
index 0000000..57434e0
--- /dev/null
+++ b/samtools-0.1.19/bam_pileup.c
@@ -0,0 +1,437 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include "sam.h"
+
+typedef struct {
+ int k, x, y, end;
+} cstate_t;
+
+static cstate_t g_cstate_null = { -1, 0, 0, 0 };
+
+typedef struct __linkbuf_t {
+ bam1_t b;
+ uint32_t beg, end;
+ cstate_t s;
+ struct __linkbuf_t *next;
+} lbnode_t;
+
+/* --- BEGIN: Memory pool */
+
+typedef struct {
+ int cnt, n, max;
+ lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+ mempool_t *mp;
+ mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+ return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+ int k;
+ for (k = 0; k < mp->n; ++k) {
+ free(mp->buf[k]->b.data);
+ free(mp->buf[k]);
+ }
+ free(mp->buf);
+ free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+ else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+ --mp->cnt; p->next = 0; // clear lbnode_t::next here
+ if (mp->n == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 256;
+ mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+ }
+ mp->buf[mp->n++] = p;
+}
+
+/* --- END: Memory pool */
+
+/* --- BEGIN: Auxiliary functions */
+
+/* s->k: the index of the CIGAR operator that has just been processed.
+ s->x: the reference coordinate of the start of s->k
+ s->y: the query coordiante of the start of s->k
+ */
+static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
+{
+#define _cop(c) ((c)&BAM_CIGAR_MASK)
+#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
+
+ bam1_t *b = p->b;
+ bam1_core_t *c = &b->core;
+ uint32_t *cigar = bam1_cigar(b);
+ int k, is_head = 0;
+ // determine the current CIGAR operation
+// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y);
+ if (s->k == -1) { // never processed
+ is_head = 1;
+ if (c->n_cigar == 1) { // just one operation, save a loop
+ if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
+ } else { // find the first match or deletion
+ for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
+ int op = _cop(cigar[k]);
+ int l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+ else if (op == BAM_CREF_SKIP) s->x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+ }
+ assert(k < c->n_cigar);
+ s->k = k;
+ }
+ } else { // the read has been processed before
+ int op, l = _cln(cigar[s->k]);
+ if (pos - s->x >= l) { // jump to the next operation
+ assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
+ op = _cop(cigar[s->k+1]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+ s->x += l;
+ ++s->k;
+ } else { // find the next M/D/N/=/X
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+ s->x += l;
+ for (k = s->k + 1; k < c->n_cigar; ++k) {
+ op = _cop(cigar[k]), l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+ }
+ s->k = k;
+ }
+ assert(s->k < c->n_cigar); // otherwise a bug
+ } // else, do nothing
+ }
+ { // collect pileup information
+ int op, l;
+ op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
+ p->is_del = p->indel = p->is_refskip = 0;
+ if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
+ int op2 = _cop(cigar[s->k+1]);
+ int l2 = _cln(cigar[s->k+1]);
+ if (op2 == BAM_CDEL) p->indel = -(int)l2;
+ else if (op2 == BAM_CINS) p->indel = l2;
+ else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
+ int l3 = 0;
+ for (k = s->k + 2; k < c->n_cigar; ++k) {
+ op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
+ if (op2 == BAM_CINS) l3 += l2;
+ else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
+ }
+ if (l3 > 0) p->indel = l3;
+ }
+ }
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ p->qpos = s->y + (pos - s->x);
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
+ p->is_refskip = (op == BAM_CREF_SKIP);
+ } // cannot be other operations; otherwise a bug
+ p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
+ }
+ return 1;
+}
+
+/* --- END: Auxiliary functions */
+
+/*******************
+ * pileup iterator *
+ *******************/
+
+struct __bam_plp_t {
+ mempool_t *mp;
+ lbnode_t *head, *tail, *dummy;
+ int32_t tid, pos, max_tid, max_pos;
+ int is_eof, flag_mask, max_plp, error, maxcnt;
+ bam_pileup1_t *plp;
+ // for the "auto" interface only
+ bam1_t *b;
+ bam_plp_auto_f func;
+ void *data;
+};
+
+bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
+{
+ bam_plp_t iter;
+ iter = calloc(1, sizeof(struct __bam_plp_t));
+ iter->mp = mp_init();
+ iter->head = iter->tail = mp_alloc(iter->mp);
+ iter->dummy = mp_alloc(iter->mp);
+ iter->max_tid = iter->max_pos = -1;
+ iter->flag_mask = BAM_DEF_MASK;
+ iter->maxcnt = 8000;
+ if (func) {
+ iter->func = func;
+ iter->data = data;
+ iter->b = bam_init1();
+ }
+ return iter;
+}
+
+void bam_plp_destroy(bam_plp_t iter)
+{
+ mp_free(iter->mp, iter->dummy);
+ mp_free(iter->mp, iter->head);
+ if (iter->mp->cnt != 0)
+ fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
+ mp_destroy(iter->mp);
+ if (iter->b) bam_destroy1(iter->b);
+ free(iter->plp);
+ free(iter);
+}
+
+const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+ if (iter->error) { *_n_plp = -1; return 0; }
+ *_n_plp = 0;
+ if (iter->is_eof && iter->head->next == 0) return 0;
+ while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
+ int n_plp = 0;
+ lbnode_t *p, *q;
+ // write iter->plp at iter->pos
+ iter->dummy->next = iter->head;
+ for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
+ if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
+ q->next = p->next; mp_free(iter->mp, p); p = q;
+ } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
+ if (n_plp == iter->max_plp) { // then double the capacity
+ iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
+ iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
+ }
+ iter->plp[n_plp].b = &p->b;
+ if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
+ }
+ }
+ iter->head = iter->dummy->next; // dummy->next may be changed
+ *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
+ // update iter->tid and iter->pos
+ if (iter->head->next) {
+ if (iter->tid > iter->head->b.core.tid) {
+ fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
+ iter->error = 1;
+ *_n_plp = -1;
+ return 0;
+ }
+ }
+ if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
+ iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
+ } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
+ iter->pos = iter->head->beg; // jump to the next position
+ } else ++iter->pos; // scan contiguously
+ // return
+ if (n_plp) return iter->plp;
+ if (iter->is_eof && iter->head->next == 0) break;
+ }
+ return 0;
+}
+
+int bam_plp_push(bam_plp_t iter, const bam1_t *b)
+{
+ if (iter->error) return -1;
+ if (b) {
+ if (b->core.tid < 0) return 0;
+ if (b->core.flag & iter->flag_mask) return 0;
+ if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
+ bam_copy1(&iter->tail->b, b);
+ iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
+ iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
+ if (b->core.tid < iter->max_tid) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+ iter->error = 1;
+ return -1;
+ }
+ if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+ iter->error = 1;
+ return -1;
+ }
+ iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
+ if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
+ iter->tail->next = mp_alloc(iter->mp);
+ iter->tail = iter->tail->next;
+ }
+ } else iter->is_eof = 1;
+ return 0;
+}
+
+const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+ const bam_pileup1_t *plp;
+ if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ else { // no pileup line can be obtained; read alignments
+ *_n_plp = 0;
+ if (iter->is_eof) return 0;
+ while (iter->func(iter->data, iter->b) >= 0) {
+ if (bam_plp_push(iter, iter->b) < 0) {
+ *_n_plp = -1;
+ return 0;
+ }
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ // otherwise no pileup line can be returned; read the next alignment.
+ }
+ bam_plp_push(iter, 0);
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ return 0;
+ }
+}
+
+void bam_plp_reset(bam_plp_t iter)
+{
+ lbnode_t *p, *q;
+ iter->max_tid = iter->max_pos = -1;
+ iter->tid = iter->pos = 0;
+ iter->is_eof = 0;
+ for (p = iter->head; p->next;) {
+ q = p->next;
+ mp_free(iter->mp, p);
+ p = q;
+ }
+ iter->head = iter->tail;
+}
+
+void bam_plp_set_mask(bam_plp_t iter, int mask)
+{
+ iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
+}
+
+void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
+{
+ iter->maxcnt = maxcnt;
+}
+
+/*****************
+ * callback APIs *
+ *****************/
+
+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
+{
+ bam_plbuf_t *buf;
+ int ret;
+ bam1_t *b;
+ b = bam_init1();
+ buf = bam_plbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf, mask);
+ while ((ret = bam_read1(fp, b)) >= 0)
+ bam_plbuf_push(b, buf);
+ bam_plbuf_push(0, buf);
+ bam_plbuf_destroy(buf);
+ bam_destroy1(b);
+ return 0;
+}
+
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+ bam_plp_set_mask(buf->iter, mask);
+}
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+ bam_plp_reset(buf->iter);
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+ bam_plbuf_t *buf;
+ buf = calloc(1, sizeof(bam_plbuf_t));
+ buf->iter = bam_plp_init(0, 0);
+ buf->func = func;
+ buf->data = data;
+ return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+ bam_plp_destroy(buf->iter);
+ free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+ int ret, n_plp, tid, pos;
+ const bam_pileup1_t *plp;
+ ret = bam_plp_push(buf->iter, b);
+ if (ret < 0) return ret;
+ while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)
+ buf->func(tid, pos, n_plp, plp, buf->data);
+ return 0;
+}
+
+/***********
+ * mpileup *
+ ***********/
+
+struct __bam_mplp_t {
+ int n;
+ uint64_t min, *pos;
+ bam_plp_t *iter;
+ int *n_plp;
+ const bam_pileup1_t **plp;
+};
+
+bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+{
+ int i;
+ bam_mplp_t iter;
+ iter = calloc(1, sizeof(struct __bam_mplp_t));
+ iter->pos = calloc(n, 8);
+ iter->n_plp = calloc(n, sizeof(int));
+ iter->plp = calloc(n, sizeof(void*));
+ iter->iter = calloc(n, sizeof(void*));
+ iter->n = n;
+ iter->min = (uint64_t)-1;
+ for (i = 0; i < n; ++i) {
+ iter->iter[i] = bam_plp_init(func, data[i]);
+ iter->pos[i] = iter->min;
+ }
+ return iter;
+}
+
+void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i)
+ iter->iter[i]->maxcnt = maxcnt;
+}
+
+void bam_mplp_destroy(bam_mplp_t iter)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
+ free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
+ free(iter);
+}
+
+int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, ret = 0;
+ uint64_t new_min = (uint64_t)-1;
+ for (i = 0; i < iter->n; ++i) {
+ if (iter->pos[i] == iter->min) {
+ int tid, pos;
+ iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
+ iter->pos[i] = (uint64_t)tid<<32 | pos;
+ }
+ if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
+ }
+ iter->min = new_min;
+ if (new_min == (uint64_t)-1) return 0;
+ *_tid = new_min>>32; *_pos = (uint32_t)new_min;
+ for (i = 0; i < iter->n; ++i) {
+ if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
+ n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
+ ++ret;
+ } else n_plp[i] = 0, plp[i] = 0;
+ }
+ return ret;
+}
diff --git a/samtools-0.1.19/bam_plcmd.c b/samtools-0.1.19/bam_plcmd.c
new file mode 100644
index 0000000..54a4597
--- /dev/null
+++ b/samtools-0.1.19/bam_plcmd.c
@@ -0,0 +1,606 @@
+#include <math.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <getopt.h>
+#include "sam.h"
+#include "faidx.h"
+#include "kstring.h"
+#include "sam_header.h"
+
+static inline int printw(int c, FILE *fp)
+{
+ char buf[16];
+ int l, x;
+ if (c == 0) return fputc('0', fp);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ buf[l] = 0;
+ for (x = 0; x < l/2; ++x) {
+ int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y;
+ }
+ fputs(buf, fp);
+ return 0;
+}
+
+static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
+{
+ int j;
+ if (p->is_head) {
+ putchar('^');
+ putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+ }
+ if (!p->is_del) {
+ int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (ref) {
+ int rb = pos < ref_len? ref[pos] : 'N';
+ if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.';
+ else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+ } else {
+ if (c == '=') c = bam1_strand(p->b)? ',' : '.';
+ else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+ }
+ putchar(c);
+ } else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*');
+ if (p->indel > 0) {
+ putchar('+'); printw(p->indel, stdout);
+ for (j = 1; j <= p->indel; ++j) {
+ int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+ }
+ } else if (p->indel < 0) {
+ printw(p->indel, stdout);
+ for (j = 1; j <= -p->indel; ++j) {
+ int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
+ putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+ }
+ }
+ if (p->is_tail) putchar('$');
+}
+
+#include <assert.h>
+#include "bam2bcf.h"
+#include "sample.h"
+
+#define MPLP_GLF 0x10
+#define MPLP_NO_COMP 0x20
+#define MPLP_NO_ORPHAN 0x40
+#define MPLP_REALN 0x80
+#define MPLP_NO_INDEL 0x400
+#define MPLP_REDO_BAQ 0x800
+#define MPLP_ILLUMINA13 0x1000
+#define MPLP_IGNORE_RG 0x2000
+#define MPLP_PRINT_POS 0x4000
+#define MPLP_PRINT_MAPQ 0x8000
+#define MPLP_PER_SAMPLE 0x10000
+
+void *bed_read(const char *fn);
+void bed_destroy(void *_h);
+int bed_overlap(const void *_h, const char *chr, int beg, int end);
+
+typedef struct {
+ int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int rflag_require, rflag_filter;
+ int openQ, extQ, tandemQ, min_support; // for indels
+ double min_frac; // for indels
+ char *reg, *pl_list, *fai_fname;
+ faidx_t *fai;
+ void *bed, *rghash;
+} mplp_conf_t;
+
+typedef struct {
+ bamFile fp;
+ bam_iter_t iter;
+ bam_header_t *h;
+ int ref_id;
+ char *ref;
+ const mplp_conf_t *conf;
+} mplp_aux_t;
+
+typedef struct {
+ int n;
+ int *n_plp, *m_plp;
+ bam_pileup1_t **plp;
+} mplp_pileup_t;
+
+static int mplp_func(void *data, bam1_t *b)
+{
+ extern int bam_realn(bam1_t *b, const char *ref);
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int);
+ extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
+ mplp_aux_t *ma = (mplp_aux_t*)data;
+ int ret, skip = 0;
+ do {
+ int has_ref;
+ ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b);
+ if (ret < 0) break;
+ if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads
+ skip = 1;
+ continue;
+ }
+ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
+ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
+ if (ma->conf->bed) { // test overlap
+ skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
+ if (skip) continue;
+ }
+ if (ma->conf->rghash) { // exclude read groups
+ uint8_t *rg = bam_aux_get(b, "RG");
+ skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0);
+ if (skip) continue;
+ }
+ if (ma->conf->flag & MPLP_ILLUMINA13) {
+ int i;
+ uint8_t *qual = bam1_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ qual[i] = qual[i] > 31? qual[i] - 31 : 0;
+ }
+ has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
+ skip = 0;
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && ma->conf->capQ_thres > 10) {
+ int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres);
+ if (q < 0) skip = 1;
+ else if (b->core.qual > q) b->core.qual = q;
+ }
+ else if (b->core.qual < ma->conf->min_mq) skip = 1;
+ else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1;
+ } while (skip);
+ return ret;
+}
+
+static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf,
+ int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg)
+{
+ int i, j;
+ memset(m->n_plp, 0, m->n * sizeof(int));
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j;
+ uint8_t *q;
+ int id = -1;
+ q = ignore_rg? 0 : bam_aux_get(p->b, "RG");
+ if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf);
+ if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
+ if (id < 0 || id >= m->n) {
+ assert(q); // otherwise a bug
+ fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
+ exit(1);
+ }
+ if (m->n_plp[id] == m->m_plp[id]) {
+ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
+ m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
+ }
+ m->plp[id][m->n_plp[id]++] = *p;
+ }
+ }
+}
+
+static int mpileup(mplp_conf_t *conf, int n, char **fn)
+{
+ extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
+ extern void bcf_call_del_rghash(void *rghash);
+ mplp_aux_t **data;
+ int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
+ const bam_pileup1_t **plp;
+ bam_mplp_t iter;
+ bam_header_t *h = 0;
+ char *ref;
+ void *rghash = 0;
+
+ bcf_callaux_t *bca = 0;
+ bcf_callret1_t *bcr = 0;
+ bcf_call_t bc;
+ bcf_t *bp = 0;
+ bcf_hdr_t *bh = 0;
+
+ bam_sample_t *sm = 0;
+ kstring_t buf;
+ mplp_pileup_t gplp;
+
+ memset(&gplp, 0, sizeof(mplp_pileup_t));
+ memset(&buf, 0, sizeof(kstring_t));
+ memset(&bc, 0, sizeof(bcf_call_t));
+ data = calloc(n, sizeof(void*));
+ plp = calloc(n, sizeof(void*));
+ n_plp = calloc(n, sizeof(int*));
+ sm = bam_smpl_init();
+
+ // read the header and initialize data
+ for (i = 0; i < n; ++i) {
+ bam_header_t *h_tmp;
+ data[i] = calloc(1, sizeof(mplp_aux_t));
+ data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
+ if ( !data[i]->fp )
+ {
+ fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
+ exit(1);
+ }
+ data[i]->conf = conf;
+ h_tmp = bam_header_read(data[i]->fp);
+ if ( !h_tmp ) {
+ fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
+ exit(1);
+ }
+ data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
+ bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
+ rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
+ if (conf->reg) {
+ int beg, end;
+ bam_index_t *idx;
+ idx = bam_index_load(fn[i]);
+ if (idx == 0) {
+ fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
+ exit(1);
+ }
+ if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
+ fprintf(stderr, "[%s] malformatted region or wrong seqname for %s\n", __func__, fn[i]);
+ exit(1);
+ }
+ if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
+ data[i]->iter = bam_iter_query(idx, tid, beg, end);
+ bam_index_destroy(idx);
+ }
+ if (i == 0) h = h_tmp;
+ else {
+ // FIXME: to check consistency
+ bam_header_destroy(h_tmp);
+ }
+ }
+ gplp.n = sm->n;
+ gplp.n_plp = calloc(sm->n, sizeof(int));
+ gplp.m_plp = calloc(sm->n, sizeof(int));
+ gplp.plp = calloc(sm->n, sizeof(void*));
+
+ fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
+ // write the VCF header
+ if (conf->flag & MPLP_GLF) {
+ kstring_t s;
+ bh = calloc(1, sizeof(bcf_hdr_t));
+ s.l = s.m = 0; s.s = 0;
+ bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
+ for (i = 0; i < h->n_targets; ++i) {
+ kputs(h->target_name[i], &s);
+ kputc('\0', &s);
+ }
+ bh->l_nm = s.l;
+ bh->name = malloc(s.l);
+ memcpy(bh->name, s.s, s.l);
+ s.l = 0;
+ for (i = 0; i < sm->n; ++i) {
+ kputs(sm->smpl[i], &s); kputc('\0', &s);
+ }
+ bh->l_smpl = s.l;
+ bh->sname = malloc(s.l);
+ memcpy(bh->sname, s.s, s.l);
+ s.l = 0;
+ ksprintf(&s, "##samtoolsVersion=%s\n", BAM_VERSION);
+ if (conf->fai_fname) ksprintf(&s, "##reference=file://%s\n", conf->fai_fname);
+ h->dict = sam_header_parse2(h->text);
+ int nseq;
+ const char *tags[] = {"SN","LN","UR","M5",NULL};
+ char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &nseq);
+ for (i=0; i<nseq; i++)
+ {
+ ksprintf(&s, "##contig=<ID=%s", tbl[4*i]);
+ if ( tbl[4*i+1] ) ksprintf(&s, ",length=%s", tbl[4*i+1]);
+ if ( tbl[4*i+2] ) ksprintf(&s, ",URL=%s", tbl[4*i+2]);
+ if ( tbl[4*i+3] ) ksprintf(&s, ",md5=%s", tbl[4*i+3]);
+ kputs(">\n", &s);
+ }
+ if (tbl) free(tbl);
+ bh->txt = s.s;
+ bh->l_txt = 1 + s.l;
+ bcf_hdr_sync(bh);
+ bcf_hdr_write(bp, bh);
+ bca = bcf_call_init(-1., conf->min_baseQ);
+ bcr = calloc(sm->n, sizeof(bcf_callret1_t));
+ bca->rghash = rghash;
+ bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
+ bca->min_frac = conf->min_frac;
+ bca->min_support = conf->min_support;
+ bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
+ }
+ if (tid0 >= 0 && conf->fai) { // region is set
+ ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
+ ref_tid = tid0;
+ for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
+ } else ref_tid = -1, ref = 0;
+ iter = bam_mplp_init(n, mplp_func, (void**)data);
+ max_depth = conf->max_depth;
+ if (max_depth * sm->n > 1<<20)
+ fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
+ if (max_depth * sm->n < 8000) {
+ max_depth = 8000 / sm->n;
+ fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
+ }
+ max_indel_depth = conf->max_indel_depth * sm->n;
+ bam_mplp_set_maxcnt(iter, max_depth);
+ while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
+ if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
+ if (tid != ref_tid) {
+ free(ref); ref = 0;
+ if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
+ for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
+ ref_tid = tid;
+ }
+ if (conf->flag & MPLP_GLF) {
+ int total_depth, _ref0, ref16;
+ bcf1_t *b = calloc(1, sizeof(bcf1_t));
+ for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
+ group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
+ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
+ ref16 = bam_nt16_table[_ref0];
+ for (i = 0; i < gplp.n; ++i)
+ bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
+ bcf_call_combine(gplp.n, bcr, bca, ref16, &bc);
+ bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0);
+ bcf_write(bp, bh, b);
+ bcf_destroy(b);
+ // call indels
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
+ for (i = 0; i < gplp.n; ++i)
+ bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
+ if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) {
+ b = calloc(1, sizeof(bcf1_t));
+ bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref);
+ bcf_write(bp, bh, b);
+ bcf_destroy(b);
+ }
+ }
+ } else {
+ printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
+ for (i = 0; i < n; ++i) {
+ int j, cnt;
+ for (j = cnt = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j;
+ if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt;
+ }
+ printf("\t%d\t", cnt);
+ if (n_plp[i] == 0) {
+ printf("*\t*"); // FIXME: printf() is very slow...
+ if (conf->flag & MPLP_PRINT_POS) printf("\t*");
+ } else {
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j;
+ if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ)
+ pileup_seq(plp[i] + j, pos, ref_len, ref);
+ }
+ putchar('\t');
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j;
+ int c = bam1_qual(p->b)[p->qpos];
+ if (c >= conf->min_baseQ) {
+ c = c + 33 < 126? c + 33 : 126;
+ putchar(c);
+ }
+ }
+ if (conf->flag & MPLP_PRINT_MAPQ) {
+ putchar('\t');
+ for (j = 0; j < n_plp[i]; ++j) {
+ int c = plp[i][j].b->core.qual + 33;
+ if (c > 126) c = 126;
+ putchar(c);
+ }
+ }
+ if (conf->flag & MPLP_PRINT_POS) {
+ putchar('\t');
+ for (j = 0; j < n_plp[i]; ++j) {
+ if (j > 0) putchar(',');
+ printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
+ }
+ }
+ }
+ }
+ putchar('\n');
+ }
+ }
+
+ bcf_close(bp);
+ bam_smpl_destroy(sm); free(buf.s);
+ for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
+ free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
+ bcf_call_del_rghash(rghash);
+ bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
+ bam_mplp_destroy(iter);
+ bam_header_destroy(h);
+ for (i = 0; i < n; ++i) {
+ bam_close(data[i]->fp);
+ if (data[i]->iter) bam_iter_destroy(data[i]->iter);
+ free(data[i]);
+ }
+ free(data); free(plp); free(ref); free(n_plp);
+ return 0;
+}
+
+#define MAX_PATH_LEN 1024
+int read_file_list(const char *file_list,int *n,char **argv[])
+{
+ char buf[MAX_PATH_LEN];
+ int len, nfiles = 0;
+ char **files = NULL;
+ struct stat sb;
+
+ *n = 0;
+ *argv = NULL;
+
+ FILE *fh = fopen(file_list,"r");
+ if ( !fh )
+ {
+ fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
+ return 1;
+ }
+
+ files = calloc(nfiles,sizeof(char*));
+ nfiles = 0;
+ while ( fgets(buf,MAX_PATH_LEN,fh) )
+ {
+ // allow empty lines and trailing spaces
+ len = strlen(buf);
+ while ( len>0 && isspace(buf[len-1]) ) len--;
+ if ( !len ) continue;
+
+ // check sanity of the file list
+ buf[len] = 0;
+ if (stat(buf, &sb) != 0)
+ {
+ // no such file, check if it is safe to print its name
+ int i, safe_to_print = 1;
+ for (i=0; i<len; i++)
+ if (!isprint(buf[i])) { safe_to_print = 0; break; }
+ if ( safe_to_print )
+ fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+ else
+ fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+ return 1;
+ }
+
+ nfiles++;
+ files = realloc(files,nfiles*sizeof(char*));
+ files[nfiles-1] = strdup(buf);
+ }
+ fclose(fh);
+ if ( !nfiles )
+ {
+ fprintf(stderr,"No files read from %s\n", file_list);
+ return 1;
+ }
+ *argv = files;
+ *n = nfiles;
+ return 0;
+}
+#undef MAX_PATH_LEN
+
+int bam_mpileup(int argc, char *argv[])
+{
+ int c;
+ const char *file_list = NULL;
+ char **fn = NULL;
+ int nfiles = 0, use_orphan = 0;
+ mplp_conf_t mplp;
+ memset(&mplp, 0, sizeof(mplp_conf_t));
+ mplp.max_mq = 60;
+ mplp.min_baseQ = 13;
+ mplp.capQ_thres = 0;
+ mplp.max_depth = 250; mplp.max_indel_depth = 250;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
+ mplp.min_frac = 0.002; mplp.min_support = 1;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
+ static struct option lopts[] =
+ {
+ {"rf",1,0,1}, // require flag
+ {"ff",1,0,2}, // filter flag
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) {
+ switch (c) {
+ case 1 : mplp.rflag_require = strtol(optarg,0,0); break;
+ case 2 : mplp.rflag_filter = strtol(optarg,0,0); break;
+ case 'f':
+ mplp.fai = fai_load(optarg);
+ if (mplp.fai == 0) return 1;
+ mplp.fai_fname = optarg;
+ break;
+ case 'd': mplp.max_depth = atoi(optarg); break;
+ case 'r': mplp.reg = strdup(optarg); break;
+ case 'l': mplp.bed = bed_read(optarg); break;
+ case 'P': mplp.pl_list = strdup(optarg); break;
+ case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
+ case 'g': mplp.flag |= MPLP_GLF; break;
+ case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break;
+ case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break;
+ case 'B': mplp.flag &= ~MPLP_REALN; break;
+ case 'D': mplp.fmt_flag |= B2B_FMT_DP; break;
+ case 'S': mplp.fmt_flag |= B2B_FMT_SP; break;
+ case 'V': mplp.fmt_flag |= B2B_FMT_DV; break;
+ case 'I': mplp.flag |= MPLP_NO_INDEL; break;
+ case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
+ case '6': mplp.flag |= MPLP_ILLUMINA13; break;
+ case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
+ case 's': mplp.flag |= MPLP_PRINT_MAPQ; break;
+ case 'O': mplp.flag |= MPLP_PRINT_POS; break;
+ case 'C': mplp.capQ_thres = atoi(optarg); break;
+ case 'M': mplp.max_mq = atoi(optarg); break;
+ case 'q': mplp.min_mq = atoi(optarg); break;
+ case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 'b': file_list = optarg; break;
+ case 'o': mplp.openQ = atoi(optarg); break;
+ case 'e': mplp.extQ = atoi(optarg); break;
+ case 'h': mplp.tandemQ = atoi(optarg); break;
+ case 'A': use_orphan = 1; break;
+ case 'F': mplp.min_frac = atof(optarg); break;
+ case 'm': mplp.min_support = atoi(optarg); break;
+ case 'L': mplp.max_indel_depth = atoi(optarg); break;
+ case 'G': {
+ FILE *fp_rg;
+ char buf[1024];
+ mplp.rghash = bcf_str2id_init();
+ if ((fp_rg = fopen(optarg, "r")) == 0)
+ fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
+ while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me...
+ bcf_str2id_add(mplp.rghash, strdup(buf));
+ fclose(fp_rg);
+ }
+ break;
+ }
+ }
+ if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
+ if (argc == 1) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
+ fprintf(stderr, "Input options:\n\n");
+ fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n");
+ fprintf(stderr, " -A count anomalous read pairs\n");
+ fprintf(stderr, " -B disable BAQ computation\n");
+ fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n");
+ fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n");
+ fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n");
+ fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n");
+ fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
+ fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n");
+ fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
+ fprintf(stderr, " -r STR region in which pileup is generated [null]\n");
+ fprintf(stderr, " -R ignore RG tags\n");
+ fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
+ fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
+ fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n");
+ fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n");
+ fprintf(stderr, "\nOutput options:\n\n");
+ fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n");
+ fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n");
+ fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n");
+ fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n");
+ fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n");
+ fprintf(stderr, " -u generate uncompress BCF output\n");
+ fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n");
+ fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
+ fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
+ fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ);
+ fprintf(stderr, " -I do not perform indel calling\n");
+ fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
+ fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
+ fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
+ fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n");
+ fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
+ return 1;
+ }
+ bam_no_B = 1;
+ if (file_list) {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ mpileup(&mplp,nfiles,fn);
+ for (c=0; c<nfiles; c++) free(fn[c]);
+ free(fn);
+ } else mpileup(&mplp, argc - optind, argv + optind);
+ if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash);
+ free(mplp.reg); free(mplp.pl_list);
+ if (mplp.fai) fai_destroy(mplp.fai);
+ if (mplp.bed) bed_destroy(mplp.bed);
+ return 0;
+}
diff --git a/samtools-0.1.19/bam_reheader.c b/samtools-0.1.19/bam_reheader.c
new file mode 100644
index 0000000..6619428
--- /dev/null
+++ b/samtools-0.1.19/bam_reheader.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "knetfile.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#define BUF_SIZE 0x10000
+
+int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
+{
+ BGZF *fp;
+ bam_header_t *old;
+ int len;
+ uint8_t *buf;
+ if (in->is_write) return -1;
+ buf = malloc(BUF_SIZE);
+ old = bam_header_read(in);
+ fp = bgzf_fdopen(fd, "w");
+ bam_header_write(fp, h);
+ if (in->block_offset < in->block_length) {
+ bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
+ bgzf_flush(fp);
+ }
+#ifdef _USE_KNETFILE
+ while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0)
+ fwrite(buf, 1, len, fp->fp);
+#else
+ while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0)
+ fwrite(buf, 1, len, fp->file);
+#endif
+ free(buf);
+ fp->block_offset = in->block_offset = 0;
+ bgzf_close(fp);
+ return 0;
+}
+
+int main_reheader(int argc, char *argv[])
+{
+ bam_header_t *h;
+ BGZF *in;
+ if (argc != 3) {
+ fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
+ return 1;
+ }
+ { // read the header
+ tamFile fph = sam_open(argv[1]);
+ if (fph == 0) {
+ fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
+ return 1;
+ }
+ h = sam_header_read(fph);
+ sam_close(fph);
+ }
+ in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r");
+ if (in == 0) {
+ fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]);
+ return 1;
+ }
+ bam_reheader(in, h, fileno(stdout));
+ bgzf_close(in);
+ return 0;
+}
diff --git a/samtools-0.1.19/bam_rmdup.c b/samtools-0.1.19/bam_rmdup.c
new file mode 100644
index 0000000..f0d2b5d
--- /dev/null
+++ b/samtools-0.1.19/bam_rmdup.c
@@ -0,0 +1,206 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <unistd.h>
+#include "sam.h"
+
+typedef bam1_t *bam1_p;
+
+#include "khash.h"
+KHASH_SET_INIT_STR(name)
+KHASH_MAP_INIT_INT64(pos, bam1_p)
+
+#define BUFFER_SIZE 0x40000
+
+typedef struct {
+ uint64_t n_checked, n_removed;
+ khash_t(pos) *best_hash;
+} lib_aux_t;
+KHASH_MAP_INIT_STR(lib, lib_aux_t)
+
+typedef struct {
+ int n, max;
+ bam1_t **a;
+} tmp_stack_t;
+
+static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
+{
+ if (stack->n == stack->max) {
+ stack->max = stack->max? stack->max<<1 : 0x10000;
+ stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
+ }
+ stack->a[stack->n++] = b;
+}
+
+static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
+{
+ int i;
+ for (i = 0; i != stack->n; ++i) {
+ samwrite(out, stack->a[i]);
+ bam_destroy1(stack->a[i]);
+ }
+ stack->n = 0;
+}
+
+static void clear_del_set(khash_t(name) *del_set)
+{
+ khint_t k;
+ for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
+ if (kh_exist(del_set, k))
+ free((char*)kh_key(del_set, k));
+ kh_clear(name, del_set);
+}
+
+static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
+{
+ khint_t k = kh_get(lib, aux, lib);
+ if (k == kh_end(aux)) {
+ int ret;
+ char *p = strdup(lib);
+ lib_aux_t *q;
+ k = kh_put(lib, aux, p, &ret);
+ q = &kh_val(aux, k);
+ q->n_checked = q->n_removed = 0;
+ q->best_hash = kh_init(pos);
+ return q;
+ } else return &kh_val(aux, k);
+}
+
+static void clear_best(khash_t(lib) *aux, int max)
+{
+ khint_t k;
+ for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+ if (kh_exist(aux, k)) {
+ lib_aux_t *q = &kh_val(aux, k);
+ if (kh_size(q->best_hash) >= max)
+ kh_clear(pos, q->best_hash);
+ }
+ }
+}
+
+static inline int sum_qual(const bam1_t *b)
+{
+ int i, q;
+ uint8_t *qual = bam1_qual(b);
+ for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
+ return q;
+}
+
+void bam_rmdup_core(samfile_t *in, samfile_t *out)
+{
+ bam1_t *b;
+ int last_tid = -1, last_pos = -1;
+ tmp_stack_t stack;
+ khint_t k;
+ khash_t(lib) *aux;
+ khash_t(name) *del_set;
+
+ aux = kh_init(lib);
+ del_set = kh_init(name);
+ b = bam_init1();
+ memset(&stack, 0, sizeof(tmp_stack_t));
+
+ kh_resize(name, del_set, 4 * BUFFER_SIZE);
+ while (samread(in, b) >= 0) {
+ bam1_core_t *c = &b->core;
+ if (c->tid != last_tid || last_pos != c->pos) {
+ dump_best(&stack, out); // write the result
+ clear_best(aux, BUFFER_SIZE);
+ if (c->tid != last_tid) {
+ clear_best(aux, 0);
+ if (kh_size(del_set)) { // check
+ fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
+ clear_del_set(del_set);
+ }
+ if ((int)c->tid == -1) { // append unmapped reads
+ samwrite(out, b);
+ while (samread(in, b) >= 0) samwrite(out, b);
+ break;
+ }
+ last_tid = c->tid;
+ fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
+ }
+ }
+ if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
+ samwrite(out, b);
+ } else if (c->isize > 0) { // paired, head
+ uint64_t key = (uint64_t)c->pos<<32 | c->isize;
+ const char *lib;
+ lib_aux_t *q;
+ int ret;
+ lib = bam_get_library(in->header, b);
+ q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+ ++q->n_checked;
+ k = kh_put(pos, q->best_hash, key, &ret);
+ if (ret == 0) { // found in best_hash
+ bam1_t *p = kh_val(q->best_hash, k);
+ ++q->n_removed;
+ if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
+ kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+ bam_copy1(p, b); // replaced as b
+ } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+ if (ret == 0)
+ fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+ } else { // not found in best_hash
+ kh_val(q->best_hash, k) = bam_dup1(b);
+ stack_insert(&stack, kh_val(q->best_hash, k));
+ }
+ } else { // paired, tail
+ k = kh_get(name, del_set, bam1_qname(b));
+ if (k != kh_end(del_set)) {
+ free((char*)kh_key(del_set, k));
+ kh_del(name, del_set, k);
+ } else samwrite(out, b);
+ }
+ last_pos = c->pos;
+ }
+
+ for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+ if (kh_exist(aux, k)) {
+ lib_aux_t *q = &kh_val(aux, k);
+ dump_best(&stack, out);
+ fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+ (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+ kh_destroy(pos, q->best_hash);
+ free((char*)kh_key(aux, k));
+ }
+ }
+ kh_destroy(lib, aux);
+
+ clear_del_set(del_set);
+ kh_destroy(name, del_set);
+ free(stack.a);
+ bam_destroy1(b);
+}
+
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
+
+int bam_rmdup(int argc, char *argv[])
+{
+ int c, is_se = 0, force_se = 0;
+ samfile_t *in, *out;
+ while ((c = getopt(argc, argv, "sS")) >= 0) {
+ switch (c) {
+ case 's': is_se = 1; break;
+ case 'S': force_se = is_se = 1; break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
+ fprintf(stderr, "Option: -s rmdup for SE reads\n");
+ fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n");
+ return 1;
+ }
+ in = samopen(argv[optind], "rb", 0);
+ out = samopen(argv[optind+1], "wb", in->header);
+ if (in == 0 || out == 0) {
+ fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
+ return 1;
+ }
+ if (is_se) bam_rmdupse_core(in, out, force_se);
+ else bam_rmdup_core(in, out);
+ samclose(in); samclose(out);
+ return 0;
+}
diff --git a/samtools-0.1.19/bam_rmdupse.c b/samtools-0.1.19/bam_rmdupse.c
new file mode 100644
index 0000000..e7dbdc7
--- /dev/null
+++ b/samtools-0.1.19/bam_rmdupse.c
@@ -0,0 +1,159 @@
+#include <math.h>
+#include "sam.h"
+#include "khash.h"
+#include "klist.h"
+
+#define QUEUE_CLEAR_SIZE 0x100000
+#define MAX_POS 0x7fffffff
+
+typedef struct {
+ int endpos;
+ uint32_t score:31, discarded:1;
+ bam1_t *b;
+} elem_t, *elem_p;
+#define __free_elem(p) bam_destroy1((p)->data.b)
+KLIST_INIT(q, elem_t, __free_elem)
+typedef klist_t(q) queue_t;
+
+KHASH_MAP_INIT_INT(best, elem_p)
+typedef khash_t(best) besthash_t;
+
+typedef struct {
+ uint64_t n_checked, n_removed;
+ besthash_t *left, *rght;
+} lib_aux_t;
+KHASH_MAP_INIT_STR(lib, lib_aux_t)
+
+static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
+{
+ khint_t k = kh_get(lib, aux, lib);
+ if (k == kh_end(aux)) {
+ int ret;
+ char *p = strdup(lib);
+ lib_aux_t *q;
+ k = kh_put(lib, aux, p, &ret);
+ q = &kh_val(aux, k);
+ q->left = kh_init(best);
+ q->rght = kh_init(best);
+ q->n_checked = q->n_removed = 0;
+ return q;
+ } else return &kh_val(aux, k);
+}
+
+static inline int sum_qual(const bam1_t *b)
+{
+ int i, q;
+ uint8_t *qual = bam1_qual(b);
+ for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
+ return q;
+}
+
+static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
+{
+ elem_t *p = kl_pushp(q, queue);
+ p->discarded = 0;
+ p->endpos = endpos; p->score = score;
+ if (p->b == 0) p->b = bam_init1();
+ bam_copy1(p->b, b);
+ return p;
+}
+
+static void clear_besthash(besthash_t *h, int32_t pos)
+{
+ khint_t k;
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
+ kh_del(best, h, k);
+}
+
+static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
+{
+ if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
+ khint_t k;
+ while (1) {
+ elem_t *q;
+ if (queue->head == queue->tail) break;
+ q = &kl_val(queue->head);
+ if (q->discarded) {
+ q->b->data_len = 0;
+ kl_shift(q, queue, 0);
+ continue;
+ }
+ if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
+ samwrite(out, q->b);
+ q->b->data_len = 0;
+ kl_shift(q, queue, 0);
+ }
+ for (k = kh_begin(h); k != kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ clear_besthash(kh_val(h, k).left, pos);
+ clear_besthash(kh_val(h, k).rght, pos);
+ }
+ }
+ }
+}
+
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
+{
+ bam1_t *b;
+ queue_t *queue;
+ khint_t k;
+ int last_tid = -2;
+ khash_t(lib) *aux;
+
+ aux = kh_init(lib);
+ b = bam_init1();
+ queue = kl_init(q);
+ while (samread(in, b) >= 0) {
+ bam1_core_t *c = &b->core;
+ int endpos = bam_calend(c, bam1_cigar(b));
+ int score = sum_qual(b);
+
+ if (last_tid != c->tid) {
+ if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
+ last_tid = c->tid;
+ } else dump_alignment(out, queue, c->pos, aux);
+ if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
+ push_queue(queue, b, endpos, score);
+ } else {
+ const char *lib;
+ lib_aux_t *q;
+ besthash_t *h;
+ uint32_t key;
+ int ret;
+ lib = bam_get_library(in->header, b);
+ q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+ ++q->n_checked;
+ h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
+ key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
+ k = kh_put(best, h, key, &ret);
+ if (ret == 0) { // in the hash table
+ elem_t *p = kh_val(h, k);
+ ++q->n_removed;
+ if (p->score < score) {
+ if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
+ p->discarded = 1;
+ kh_val(h, k) = push_queue(queue, b, endpos, score);
+ } else { // replace
+ p->score = score; p->endpos = endpos;
+ bam_copy1(p->b, b);
+ }
+ } // otherwise, discard the alignment
+ } else kh_val(h, k) = push_queue(queue, b, endpos, score);
+ }
+ }
+ dump_alignment(out, queue, MAX_POS, aux);
+
+ for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+ if (kh_exist(aux, k)) {
+ lib_aux_t *q = &kh_val(aux, k);
+ fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+ (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+ kh_destroy(best, q->left); kh_destroy(best, q->rght);
+ free((char*)kh_key(aux, k));
+ }
+ }
+ kh_destroy(lib, aux);
+ bam_destroy1(b);
+ kl_destroy(q, queue);
+}
diff --git a/samtools-0.1.19/bam_sort.c b/samtools-0.1.19/bam_sort.c
new file mode 100644
index 0000000..c46bce3
--- /dev/null
+++ b/samtools-0.1.19/bam_sort.c
@@ -0,0 +1,571 @@
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "bam.h"
+#include "ksort.h"
+
+static int g_is_by_qname = 0;
+
+static int strnum_cmp(const char *_a, const char *_b)
+{
+ const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
+ const unsigned char *pa = a, *pb = b;
+ while (*pa && *pb) {
+ if (isdigit(*pa) && isdigit(*pb)) {
+ while (*pa == '0') ++pa;
+ while (*pb == '0') ++pb;
+ while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
+ if (isdigit(*pa) && isdigit(*pb)) {
+ int i = 0;
+ while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
+ return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
+ } else if (isdigit(*pa)) return 1;
+ else if (isdigit(*pb)) return -1;
+ else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
+ } else {
+ if (*pa != *pb) return (int)*pa - (int)*pb;
+ ++pa; ++pb;
+ }
+ }
+ return *pa? 1 : *pb? -1 : 0;
+}
+
+#define HEAP_EMPTY 0xffffffffffffffffull
+
+typedef struct {
+ int i;
+ uint64_t pos, idx;
+ bam1_t *b;
+} heap1_t;
+
+#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
+
+static inline int heap_lt(const heap1_t a, const heap1_t b)
+{
+ if (g_is_by_qname) {
+ int t;
+ if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;
+ t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+ return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0)));
+ } else return __pos_cmp(a, b);
+}
+
+KSORT_INIT(heap, heap1_t, heap_lt)
+
+static void swap_header_targets(bam_header_t *h1, bam_header_t *h2)
+{
+ bam_header_t t;
+ t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets;
+ t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name;
+ t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len;
+}
+
+static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
+{
+ int tempi;
+ char *temps;
+ tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;
+ temps = h1->text, h1->text = h2->text, h2->text = temps;
+}
+
+#define MERGE_RG 1
+#define MERGE_UNCOMP 2
+#define MERGE_LEVEL1 4
+#define MERGE_FORCE 8
+
+/*!
+ @abstract Merge multiple sorted BAM.
+ @param is_by_qname whether to sort by query name
+ @param out output BAM file name
+ @param headers name of SAM file from which to copy '@' header lines,
+ or NULL to copy them from the first file to be merged
+ @param n number of files to be merged
+ @param fn names of files to be merged
+
+ @discussion Padding information may NOT correctly maintained. This
+ function is NOT thread safe.
+ */
+int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level)
+{
+ bamFile fpout, *fp;
+ heap1_t *heap;
+ bam_header_t *hout = 0;
+ bam_header_t *hheaders = NULL;
+ int i, j, *RG_len = 0;
+ uint64_t idx = 0;
+ char **RG = 0, mode[8];
+ bam_iter_t *iter = 0;
+
+ if (headers) {
+ tamFile fpheaders = sam_open(headers);
+ if (fpheaders == 0) {
+ const char *message = strerror(errno);
+ fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ return -1;
+ }
+ hheaders = sam_header_read(fpheaders);
+ sam_close(fpheaders);
+ }
+
+ g_is_by_qname = by_qname;
+ fp = (bamFile*)calloc(n, sizeof(bamFile));
+ heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+ iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
+ // prepare RG tag
+ if (flag & MERGE_RG) {
+ RG = (char**)calloc(n, sizeof(void*));
+ RG_len = (int*)calloc(n, sizeof(int));
+ for (i = 0; i != n; ++i) {
+ int l = strlen(fn[i]);
+ const char *s = fn[i];
+ if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
+ for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
+ ++j; l -= j;
+ RG[i] = calloc(l + 1, 1);
+ RG_len[i] = l;
+ strncpy(RG[i], s + j, l);
+ }
+ }
+ // read the first
+ for (i = 0; i != n; ++i) {
+ bam_header_t *hin;
+ fp[i] = bam_open(fn[i], "r");
+ if (fp[i] == 0) {
+ int j;
+ fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+ for (j = 0; j < i; ++j) bam_close(fp[j]);
+ free(fp); free(heap);
+ // FIXME: possible memory leak
+ return -1;
+ }
+ hin = bam_header_read(fp[i]);
+ if (i == 0) { // the first BAM
+ hout = hin;
+ } else { // validate multiple baf
+ int min_n_targets = hout->n_targets;
+ if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;
+
+ for (j = 0; j < min_n_targets; ++j)
+ if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
+ hout->target_name[j], hin->target_name[j], fn[i]);
+ return -1;
+ }
+
+ // If this input file has additional target reference sequences,
+ // add them to the headers to be output
+ if (hin->n_targets > hout->n_targets) {
+ swap_header_targets(hout, hin);
+ // FIXME Possibly we should also create @SQ text headers
+ // for the newly added reference sequences
+ }
+
+ bam_header_destroy(hin);
+ }
+ }
+
+ if (hheaders) {
+ // If the text headers to be swapped in include any @SQ headers,
+ // check that they are consistent with the existing binary list
+ // of reference information.
+ if (hheaders->n_targets > 0) {
+ if (hout->n_targets != hheaders->n_targets) {
+ fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
+ if (!reg) return -1;
+ }
+ for (j = 0; j < hout->n_targets; ++j)
+ if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
+ if (!reg) return -1;
+ }
+ }
+
+ swap_header_text(hout, hheaders);
+ bam_header_destroy(hheaders);
+ }
+
+ if (reg) {
+ int tid, beg, end;
+ if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
+ fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
+ return -1;
+ }
+ for (i = 0; i < n; ++i) {
+ bam_index_t *idx;
+ idx = bam_index_load(fn[i]);
+ iter[i] = bam_iter_query(idx, tid, beg, end);
+ bam_index_destroy(idx);
+ }
+ }
+
+ for (i = 0; i < n; ++i) {
+ heap1_t *h = heap + i;
+ h->i = i;
+ h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
+ h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
+ h->idx = idx++;
+ }
+ else h->pos = HEAP_EMPTY;
+ }
+ if (flag & MERGE_UNCOMP) level = 0;
+ else if (flag & MERGE_LEVEL1) level = 1;
+ strcpy(mode, "w");
+ if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
+ if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
+ fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
+ return -1;
+ }
+ bam_header_write(fpout, hout);
+ bam_header_destroy(hout);
+ if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
+
+ ks_heapmake(heap, n, heap);
+ while (heap->pos != HEAP_EMPTY) {
+ bam1_t *b = heap->b;
+ if (flag & MERGE_RG) {
+ uint8_t *rg = bam_aux_get(b, "RG");
+ if (rg) bam_aux_del(b, rg);
+ bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
+ }
+ bam_write1_core(fpout, &b->core, b->data_len, b->data);
+ if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
+ heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
+ heap->idx = idx++;
+ } else if (j == -1) {
+ heap->pos = HEAP_EMPTY;
+ free(heap->b->data); free(heap->b);
+ heap->b = 0;
+ } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+ ks_heapadjust(heap, 0, n, heap);
+ }
+
+ if (flag & MERGE_RG) {
+ for (i = 0; i != n; ++i) free(RG[i]);
+ free(RG); free(RG_len);
+ }
+ for (i = 0; i != n; ++i) {
+ bam_iter_destroy(iter[i]);
+ bam_close(fp[i]);
+ }
+ bam_close(fpout);
+ free(fp); free(heap); free(iter);
+ return 0;
+}
+
+int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg)
+{
+ return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1);
+}
+
+int bam_merge(int argc, char *argv[])
+{
+ int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
+ char *fn_headers = NULL, *reg = 0;
+
+ while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) {
+ switch (c) {
+ case 'r': flag |= MERGE_RG; break;
+ case 'f': flag |= MERGE_FORCE; break;
+ case 'h': fn_headers = strdup(optarg); break;
+ case 'n': is_by_qname = 1; break;
+ case '1': flag |= MERGE_LEVEL1; break;
+ case 'u': flag |= MERGE_UNCOMP; break;
+ case 'R': reg = strdup(optarg); break;
+ case 'l': level = atoi(optarg); break;
+ case '@': n_threads = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 >= argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
+ fprintf(stderr, "Options: -n sort by read names\n");
+ fprintf(stderr, " -r attach RG tag (inferred from file names)\n");
+ fprintf(stderr, " -u uncompressed BAM output\n");
+ fprintf(stderr, " -f overwrite the output BAM if exist\n");
+ fprintf(stderr, " -1 compress level 1\n");
+ fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
+ fprintf(stderr, " -@ INT number of BAM compression threads [0]\n");
+ fprintf(stderr, " -R STR merge file in the specified region STR [all]\n");
+ fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n");
+ fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
+ fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n");
+ fprintf(stderr, " the header dictionary in merging.\n\n");
+ return 1;
+ }
+ if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
+ FILE *fp = fopen(argv[optind], "rb");
+ if (fp != NULL) {
+ fclose(fp);
+ fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
+ return 1;
+ }
+ }
+ if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1;
+ free(reg);
+ free(fn_headers);
+ return ret;
+}
+
+/***************
+ * BAM sorting *
+ ***************/
+
+#include <pthread.h>
+
+typedef bam1_t *bam1_p;
+
+static int change_SO(bam_header_t *h, const char *so)
+{
+ char *p, *q, *beg = 0, *end = 0, *newtext;
+ if (h->l_text > 3) {
+ if (strncmp(h->text, "@HD", 3) == 0) {
+ if ((p = strchr(h->text, '\n')) == 0) return -1;
+ *p = '\0';
+ if ((q = strstr(h->text, "\tSO:")) != 0) {
+ *p = '\n'; // change back
+ if (strncmp(q + 4, so, p - q - 4) != 0) {
+ beg = q;
+ for (q += 4; *q != '\n' && *q != '\t'; ++q);
+ end = q;
+ } else return 0; // no need to change
+ } else beg = end = p, *p = '\n';
+ }
+ }
+ if (beg == 0) { // no @HD
+ h->l_text += strlen(so) + 15;
+ newtext = malloc(h->l_text + 1);
+ sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so);
+ strcat(newtext, h->text);
+ } else { // has @HD but different or no SO
+ h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end);
+ newtext = malloc(h->l_text + 1);
+ strncpy(newtext, h->text, beg - h->text);
+ sprintf(newtext + (beg - h->text), "\tSO:%s", so);
+ strcat(newtext, end);
+ }
+ free(h->text);
+ h->text = newtext;
+ return 0;
+}
+
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+ if (g_is_by_qname) {
+ int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
+ return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
+ } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b)));
+}
+KSORT_INIT(sort, bam1_p, bam1_lt)
+
+typedef struct {
+ size_t buf_len;
+ const char *prefix;
+ bam1_p *buf;
+ const bam_header_t *h;
+ int index;
+} worker_t;
+
+static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads)
+{
+ size_t i;
+ bamFile fp;
+ fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode);
+ if (fp == 0) return;
+ bam_header_write(fp, h);
+ if (n_threads > 1) bgzf_mt(fp, n_threads, 256);
+ for (i = 0; i < l; ++i)
+ bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
+ bam_close(fp);
+}
+
+static void *worker(void *data)
+{
+ worker_t *w = (worker_t*)data;
+ char *name;
+ ks_mergesort(sort, w->buf_len, w->buf, 0);
+ name = (char*)calloc(strlen(w->prefix) + 20, 1);
+ sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
+ write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0);
+ free(name);
+ return 0;
+}
+
+static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads)
+{
+ int i;
+ size_t rest;
+ bam1_p *b;
+ pthread_t *tid;
+ pthread_attr_t attr;
+ worker_t *w;
+
+ if (n_threads < 1) n_threads = 1;
+ if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ w = calloc(n_threads, sizeof(worker_t));
+ tid = calloc(n_threads, sizeof(pthread_t));
+ b = buf; rest = k;
+ for (i = 0; i < n_threads; ++i) {
+ w[i].buf_len = rest / (n_threads - i);
+ w[i].buf = b;
+ w[i].prefix = prefix;
+ w[i].h = h;
+ w[i].index = n_files + i;
+ b += w[i].buf_len; rest -= w[i].buf_len;
+ pthread_create(&tid[i], &attr, worker, &w[i]);
+ }
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+ free(tid); free(w);
+ return n_files + n_threads;
+}
+
+/*!
+ @abstract Sort an unsorted BAM file based on the chromosome order
+ and the leftmost position of an alignment
+
+ @param is_by_qname whether to sort by query name
+ @param fn name of the file to be sorted
+ @param prefix prefix of the output and the temporary files; upon
+ sucessess, prefix.bam will be written.
+ @param max_mem approxiate maximum memory (very inaccurate)
+ @param full_path the given output path is the full path and not just the prefix
+
+ @discussion It may create multiple temporary subalignment files
+ and then merge them by calling bam_merge_core(). This function is
+ NOT thread safe.
+ */
+void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int full_path)
+{
+ int ret, i, n_files = 0;
+ size_t mem, max_k, k, max_mem;
+ bam_header_t *header;
+ bamFile fp;
+ bam1_t *b, **buf;
+ char *fnout = 0;
+ char const *suffix = ".bam";
+ if (full_path) suffix += 4;
+
+ if (n_threads < 2) n_threads = 1;
+ g_is_by_qname = is_by_qname;
+ max_k = k = 0; mem = 0;
+ max_mem = _max_mem * n_threads;
+ buf = 0;
+ fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
+ return;
+ }
+ header = bam_header_read(fp);
+ if (is_by_qname) change_SO(header, "queryname");
+ else change_SO(header, "coordinate");
+ // write sub files
+ for (;;) {
+ if (k == max_k) {
+ size_t old_max = max_k;
+ max_k = max_k? max_k<<1 : 0x10000;
+ buf = realloc(buf, max_k * sizeof(void*));
+ memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max));
+ }
+ if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+ b = buf[k];
+ if ((ret = bam_read1(fp, b)) < 0) break;
+ if (b->data_len < b->m_data>>2) { // shrink
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = realloc(b->data, b->m_data);
+ }
+ mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
+ ++k;
+ if (mem >= max_mem) {
+ n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+ mem = k = 0;
+ }
+ }
+ if (ret != -1)
+ fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+ // output file name
+ fnout = calloc(strlen(prefix) + 20, 1);
+ if (is_stdout) sprintf(fnout, "-");
+ else sprintf(fnout, "%s%s", prefix, suffix);
+ // write the final output
+ if (n_files == 0) { // a single block
+ char mode[8];
+ strcpy(mode, "w");
+ if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
+ ks_mergesort(sort, k, buf, 0);
+ write_buffer(fnout, mode, k, buf, header, n_threads);
+ } else { // then merge
+ char **fns;
+ n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+ fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
+ fns = (char**)calloc(n_files, sizeof(char*));
+ for (i = 0; i < n_files; ++i) {
+ fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+ sprintf(fns[i], "%s.%.4d%s", prefix, i, suffix);
+ }
+ bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level);
+ for (i = 0; i < n_files; ++i) {
+ unlink(fns[i]);
+ free(fns[i]);
+ }
+ free(fns);
+ }
+ free(fnout);
+ // free
+ for (k = 0; k < max_k; ++k) {
+ if (!buf[k]) continue;
+ free(buf[k]->data);
+ free(buf[k]);
+ }
+ free(buf);
+ bam_header_destroy(header);
+ bam_close(fp);
+}
+
+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
+{
+ bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1, 0);
+}
+
+int bam_sort(int argc, char *argv[])
+{
+ size_t max_mem = 768<<20; // 512MB
+ int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1, full_path = 0;
+ while ((c = getopt(argc, argv, "fnom:@:l:")) >= 0) {
+ switch (c) {
+ case 'f': full_path = 1; break;
+ case 'o': is_stdout = 1; break;
+ case 'n': is_by_qname = 1; break;
+ case 'm': {
+ char *q;
+ max_mem = strtol(optarg, &q, 0);
+ if (*q == 'k' || *q == 'K') max_mem <<= 10;
+ else if (*q == 'm' || *q == 'M') max_mem <<= 20;
+ else if (*q == 'g' || *q == 'G') max_mem <<= 30;
+ break;
+ }
+ case '@': n_threads = atoi(optarg); break;
+ case 'l': level = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools sort [options] <in.bam> <out.prefix>\n\n");
+ fprintf(stderr, "Options: -n sort by read name\n");
+ fprintf(stderr, " -f use <out.prefix> as full file name instead of prefix\n");
+ fprintf(stderr, " -o final output to stdout\n");
+ fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
+ fprintf(stderr, " -@ INT number of sorting and compression threads [1]\n");
+ fprintf(stderr, " -m INT max memory per thread; suffix K/M/G recognized [768M]\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level, full_path);
+ return 0;
+}
diff --git a/samtools-0.1.19/bam_stat.c b/samtools-0.1.19/bam_stat.c
new file mode 100644
index 0000000..f2de0f1
--- /dev/null
+++ b/samtools-0.1.19/bam_stat.c
@@ -0,0 +1,77 @@
+#include <unistd.h>
+#include <assert.h>
+#include "bam.h"
+
+typedef struct {
+ long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2];
+ long long n_sgltn[2], n_read1[2], n_read2[2];
+ long long n_dup[2];
+ long long n_diffchr[2], n_diffhigh[2];
+} bam_flagstat_t;
+
+#define flagstat_loop(s, c) do { \
+ int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \
+ ++(s)->n_reads[w]; \
+ if ((c)->flag & BAM_FPAIRED) { \
+ ++(s)->n_pair_all[w]; \
+ if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \
+ if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \
+ if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \
+ if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \
+ if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
+ ++(s)->n_pair_map[w]; \
+ if ((c)->mtid != (c)->tid) { \
+ ++(s)->n_diffchr[w]; \
+ if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \
+ } \
+ } \
+ } \
+ if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \
+ if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \
+ } while (0)
+
+bam_flagstat_t *bam_flagstat_core(bamFile fp)
+{
+ bam_flagstat_t *s;
+ bam1_t *b;
+ bam1_core_t *c;
+ int ret;
+ s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+ b = bam_init1();
+ c = &b->core;
+ while ((ret = bam_read1(fp, b)) >= 0)
+ flagstat_loop(s, c);
+ bam_destroy1(b);
+ if (ret != -1)
+ fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+ return s;
+}
+int bam_flagstat(int argc, char *argv[])
+{
+ bamFile fp;
+ bam_header_t *header;
+ bam_flagstat_t *s;
+ if (argc == optind) {
+ fprintf(stderr, "Usage: samtools flagstat <in.bam>\n");
+ return 1;
+ }
+ fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ assert(fp);
+ header = bam_header_read(fp);
+ s = bam_flagstat_core(fp);
+ printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+ printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
+ printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
+ printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
+ printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
+ printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
+ printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
+ printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
+ printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
+ free(s);
+ bam_header_destroy(header);
+ bam_close(fp);
+ return 0;
+}
diff --git a/samtools-0.1.19/bam_tview.c b/samtools-0.1.19/bam_tview.c
new file mode 100644
index 0000000..06d5e33
--- /dev/null
+++ b/samtools-0.1.19/bam_tview.c
@@ -0,0 +1,368 @@
+#include <assert.h>
+#include "bam_tview.h"
+
+int base_tv_init(tview_t* tv,const char *fn, const char *fn_fa, const char *samples)
+ {
+ assert(tv!=NULL);
+ assert(fn!=NULL);
+ tv->mrow = 24; tv->mcol = 80;
+ tv->color_for = TV_COLOR_MAPQ;
+ tv->is_dot = 1;
+
+ tv->fp = bam_open(fn, "r");
+ if(tv->fp==0)
+ {
+ fprintf(stderr,"bam_open %s. %s\n", fn,fn_fa);
+ exit(EXIT_FAILURE);
+ }
+ bgzf_set_cache_size(tv->fp, 8 * 1024 *1024);
+ assert(tv->fp);
+
+ tv->header = bam_header_read(tv->fp);
+ if(tv->header==0)
+ {
+ fprintf(stderr,"Cannot read '%s'.\n", fn);
+ exit(EXIT_FAILURE);
+ }
+ tv->idx = bam_index_load(fn);
+ if (tv->idx == 0)
+ {
+ fprintf(stderr,"Cannot read index for '%s'.\n", fn);
+ exit(EXIT_FAILURE);
+ }
+ tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
+ if (fn_fa) tv->fai = fai_load(fn_fa);
+ tv->bca = bcf_call_init(0.83, 13);
+ tv->ins = 1;
+
+ if ( samples )
+ {
+ if ( !tv->header->dict ) tv->header->dict = sam_header_parse2(tv->header->text);
+ void *iter = tv->header->dict;
+ const char *key, *val;
+ int n = 0;
+ tv->rg_hash = kh_init(kh_rg);
+ while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
+ {
+ if ( !strcmp(samples,key) || (val && !strcmp(samples,val)) )
+ {
+ khiter_t k = kh_get(kh_rg, tv->rg_hash, key);
+ if ( k != kh_end(tv->rg_hash) ) continue;
+ int ret;
+ k = kh_put(kh_rg, tv->rg_hash, key, &ret);
+ kh_value(tv->rg_hash, k) = val;
+ n++;
+ }
+ }
+ if ( !n )
+ {
+ fprintf(stderr,"The sample or read group \"%s\" not present.\n", samples);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ return 0;
+ }
+
+
+void base_tv_destroy(tview_t* tv)
+ {
+ bam_lplbuf_destroy(tv->lplbuf);
+ bcf_call_destroy(tv->bca);
+ bam_index_destroy(tv->idx);
+ if (tv->fai) fai_destroy(tv->fai);
+ free(tv->ref);
+ bam_header_destroy(tv->header);
+ bam_close(tv->fp);
+ }
+
+
+int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+ extern unsigned char bam_nt16_table[256];
+ tview_t *tv = (tview_t*)data;
+ int i, j, c, rb, attr, max_ins = 0;
+ uint32_t call = 0;
+ if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
+ // print referece
+ rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
+ for (i = tv->last_pos + 1; i < pos; ++i) {
+ if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1);
+ c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
+ tv->my_mvaddch(tv,1, tv->ccol++, c);
+ }
+ if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
+ { // call consensus
+ bcf_callret1_t bcr;
+ int qsum[4], a1, a2, tmp;
+ double p[3], prior = 30;
+ bcf_call_glfgen(n, pl, bam_nt16_table[rb], tv->bca, &bcr);
+ for (i = 0; i < 4; ++i) qsum[i] = bcr.qsum[i]<<2 | i;
+ for (i = 1; i < 4; ++i) // insertion sort
+ for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
+ tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
+ a1 = qsum[0]&3; a2 = qsum[1]&3;
+ p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
+ if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
+ if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
+ if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
+ else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
+ else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
+ }
+ attr = tv->my_underline(tv);
+ c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
+ i = (call&0xffff)/10+1;
+ if (i > 4) i = 4;
+ attr |= tv->my_colorpair(tv,i);
+ if (c == toupper(rb)) c = '.';
+ tv->my_attron(tv,attr);
+ tv->my_mvaddch(tv,2, tv->ccol, c);
+ tv->my_attroff(tv,attr);
+ if(tv->ins) {
+ // calculate maximum insert
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
+ }
+ }
+ // core loop
+ for (j = 0; j <= max_ins; ++j) {
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ int row = TV_MIN_ALNROW + p->level - tv->row_shift;
+ if (j == 0) {
+ if (!p->is_del) {
+ if (tv->base_for == TV_BASE_COLOR_SPACE &&
+ (c = bam_aux_getCSi(p->b, p->qpos))) {
+ // assume that if we found one color, we will be able to get the color error
+ if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.';
+ } else {
+ if (tv->show_name) {
+ char *name = bam1_qname(p->b);
+ c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
+ } else {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ }
+ } else c = p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*';
+ } else { // padding
+ if (j > p->indel) c = '*';
+ else { // insertion
+ if (tv->base_for == TV_BASE_NUCL) {
+ if (tv->show_name) {
+ char *name = bam1_qname(p->b);
+ c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
+ } else {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ } else {
+ c = bam_aux_getCSi(p->b, p->qpos + j);
+ if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.';
+ }
+ }
+ }
+ if (row > TV_MIN_ALNROW && row < tv->mrow) {
+ int x;
+ attr = 0;
+ if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
+ || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
+ if (tv->color_for == TV_COLOR_BASEQ) {
+ x = bam1_qual(p->b)[p->qpos]/10 + 1;
+ if (x > 4) x = 4;
+ attr |= tv->my_colorpair(tv,x);
+ } else if (tv->color_for == TV_COLOR_MAPQ) {
+ x = p->b->core.qual/10 + 1;
+ if (x > 4) x = 4;
+ attr |= tv->my_colorpair(tv,x);
+ } else if (tv->color_for == TV_COLOR_NUCL) {
+ x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5;
+ attr |= tv->my_colorpair(tv,x);
+ } else if(tv->color_for == TV_COLOR_COL) {
+ x = 0;
+ switch(bam_aux_getCSi(p->b, p->qpos)) {
+ case '0': x = 0; break;
+ case '1': x = 1; break;
+ case '2': x = 2; break;
+ case '3': x = 3; break;
+ case '4': x = 4; break;
+ default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break;
+ }
+ x+=5;
+ attr |= tv->my_colorpair(tv,x);
+ } else if(tv->color_for == TV_COLOR_COLQ) {
+ x = bam_aux_getCQi(p->b, p->qpos);
+ if(0 == x) x = bam1_qual(p->b)[p->qpos];
+ x = x/10 + 1;
+ if (x > 4) x = 4;
+ attr |= tv->my_colorpair(tv,x);
+ }
+ tv->my_attron(tv,attr);
+ tv->my_mvaddch(tv,row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c));
+ tv->my_attroff(tv,attr);
+ }
+ }
+ c = j? '*' : rb;
+ if (c == '*') {
+ attr = tv->my_colorpair(tv,8);
+ tv->my_attron(tv,attr);
+ tv->my_mvaddch(tv,1, tv->ccol++, c);
+ tv->my_attroff(tv,attr);
+ } else tv->my_mvaddch(tv,1, tv->ccol++, c);
+ }
+ tv->last_pos = pos;
+ return 0;
+}
+
+
+
+
+int tv_fetch_func(const bam1_t *b, void *data)
+{
+ tview_t *tv = (tview_t*)data;
+ if ( tv->rg_hash )
+ {
+ const uint8_t *rg = bam_aux_get(b, "RG");
+ if ( !rg ) return 0;
+ khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
+ if ( k == kh_end(tv->rg_hash) ) return 0;
+ }
+ if (tv->no_skip) {
+ uint32_t *cigar = bam1_cigar(b); // this is cheating...
+ int i;
+ for (i = 0; i <b->core.n_cigar; ++i) {
+ if ((cigar[i]&0xf) == BAM_CREF_SKIP)
+ cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
+ }
+ }
+ bam_lplbuf_push(b, tv->lplbuf);
+ return 0;
+}
+
+int base_draw_aln(tview_t *tv, int tid, int pos)
+ {
+ assert(tv!=NULL);
+ // reset
+ tv->my_clear(tv);
+ tv->curr_tid = tid; tv->left_pos = pos;
+ tv->last_pos = tv->left_pos - 1;
+ tv->ccol = 0;
+ // print ref and consensus
+ if (tv->fai) {
+ char *str;
+ if (tv->ref) free(tv->ref);
+ assert(tv->curr_tid>=0);
+
+ str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
+ assert(str!=NULL);
+ sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
+ tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
+ free(str);
+ }
+ // draw aln
+ bam_lplbuf_reset(tv->lplbuf);
+ bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
+ bam_lplbuf_push(0, tv->lplbuf);
+
+ while (tv->ccol < tv->mcol) {
+ int pos = tv->last_pos + 1;
+ if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
+ tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
+ ++tv->last_pos;
+ }
+ return 0;
+}
+
+
+
+
+static void error(const char *format, ...)
+{
+ if ( !format )
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bamtk tview [options] <aln.bam> [ref.fasta]\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -d display output as (H)tml or (C)urses or (T)ext \n");
+ fprintf(stderr, " -p chr:pos go directly to this position\n");
+ fprintf(stderr, " -s STR display only reads from this sample or group\n");
+ fprintf(stderr, "\n\n");
+ }
+ else
+ {
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ }
+ exit(-1);
+}
+
+enum dipsay_mode {display_ncurses,display_html,display_text};
+extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples);
+extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples);
+extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
+
+int bam_tview_main(int argc, char *argv[])
+ {
+ int view_mode=display_ncurses;
+ tview_t* tv=NULL;
+ char *samples=NULL, *position=NULL;
+ int c;
+ while ((c = getopt(argc, argv, "s:p:d:")) >= 0) {
+ switch (c) {
+ case 's': samples=optarg; break;
+ case 'p': position=optarg; break;
+ case 'd':
+ {
+ switch(optarg[0])
+ {
+ case 'H': case 'h': view_mode=display_html;break;
+ case 'T': case 't': view_mode=display_text;break;
+ case 'C': case 'c': view_mode=display_ncurses;break;
+ default: view_mode=display_ncurses;break;
+ }
+ break;
+ }
+ default: error(NULL);
+ }
+ }
+ if (argc==optind) error(NULL);
+
+ switch(view_mode)
+ {
+ case display_ncurses:
+ {
+ tv = curses_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ break;
+ }
+ case display_text:
+ {
+ tv = text_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ break;
+ }
+ case display_html:
+ {
+ tv = html_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
+ break;
+ }
+ }
+ if(tv==NULL)
+ {
+ error("cannot create view");
+ return EXIT_FAILURE;
+ }
+
+ if ( position )
+ {
+ int _tid = -1, _beg, _end;
+ bam_parse_region(tv->header, position, &_tid, &_beg, &_end);
+ if (_tid >= 0) { tv->curr_tid = _tid; tv->left_pos = _beg; }
+ }
+ tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
+ tv->my_loop(tv);
+ tv->my_destroy(tv);
+
+ return EXIT_SUCCESS;
+ }
diff --git a/samtools-0.1.19/bam_tview.h b/samtools-0.1.19/bam_tview.h
new file mode 100644
index 0000000..80f0464
--- /dev/null
+++ b/samtools-0.1.19/bam_tview.h
@@ -0,0 +1,75 @@
+#ifndef BAM_TVIEW_H
+#define BAM_TVIEW_H
+
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include "bam.h"
+#include "faidx.h"
+#include "bam2bcf.h"
+#include "sam_header.h"
+#include "khash.h"
+
+KHASH_MAP_INIT_STR(kh_rg, const char *)
+
+typedef struct AbstractTview {
+ int mrow, mcol;
+
+ bam_index_t *idx;
+ bam_lplbuf_t *lplbuf;
+ bam_header_t *header;
+ bamFile fp;
+ int curr_tid, left_pos;
+ faidx_t *fai;
+ bcf_callaux_t *bca;
+
+ int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name;
+ char *ref;
+ khash_t(kh_rg) *rg_hash;
+ /* callbacks */
+ void (*my_destroy)(struct AbstractTview* );
+ void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
+ void (*my_mvaddch)(struct AbstractTview*,int,int,int);
+ void (*my_attron)(struct AbstractTview*,int);
+ void (*my_attroff)(struct AbstractTview*,int);
+ void (*my_clear)(struct AbstractTview*);
+ int (*my_colorpair)(struct AbstractTview*,int);
+ int (*my_drawaln)(struct AbstractTview*,int,int);
+ int (*my_loop)(struct AbstractTview*);
+ int (*my_underline)(struct AbstractTview*);
+} tview_t;
+
+
+char bam_aux_getCEi(bam1_t *b, int i);
+char bam_aux_getCSi(bam1_t *b, int i);
+char bam_aux_getCQi(bam1_t *b, int i);
+
+#define TV_MIN_ALNROW 2
+#define TV_MAX_GOTO 40
+#define TV_LOW_MAPQ 10
+
+#define TV_COLOR_MAPQ 0
+#define TV_COLOR_BASEQ 1
+#define TV_COLOR_NUCL 2
+#define TV_COLOR_COL 3
+#define TV_COLOR_COLQ 4
+
+#define TV_BASE_NUCL 0
+#define TV_BASE_COLOR_SPACE 1
+
+int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
+int base_tv_init(tview_t*,const char *fn, const char *fn_fa, const char *samples);
+void base_tv_destroy(tview_t*);
+int base_draw_aln(tview_t *tv, int tid, int pos);
+
+typedef struct Tixel
+ {
+ int ch;
+ int attributes;
+ }tixel_t;
+
+#endif
+
diff --git a/samtools-0.1.19/bam_tview_curses.c b/samtools-0.1.19/bam_tview_curses.c
new file mode 100644
index 0000000..4fdd1fb
--- /dev/null
+++ b/samtools-0.1.19/bam_tview_curses.c
@@ -0,0 +1,297 @@
+#undef _HAVE_CURSES
+
+#if _CURSES_LIB == 0
+#elif _CURSES_LIB == 1
+#include <curses.h>
+#ifndef NCURSES_VERSION
+#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled"
+#else
+#define _HAVE_CURSES
+#endif
+#elif _CURSES_LIB == 2
+#include <xcurses.h>
+#define _HAVE_CURSES
+#else
+#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled"
+#endif
+
+
+#include "bam_tview.h"
+
+#ifdef _HAVE_CURSES
+
+
+
+typedef struct CursesTview {
+ tview_t view;
+ WINDOW *wgoto, *whelp;
+ } curses_tview_t;
+
+
+
+
+#define FROM_TV(ptr) ((curses_tview_t*)ptr)
+
+static void curses_destroy(tview_t* base)
+ {
+ curses_tview_t* tv=(curses_tview_t*)base;
+
+
+ delwin(tv->wgoto); delwin(tv->whelp);
+ endwin();
+
+ base_tv_destroy(base);
+
+ free(tv);
+ }
+
+/*
+ void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
+ void (*my_)(struct AbstractTview*,int,int,int);
+ void (*my_attron)(struct AbstractTview*,int);
+ void (*my_attroff)(struct AbstractTview*,int);
+ void (*my_clear)(struct AbstractTview*);
+ int (*my_colorpair)(struct AbstractTview*,int);
+*/
+
+static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
+ {
+ unsigned int size=tv->mcol+2;
+ char* str=malloc(size);
+ if(str==0) exit(EXIT_FAILURE);
+ va_list argptr;
+ va_start(argptr, fmt);
+ vsnprintf(str,size, fmt, argptr);
+ va_end(argptr);
+ mvprintw(y,x,str);
+ free(str);
+ }
+
+static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
+ {
+ mvaddch(y,x,ch);
+ }
+
+static void curses_attron(struct AbstractTview* tv,int flag)
+ {
+ attron(flag);
+ }
+static void curses_attroff(struct AbstractTview* tv,int flag)
+ {
+ attroff(flag);
+ }
+static void curses_clear(struct AbstractTview* tv)
+ {
+ clear();
+ }
+
+static int curses_colorpair(struct AbstractTview* tv,int flag)
+ {
+ return COLOR_PAIR(flag);
+ }
+
+static int curses_drawaln(struct AbstractTview* tv, int tid, int pos)
+ {
+ return base_draw_aln(tv, tid, pos);
+ }
+
+
+
+static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
+ {
+ char str[256], *p;
+ int i, l = 0;
+ tview_t *base=(tview_t*)tv;
+ wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
+ mvwprintw(tv->wgoto, 1, 2, "Goto: ");
+ for (;;) {
+ int c = wgetch(tv->wgoto);
+ wrefresh(tv->wgoto);
+ if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
+ if(l > 0) --l;
+ } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
+ int _tid = -1, _beg, _end;
+ if (str[0] == '=') {
+ _beg = strtol(str+1, &p, 10) - 1;
+ if (_beg > 0) {
+ *pos = _beg;
+ return;
+ }
+ } else {
+ bam_parse_region(base->header, str, &_tid, &_beg, &_end);
+ if (_tid >= 0) {
+ *tid = _tid; *pos = _beg;
+ return;
+ }
+ }
+ } else if (isgraph(c)) {
+ if (l < TV_MAX_GOTO) str[l++] = c;
+ } else if (c == '\027') l = 0;
+ else if (c == '\033') return;
+ str[l] = '\0';
+ for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
+ mvwprintw(tv->wgoto, 1, 8, "%s", str);
+ }
+}
+
+
+
+
+static void tv_win_help(curses_tview_t *tv) {
+ int r = 1;
+ tview_t* base=(tview_t*)base;
+ WINDOW *win = tv->whelp;
+ wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
+ mvwprintw(win, r++, 2, " -=- Help -=- ");
+ r++;
+ mvwprintw(win, r++, 2, "? This window");
+ mvwprintw(win, r++, 2, "Arrows Small scroll movement");
+ mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");
+ mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");
+ mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");
+ mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");
+ mvwprintw(win, r++, 2, "space Scroll one screen");
+ mvwprintw(win, r++, 2, "backspace Scroll back one screen");
+ mvwprintw(win, r++, 2, "g Go to specific location");
+ mvwprintw(win, r++, 2, "m Color for mapping qual");
+ mvwprintw(win, r++, 2, "n Color for nucleotide");
+ mvwprintw(win, r++, 2, "b Color for base quality");
+ mvwprintw(win, r++, 2, "c Color for cs color");
+ mvwprintw(win, r++, 2, "z Color for cs qual");
+ mvwprintw(win, r++, 2, ". Toggle on/off dot view");
+ mvwprintw(win, r++, 2, "s Toggle on/off ref skip");
+ mvwprintw(win, r++, 2, "r Toggle on/off rd name");
+ mvwprintw(win, r++, 2, "N Turn on nt view");
+ mvwprintw(win, r++, 2, "C Turn on cs view");
+ mvwprintw(win, r++, 2, "i Toggle on/off ins");
+ mvwprintw(win, r++, 2, "q Exit");
+ r++;
+ mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
+ mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");
+ mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");
+ wrefresh(win);
+ wgetch(win);
+}
+
+static int curses_underline(tview_t* tv)
+ {
+ return A_UNDERLINE;
+ }
+
+static int curses_loop(tview_t* tv)
+ {
+ int tid, pos;
+ curses_tview_t *CTV=(curses_tview_t *)tv;
+ tid = tv->curr_tid; pos = tv->left_pos;
+ while (1) {
+ int c = getch();
+ switch (c) {
+ case '?': tv_win_help(CTV); break;
+ case '\033':
+ case 'q': goto end_loop;
+ case '/':
+ case 'g': tv_win_goto(CTV, &tid, &pos); break;
+ case 'm': tv->color_for = TV_COLOR_MAPQ; break;
+ case 'b': tv->color_for = TV_COLOR_BASEQ; break;
+ case 'n': tv->color_for = TV_COLOR_NUCL; break;
+ case 'c': tv->color_for = TV_COLOR_COL; break;
+ case 'z': tv->color_for = TV_COLOR_COLQ; break;
+ case 's': tv->no_skip = !tv->no_skip; break;
+ case 'r': tv->show_name = !tv->show_name; break;
+ case KEY_LEFT:
+ case 'h': --pos; break;
+ case KEY_RIGHT:
+ case 'l': ++pos; break;
+ case KEY_SLEFT:
+ case 'H': pos -= 20; break;
+ case KEY_SRIGHT:
+ case 'L': pos += 20; break;
+ case '.': tv->is_dot = !tv->is_dot; break;
+ case 'N': tv->base_for = TV_BASE_NUCL; break;
+ case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
+ case 'i': tv->ins = !tv->ins; break;
+ case '\010': pos -= 1000; break;
+ case '\014': pos += 1000; break;
+ case ' ': pos += tv->mcol; break;
+ case KEY_UP:
+ case 'j': --tv->row_shift; break;
+ case KEY_DOWN:
+ case 'k': ++tv->row_shift; break;
+ case KEY_BACKSPACE:
+ case '\177': pos -= tv->mcol; break;
+ case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
+ default: continue;
+ }
+ if (pos < 0) pos = 0;
+ if (tv->row_shift < 0) tv->row_shift = 0;
+ tv->my_drawaln(tv, tid, pos);
+ }
+end_loop:
+ return 0;
+}
+
+
+
+
+tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
+ {
+ curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
+ tview_t* base=(tview_t*)tv;
+ if(tv==0)
+ {
+ fprintf(stderr,"Calloc failed\n");
+ return 0;
+ }
+
+ base_tv_init(base,fn,fn_fa,samples);
+ /* initialize callbacks */
+#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
+ SET_CALLBACK(destroy);
+ SET_CALLBACK(mvprintw);
+ SET_CALLBACK(mvaddch);
+ SET_CALLBACK(attron);
+ SET_CALLBACK(attroff);
+ SET_CALLBACK(clear);
+ SET_CALLBACK(colorpair);
+ SET_CALLBACK(drawaln);
+ SET_CALLBACK(loop);
+ SET_CALLBACK(underline);
+#undef SET_CALLBACK
+
+ initscr();
+ keypad(stdscr, TRUE);
+ clear();
+ noecho();
+ cbreak();
+
+ getmaxyx(stdscr, base->mrow, base->mcol);
+ tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
+ tv->whelp = newwin(29, 40, 5, 5);
+
+ start_color();
+ init_pair(1, COLOR_BLUE, COLOR_BLACK);
+ init_pair(2, COLOR_GREEN, COLOR_BLACK);
+ init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+ init_pair(4, COLOR_WHITE, COLOR_BLACK);
+ init_pair(5, COLOR_GREEN, COLOR_BLACK);
+ init_pair(6, COLOR_CYAN, COLOR_BLACK);
+ init_pair(7, COLOR_YELLOW, COLOR_BLACK);
+ init_pair(8, COLOR_RED, COLOR_BLACK);
+ init_pair(9, COLOR_BLUE, COLOR_BLACK);
+ return base;
+ }
+
+
+#else // #ifdef _HAVE_CURSES
+#include <stdio.h>
+#warning "No curses library is available; tview with curses is disabled."
+
+extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
+
+tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
+ {
+ return text_tv_init(fn,fn_fa,samples);
+ }
+#endif // #ifdef _HAVE_CURSES
+
+
diff --git a/samtools-0.1.19/bam_tview_html.c b/samtools-0.1.19/bam_tview_html.c
new file mode 100644
index 0000000..f52b4c3
--- /dev/null
+++ b/samtools-0.1.19/bam_tview_html.c
@@ -0,0 +1,349 @@
+#include <unistd.h>
+#include "bam_tview.h"
+
+#define UNDERLINE_FLAG 10
+
+typedef struct HtmlTview {
+ tview_t view;
+ int row_count;
+ tixel_t** screen;
+ FILE* out;
+ int attributes;/* color... */
+ } html_tview_t;
+
+#define FROM_TV(ptr) ((html_tview_t*)ptr)
+
+static void html_destroy(tview_t* base)
+ {
+ int i;
+ html_tview_t* tv=(html_tview_t*)base;
+ if(tv->screen!=NULL)
+ {
+ for(i=0;i< tv->row_count;++i) free(tv->screen[i]);
+ free(tv->screen);
+ }
+ base_tv_destroy(base);
+ free(tv);
+ }
+
+/*
+ void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
+ void (*my_)(struct AbstractTview*,int,int,int);
+ void (*my_attron)(struct AbstractTview*,int);
+ void (*my_attroff)(struct AbstractTview*,int);
+ void (*my_clear)(struct AbstractTview*);
+ int (*my_colorpair)(struct AbstractTview*,int);
+*/
+
+static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
+ {
+ int i,nchars=0;
+ unsigned int size=tv->mcol+2;
+ char* str=malloc(size);
+ if(str==0) exit(EXIT_FAILURE);
+ va_list argptr;
+ va_start(argptr, fmt);
+ nchars=vsnprintf(str,size, fmt, argptr);
+ va_end(argptr);
+
+ for(i=0;i< nchars;++i)
+ {
+ tv->my_mvaddch(tv,y,x+i,str[i]);
+ }
+ free(str);
+ }
+
+static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
+ {
+ tixel_t* row=NULL;
+ html_tview_t* ptr=FROM_TV(tv);
+ if( x >= tv->mcol ) return; //out of screen
+ while(ptr->row_count<=y)
+ {
+ int x;
+ row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t));
+ if(row==0) exit(EXIT_FAILURE);
+ for(x=0;x<tv->mcol;++x) {row[x].ch=' ';row[x].attributes=0;}
+ ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1));
+ ptr->screen[ptr->row_count++]=row;
+ }
+ row=ptr->screen[y];
+ row[x].ch=ch;
+ row[x].attributes=ptr->attributes;
+ }
+
+static void html_attron(struct AbstractTview* tv,int flag)
+ {
+ html_tview_t* ptr=FROM_TV(tv);
+ ptr->attributes |= flag;
+
+
+ }
+
+static void html_attroff(struct AbstractTview* tv,int flag)
+ {
+ html_tview_t* ptr=FROM_TV(tv);
+ ptr->attributes &= ~(flag);
+ }
+
+static void html_clear(struct AbstractTview* tv)
+ {
+ html_tview_t* ptr=FROM_TV(tv);
+ if(ptr->screen!=NULL)
+ {
+ int i;
+ for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]);
+ free(ptr->screen);
+ ptr->screen=NULL;
+ }
+ ptr->row_count=0;
+ ptr->attributes=0;
+ }
+
+static int html_colorpair(struct AbstractTview* tv,int flag)
+ {
+ return (1 << (flag));
+ }
+
+static int html_drawaln(struct AbstractTview* tv, int tid, int pos)
+ {
+ int y,x;
+ html_tview_t* ptr=FROM_TV(tv);
+ html_clear(tv);
+ base_draw_aln(tv, tid, pos);
+ fputs("<html><head>",ptr->out);
+ fprintf(ptr->out,"<title>%s:%d</title>",
+ tv->header->target_name[tid],
+ pos+1
+ );
+ //style
+
+ fputs("<style type='text/css'>\n",ptr->out);
+ fputs(".tviewbody { margin:5px; background-color:white;text-align:center;}\n",ptr->out);
+ fputs(".tviewtitle {text-align:center;}\n",ptr->out);
+ fputs(".tviewpre { margin:5px; background-color:white;}\n",ptr->out);
+ #define CSS(id,col) fprintf(ptr->out,".tviewc%d {color:%s;}\n.tviewcu%d {color:%s;text-decoration:underline;}\n",id,col,id,col);
+ CSS(0, "black");
+ CSS(1, "blue");
+ CSS(2, "green");
+ CSS(3, "yellow");
+ CSS(4, "black");
+ CSS(5, "green");
+ CSS(6, "cyan");
+ CSS(7, "yellow");
+ CSS(8, "red");
+ CSS(9, "blue");
+ #undef CSS
+ fputs("</style>",ptr->out);
+
+ fputs("</head><body>",ptr->out);
+
+ fprintf(ptr->out,"<div class='tviewbody'><div class='tviewtitle'>%s:%d</div>",
+ tv->header->target_name[tid],
+ pos+1
+ );
+
+ fputs("<pre class='tviewpre'>",ptr->out);
+ for(y=0;y< ptr->row_count;++y)
+ {
+
+ for(x=0;x< tv->mcol;++x)
+ {
+
+
+ if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes)
+ {
+ int css=0;
+ fprintf(ptr->out,"<span");
+ while(css<32)
+ {
+ //if(y>1) fprintf(stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
+ if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
+ {
+
+ fprintf(ptr->out," class='tviewc%s%d'",
+ (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""),
+ css);
+ break;
+ }
+ ++css;
+ }
+
+
+ fputs(">",ptr->out);
+ }
+
+ int ch=ptr->screen[y][x].ch;
+ switch(ch)
+ {
+ case '<': fputs("<",ptr->out);break;
+ case '>': fputs(">",ptr->out);break;
+ case '&': fputs("&",ptr->out);break;
+ default: fputc(ch,ptr->out); break;
+ }
+
+
+ if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes)
+ {
+ fputs("</span>",ptr->out);
+ }
+ }
+ if(y+1 < ptr->row_count) fputs("<br/>",ptr->out);
+ }
+ fputs("</pre></div></body></html>",ptr->out);
+ return 0;
+ }
+
+
+#define ANSI_COLOR_RED "\x1b[31m"
+#define ANSI_COLOR_GREEN "\x1b[32m"
+#define ANSI_COLOR_YELLOW "\x1b[33m"
+#define ANSI_COLOR_BLUE "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN "\x1b[36m"
+#define ANSI_COLOR_BLACK "\x1b[0m"
+#define ANSI_COLOR_RESET ANSI_COLOR_BLACK
+
+#define ANSI_UNDERLINE_SET "\033[4m"
+#define ANSI_UNDERLINE_UNSET "\033[0m"
+
+static int text_drawaln(struct AbstractTview* tv, int tid, int pos)
+ {
+ int y,x;
+ html_tview_t* ptr=FROM_TV(tv);
+ html_clear(tv);
+ base_draw_aln(tv, tid, pos);
+ int is_term= isatty(fileno(ptr->out));
+
+ for(y=0;y< ptr->row_count;++y)
+ {
+ for(x=0;x< tv->mcol;++x)
+ {
+ if(is_term)
+ {
+ int css=0;
+ while(css<32)
+ {
+ if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
+ {
+ break;
+ }
+ ++css;
+ }
+ switch(css)
+ {
+ //CSS(0, "black");
+ case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break;
+ case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break;
+ case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
+ //CSS(4, "black");
+ case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break;
+ case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break;
+ case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
+ case 8: fputs(ANSI_COLOR_RED,ptr->out); break;
+ case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break;
+ default:break;
+ }
+ if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
+ {
+ fputs(ANSI_UNDERLINE_SET,ptr->out);
+ }
+
+ }
+
+
+ int ch=ptr->screen[y][x].ch;
+
+ fputc(ch,ptr->out);
+ if(is_term)
+ {
+ fputs(ANSI_COLOR_RESET,ptr->out);
+ if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
+ {
+ fputs(ANSI_UNDERLINE_UNSET,ptr->out);
+ }
+ }
+ }
+ fputc('\n',ptr->out);
+ }
+ return 0;
+ }
+
+
+static int html_loop(tview_t* tv)
+ {
+ //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
+ return 0;
+ }
+
+static int html_underline(tview_t* tv)
+ {
+ return (1 << UNDERLINE_FLAG);
+ }
+
+/*
+static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper)
+ {
+
+ }
+*/
+
+tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
+ {
+ char* colstr=getenv("COLUMNS");
+ html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
+ tview_t* base=(tview_t*)tv;
+ if(tv==0)
+ {
+ fprintf(stderr,"Calloc failed\n");
+ return 0;
+ }
+ tv->row_count=0;
+ tv->screen=NULL;
+ tv->out=stdout;
+ tv->attributes=0;
+ base_tv_init(base,fn,fn_fa,samples);
+ /* initialize callbacks */
+#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
+ SET_CALLBACK(destroy);
+ SET_CALLBACK(mvprintw);
+ SET_CALLBACK(mvaddch);
+ SET_CALLBACK(attron);
+ SET_CALLBACK(attroff);
+ SET_CALLBACK(clear);
+ SET_CALLBACK(colorpair);
+ SET_CALLBACK(drawaln);
+ SET_CALLBACK(loop);
+ SET_CALLBACK(underline);
+#undef SET_CALLBACK
+
+
+ if(colstr!=0)
+ {
+ base->mcol=atoi(colstr);
+ if(base->mcol<10) base->mcol=80;
+ }
+ base->mrow=99999;
+
+/*
+ init_pair(tv,1, "blue", "white");
+ init_pair(tv,2, "green", "white");
+ init_pair(tv,3, "yellow", "white");
+ init_pair(tv,4, "white", "white");
+ init_pair(tv,5, "green", "white");
+ init_pair(tv,6, "cyan", "white");
+ init_pair(tv,7, "yellow", "white");
+ init_pair(tv,8, "red", "white");
+ init_pair(tv,9, "blue", "white");
+ */
+ return base;
+ }
+
+
+tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples)
+ {
+ tview_t* tv=html_tv_init(fn,fn_fa,samples);
+ tv->my_drawaln=text_drawaln;
+ return tv;
+ }
+
diff --git a/samtools-0.1.19/bamshuf.c b/samtools-0.1.19/bamshuf.c
new file mode 100644
index 0000000..33a5238
--- /dev/null
+++ b/samtools-0.1.19/bamshuf.c
@@ -0,0 +1,141 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "sam.h"
+#include "ksort.h"
+
+#define DEF_CLEVEL 1
+
+static inline unsigned hash_Wang(unsigned key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+
+static inline unsigned hash_X31_Wang(const char *s)
+{
+ unsigned h = *s;
+ if (h) {
+ for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return hash_Wang(h);
+ } else return 0;
+}
+
+typedef struct {
+ unsigned key;
+ bam1_t *b;
+} elem_t;
+
+static inline int elem_lt(elem_t x, elem_t y)
+{
+ if (x.key < y.key) return 1;
+ if (x.key == y.key) {
+ int t;
+ t = strcmp(bam_get_qname(x.b), bam_get_qname(y.b));
+ if (t < 0) return 1;
+ return (t == 0 && ((x.b->core.flag>>6&3) < (y.b->core.flag>>6&3)));
+ } else return 0;
+}
+
+KSORT_INIT(bamshuf, elem_t, elem_lt)
+
+static void bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout)
+{
+ BGZF *fp, *fpw, **fpt;
+ char **fnt, modew[8];
+ bam1_t *b;
+ int i, l;
+ bam_hdr_t *h;
+ int64_t *cnt;
+
+ // split
+ fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r");
+ assert(fp);
+ h = bam_hdr_read(fp);
+ fnt = (char**)calloc(n_files, sizeof(void*));
+ fpt = (BGZF**)calloc(n_files, sizeof(void*));
+ cnt = (int64_t*)calloc(n_files, 8);
+ l = strlen(pre);
+ for (i = 0; i < n_files; ++i) {
+ fnt[i] = (char*)calloc(l + 10, 1);
+ sprintf(fnt[i], "%s.%.4d.bam", pre, i);
+ fpt[i] = bgzf_open(fnt[i], "w1");
+ bam_hdr_write(fpt[i], h);
+ }
+ b = bam_init1();
+ while (bam_read1(fp, b) >= 0) {
+ uint32_t x;
+ x = hash_X31_Wang(bam_get_qname(b)) % n_files;
+ bam_write1(fpt[x], b);
+ ++cnt[x];
+ }
+ bam_destroy1(b);
+ for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]);
+ free(fpt);
+ bgzf_close(fp);
+ // merge
+ sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
+ if (!is_stdout) { // output to a file
+ char *fnw = (char*)calloc(l + 5, 1);
+ sprintf(fnw, "%s.bam", pre);
+ fpw = bgzf_open(fnw, modew);
+ free(fnw);
+ } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout
+ bam_hdr_write(fpw, h);
+ bam_hdr_destroy(h);
+ for (i = 0; i < n_files; ++i) {
+ int64_t j, c = cnt[i];
+ elem_t *a;
+ fp = bgzf_open(fnt[i], "r");
+ bam_hdr_destroy(bam_hdr_read(fp));
+ a = (elem_t*)calloc(c, sizeof(elem_t));
+ for (j = 0; j < c; ++j) {
+ a[j].b = bam_init1();
+ assert(bam_read1(fp, a[j].b) >= 0);
+ a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
+ }
+ bgzf_close(fp);
+ unlink(fnt[i]);
+ free(fnt[i]);
+ ks_introsort(bamshuf, c, a);
+ for (j = 0; j < c; ++j) {
+ bam_write1(fpw, a[j].b);
+ bam_destroy1(a[j].b);
+ }
+ free(a);
+ }
+ bgzf_close(fpw);
+ free(fnt); free(cnt);
+}
+
+int main_bamshuf(int argc, char *argv[])
+{
+ int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
+ while ((c = getopt(argc, argv, "n:l:uO")) >= 0) {
+ switch (c) {
+ case 'n': n_files = atoi(optarg); break;
+ case 'l': clevel = atoi(optarg); break;
+ case 'u': is_un = 1; break;
+ case 'O': is_stdout = 1; break;
+ }
+ }
+ if (is_un) clevel = 0;
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\nUsage: bamshuf [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n");
+ fprintf(stderr, "Options: -O output to stdout\n");
+ fprintf(stderr, " -u uncompressed BAM output\n");
+ fprintf(stderr, " -l INT compression level [%d]\n", DEF_CLEVEL);
+ fprintf(stderr, " -n INT number of temporary files [%d]\n", n_files);
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout);
+ return 0;
+}
diff --git a/samtools-0.1.19/bamtk.c b/samtools-0.1.19/bamtk.c
new file mode 100644
index 0000000..9df7c11
--- /dev/null
+++ b/samtools-0.1.19/bamtk.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <fcntl.h>
+#include "bam.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+int bam_taf2baf(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
+int bam_merge(int argc, char *argv[]);
+int bam_index(int argc, char *argv[]);
+int bam_sort(int argc, char *argv[]);
+int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
+int bam_idxstats(int argc, char *argv[]);
+int main_samview(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
+int main_cut_target(int argc, char *argv[]);
+int main_phase(int argc, char *argv[]);
+int main_cat(int argc, char *argv[]);
+int main_depth(int argc, char *argv[]);
+int main_bam2fq(int argc, char *argv[]);
+int main_pad2unpad(int argc, char *argv[]);
+int main_bedcov(int argc, char *argv[]);
+int main_bamshuf(int argc, char *argv[]);
+
+int faidx_main(int argc, char *argv[]);
+
+static int usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n");
+ fprintf(stderr, "Version: %s\n\n", BAM_VERSION);
+ fprintf(stderr, "Usage: samtools <command> [options]\n\n");
+ fprintf(stderr, "Command: view SAM<->BAM conversion\n");
+ fprintf(stderr, " sort sort alignment file\n");
+ fprintf(stderr, " mpileup multi-way pileup\n");
+ fprintf(stderr, " depth compute the depth\n");
+ fprintf(stderr, " faidx index/extract FASTA\n");
+#if _CURSES_LIB != 0
+ fprintf(stderr, " tview text alignment viewer\n");
+#endif
+ fprintf(stderr, " index index alignment\n");
+ fprintf(stderr, " idxstats BAM index stats (r595 or later)\n");
+ fprintf(stderr, " fixmate fix mate information\n");
+ fprintf(stderr, " flagstat simple stats\n");
+ fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n");
+ fprintf(stderr, " merge merge sorted alignments\n");
+ fprintf(stderr, " rmdup remove PCR duplicates\n");
+ fprintf(stderr, " reheader replace BAM header\n");
+ fprintf(stderr, " cat concatenate BAMs\n");
+ fprintf(stderr, " bedcov read depth per BED region\n");
+ fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n");
+ fprintf(stderr, " phase phase heterozygotes\n");
+ fprintf(stderr, " bamshuf shuffle and group alignments by name\n");
+// fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable
+ fprintf(stderr, "\n");
+#ifdef _WIN32
+ fprintf(stderr, "\
+Note: The Windows version of SAMtools is mainly designed for read-only\n\
+ operations, such as viewing the alignments and generating the pileup.\n\
+ Binary files generated by the Windows version may be buggy.\n\n");
+#endif
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef _WIN32
+ setmode(fileno(stdout), O_BINARY);
+ setmode(fileno(stdin), O_BINARY);
+#ifdef _USE_KNETFILE
+ knet_win32_init();
+#endif
+#endif
+ if (argc < 2) return usage();
+ if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
+ else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
+ else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1);
+ else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
+ else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
+ else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
+ else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1);
+ else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
+ else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
+ else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
+ else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
+ else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1);
+ else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1);
+ else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1);
+ else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1);
+ else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1);
+ else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1);
+ else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1);
+ else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1);
+ else if (strcmp(argv[1], "bamshuf") == 0) return main_bamshuf(argc-1, argv+1);
+ else if (strcmp(argv[1], "pileup") == 0) {
+ fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
+ return 1;
+ }
+#if _CURSES_LIB != 0
+ else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
+#endif
+ else {
+ fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
+ return 1;
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/bcftools/Makefile b/samtools-0.1.19/bcftools/Makefile
new file mode 100644
index 0000000..be831de
--- /dev/null
+++ b/samtools-0.1.19/bcftools/Makefile
@@ -0,0 +1,51 @@
+CC= gcc
+CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
+LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o mut.o bcf2qcall.o
+OMISC= ..
+AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o
+PROG= bcftools
+INCLUDES=
+SUBDIRS= .
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) -I.. $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+lib:libbcf.a
+
+libbcf.a:$(LOBJS)
+ $(AR) -csru $@ $(LOBJS)
+
+bcftools:lib $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz -lpthread
+
+bcf.o:bcf.h
+vcf.o:bcf.h
+index.o:bcf.h
+bcfutils.o:bcf.h
+prob1.o:prob1.h bcf.h
+call1.o:prob1.h bcf.h
+bcf2qcall.o:bcf.h
+main.o:bcf.h
+
+bcf.pdf:bcf.tex
+ pdflatex bcf
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so*
+
+clean:cleanlocal-recur
diff --git a/samtools-0.1.19/bcftools/README b/samtools-0.1.19/bcftools/README
new file mode 100644
index 0000000..1d7159d
--- /dev/null
+++ b/samtools-0.1.19/bcftools/README
@@ -0,0 +1,36 @@
+The view command of bcftools calls variants, tests Hardy-Weinberg
+equilibrium (HWE), tests allele balances and estimates allele frequency.
+
+This command calls a site as a potential variant if P(ref|D,F) is below
+0.9 (controlled by the -p option), where D is data and F is the prior
+allele frequency spectrum (AFS).
+
+The view command performs two types of allele balance tests, both based
+on Fisher's exact test for 2x2 contingency tables with the row variable
+being reference allele or not. In the first table, the column variable
+is strand. Two-tail P-value is taken. We test if variant bases tend to
+come from one strand. In the second table, the column variable is
+whether a base appears in the first or the last 11bp of the read.
+One-tail P-value is taken. We test if variant bases tend to occur
+towards the end of reads, which is usually an indication of
+misalignment.
+
+Site allele frequency is estimated in two ways. In the first way, the
+frequency is esimated as \argmax_f P(D|f) under the assumption of
+HWE. Prior AFS is not used. In the second way, the frequency is
+estimated as the posterior expectation of allele counts \sum_k
+kP(k|D,F), dividied by the total number of haplotypes. HWE is not
+assumed, but the estimate depends on the prior AFS. The two estimates
+largely agree when the signal is strong, but may differ greatly on weak
+sites as in this case, the prior plays an important role.
+
+To test HWE, we calculate the posterior distribution of genotypes
+(ref-hom, het and alt-hom). Chi-square test is performed. It is worth
+noting that the model used here is prior dependent and assumes HWE,
+which is different from both models for allele frequency estimate. The
+new model actually yields a third estimate of site allele frequency.
+
+The estimate allele frequency spectrum is printed to stderr per 64k
+sites. The estimate is in fact only the first round of a EM
+procedure. The second model (not the model for HWE testing) is used to
+estimate the AFS.
\ No newline at end of file
diff --git a/samtools-0.1.19/bcftools/bcf.c b/samtools-0.1.19/bcftools/bcf.c
new file mode 100644
index 0000000..24728db
--- /dev/null
+++ b/samtools-0.1.19/bcftools/bcf.c
@@ -0,0 +1,396 @@
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include "kstring.h"
+#include "bcf.h"
+
+bcf_t *bcf_open(const char *fn, const char *mode)
+{
+ bcf_t *b;
+ b = calloc(1, sizeof(bcf_t));
+ if (strchr(mode, 'w')) {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), mode);
+ } else {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode);
+ }
+ return b;
+}
+
+int bcf_close(bcf_t *b)
+{
+ int ret;
+ if (b == 0) return 0;
+ ret = bgzf_close(b->fp);
+ free(b);
+ return ret;
+}
+
+int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h)
+{
+ if (b == 0 || h == 0) return -1;
+ bgzf_write(b->fp, "BCF\4", 4);
+ bgzf_write(b->fp, &h->l_nm, 4);
+ bgzf_write(b->fp, h->name, h->l_nm);
+ bgzf_write(b->fp, &h->l_smpl, 4);
+ bgzf_write(b->fp, h->sname, h->l_smpl);
+ bgzf_write(b->fp, &h->l_txt, 4);
+ bgzf_write(b->fp, h->txt, h->l_txt);
+ bgzf_flush(b->fp);
+ return 16 + h->l_nm + h->l_smpl + h->l_txt;
+}
+
+bcf_hdr_t *bcf_hdr_read(bcf_t *b)
+{
+ uint8_t magic[4];
+ bcf_hdr_t *h;
+ if (b == 0) return 0;
+ h = calloc(1, sizeof(bcf_hdr_t));
+ bgzf_read(b->fp, magic, 4);
+ bgzf_read(b->fp, &h->l_nm, 4);
+ h->name = malloc(h->l_nm);
+ bgzf_read(b->fp, h->name, h->l_nm);
+ bgzf_read(b->fp, &h->l_smpl, 4);
+ h->sname = malloc(h->l_smpl);
+ bgzf_read(b->fp, h->sname, h->l_smpl);
+ bgzf_read(b->fp, &h->l_txt, 4);
+ h->txt = malloc(h->l_txt);
+ bgzf_read(b->fp, h->txt, h->l_txt);
+ bcf_hdr_sync(h);
+ return h;
+}
+
+void bcf_hdr_destroy(bcf_hdr_t *h)
+{
+ if (h == 0) return;
+ free(h->name); free(h->sname); free(h->txt); free(h->ns); free(h->sns);
+ free(h);
+}
+
+static inline char **cnt_null(int l, char *str, int *_n)
+{
+ int n = 0;
+ char *p, **list;
+ *_n = 0;
+ if (l == 0 || str == 0) return 0;
+ for (p = str; p != str + l; ++p)
+ if (*p == 0) ++n;
+ *_n = n;
+ list = calloc(n, sizeof(void*));
+ list[0] = str;
+ for (p = str, n = 1; p < str + l - 1; ++p)
+ if (*p == 0) list[n++] = p + 1;
+ return list;
+}
+
+int bcf_hdr_sync(bcf_hdr_t *b)
+{
+ if (b == 0) return -1;
+ if (b->ns) free(b->ns);
+ if (b->sns) free(b->sns);
+ if (b->l_nm) b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
+ else b->ns = 0, b->n_ref = 0;
+ b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl);
+ return 0;
+}
+
+int bcf_sync(bcf1_t *b)
+{
+ char *p, *tmp[5];
+ int i, n, n_smpl = b->n_smpl;
+ ks_tokaux_t aux;
+ // set ref, alt, flt, info, fmt
+ b->ref = b->alt = b->flt = b->info = b->fmt = 0;
+ for (p = b->str, n = 0; p < b->str + b->l_str; ++p) {
+ if (*p == 0 && p+1 != b->str + b->l_str) {
+ if (n == 5) {
+ ++n;
+ break;
+ } else tmp[n++] = p + 1;
+ }
+ }
+ if (n != 5) {
+ fprintf(stderr, "[%s] incorrect number of fields (%d != 5) at %d:%d\n", __func__, n, b->tid, b->pos);
+ return -1;
+ }
+ b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
+ // set n_alleles
+ if (*b->alt == 0) b->n_alleles = 1;
+ else {
+ for (p = b->alt, n = 1; *p; ++p)
+ if (*p == ',') ++n;
+ b->n_alleles = n + 1;
+ }
+ // set n_gi and gi[i].fmt
+ for (p = b->fmt, n = 1; *p; ++p)
+ if (*p == ':') ++n;
+ if (n > b->m_gi) {
+ int old_m = b->m_gi;
+ b->m_gi = n;
+ kroundup32(b->m_gi);
+ b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t));
+ memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t));
+ }
+ b->n_gi = n;
+ for (p = kstrtok(b->fmt, ":", &aux), n = 0; p; p = kstrtok(0, 0, &aux))
+ b->gi[n++].fmt = bcf_str2int(p, aux.p - p);
+ // set gi[i].len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2;
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
+ b->gi[i].len = 2;
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) {
+ b->gi[i].len = 1;
+ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
+ b->gi[i].len = 4;
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4;
+ }
+ b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len);
+ }
+ return 0;
+}
+
+int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b)
+{
+ int i, l = 0;
+ if (b == 0) return -1;
+ bgzf_write(bp->fp, &b->tid, 4);
+ bgzf_write(bp->fp, &b->pos, 4);
+ bgzf_write(bp->fp, &b->qual, 4);
+ bgzf_write(bp->fp, &b->l_str, 4);
+ bgzf_write(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
+ l += b->gi[i].len * h->n_smpl;
+ }
+ return l;
+}
+
+int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b)
+{
+ int i, l = 0;
+ if (b == 0) return -1;
+ if (bgzf_read(bp->fp, &b->tid, 4) == 0) return -1;
+ b->n_smpl = h->n_smpl;
+ bgzf_read(bp->fp, &b->pos, 4);
+ bgzf_read(bp->fp, &b->qual, 4);
+ bgzf_read(bp->fp, &b->l_str, 4);
+ if (b->l_str > b->m_str) {
+ b->m_str = b->l_str;
+ kroundup32(b->m_str);
+ b->str = realloc(b->str, b->m_str);
+ }
+ bgzf_read(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ if (bcf_sync(b) < 0) return -2;
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
+ l += b->gi[i].len * h->n_smpl;
+ }
+ return l;
+}
+
+int bcf_destroy(bcf1_t *b)
+{
+ int i;
+ if (b == 0) return -1;
+ free(b->str);
+ for (i = 0; i < b->m_gi; ++i)
+ free(b->gi[i].data);
+ free(b->gi);
+ free(b);
+ return 0;
+}
+
+static inline void fmt_str(const char *p, kstring_t *s)
+{
+ if (*p == 0) kputc('.', s);
+ else kputs(p, s);
+}
+
+void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s)
+{
+ int i, j, x;
+ s->l = 0;
+ if (h->n_ref) kputs(h->ns[b->tid], s);
+ else kputw(b->tid, s);
+ kputc('\t', s);
+ kputw(b->pos + 1, s); kputc('\t', s);
+ fmt_str(b->str, s); kputc('\t', s);
+ fmt_str(b->ref, s); kputc('\t', s);
+ fmt_str(b->alt, s); kputc('\t', s);
+ ksprintf(s, "%.3g", b->qual); kputc('\t', s);
+ fmt_str(b->flt, s); kputc('\t', s);
+ fmt_str(b->info, s);
+ if (b->fmt[0]) {
+ kputc('\t', s);
+ fmt_str(b->fmt, s);
+ }
+ x = b->n_alleles * (b->n_alleles + 1) / 2;
+ if (b->n_gi == 0) return;
+ int iPL = -1;
+ if ( b->n_alleles > 2 ) {
+ for (i=0; i<b->n_gi; i++) {
+ if ( b->gi[i].fmt == bcf_str2int("PL", 2) ) {
+ iPL = i;
+ break;
+ }
+ }
+ }
+ for (j = 0; j < h->n_smpl; ++j) {
+ int ploidy = b->ploidy ? b->ploidy[j] : 2;
+ kputc('\t', s);
+ for (i = 0; i < b->n_gi; ++i) {
+ if (i) kputc(':', s);
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ uint8_t *d = (uint8_t*)b->gi[i].data + j * x;
+ int k;
+ if ( ploidy==1 )
+ for (k=0; k<b->n_alleles; k++)
+ {
+ if (k>0) kputc(',', s);
+ kputw(d[(k+1)*(k+2)/2-1], s);
+ }
+ else
+ for (k = 0; k < x; ++k) {
+ if (k > 0) kputc(',', s);
+ kputw(d[k], s);
+ }
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
+ kputw(((uint16_t*)b->gi[i].data)[j], s);
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
+ kputw(((uint8_t*)b->gi[i].data)[j], s);
+ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
+ kputw(((int32_t*)b->gi[i].data)[j], s);
+ } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+ int y = ((uint8_t*)b->gi[i].data)[j];
+ if ( ploidy==1 )
+ {
+ if ( y>>7&1 )
+ kputc('.', s);
+ else
+ kputc('0' + (y>>3&7), s);
+ }
+ else
+ {
+ if ( y>>7&1 )
+ kputsn("./.", 3, s);
+ else {
+ kputc('0' + (y>>3&7), s);
+ kputc("/|"[y>>6&1], s);
+ kputc('0' + (y&7), s);
+ }
+ }
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ float *d = (float*)b->gi[i].data + j * x;
+ int k;
+ //printf("- %lx\n", d);
+ for (k = 0; k < x; ++k) {
+ if (k > 0) kputc(',', s);
+ ksprintf(s, "%.2f", d[k]);
+ }
+ } else kputc('.', s); // custom fields
+ }
+ }
+}
+
+char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b)
+{
+ kstring_t s;
+ s.l = s.m = 0; s.s = 0;
+ bcf_fmt_core(h, b, &s);
+ return s.s;
+}
+
+int bcf_append_info(bcf1_t *b, const char *info, int l)
+{
+ int shift = b->fmt - b->str;
+ int l_fmt = b->l_str - shift;
+ char *ori = b->str;
+ if (b->l_str + l > b->m_str) { // enlarge if necessary
+ b->m_str = b->l_str + l;
+ kroundup32(b->m_str);
+ b->str = realloc(b->str, b->m_str);
+ }
+ memmove(b->str + shift + l, b->str + shift, l_fmt); // move the FORMAT field
+ memcpy(b->str + shift - 1, info, l); // append to the INFO field
+ b->str[shift + l - 1] = '\0';
+ b->fmt = b->str + shift + l;
+ b->l_str += l;
+ if (ori != b->str) bcf_sync(b); // synchronize when realloc changes the pointer
+ return 0;
+}
+
+int remove_tag(char *str, const char *tag, char delim)
+{
+ char *tmp = str, *p;
+ int len_diff = 0, ori_len = strlen(str);
+ while ( *tmp && (p = strstr(tmp,tag)) )
+ {
+ if ( p>str )
+ {
+ if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring
+ p--;
+ }
+ char *q=p+1;
+ while ( *q && *q!=delim ) q++;
+ if ( p==str && *q ) q++; // the tag is first, don't move the delim char
+ len_diff += q-p;
+ if ( ! *q ) { *p = 0; break; } // the tag was last, no delim follows
+ else
+ memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)); // *q==delim
+ }
+ if ( len_diff==ori_len )
+ str[0]='.', str[1]=0, len_diff--;
+
+ return len_diff;
+}
+
+
+void rm_info(kstring_t *s, const char *key)
+{
+ char *p = s->s;
+ int n = 0;
+ while ( n<4 )
+ {
+ if ( !*p ) n++;
+ p++;
+ }
+ char *q = p+1;
+ while ( *q && q-s->s<s->l ) q++;
+
+ int nrm = remove_tag(p, key, ';');
+ if ( nrm )
+ memmove(q-nrm, q, s->s+s->l-q+1);
+ s->l -= nrm;
+}
+
+int bcf_cpy(bcf1_t *r, const bcf1_t *b)
+{
+ char *t1 = r->str;
+ bcf_ginfo_t *t2 = r->gi;
+ int i, t3 = r->m_str, t4 = r->m_gi;
+ *r = *b;
+ r->str = t1; r->gi = t2; r->m_str = t3; r->m_gi = t4;
+ if (r->m_str < b->m_str) {
+ r->m_str = b->m_str;
+ r->str = realloc(r->str, r->m_str);
+ }
+ memcpy(r->str, b->str, r->m_str);
+ bcf_sync(r); // calling bcf_sync() is simple but inefficient
+ for (i = 0; i < r->n_gi; ++i)
+ memcpy(r->gi[i].data, b->gi[i].data, r->n_smpl * r->gi[i].len);
+ return 0;
+}
+
+int bcf_is_indel(const bcf1_t *b)
+{
+ char *p;
+ if (strlen(b->ref) > 1) return 1;
+ for (p = b->alt; *p; ++p)
+ if (*p != ',' && p[1] != ',' && p[1] != '\0')
+ return 1;
+ return 0;
+}
diff --git a/samtools-0.1.19/bcftools/bcf.h b/samtools-0.1.19/bcftools/bcf.h
new file mode 100644
index 0000000..3315809
--- /dev/null
+++ b/samtools-0.1.19/bcftools/bcf.h
@@ -0,0 +1,197 @@
+/* The MIT License
+
+ Copyright (c) 2010 Broad Institute
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at live.co.uk> */
+
+#ifndef BCF_H
+#define BCF_H
+
+#define BCF_VERSION "0.1.19-96b5f2294a"
+
+#include <stdint.h>
+#include <zlib.h>
+
+#ifndef BCF_LITE
+#include "bgzf.h"
+typedef BGZF *bcfFile;
+#else
+typedef gzFile bcfFile;
+#define bgzf_open(fn, mode) gzopen(fn, mode)
+#define bgzf_fdopen(fd, mode) gzdopen(fd, mode)
+#define bgzf_close(fp) gzclose(fp)
+#define bgzf_read(fp, buf, len) gzread(fp, buf, len)
+#define bgzf_write(fp, buf, len)
+#define bgzf_flush(fp)
+#endif
+
+/*
+ A member in the structs below is said to "primary" if its content
+ cannot be inferred from other members in any of structs below; a
+ member is said to be "derived" if its content can be derived from
+ other members. For example, bcf1_t::str is primary as this comes from
+ the input data, while bcf1_t::info is derived as it can always be
+ correctly set if we know bcf1_t::str. Derived members are for quick
+ access to the content and must be synchronized with the primary data.
+ */
+
+typedef struct {
+ uint32_t fmt; // format of the block, set by bcf_str2int().
+ int len; // length of data for each individual
+ void *data; // concatenated data
+ // derived info: fmt, len (<-bcf1_t::fmt)
+} bcf_ginfo_t;
+
+typedef struct {
+ int32_t tid, pos; // refID and 0-based position
+ int32_t l_str, m_str; // length and the allocated size of ->str
+ float qual; // SNP quality
+ char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7)
+ char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation
+ int n_gi, m_gi; // number and the allocated size of geno fields
+ bcf_ginfo_t *gi; // array of geno fields
+ int n_alleles, n_smpl; // number of alleles and samples
+ // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
+ uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed.
+} bcf1_t;
+
+typedef struct {
+ int32_t n_ref, n_smpl; // number of reference sequences and samples
+ int32_t l_nm; // length of concatenated sequence names; 0 padded
+ int32_t l_smpl; // length of concatenated sample names; 0 padded
+ int32_t l_txt; // length of header text (lines started with ##)
+ char *name, *sname, *txt; // concatenated sequence names, sample names and header text
+ char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively
+ // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname)
+} bcf_hdr_t;
+
+typedef struct {
+ int is_vcf; // if the file in operation is a VCF
+ void *v; // auxillary data structure for VCF
+ bcfFile fp; // file handler for BCF
+} bcf_t;
+
+struct __bcf_idx_t;
+typedef struct __bcf_idx_t bcf_idx_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ // open a BCF file; for BCF file only
+ bcf_t *bcf_open(const char *fn, const char *mode);
+ // close file
+ int bcf_close(bcf_t *b);
+ // read one record from BCF; return -1 on end-of-file, and <-1 for errors
+ int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
+ // call this function if b->str is changed
+ int bcf_sync(bcf1_t *b);
+ // write a BCF record
+ int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
+ // read the BCF header; BCF only
+ bcf_hdr_t *bcf_hdr_read(bcf_t *b);
+ // write the BCF header
+ int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
+ // set bcf_hdr_t::ns and bcf_hdr_t::sns
+ int bcf_hdr_sync(bcf_hdr_t *b);
+ // destroy the header
+ void bcf_hdr_destroy(bcf_hdr_t *h);
+ // destroy a record
+ int bcf_destroy(bcf1_t *b);
+ // BCF->VCF conversion
+ char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
+ // append more info
+ int bcf_append_info(bcf1_t *b, const char *info, int l);
+ // remove tag
+ int remove_tag(char *string, const char *tag, char delim);
+ // remove info tag, string is the kstring holder of bcf1_t.str
+ void rm_info(kstring_t *string, const char *key);
+ // copy
+ int bcf_cpy(bcf1_t *r, const bcf1_t *b);
+
+ // open a VCF or BCF file if "b" is set in "mode"
+ bcf_t *vcf_open(const char *fn, const char *mode);
+ // close a VCF/BCF file
+ int vcf_close(bcf_t *bp);
+ // read the VCF/BCF header
+ bcf_hdr_t *vcf_hdr_read(bcf_t *bp);
+ // read the sequence dictionary from a separate file; required for VCF->BCF conversion
+ int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn);
+ // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors
+ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
+ // write the VCF header
+ int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h);
+ // write a VCF record
+ int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
+
+ // keep the first n alleles and discard the rest
+ int bcf_shrink_alt(bcf1_t *b, int n);
+ // keep the masked alleles and discard the rest
+ void bcf_fit_alt(bcf1_t *b, int mask);
+ // convert GL to PL
+ int bcf_gl2pl(bcf1_t *b);
+ // if the site is an indel
+ int bcf_is_indel(const bcf1_t *b);
+ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list);
+ int bcf_subsam(int n_smpl, int *list, bcf1_t *b);
+ // move GT to the first FORMAT field
+ int bcf_fix_gt(bcf1_t *b);
+ // update PL generated by old samtools
+ int bcf_fix_pl(bcf1_t *b);
+ // convert PL to GLF-like 10-likelihood GL
+ int bcf_gl10(const bcf1_t *b, uint8_t *gl);
+ // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL
+ int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl);
+
+ // string hash table
+ void *bcf_build_refhash(bcf_hdr_t *h);
+ void bcf_str2id_destroy(void *_hash);
+ void bcf_str2id_thorough_destroy(void *_hash);
+ int bcf_str2id_add(void *_hash, const char *str);
+ int bcf_str2id(void *_hash, const char *str);
+ void *bcf_str2id_init();
+
+ // indexing related functions
+ int bcf_idx_build(const char *fn);
+ uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg);
+ int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
+ bcf_idx_t *bcf_idx_load(const char *fn);
+ void bcf_idx_destroy(bcf_idx_t *idx);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline uint32_t bcf_str2int(const char *str, int l)
+{
+ int i;
+ uint32_t x = 0;
+ for (i = 0; i < l && i < 4; ++i) {
+ if (str[i] == 0) return x;
+ x = x<<8 | str[i];
+ }
+ return x;
+}
+
+#endif
diff --git a/samtools-0.1.19/bcftools/bcf.tex b/samtools-0.1.19/bcftools/bcf.tex
new file mode 100644
index 0000000..442fc2a
--- /dev/null
+++ b/samtools-0.1.19/bcftools/bcf.tex
@@ -0,0 +1,77 @@
+\documentclass[10pt,pdftex]{article}
+\usepackage{color}
+\definecolor{gray}{rgb}{0.7,0.7,0.7}
+
+\setlength{\topmargin}{0.0cm}
+\setlength{\textheight}{21.5cm}
+\setlength{\oddsidemargin}{0cm}
+\setlength{\textwidth}{16.5cm}
+\setlength{\columnsep}{0.6cm}
+
+\begin{document}
+
+\begin{center}
+\begin{tabular}{|l|l|l|l|l|}
+\hline
+\multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline
+\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline
+\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline
+\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline
+\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline
+\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline
+\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline
+\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline
+\multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5}
+& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5}
+& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5}
+& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5}
+& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5}
+& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5}
+& \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\
+\hline
+\end{tabular}
+\end{center}
+
+\begin{center}
+\begin{tabular}{clp{9cm}}
+\hline
+\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline
+{\tt DP} & {\tt uint16\_t[n]} & Read depth \\
+{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\
+{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\
+{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set,
+ the allele is not present (e.g. due to different ploidy between samples).} \\
+{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\
+{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\
+{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\
+{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\
+{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\
+{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\
+{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\
+%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\
+\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\
+\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\
+\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\
+\hline
+\end{tabular}
+\end{center}
+
+\begin{itemize}
+\item A BCF file is in the {\tt BGZF} format.
+\item All multi-byte numbers are little-endian.
+\item In a string, a missing value `.' is an empty C string ``{\tt
+ \char92 0}'' (not ``{\tt .\char92 0}'')
+\item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the
+ order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt
+ REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt
+ CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original
+ BCF proposal).
+\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields
+ are required to be explicitly defined in the headers.
+\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only.
+ It gives an alternative binary representation of the corresponding VCF field, in case
+ the default representation is unable to keep the genotype information,
+ for example, when the ploidy is not 2 or there are more than 8 alleles.
+\end{itemize}
+
+\end{document}
diff --git a/samtools-0.1.19/bcftools/bcf2qcall.c b/samtools-0.1.19/bcftools/bcf2qcall.c
new file mode 100644
index 0000000..a86bac2
--- /dev/null
+++ b/samtools-0.1.19/bcftools/bcf2qcall.c
@@ -0,0 +1,91 @@
+#include <errno.h>
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include "bcf.h"
+
+static int8_t nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+static int read_I16(bcf1_t *b, int anno[16])
+{
+ char *p;
+ int i;
+ if ((p = strstr(b->info, "I16=")) == 0) return -1;
+ p += 4;
+ for (i = 0; i < 16; ++i) {
+ anno[i] = strtol(p, &p, 10);
+ if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2;
+ ++p;
+ }
+ return 0;
+}
+
+int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b)
+{
+ int a[4], k, g[10], l, map[4], k1, j, i, i0, anno[16], dp, mq, d_rest;
+ char *s;
+ if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ if (read_I16(b, anno) != 0) return -1; // no I16; FIXME: can be improved
+ d_rest = dp = anno[0] + anno[1] + anno[2] + anno[3];
+ if (dp == 0) return -1; // depth is zero
+ mq = (int)(sqrt((double)(anno[9] + anno[11]) / dp) + .499);
+ i0 = i;
+ a[0] = nt4_table[(int)b->ref[0]];
+ if (a[0] > 3) return -1; // ref is not A/C/G/T
+ a[1] = a[2] = a[3] = -2; // -1 has a special meaning
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ map[0] = map[1] = map[2] = map[3] = -2;
+ map[a[0]] = 0;
+ for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
+ if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
+ a[k+1] = nt4_table[(int)*s];
+ if (a[k+1] >= 0) map[a[k+1]] = k+1;
+ else k1 = k+1;
+ if (s[1] == 0) break;
+ }
+ for (k = 0; k < 4; ++k)
+ if (map[k] < 0) map[k] = k1;
+ for (i = 0; i < h->n_smpl; ++i) {
+ int d;
+ uint8_t *p = b->gi[i0].data + i * b->gi[i0].len;
+ for (j = 0; j < b->gi[i0].len; ++j)
+ if (p[j]) break;
+ d = (int)((double)d_rest / (h->n_smpl - i) + .499);
+ if (d == 0) d = 1;
+ if (j == b->gi[i0].len) d = 0;
+ d_rest -= d;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = map[k], y = map[l];
+ if (x > y) t = x, x = y, y = t; // swap
+ g[j++] = p[y * (y+1) / 2 + x];
+ }
+ }
+ printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref);
+ printf("\t%d\t%d\t0", d, mq);
+ for (j = 0; j < 10; ++j)
+ printf("\t%d", g[j]);
+ printf("\t%s\n", h->sns[i]);
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/bcftools/bcfutils.c b/samtools-0.1.19/bcftools/bcfutils.c
new file mode 100644
index 0000000..7638085
--- /dev/null
+++ b/samtools-0.1.19/bcftools/bcfutils.c
@@ -0,0 +1,504 @@
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "bcf.h"
+#include "kstring.h"
+#include "khash.h"
+KHASH_MAP_INIT_STR(str2id, int)
+
+#ifdef _WIN32
+#define srand48(x) srand(x)
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
+// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
+void *bcf_build_refhash(bcf_hdr_t *h)
+{
+ khash_t(str2id) *hash;
+ int i, ret;
+ hash = kh_init(str2id);
+ for (i = 0; i < h->n_ref; ++i) {
+ khint_t k;
+ k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
+ kh_val(hash, k) = i;
+ }
+ return hash;
+}
+
+void *bcf_str2id_init()
+{
+ return kh_init(str2id);
+}
+
+void bcf_str2id_destroy(void *_hash)
+{
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
+}
+
+void bcf_str2id_thorough_destroy(void *_hash)
+{
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ khint_t k;
+ if (hash == 0) return;
+ for (k = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
+ kh_destroy(str2id, hash);
+}
+
+int bcf_str2id(void *_hash, const char *str)
+{
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ khint_t k;
+ if (!hash) return -1;
+ k = kh_get(str2id, hash, str);
+ return k == kh_end(hash)? -1 : kh_val(hash, k);
+}
+
+int bcf_str2id_add(void *_hash, const char *str)
+{
+ khint_t k;
+ int ret;
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ if (!hash) return -1;
+ k = kh_put(str2id, hash, str, &ret);
+ if (ret == 0) return kh_val(hash, k);
+ kh_val(hash, k) = kh_size(hash) - 1;
+ return kh_val(hash, k);
+}
+
+void bcf_fit_alt(bcf1_t *b, int mask)
+{
+ mask |= 1; // REF must be always present
+
+ int i,j,nals=0;
+ for (i=0; i<sizeof(int); i++)
+ if ( mask&1<<i) nals++;
+ if ( b->n_alleles <= nals ) return;
+
+ // update ALT, in principle any of the alleles can be removed
+ char *p;
+ if ( nals>1 )
+ {
+ char *dst, *src;
+ int n=0, nalts=nals-1;
+ for (src=dst=p=b->alt, i=1; *p; p++)
+ {
+ if ( *p!=',' ) continue;
+
+ if ( mask&1<<i )
+ {
+ n++;
+ if ( src!=dst )
+ {
+ memmove(dst,src,p-src);
+ dst += p-src;
+ }
+ else dst = p;
+ if ( n<nalts ) { *dst=','; dst++; }
+ }
+ i++;
+
+ if ( n>=nalts ) { *dst=0; break; }
+ src = p+1;
+ }
+ if ( n<nalts )
+ {
+ memmove(dst,src,p-src);
+ dst += p-src;
+ *dst = 0;
+ }
+ p = dst;
+ }
+ else p = b->alt, *p = '\0';
+ p++;
+ memmove(p, b->flt, b->str + b->l_str - b->flt);
+ b->l_str -= b->flt - p;
+
+ // update PL and GT
+ int ipl=-1, igt=-1;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
+ if (g->fmt == bcf_str2int("GT", 2)) igt = i;
+ }
+
+ // .. create mapping between old and new indexes
+ int npl = nals * (nals+1) / 2;
+ int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
+ int kori=0,knew=0;
+ for (i=0; i<b->n_alleles; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ int skip=0;
+ if ( i && !(mask&1<<i) ) skip=1;
+ if ( j && !(mask&1<<j) ) skip=1;
+ if ( !skip ) { map[knew++] = kori; }
+ kori++;
+ }
+ }
+ // .. apply to all samples
+ int n_smpl = b->n_smpl;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2))
+ {
+ g->len = npl;
+ uint8_t *d = (uint8_t*)g->data;
+ int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
+ {
+ uint8_t *dl = d + ismpl * npl_ori;
+ for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
+ }
+ } // FIXME: to add GL
+ }
+ // update GTs
+ map[0] = 0;
+ for (i=1, knew=0; i<b->n_alleles; i++)
+ map[i] = mask&1<<i ? ++knew : -1;
+ for (i=0; i<n_smpl; i++)
+ {
+ uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
+ int a1 = (gt>>3)&7;
+ int a2 = gt&7;
+ assert( map[a1]>=0 && map[a2]>=0 );
+ ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2];
+ }
+ free(map);
+ b->n_alleles = nals;
+ bcf_sync(b);
+}
+
+int bcf_shrink_alt(bcf1_t *b, int n)
+{
+ char *p;
+ int i, j, k, n_smpl = b->n_smpl;
+ if (b->n_alleles <= n) return -1;
+ // update ALT
+ if (n > 1) {
+ for (p = b->alt, k = 1; *p; ++p)
+ if (*p == ',' && ++k == n) break;
+ *p = '\0';
+ } else p = b->alt, *p = '\0';
+ ++p;
+ memmove(p, b->flt, b->str + b->l_str - b->flt);
+ b->l_str -= b->flt - p;
+ // update PL
+ for (i = 0; i < b->n_gi; ++i) {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2)) {
+ int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
+ uint8_t *d = (uint8_t*)g->data;
+ g->len = n * (n + 1) / 2;
+ for (l = k = 0; l < n_smpl; ++l) {
+ uint8_t *dl = d + l * x;
+ for (j = 0; j < g->len; ++j) d[k++] = dl[j];
+ }
+ } // FIXME: to add GL
+ }
+ b->n_alleles = n;
+ bcf_sync(b);
+ return 0;
+}
+
+int bcf_gl2pl(bcf1_t *b)
+{
+ char *p;
+ int i, n_smpl = b->n_smpl;
+ bcf_ginfo_t *g;
+ float *d0;
+ uint8_t *d1;
+ if (strstr(b->fmt, "PL")) return -1;
+ if ((p = strstr(b->fmt, "GL")) == 0) return -1;
+ *p = 'P';
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("GL", 2))
+ break;
+ g = b->gi + i;
+ g->fmt = bcf_str2int("PL", 2);
+ g->len /= 4; // 4 == sizeof(float)
+ d0 = (float*)g->data; d1 = (uint8_t*)g->data;
+ for (i = 0; i < n_smpl * g->len; ++i) {
+ int x = (int)(-10. * d0[i] + .499);
+ if (x > 255) x = 255;
+ if (x < 0) x = 0;
+ d1[i] = x;
+ }
+ return 0;
+}
+/* FIXME: this function will fail given AB:GTX:GT. BCFtools never
+ * produces such FMT, but others may do. */
+int bcf_fix_gt(bcf1_t *b)
+{
+ char *s;
+ int i;
+ uint32_t tmp;
+ bcf_ginfo_t gt;
+ // check the presence of the GT FMT
+ if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
+ assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact
+ tmp = bcf_str2int("GT", 2);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
+ gt = b->gi[i];
+ // move GT to the first
+ for (; i > 0; --i) b->gi[i] = b->gi[i-1];
+ b->gi[0] = gt;
+ if ( s[3]==0 )
+ memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT
+ else
+ memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT:
+ b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
+ return 0;
+}
+
+int bcf_fix_pl(bcf1_t *b)
+{
+ int i;
+ uint32_t tmp;
+ uint8_t *PL, *swap;
+ bcf_ginfo_t *gi;
+ // pinpoint PL
+ tmp = bcf_str2int("PL", 2);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ if (i == b->n_gi) return 0;
+ // prepare
+ gi = b->gi + i;
+ PL = (uint8_t*)gi->data;
+ swap = alloca(gi->len);
+ // loop through individuals
+ for (i = 0; i < b->n_smpl; ++i) {
+ int k, l, x;
+ uint8_t *PLi = PL + i * gi->len;
+ memcpy(swap, PLi, gi->len);
+ for (k = x = 0; k < b->n_alleles; ++k)
+ for (l = k; l < b->n_alleles; ++l)
+ PLi[l*(l+1)/2 + k] = swap[x++];
+ }
+ return 0;
+}
+
+int bcf_smpl_covered(const bcf1_t *b)
+{
+ int i, j, n = 0;
+ uint32_t tmp;
+ bcf_ginfo_t *gi;
+ // pinpoint PL
+ tmp = bcf_str2int("PL", 2);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ if (i == b->n_gi) return 0;
+ // count how many samples having PL!=[0..0]
+ gi = b->gi + i;
+ for (i = 0; i < b->n_smpl; ++i) {
+ uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
+ for (j = 0; j < gi->len; ++j)
+ if (PLi[j]) break;
+ if (j < gi->len) ++n;
+ }
+ return n;
+}
+
+static void *locate_field(const bcf1_t *b, const char *fmt, int l)
+{
+ int i;
+ uint32_t tmp;
+ tmp = bcf_str2int(fmt, l);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ return i == b->n_gi? 0 : b->gi[i].data;
+}
+
+int bcf_anno_max(bcf1_t *b)
+{
+ int k, max_gq, max_sp, n_het;
+ kstring_t str;
+ uint8_t *gt, *gq;
+ int32_t *sp;
+ max_gq = max_sp = n_het = 0;
+ gt = locate_field(b, "GT", 2);
+ if (gt == 0) return -1;
+ gq = locate_field(b, "GQ", 2);
+ sp = locate_field(b, "SP", 2);
+ if (sp)
+ for (k = 0; k < b->n_smpl; ++k)
+ if (gt[k]&0x3f)
+ max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
+ if (gq)
+ for (k = 0; k < b->n_smpl; ++k)
+ if (gt[k]&0x3f)
+ max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
+ for (k = 0; k < b->n_smpl; ++k) {
+ int a1, a2;
+ a1 = gt[k]&7; a2 = gt[k]>>3&7;
+ if ((!a1 && a2) || (!a2 && a1)) { // a het
+ if (gq == 0) ++n_het;
+ else if (gq[k] >= 20) ++n_het;
+ }
+ }
+ if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
+ if (max_sp < 0) max_sp = 0;
+ memset(&str, 0, sizeof(kstring_t));
+ if (*b->info) kputc(';', &str);
+ ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
+ bcf_append_info(b, str.s, str.l);
+ free(str.s);
+ return 0;
+}
+
+// FIXME: only data are shuffled; the header is NOT
+int bcf_shuffle(bcf1_t *b, int seed)
+{
+ int i, j, *a;
+ if (seed > 0) srand48(seed);
+ a = malloc(b->n_smpl * sizeof(int));
+ for (i = 0; i < b->n_smpl; ++i) a[i] = i;
+ for (i = b->n_smpl; i > 1; --i) {
+ int tmp;
+ j = (int)(drand48() * i);
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
+ }
+ for (j = 0; j < b->n_gi; ++j) {
+ bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap, *data = (uint8_t*)gi->data;
+ swap = malloc(gi->len * b->n_smpl);
+ for (i = 0; i < b->n_smpl; ++i)
+ memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
+ free(gi->data);
+ gi->data = swap;
+ }
+ free(a);
+ return 0;
+}
+
+bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
+{
+ int i, ret, j;
+ khint_t k;
+ bcf_hdr_t *h;
+ khash_t(str2id) *hash;
+ kstring_t s;
+ s.l = s.m = 0; s.s = 0;
+ hash = kh_init(str2id);
+ for (i = 0; i < h0->n_smpl; ++i) {
+ k = kh_put(str2id, hash, h0->sns[i], &ret);
+ kh_val(hash, k) = i;
+ }
+ for (i = j = 0; i < n; ++i) {
+ k = kh_get(str2id, hash, samples[i]);
+ if (k != kh_end(hash)) {
+ list[j++] = kh_val(hash, k);
+ kputs(samples[i], &s); kputc('\0', &s);
+ }
+ }
+ if (j < n)
+ {
+ fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
+ exit(1);
+ }
+ kh_destroy(str2id, hash);
+ h = calloc(1, sizeof(bcf_hdr_t));
+ *h = *h0;
+ h->ns = 0; h->sns = 0;
+ h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
+ h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
+ h->l_smpl = s.l; h->sname = s.s;
+ bcf_hdr_sync(h);
+ return h;
+}
+
+int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
+{
+ int i, j;
+ for (j = 0; j < b->n_gi; ++j) {
+ bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap;
+ swap = malloc(gi->len * b->n_smpl);
+ for (i = 0; i < n_smpl; ++i)
+ memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
+ free(gi->data);
+ gi->data = swap;
+ }
+ b->n_smpl = n_smpl;
+ return 0;
+}
+
+static int8_t nt4_table[128] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
+};
+
+int bcf_gl10(const bcf1_t *b, uint8_t *gl)
+{
+ int a[4], k, l, map[4], k1, j, i;
+ const bcf_ginfo_t *PL;
+ char *s;
+ if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ a[0] = nt4_table[(int)b->ref[0]];
+ if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
+ a[1] = a[2] = a[3] = -2; // -1 has a special meaning
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ map[0] = map[1] = map[2] = map[3] = -2;
+ map[a[0]] = 0;
+ for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
+ if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
+ a[k+1] = nt4_table[(int)*s];
+ if (a[k+1] >= 0) map[a[k+1]] = k+1;
+ else k1 = k + 1;
+ if (s[1] == 0) break; // the end of the ALT string
+ }
+ for (k = 0; k < 4; ++k)
+ if (map[k] < 0) map[k] = k1;
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
+ uint8_t *g = gl + 10 * i;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = map[k], y = map[l];
+ if (x > y) t = x, x = y, y = t; // make sure x is the smaller
+ g[j++] = p[y * (y+1) / 2 + x];
+ }
+ }
+ }
+ return 0;
+}
+
+int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
+{
+ int k, l, j, i;
+ const bcf_ginfo_t *PL;
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
+ uint8_t *g = gl + 10 * i;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = k, y = l;
+ if (x > y) t = x, x = y, y = t; // make sure x is the smaller
+ x = y * (y+1) / 2 + x;
+ g[j++] = x < PL->len? p[x] : 255;
+ }
+ }
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/bcftools/call1.c b/samtools-0.1.19/bcftools/call1.c
new file mode 100644
index 0000000..e6373d3
--- /dev/null
+++ b/samtools-0.1.19/bcftools/call1.c
@@ -0,0 +1,633 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+#include <zlib.h>
+#include <errno.h>
+#include "bcf.h"
+#include "prob1.h"
+#include "kstring.h"
+#include "time.h"
+
+#ifdef _WIN32
+#define srand48(x) srand(x)
+#define lrand48() rand()
+#endif
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define VC_NO_GENO 2
+#define VC_BCFOUT 4
+#define VC_CALL 8
+#define VC_VARONLY 16
+#define VC_VCFIN 32
+#define VC_UNCOMP 64
+#define VC_KEEPALT 256
+#define VC_ACGT_ONLY 512
+#define VC_QCALL 1024
+#define VC_CALL_GT 2048
+#define VC_ADJLD 4096
+#define VC_NO_INDEL 8192
+#define VC_ANNO_MAX 16384
+#define VC_FIX_PL 32768
+#define VC_EM 0x10000
+#define VC_PAIRCALL 0x20000
+#define VC_QCNT 0x40000
+#define VC_INDEL_ONLY 0x80000
+
+typedef struct {
+ int flag, prior_type, n1, n_sub, *sublist, n_perm;
+ uint32_t *trio_aux;
+ char *prior_file, **subsam, *fn_dict;
+ uint8_t *ploidy;
+ double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt;
+ void *bed;
+} viewconf_t;
+
+void *bed_read(const char *fn);
+void bed_destroy(void *_h);
+int bed_overlap(const void *_h, const char *chr, int beg, int end);
+
+static double ttest(int n1, int n2, int a[4])
+{
+ extern double kf_betai(double a, double b, double x);
+ double t, v, u1, u2;
+ if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0;
+ u1 = (double)a[0] / n1; u2 = (double)a[2] / n2;
+ if (u1 <= u2) return 1.;
+ t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2));
+ v = n1 + n2 - 2;
+// printf("%d,%d,%d,%d,%lf,%lf,%lf\n", a[0], a[1], a[2], a[3], t, u1, u2);
+ return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t));
+}
+
+static int test16_core(int anno[16], anno16_t *a)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ double left, right;
+ int i;
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ memcpy(a->d, anno, 4 * sizeof(int));
+ a->depth = anno[0] + anno[1] + anno[2] + anno[3];
+ a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
+ if (a->depth == 0) return -1;
+ a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
+ kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
+ for (i = 1; i < 4; ++i)
+ a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
+ return 0;
+}
+
+int test16(bcf1_t *b, anno16_t *a)
+{
+ char *p;
+ int i, anno[16];
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.;
+ a->mq = a->depth = a->is_tested = 0;
+ if ((p = strstr(b->info, "I16=")) == 0) return -1;
+ p += 4;
+ for (i = 0; i < 16; ++i) {
+ errno = 0; anno[i] = strtol(p, &p, 10);
+ if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2;
+ ++p;
+ }
+ return test16_core(anno, a);
+}
+
+static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt)
+{
+ kstring_t s;
+ int has_I16, is_var;
+ double fq, r;
+ anno16_t a;
+
+ has_I16 = test16(b, &a) >= 0? 1 : 0;
+ //rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed!
+
+ memset(&s, 0, sizeof(kstring_t));
+ kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ kputs(b->info, &s);
+ if (b->info[0]) kputc(';', &s);
+ { // print EM
+ if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]);
+ if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]);
+ if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
+ if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
+ if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]);
+ }
+ if (cons_llr > 0) {
+ ksprintf(&s, ";CLR=%d", cons_llr);
+ if (cons_gt > 0)
+ ksprintf(&s, ";UGT=%c%c%c;CGT=%c%c%c", cons_gt&0xff, cons_gt>>8&0xff, cons_gt>>16&0xff,
+ cons_gt>>32&0xff, cons_gt>>40&0xff, cons_gt>>48&0xff);
+ }
+ if (pr == 0) { // if pr is unset, return
+ kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+ return 1;
+ }
+
+ is_var = (pr->p_ref < pref);
+ r = is_var? pr->p_ref : pr->p_var;
+
+// ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted!
+ ksprintf(&s, ";AC1=%d", pr->ac);
+ if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
+ if (fq < -999) fq = -999;
+ if (fq > 999) fq = 999;
+ ksprintf(&s, ";FQ=%.3g", fq);
+ if (pr->cmp[0] >= 0.) { // two sample groups
+ int i, q[3];
+ for (i = 1; i < 3; ++i) {
+ double x = pr->cmp[i] + pr->cmp[0]/2.;
+ q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
+ if (q[i] > 255) q[i] = 255;
+ }
+ if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank);
+ // ksprintf(&s, ";LRT3=%.3g", pr->lrt);
+ ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2);
+ }
+ if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
+ kputc('\0', &s);
+ rm_info(&s, "QS=");
+ rm_info(&s, "I16=");
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ b->qual = r < 1e-100? 999 : -4.343 * log(r);
+ if (b->qual > 999) b->qual = 999;
+ bcf_sync(b);
+ if (!is_var) bcf_shrink_alt(b, 1);
+ else if (!(flag&VC_KEEPALT))
+ bcf_shrink_alt(b, pr->rank0 < 2? 2 : pr->rank0+1);
+ if (is_var && (flag&VC_CALL_GT)) { // call individual genotype
+ int i, x, old_n_gi = b->n_gi;
+ s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
+ kputs(":GT:GQ", &s); kputc('\0', &s);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+ for (i = 0; i < b->n_smpl; ++i) {
+ x = bcf_p1_call_gt(pa, pr->f_exp, i);
+ ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2;
+ }
+ }
+ return is_var;
+}
+
+static char **read_samples(const char *fn, int *_n)
+{
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t s;
+ int dret, n = 0, max = 0;
+ char **sam = 0;
+ *_n = 0;
+ s.l = s.m = 0; s.s = 0;
+ fp = gzopen(fn, "r");
+ if (fp == 0)
+ {
+ // interpret as sample names, not as a file name
+ const char *t = fn, *p = t;
+ while (*t)
+ {
+ t++;
+ if ( *t==',' || !*t )
+ {
+ sam = realloc(sam, sizeof(void*)*(n+1));
+ sam[n] = (char*) malloc(sizeof(char)*(t-p+2));
+ memcpy(sam[n], p, t-p);
+ sam[n][t-p] = 0;
+ sam[n][t-p+1] = 2; // assume diploid
+ p = t+1;
+ n++;
+ }
+ }
+ *_n = n;
+ return sam; // fail to open file
+ }
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ int l;
+ if (max == n) {
+ max = max? max<<1 : 4;
+ sam = realloc(sam, sizeof(void*)*max);
+ }
+ l = s.l;
+ sam[n] = malloc(s.l + 2);
+ strcpy(sam[n], s.s);
+ sam[n][l+1] = 2; // by default, diploid
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2
+ int x = (int)s.s[0] - '0';
+ if (x == 1 || x == 2) sam[n][l+1] = x;
+ else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__);
+ }
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ++n;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(s.s);
+ *_n = n;
+ return sam;
+}
+
+static void write_header(bcf_hdr_t *h)
+{
+ kstring_t str;
+ str.l = h->l_txt? h->l_txt - 1 : 0;
+ str.m = str.l + 1; str.s = h->txt;
+ if (!strstr(str.s, "##INFO=<ID=DP,"))
+ kputs("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=DP4,"))
+ kputs("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=MQ,"))
+ kputs("##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=FQ,"))
+ kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AF1,"))
+ kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AC1,"))
+ kputs("##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AN,"))
+ kputs("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=IS,"))
+ kputs("##INFO=<ID=IS,Number=2,Type=Float,Description=\"Maximum number of reads supporting an indel and fraction of indel reads\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AC,"))
+ kputs("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=G3,"))
+ kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=HWE,"))
+ kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=CLR,"))
+ kputs("##INFO=<ID=CLR,Number=1,Type=Integer,Description=\"Log ratio of genotype likelihoods with and without the constraint\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=UGT,"))
+ kputs("##INFO=<ID=UGT,Number=1,Type=String,Description=\"The most probable unconstrained genotype configuration in the trio\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=CGT,"))
+ kputs("##INFO=<ID=CGT,Number=1,Type=String,Description=\"The most probable constrained genotype configuration in the trio\">\n", &str);
+// if (!strstr(str.s, "##INFO=<ID=CI95,"))
+// kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PV4,"))
+ kputs("##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=INDEL,"))
+ kputs("##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PC2,"))
+ kputs("##INFO=<ID=PC2,Number=2,Type=Integer,Description=\"Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PCHI2,"))
+ kputs("##INFO=<ID=PCHI2,Number=1,Type=Float,Description=\"Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=QCHI2,"))
+ kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=RP,"))
+ kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=QBD,"))
+ kputs("##INFO=<ID=QBD,Number=1,Type=Float,Description=\"Quality by Depth: QUAL/#reads\">\n", &str);
+ //if (!strstr(str.s, "##INFO=<ID=RPS,"))
+ // kputs("##INFO=<ID=RPS,Number=3,Type=Float,Description=\"Read Position Stats: depth, average, stddev\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=RPB,"))
+ kputs("##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Read Position Bias\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=MDV,"))
+ kputs("##INFO=<ID=MDV,Number=1,Type=Integer,Description=\"Maximum number of high-quality nonRef reads in samples\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=VDB,"))
+ kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken.\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GT,"))
+ kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
+ kputs("##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GL,"))
+ kputs("##FORMAT=<ID=GL,Number=3,Type=Float,Description=\"Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=DP,"))
+ kputs("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"# high-quality bases\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=DV,"))
+ kputs("##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality non-reference bases\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=SP,"))
+ kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=PL,"))
+ kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str);
+ h->l_txt = str.l + 1; h->txt = str.s;
+}
+
+double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
+
+int bcfview(int argc, char *argv[])
+{
+ extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b);
+ extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x);
+ extern int bcf_fix_gt(bcf1_t *b);
+ extern int bcf_anno_max(bcf1_t *b);
+ extern int bcf_shuffle(bcf1_t *b, int seed);
+ extern uint32_t *bcf_trio_prep(int is_x, int is_son);
+ extern int bcf_trio_call(uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt);
+ extern int bcf_pair_call(const bcf1_t *b);
+ extern int bcf_min_diff(const bcf1_t *b);
+ extern int bcf_p1_get_M(bcf_p1aux_t *b);
+
+ extern gzFile bcf_p1_fp_lk;
+
+ bcf_t *bp, *bout = 0;
+ bcf1_t *b, *blast;
+ int c, *seeds = 0;
+ uint64_t n_processed = 0, qcnt[256];
+ viewconf_t vc;
+ bcf_p1aux_t *p1 = 0;
+ bcf_hdr_t *hin, *hout;
+ int tid, begin, end;
+ char moder[4], modew[4];
+
+ tid = begin = end = -1;
+ memset(&vc, 0, sizeof(viewconf_t));
+ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; vc.min_ma_lrt = -1;
+ memset(qcnt, 0, 8 * 256);
+ while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) {
+ switch (c) {
+ case '1': vc.n1 = atoi(optarg); break;
+ case 'l': vc.bed = bed_read(optarg); if (!vc.bed) { fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; } break;
+ case 'D': vc.fn_dict = strdup(optarg); break;
+ case 'F': vc.flag |= VC_FIX_PL; break;
+ case 'N': vc.flag |= VC_ACGT_ONLY; break;
+ case 'G': vc.flag |= VC_NO_GENO; break;
+ case 'A': vc.flag |= VC_KEEPALT; break;
+ case 'b': vc.flag |= VC_BCFOUT; break;
+ case 'S': vc.flag |= VC_VCFIN; break;
+ case 'c': vc.flag |= VC_CALL; break;
+ case 'e': vc.flag |= VC_EM; break;
+ case 'v': vc.flag |= VC_VARONLY | VC_CALL; break;
+ case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break;
+ case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
+ case 'I': vc.flag |= VC_NO_INDEL; break;
+ case 'w': vc.flag |= VC_INDEL_ONLY; break;
+ case 'M': vc.flag |= VC_ANNO_MAX; break;
+ case 'Y': vc.flag |= VC_QCNT; break;
+ case 'm': vc.min_ma_lrt = atof(optarg); break;
+ case 't': vc.theta = atof(optarg); break;
+ case 'p': vc.pref = atof(optarg); break;
+ case 'i': vc.indel_frac = atof(optarg); break;
+ case 'Q': vc.flag |= VC_QCALL; break;
+ case 'L': vc.flag |= VC_ADJLD; break;
+ case 'U': vc.n_perm = atoi(optarg); break;
+ case 'C': vc.min_lrt = atof(optarg); break;
+ case 'X': vc.min_perm_p = atof(optarg); break;
+ case 'd': vc.min_smpl_frac = atof(optarg); break;
+ case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break;
+ case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
+ vc.ploidy = calloc(vc.n_sub + 1, 1);
+ for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1];
+ tid = -1;
+ break;
+ case 'T':
+ if (strcmp(optarg, "trioauto") == 0) vc.trio_aux = bcf_trio_prep(0, 0);
+ else if (strcmp(optarg, "trioxd") == 0) vc.trio_aux = bcf_trio_prep(1, 0);
+ else if (strcmp(optarg, "trioxs") == 0) vc.trio_aux = bcf_trio_prep(1, 1);
+ else if (strcmp(optarg, "pair") == 0) vc.flag |= VC_PAIRCALL;
+ else {
+ fprintf(stderr, "[%s] Option '-T' can only take value trioauto, trioxd or trioxs.\n", __func__);
+ return 1;
+ }
+ break;
+ case 'P':
+ if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL;
+ else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2;
+ else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT;
+ else vc.prior_file = strdup(optarg);
+ break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bcftools view [options] <in.bcf> [reg]\n\n");
+ fprintf(stderr, "Input/output options:\n\n");
+ fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
+ fprintf(stderr, " -b output BCF instead of VCF\n");
+ fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
+ fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n");
+ fprintf(stderr, " -G suppress all individual genotype information\n");
+ fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n");
+ fprintf(stderr, " -L calculate LD for adjacent sites\n");
+ fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n");
+ fprintf(stderr, " -Q output the QCALL likelihood format\n");
+ fprintf(stderr, " -s FILE list of samples to use [all samples]\n");
+ fprintf(stderr, " -S input is VCF\n");
+ fprintf(stderr, " -u uncompressed BCF output (force -b)\n");
+ fprintf(stderr, "\nConsensus/variant calling options:\n\n");
+ fprintf(stderr, " -c SNP calling (force -e)\n");
+ fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n");
+ fprintf(stderr, " -e likelihood based analyses\n");
+ fprintf(stderr, " -g call genotypes at variant sites (force -c)\n");
+ fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
+ fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n");
+ fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref);
+ fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
+ fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
+ fprintf(stderr, " -T STR constrained calling; STR can be: pair, trioauto, trioxd and trioxs (see manual) [null]\n");
+ fprintf(stderr, " -v output potential variant sites only (force -c)\n");
+ fprintf(stderr, "\nContrast calling and association test options:\n\n");
+ fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
+ fprintf(stderr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", vc.min_lrt);
+ fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
+ fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p);
+ fprintf(stderr, "\n");
+ return 1;
+ }
+
+ if (vc.flag & VC_CALL) vc.flag |= VC_EM;
+ if ((vc.flag & VC_VCFIN) && (vc.flag & VC_BCFOUT) && vc.fn_dict == 0) {
+ fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__);
+ return 1;
+ }
+ if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here!
+ if (vc.n_perm > 0) {
+ seeds = malloc(vc.n_perm * sizeof(int));
+ srand48(time(0));
+ for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48();
+ }
+ b = calloc(1, sizeof(bcf1_t));
+ blast = calloc(1, sizeof(bcf1_t));
+ strcpy(moder, "r");
+ if (!(vc.flag & VC_VCFIN)) strcat(moder, "b");
+ strcpy(modew, "w");
+ if (vc.flag & VC_BCFOUT) strcat(modew, "b");
+ if (vc.flag & VC_UNCOMP) strcat(modew, "u");
+ bp = vcf_open(argv[optind], moder);
+ hin = hout = vcf_hdr_read(bp);
+ if (vc.fn_dict && (vc.flag & VC_VCFIN))
+ vcf_dictread(bp, hin, vc.fn_dict);
+ bout = vcf_open("-", modew);
+ if (!(vc.flag & VC_QCALL)) {
+ if (vc.n_sub) {
+ vc.sublist = calloc(vc.n_sub, sizeof(int));
+ hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist);
+ }
+ write_header(hout); // always print the header
+ vcf_hdr_write(bout, hout);
+ }
+ if (vc.flag & VC_CALL) {
+ p1 = bcf_p1_init(hout->n_smpl, vc.ploidy);
+ if (vc.prior_file) {
+ if (bcf_p1_read_prior(p1, vc.prior_file) < 0) {
+ fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__);
+ return 1;
+ }
+ } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta);
+ if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1
+ bcf_p1_set_n1(p1, vc.n1);
+ bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
+ }
+ if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac
+ }
+ if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
+ void *str2id = bcf_build_refhash(hout);
+ if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) {
+ bcf_idx_t *idx;
+ idx = bcf_idx_load(argv[optind]);
+ if (idx) {
+ uint64_t off;
+ off = bcf_idx_query(idx, tid, begin);
+ if (off == 0) {
+ fprintf(stderr, "[%s] no records in the query region.\n", __func__);
+ return 1; // FIXME: a lot of memory leaks...
+ }
+ bgzf_seek(bp->fp, off, SEEK_SET);
+ bcf_idx_destroy(idx);
+ }
+ }
+ }
+ if (bcf_p1_fp_lk && p1) {
+ int32_t M = bcf_p1_get_M(p1);
+ gzwrite(bcf_p1_fp_lk, &M, 4);
+ }
+ while (vcf_read(bp, hin, b) > 0) {
+ int is_indel, cons_llr = -1;
+ int64_t cons_gt = -1;
+ double em[10];
+ if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue;
+ if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) {
+ extern int bcf_smpl_covered(const bcf1_t *b);
+ int n = bcf_smpl_covered(b);
+ if ((double)n / b->n_smpl < vc.min_smpl_frac) continue;
+ }
+ if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b);
+ if (vc.flag & VC_FIX_PL) bcf_fix_pl(b);
+ is_indel = bcf_is_indel(b);
+ if ((vc.flag & VC_NO_INDEL) && is_indel) continue;
+ if ((vc.flag & VC_INDEL_ONLY) && !is_indel) continue;
+ if ((vc.flag & VC_ACGT_ONLY) && !is_indel) {
+ int x;
+ if (b->ref[0] == 0 || b->ref[1] != 0) continue;
+ x = toupper(b->ref[0]);
+ if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue;
+ }
+ if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue;
+ if (tid >= 0) {
+ int l = strlen(b->ref);
+ l = b->pos + (l > 0? l : 1);
+ if (b->tid != tid || b->pos >= end) break;
+ if (!(l > begin && end > b->pos)) continue;
+ }
+ ++n_processed;
+ if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference
+ int x = bcf_min_diff(b);
+ if (x > 255) x = 255;
+ if (x >= 0) ++qcnt[x];
+ }
+ if (vc.flag & VC_QCALL) { // output QCALL format; STOP here
+ bcf_2qcall(hout, b);
+ continue;
+ }
+ if (vc.trio_aux) // do trio calling
+ bcf_trio_call(vc.trio_aux, b, &cons_llr, &cons_gt);
+ else if (vc.flag & VC_PAIRCALL)
+ cons_llr = bcf_pair_call(b);
+ if (vc.flag & (VC_CALL|VC_ADJLD|VC_EM)) bcf_gl2pl(b);
+ if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0x1ff, em);
+ else {
+ int i;
+ for (i = 0; i < 9; ++i) em[i] = -1.;
+ }
+ if ( !(vc.flag&VC_KEEPALT) && (vc.flag&VC_CALL) && vc.min_ma_lrt>=0 )
+ {
+ bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions
+ int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY);
+ if ( gts<=1 && vc.flag & VC_VARONLY ) continue;
+ }
+ else if (vc.flag & VC_CALL) { // call variants
+ bcf_p1rst_t pr;
+ int calret;
+ gzwrite(bcf_p1_fp_lk, &b->tid, 4);
+ gzwrite(bcf_p1_fp_lk, &b->pos, 4);
+ gzwrite(bcf_p1_fp_lk, &em[0], sizeof(double));
+ calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr);
+ if (n_processed % 100000 == 0) {
+ fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed);
+ bcf_p1_dump_afs(p1);
+ }
+ if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue;
+ if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test
+ bcf_p1rst_t r;
+ int i, n = 0;
+ for (i = 0; i < vc.n_perm; ++i) {
+#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts
+ double x[10];
+ bcf_shuffle(b, seeds[i]);
+ bcf_em1(b, vc.n1, 1<<7, x);
+ if (x[7] < em[7]) ++n;
+#else
+ bcf_shuffle(b, seeds[i]);
+ bcf_p1_cal(b, 1, p1, &r);
+ if (pr.p_chi2 >= r.p_chi2) ++n;
+#endif
+ }
+ pr.perm_rank = n;
+ }
+ if (calret >= 0) update_bcf1(b, p1, &pr, vc.pref, vc.flag, em, cons_llr, cons_gt);
+ } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em, cons_llr, cons_gt);
+ if (vc.flag & VC_ADJLD) { // compute LD
+ double f[4], r2;
+ if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) {
+ kstring_t s;
+ s.m = s.l = 0; s.s = 0;
+ if (*b->info) kputc(';', &s);
+ ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]);
+ bcf_append_info(b, s.s, s.l);
+ free(s.s);
+ }
+ bcf_cpy(blast, b);
+ }
+ if (vc.flag & VC_ANNO_MAX) bcf_anno_max(b);
+ if (vc.flag & VC_NO_GENO) { // do not output GENO fields
+ b->n_gi = 0;
+ b->fmt[0] = '\0';
+ b->l_str = b->fmt - b->str + 1;
+ } else bcf_fix_gt(b);
+ vcf_write(bout, hout, b);
+ }
+
+ if (bcf_p1_fp_lk) gzclose(bcf_p1_fp_lk);
+ if (vc.prior_file) free(vc.prior_file);
+ if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1);
+ if (hin != hout) bcf_hdr_destroy(hout);
+ bcf_hdr_destroy(hin);
+ bcf_destroy(b); bcf_destroy(blast);
+ vcf_close(bp); vcf_close(bout);
+ if (vc.fn_dict) free(vc.fn_dict);
+ if (vc.ploidy) free(vc.ploidy);
+ if (vc.trio_aux) free(vc.trio_aux);
+ if (vc.n_sub) {
+ int i;
+ for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]);
+ free(vc.subsam); free(vc.sublist);
+ }
+ if (vc.bed) bed_destroy(vc.bed);
+ if (vc.flag & VC_QCNT)
+ for (c = 0; c < 256; ++c)
+ fprintf(stderr, "QT\t%d\t%lld\n", c, (long long)qcnt[c]);
+ if (seeds) free(seeds);
+ if (p1) bcf_p1_destroy(p1);
+ return 0;
+}
diff --git a/samtools-0.1.19/bcftools/em.c b/samtools-0.1.19/bcftools/em.c
new file mode 100644
index 0000000..b7dfe1a
--- /dev/null
+++ b/samtools-0.1.19/bcftools/em.c
@@ -0,0 +1,310 @@
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "bcf.h"
+#include "kmin.h"
+
+static double g_q2p[256];
+
+#define ITER_MAX 50
+#define ITER_TRY 10
+#define EPS 1e-5
+
+extern double kf_gammaq(double, double);
+
+/*
+ Generic routines
+ */
+// get the 3 genotype likelihoods
+static double *get_pdg3(const bcf1_t *b)
+{
+ double *pdg;
+ const uint8_t *PL = 0;
+ int i, PL_len = 0;
+ // initialize g_q2p if necessary
+ if (g_q2p[0] == 0.)
+ for (i = 0; i < 256; ++i)
+ g_q2p[i] = pow(10., -i / 10.);
+ // set PL and PL_len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ PL = (const uint8_t*)b->gi[i].data;
+ PL_len = b->gi[i].len;
+ break;
+ }
+ }
+ if (i == b->n_gi) return 0; // no PL
+ // fill pdg
+ pdg = malloc(3 * b->n_smpl * sizeof(double));
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *pi = PL + i * PL_len;
+ double *p = pdg + i * 3;
+ p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]];
+ }
+ return pdg;
+}
+
+// estimate site allele frequency in a very naive and inaccurate way
+static double est_freq(int n, const double *pdg)
+{
+ int i, gcnt[3], tmp1;
+ // get a rough estimate of the genotype frequency
+ gcnt[0] = gcnt[1] = gcnt[2] = 0;
+ for (i = 0; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ if (p[0] != 1. || p[1] != 1. || p[2] != 1.) {
+ int which = p[0] > p[1]? 0 : 1;
+ which = p[which] > p[2]? which : 2;
+ ++gcnt[which];
+ }
+ }
+ tmp1 = gcnt[0] + gcnt[1] + gcnt[2];
+ return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1;
+}
+
+/*
+ Single-locus EM
+ */
+
+typedef struct {
+ int beg, end;
+ const double *pdg;
+} minaux1_t;
+
+static double prob1(double f, void *data)
+{
+ minaux1_t *a = (minaux1_t*)data;
+ double p = 1., l = 0., f3[3];
+ int i;
+// printf("brent %lg\n", f);
+ if (f < 0 || f > 1) return 1e300;
+ f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
+ for (i = a->beg; i < a->end; ++i) {
+ const double *pdg = a->pdg + i * 3;
+ p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
+ if (p < 1e-200) l -= log(p), p = 1.;
+ }
+ return l - log(p);
+}
+
+// one EM iteration for allele frequency estimate
+static double freq_iter(double *f, const double *_pdg, int beg, int end)
+{
+ double f0 = *f, f3[3], err;
+ int i;
+// printf("em %lg\n", *f);
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = beg, f0 = 0.; i < end; ++i) {
+ const double *pdg = _pdg + i * 3;
+ f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
+ / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
+ }
+ f0 /= (end - beg) * 2;
+ err = fabs(f0 - *f);
+ *f = f0;
+ return err;
+}
+
+/* The following function combines EM and Brent's method. When the signal from
+ * the data is strong, EM is faster but sometimes, EM may converge very slowly.
+ * When this happens, we switch to Brent's method. The idea is learned from
+ * Rasmus Nielsen.
+ */
+static double freqml(double f0, int beg, int end, const double *pdg)
+{
+ int i;
+ double f;
+ for (i = 0, f = f0; i < ITER_TRY; ++i)
+ if (freq_iter(&f, pdg, beg, end) < EPS) break;
+ if (i == ITER_TRY) { // haven't converged yet; try Brent's method
+ minaux1_t a;
+ a.beg = beg; a.end = end; a.pdg = pdg;
+ kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f);
+ }
+ return f;
+}
+
+// one EM iteration for genotype frequency estimate
+static double g3_iter(double g[3], const double *_pdg, int beg, int end)
+{
+ double err, gg[3];
+ int i;
+ gg[0] = gg[1] = gg[2] = 0.;
+// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
+ for (i = beg; i < end; ++i) {
+ double sum, tmp[3];
+ const double *pdg = _pdg + i * 3;
+ tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
+ sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
+ gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
+ }
+ err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
+ err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
+ g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
+ return err;
+}
+
+// perform likelihood ratio test
+static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
+{
+ double r;
+ int i;
+ for (i = 0, r = 1.; i < n1; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ for (; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ return r;
+}
+
+// x[0]: ref frequency
+// x[1..3]: alt-alt, alt-ref, ref-ref frequenc
+// x[4]: HWE P-value
+// x[5..6]: group1 freq, group2 freq
+// x[7]: 1-degree P-value
+// x[8]: 2-degree P-value
+int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
+{
+ double *pdg;
+ int i, n, n2;
+ if (b->n_alleles < 2) return -1; // one allele only
+ // initialization
+ if (n1 < 0 || n1 > b->n_smpl) n1 = 0;
+ if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required
+ if (flag & 0xf<<1) flag |= 0xf<<1;
+ n = b->n_smpl; n2 = n - n1;
+ pdg = get_pdg3(b);
+ if (pdg == 0) return -1;
+ for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative
+ {
+ if ((x[0] = est_freq(n, pdg)) < 0.) {
+ free(pdg);
+ return -1; // no data
+ }
+ x[0] = freqml(x[0], 0, n, pdg);
+ }
+ if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE
+ double *g = x + 1, f3[3], r;
+ f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
+ f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
+ f3[2] = g[2] = x[0] * x[0];
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g, pdg, 0, n) < EPS) break;
+ // Hardy-Weinberg equilibrium (HWE)
+ for (i = 0, r = 1.; i < n; ++i) {
+ double *p = pdg + i * 3;
+ r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]);
+ }
+ x[4] = kf_gammaq(.5, log(r));
+ }
+ if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency
+ x[5] = freqml(x[0], 0, n1, pdg);
+ x[6] = freqml(x[0], n1, n, pdg);
+ }
+ if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
+ double f[3], f3[3][3], tmp;
+ f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
+ for (i = 0; i < 3; ++i)
+ f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
+ tmp = log(lk_ratio_test(n, n1, pdg, f3));
+ if (tmp < 0) tmp = 0;
+ x[7] = kf_gammaq(.5, tmp);
+ }
+ if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
+ double g[3][3], tmp;
+ for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[2], pdg, n1, n) < EPS) break;
+ tmp = log(lk_ratio_test(n, n1, pdg, g));
+ if (tmp < 0) tmp = 0;
+ x[8] = kf_gammaq(1., tmp);
+ }
+ // free
+ free(pdg);
+ return 0;
+}
+
+/*
+ Two-locus EM (LD)
+ */
+
+#define _G1(h, k) ((h>>1&1) + (k>>1&1))
+#define _G2(h, k) ((h&1) + (k&1))
+
+// 0: the previous site; 1: the current site
+static int pair_freq_iter(int n, double *pdg[2], double f[4])
+{
+ double ff[4];
+ int i, k, h;
+// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
+ memset(ff, 0, 4 * sizeof(double));
+ for (i = 0; i < n; ++i) {
+ double *p[2], sum, tmp;
+ p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
+ for (k = 0, sum = 0.; k < 4; ++k)
+ for (h = 0; h < 4; ++h)
+ sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
+ for (k = 0; k < 4; ++k) {
+ tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
+ + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
+ + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
+ + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
+ ff[k] += f[k] * tmp / sum;
+ }
+ }
+ for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
+ return 0;
+}
+
+double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4])
+{
+ const bcf1_t *b[2];
+ int i, j, n_smpl;
+ double *pdg[2], flast[4], r, f0[2];
+ // initialize others
+ if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples
+ n_smpl = b0->n_smpl;
+ b[0] = b0; b[1] = b1;
+ f[0] = f[1] = f[2] = f[3] = -1.;
+ if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only
+ pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1);
+ if (pdg[0] == 0 || pdg[1] == 0) {
+ free(pdg[0]); free(pdg[1]);
+ return -1;
+ }
+ // set the initial value
+ f0[0] = est_freq(n_smpl, pdg[0]);
+ f0[1] = est_freq(n_smpl, pdg[1]);
+ f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1];
+ f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]);
+ // iteration
+ for (j = 0; j < ITER_MAX; ++j) {
+ double eps = 0;
+ memcpy(flast, f, 4 * sizeof(double));
+ pair_freq_iter(n_smpl, pdg, f);
+ for (i = 0; i < 4; ++i) {
+ double x = fabs(f[i] - flast[i]);
+ if (x > eps) eps = x;
+ }
+ if (eps < EPS) break;
+ }
+ // free
+ free(pdg[0]); free(pdg[1]);
+ { // calculate r^2
+ double p[2], q[2], D;
+ p[0] = f[0] + f[1]; q[0] = 1 - p[0];
+ p[1] = f[0] + f[2]; q[1] = 1 - p[1];
+ D = f[0] * f[3] - f[1] * f[2];
+ r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1]));
+// printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r);
+ if (isnan(r)) r = -1.;
+ }
+ return r;
+}
diff --git a/samtools-0.1.19/bcftools/fet.c b/samtools-0.1.19/bcftools/fet.c
new file mode 100644
index 0000000..5812517
--- /dev/null
+++ b/samtools-0.1.19/bcftools/fet.c
@@ -0,0 +1,112 @@
+#include <math.h>
+#include <stdlib.h>
+
+/* This program is implemented with ideas from this web page:
+ *
+ * http://www.langsrud.com/fisher.htm
+ */
+
+// log\binom{n}{k}
+static double lbinom(int n, int k)
+{
+ if (k == 0 || n == k) return 0;
+ return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+}
+
+// n11 n12 | n1_
+// n21 n22 | n2_
+//-----------+----
+// n_1 n_2 | n
+
+// hypergeometric distribution
+static double hypergeo(int n11, int n1_, int n_1, int n)
+{
+ return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
+}
+
+typedef struct {
+ int n11, n1_, n_1, n;
+ double p;
+} hgacc_t;
+
+// incremental version of hypergenometric distribution
+static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux)
+{
+ if (n1_ || n_1 || n) {
+ aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
+ } else { // then only n11 changed; the rest fixed
+ if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
+ if (n11 == aux->n11 + 1) { // incremental
+ aux->p *= (double)(aux->n1_ - aux->n11) / n11
+ * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
+ aux->n11 = n11;
+ return aux->p;
+ }
+ if (n11 == aux->n11 - 1) { // incremental
+ aux->p *= (double)aux->n11 / (aux->n1_ - n11)
+ * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
+ aux->n11 = n11;
+ return aux->p;
+ }
+ }
+ aux->n11 = n11;
+ }
+ aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
+ return aux->p;
+}
+
+double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two)
+{
+ int i, j, max, min;
+ double p, q, left, right;
+ hgacc_t aux;
+ int n1_, n_1, n;
+
+ n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
+ max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
+ min = n1_ + n_1 - n;
+ if (min < 0) min = 0; // min n11, for left tail
+ *two = *_left = *_right = 1.;
+ if (min == max) return 1.; // no need to do test
+ q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
+ // left tail
+ p = hypergeo_acc(min, 0, 0, 0, &aux);
+ for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) // loop until underflow
+ left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
+ --i;
+ if (p < 1.00000001 * q) left += p;
+ else --i;
+ // right tail
+ p = hypergeo_acc(max, 0, 0, 0, &aux);
+ for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow
+ right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
+ ++j;
+ if (p < 1.00000001 * q) right += p;
+ else ++j;
+ // two-tail
+ *two = left + right;
+ if (*two > 1.) *two = 1.;
+ // adjust left and right
+ if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
+ else left = 1.0 - right + q;
+ *_left = left; *_right = right;
+ return q;
+}
+
+#ifdef FET_MAIN
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+ char id[1024];
+ int n11, n12, n21, n22;
+ double left, right, twotail, prob;
+
+ while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) {
+ prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail);
+ printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22,
+ prob, left, right, twotail);
+ }
+ return 0;
+}
+#endif
diff --git a/samtools-0.1.19/bcftools/index.c b/samtools-0.1.19/bcftools/index.c
new file mode 100644
index 0000000..a7db24f
--- /dev/null
+++ b/samtools-0.1.19/bcftools/index.c
@@ -0,0 +1,336 @@
+#include <assert.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#include "bam_endian.h"
+#include "kstring.h"
+#include "bcf.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+#define TAD_LIDX_SHIFT 13
+
+typedef struct {
+ int32_t n, m;
+ uint64_t *offset;
+} bcf_lidx_t;
+
+struct __bcf_idx_t {
+ int32_t n;
+ bcf_lidx_t *index2;
+};
+
+/************
+ * indexing *
+ ************/
+
+static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset)
+{
+ int i, beg, end;
+ beg = _beg >> TAD_LIDX_SHIFT;
+ end = (_end - 1) >> TAD_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
+ kroundup32(index2->m);
+ index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+ }
+ if (beg == end) {
+ if (index2->offset[beg] == 0) index2->offset[beg] = offset;
+ } else {
+ for (i = beg; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ }
+ if (index2->n < end + 1) index2->n = end + 1;
+}
+
+bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h)
+{
+ bcf_idx_t *idx;
+ int32_t last_coor, last_tid;
+ uint64_t last_off;
+ kstring_t *str;
+ BGZF *fp = bp->fp;
+ bcf1_t *b;
+ int ret;
+
+ b = calloc(1, sizeof(bcf1_t));
+ str = calloc(1, sizeof(kstring_t));
+ idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
+ idx->n = h->n_ref;
+ idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t));
+
+ last_tid = 0xffffffffu;
+ last_off = bgzf_tell(fp); last_coor = 0xffffffffu;
+ while ((ret = bcf_read(bp, h, b)) > 0) {
+ int end, tmp;
+ if (last_tid != b->tid) { // change of chromosomes
+ last_tid = b->tid;
+ } else if (last_coor > b->pos) {
+ fprintf(stderr, "[bcf_idx_core] the input is out of order\n");
+ free(str->s); free(str); free(idx); bcf_destroy(b);
+ return 0;
+ }
+ tmp = strlen(b->ref);
+ end = b->pos + (tmp > 0? tmp : 1);
+ insert_offset2(&idx->index2[b->tid], b->pos, end, last_off);
+ last_off = bgzf_tell(fp);
+ last_coor = b->pos;
+ }
+ free(str->s); free(str); bcf_destroy(b);
+ return idx;
+}
+
+void bcf_idx_destroy(bcf_idx_t *idx)
+{
+ int i;
+ if (idx == 0) return;
+ for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset);
+ free(idx->index2);
+ free(idx);
+}
+
+/******************
+ * index file I/O *
+ ******************/
+
+void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp)
+{
+ int32_t i, ti_is_be;
+ ti_is_be = bam_is_big_endian();
+ bgzf_write(fp, "BCI\4", 4);
+ if (ti_is_be) {
+ uint32_t x = idx->n;
+ bgzf_write(fp, bam_swap_endian_4p(&x), 4);
+ } else bgzf_write(fp, &idx->n, 4);
+ for (i = 0; i < idx->n; ++i) {
+ bcf_lidx_t *index2 = idx->index2 + i;
+ // write linear index (index2)
+ if (ti_is_be) {
+ int x = index2->n;
+ bgzf_write(fp, bam_swap_endian_4p(&x), 4);
+ } else bgzf_write(fp, &index2->n, 4);
+ if (ti_is_be) { // big endian
+ int x;
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ bgzf_write(fp, index2->offset, 8 * index2->n);
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ } else bgzf_write(fp, index2->offset, 8 * index2->n);
+ }
+}
+
+static bcf_idx_t *bcf_idx_load_core(BGZF *fp)
+{
+ int i, ti_is_be;
+ char magic[4];
+ bcf_idx_t *idx;
+ ti_is_be = bam_is_big_endian();
+ if (fp == 0) {
+ fprintf(stderr, "[%s] fail to load index.\n", __func__);
+ return 0;
+ }
+ bgzf_read(fp, magic, 4);
+ if (strncmp(magic, "BCI\4", 4)) {
+ fprintf(stderr, "[%s] wrong magic number.\n", __func__);
+ return 0;
+ }
+ idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
+ bgzf_read(fp, &idx->n, 4);
+ if (ti_is_be) bam_swap_endian_4p(&idx->n);
+ idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t));
+ for (i = 0; i < idx->n; ++i) {
+ bcf_lidx_t *index2 = idx->index2 + i;
+ int j;
+ bgzf_read(fp, &index2->n, 4);
+ if (ti_is_be) bam_swap_endian_4p(&index2->n);
+ index2->m = index2->n;
+ index2->offset = (uint64_t*)calloc(index2->m, 8);
+ bgzf_read(fp, index2->offset, index2->n * 8);
+ if (ti_is_be)
+ for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+ }
+ return idx;
+}
+
+bcf_idx_t *bcf_idx_load_local(const char *fnidx)
+{
+ BGZF *fp;
+ fp = bgzf_open(fnidx, "r");
+ if (fp) {
+ bcf_idx_t *idx = bcf_idx_load_core(fp);
+ bgzf_close(fp);
+ return idx;
+ } else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ char *fn;
+ FILE *fp;
+ uint8_t *buf;
+ knetFile *fp_remote;
+ int l;
+ if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+ l = strlen(url);
+ for (fn = (char*)url + l - 1; fn >= url; --fn)
+ if (*fn == '/') break;
+ ++fn; // fn now points to the file name
+ fp_remote = knet_open(url, "r");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+ return;
+ }
+ if ((fp = fopen(fn, "w")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+ knet_close(fp_remote);
+ return;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+ return;
+}
+#endif
+
+static char *get_local_version(const char *fn)
+{
+ struct stat sbuf;
+ char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".bci");
+ if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
+ char *p, *url;
+ int l = strlen(fnidx);
+ for (p = fnidx + l - 1; p >= fnidx; --p)
+ if (*p == '/') break;
+ url = fnidx; fnidx = strdup(p + 1);
+ if (stat(fnidx, &sbuf) == 0) {
+ free(url);
+ return fnidx;
+ }
+ fprintf(stderr, "[%s] downloading the index file...\n", __func__);
+ download_from_remote(url);
+ free(url);
+ }
+ if (stat(fnidx, &sbuf) == 0) return fnidx;
+ free(fnidx); return 0;
+}
+
+bcf_idx_t *bcf_idx_load(const char *fn)
+{
+ bcf_idx_t *idx;
+ char *fname = get_local_version(fn);
+ if (fname == 0) return 0;
+ idx = bcf_idx_load_local(fname);
+ free(fname);
+ return idx;
+}
+
+int bcf_idx_build2(const char *fn, const char *_fnidx)
+{
+ char *fnidx;
+ BGZF *fpidx;
+ bcf_t *bp;
+ bcf_idx_t *idx;
+ bcf_hdr_t *h;
+ if ((bp = bcf_open(fn, "r")) == 0) {
+ fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n");
+ return -1;
+ }
+ h = bcf_hdr_read(bp);
+ idx = bcf_idx_core(bp, h);
+ bcf_close(bp);
+ if (_fnidx == 0) {
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bci");
+ } else fnidx = strdup(_fnidx);
+ fpidx = bgzf_open(fnidx, "w");
+ if (fpidx == 0) {
+ fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n");
+ free(fnidx);
+ bcf_idx_destroy(idx);
+ return -1;
+ }
+ bcf_idx_save(idx, fpidx);
+ bcf_idx_destroy(idx);
+ bgzf_close(fpidx);
+ free(fnidx);
+ return 0;
+}
+
+int bcf_idx_build(const char *fn)
+{
+ return bcf_idx_build2(fn, 0);
+}
+
+/********************************************
+ * parse a region in the format chr:beg-end *
+ ********************************************/
+
+int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end)
+{
+ char *s, *p;
+ int i, l, k;
+ l = strlen(str);
+ p = s = (char*)malloc(l+1);
+ /* squeeze out "," */
+ for (i = k = 0; i != l; ++i)
+ if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+ s[k] = 0;
+ for (i = 0; i != k; ++i) if (s[i] == ':') break;
+ s[i] = 0;
+ if ((*tid = bcf_str2id(str2id, s)) < 0) {
+ free(s);
+ return -1;
+ }
+ if (i == k) { /* dump the whole sequence */
+ *begin = 0; *end = 1<<29; free(s);
+ return 0;
+ }
+ for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+ *begin = atoi(p);
+ if (i < k) {
+ p = s + i + 1;
+ *end = atoi(p);
+ } else *end = 1<<29;
+ if (*begin > 0) --*begin;
+ free(s);
+ if (*begin > *end) return -1;
+ return 0;
+}
+
+/*******************************
+ * retrieve a specified region *
+ *******************************/
+
+uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg)
+{
+ uint64_t min_off, *offset;
+ int i;
+ if (beg < 0) beg = 0;
+ offset = idx->index2[tid].offset;
+ for (i = beg>>TAD_LIDX_SHIFT; i < idx->index2[tid].n && offset[i] == 0; ++i);
+ min_off = (i == idx->index2[tid].n)? offset[idx->index2[tid].n-1] : offset[i];
+ return min_off;
+}
+
+int bcf_main_index(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bcftools index <in.bcf>\n");
+ return 1;
+ }
+ bcf_idx_build(argv[1]);
+ return 0;
+}
diff --git a/samtools-0.1.19/bcftools/kfunc.c b/samtools-0.1.19/bcftools/kfunc.c
new file mode 100644
index 0000000..a637b6c
--- /dev/null
+++ b/samtools-0.1.19/bcftools/kfunc.c
@@ -0,0 +1,162 @@
+#include <math.h>
+
+
+/* Log gamma function
+ * \log{\Gamma(z)}
+ * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+ */
+double kf_lgamma(double z)
+{
+ double x = 0;
+ x += 0.1659470187408462e-06 / (z+7);
+ x += 0.9934937113930748e-05 / (z+6);
+ x -= 0.1385710331296526 / (z+5);
+ x += 12.50734324009056 / (z+4);
+ x -= 176.6150291498386 / (z+3);
+ x += 771.3234287757674 / (z+2);
+ x -= 1259.139216722289 / (z+1);
+ x += 676.5203681218835 / z;
+ x += 0.9999999999995183;
+ return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
+}
+
+/* complementary error function
+ * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
+ * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
+ */
+double kf_erfc(double x)
+{
+ const double p0 = 220.2068679123761;
+ const double p1 = 221.2135961699311;
+ const double p2 = 112.0792914978709;
+ const double p3 = 33.912866078383;
+ const double p4 = 6.37396220353165;
+ const double p5 = .7003830644436881;
+ const double p6 = .03526249659989109;
+ const double q0 = 440.4137358247522;
+ const double q1 = 793.8265125199484;
+ const double q2 = 637.3336333788311;
+ const double q3 = 296.5642487796737;
+ const double q4 = 86.78073220294608;
+ const double q5 = 16.06417757920695;
+ const double q6 = 1.755667163182642;
+ const double q7 = .08838834764831844;
+ double expntl, z, p;
+ z = fabs(x) * M_SQRT2;
+ if (z > 37.) return x > 0.? 0. : 2.;
+ expntl = exp(z * z * - .5);
+ if (z < 10. / M_SQRT2) // for small z
+ p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
+ / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
+ else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
+ return x > 0.? 2. * p : 2. * (1. - p);
+}
+
+/* The following computes regularized incomplete gamma functions.
+ * Formulas are taken from Wiki, with additional input from Numerical
+ * Recipes in C (for modified Lentz's algorithm) and AS245
+ * (http://lib.stat.cmu.edu/apstat/245).
+ *
+ * A good online calculator is available at:
+ *
+ * http://www.danielsoper.com/statcalc/calc23.aspx
+ *
+ * It calculates upper incomplete gamma function, which equals
+ * kf_gammaq(s,z)*tgamma(s).
+ */
+
+#define KF_GAMMA_EPS 1e-14
+#define KF_TINY 1e-290
+
+// regularized lower incomplete gamma function, by series expansion
+static double _kf_gammap(double s, double z)
+{
+ double sum, x;
+ int k;
+ for (k = 1, sum = x = 1.; k < 100; ++k) {
+ sum += (x *= z / (s + k));
+ if (x / sum < KF_GAMMA_EPS) break;
+ }
+ return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
+}
+// regularized upper incomplete gamma function, by continued fraction
+static double _kf_gammaq(double s, double z)
+{
+ int j;
+ double C, D, f;
+ f = 1. + z - s; C = f; D = 0.;
+ // Modified Lentz's algorithm for computing continued fraction
+ // See Numerical Recipes in C, 2nd edition, section 5.2
+ for (j = 1; j < 100; ++j) {
+ double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
+ D = b + a * D;
+ if (D < KF_TINY) D = KF_TINY;
+ C = b + a / C;
+ if (C < KF_TINY) C = KF_TINY;
+ D = 1. / D;
+ d = C * D;
+ f *= d;
+ if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+ }
+ return exp(s * log(z) - z - kf_lgamma(s) - log(f));
+}
+
+double kf_gammap(double s, double z)
+{
+ return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
+}
+
+double kf_gammaq(double s, double z)
+{
+ return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
+}
+
+/* Regularized incomplete beta function. The method is taken from
+ * Numerical Recipe in C, 2nd edition, section 6.4. The following web
+ * page calculates the incomplete beta function, which equals
+ * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
+ *
+ * http://www.danielsoper.com/statcalc/calc36.aspx
+ */
+static double kf_betai_aux(double a, double b, double x)
+{
+ double C, D, f;
+ int j;
+ if (x == 0.) return 0.;
+ if (x == 1.) return 1.;
+ f = 1.; C = f; D = 0.;
+ // Modified Lentz's algorithm for computing continued fraction
+ for (j = 1; j < 200; ++j) {
+ double aa, d;
+ int m = j>>1;
+ aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
+ : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
+ D = 1. + aa * D;
+ if (D < KF_TINY) D = KF_TINY;
+ C = 1. + aa / C;
+ if (C < KF_TINY) C = KF_TINY;
+ D = 1. / D;
+ d = C * D;
+ f *= d;
+ if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+ }
+ return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
+}
+double kf_betai(double a, double b, double x)
+{
+ return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
+}
+
+#ifdef KF_MAIN
+#include <stdio.h>
+int main(int argc, char *argv[])
+{
+ double x = 5.5, y = 3;
+ double a, b;
+ printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
+ printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
+ a = 2; b = 2; x = 0.5;
+ printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
+ return 0;
+}
+#endif
diff --git a/samtools-0.1.19/bcftools/kmin.c b/samtools-0.1.19/bcftools/kmin.c
new file mode 100644
index 0000000..5b8193b
--- /dev/null
+++ b/samtools-0.1.19/bcftools/kmin.c
@@ -0,0 +1,209 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Hooke-Jeeves algorithm for nonlinear minimization
+
+ Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
+ the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
+ papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
+ 6(6):313-314). The original algorithm was designed by Hooke and
+ Jeeves (ACM 8:212-229). This program is further revised according to
+ Johnson's implementation at Netlib (opt/hooke.c).
+
+ Hooke-Jeeves algorithm is very simple and it works quite well on a
+ few examples. However, it might fail to converge due to its heuristic
+ nature. A possible improvement, as is suggested by Johnson, may be to
+ choose a small r at the beginning to quickly approach to the minimum
+ and a large r at later step to hit the minimum.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kmin.h"
+
+static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
+{
+ int k, j = *n_calls;
+ double ftmp;
+ for (k = 0; k != n; ++k) {
+ x1[k] += dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else { /* search the opposite direction */
+ dx[k] = 0.0 - dx[k];
+ x1[k] += dx[k] + dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else x1[k] -= dx[k]; /* back to the original x[k] */
+ }
+ }
+ *n_calls = j;
+ return fx1; /* here: fx1=f(n,x1) */
+}
+
+double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
+{
+ double fx, fx1, *x1, *dx, radius;
+ int k, n_calls = 0;
+ x1 = (double*)calloc(n, sizeof(double));
+ dx = (double*)calloc(n, sizeof(double));
+ for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
+ dx[k] = fabs(x[k]) * r;
+ if (dx[k] == 0) dx[k] = r;
+ }
+ radius = r;
+ fx1 = fx = func(n, x, data); ++n_calls;
+ for (;;) {
+ memcpy(x1, x, n * sizeof(double)); /* x1 = x */
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
+ while (fx1 < fx) {
+ for (k = 0; k != n; ++k) {
+ double t = x[k];
+ dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
+ x[k] = x1[k];
+ x1[k] = x1[k] + x1[k] - t;
+ }
+ fx = fx1;
+ if (n_calls >= max_calls) break;
+ fx1 = func(n, x1, data); ++n_calls;
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
+ if (fx1 >= fx) break;
+ for (k = 0; k != n; ++k)
+ if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
+ if (k == n) break;
+ }
+ if (radius >= eps) {
+ if (n_calls >= max_calls) break;
+ radius *= r;
+ for (k = 0; k != n; ++k) dx[k] *= r;
+ } else break; /* converge */
+ }
+ free(x1); free(dx);
+ return fx1;
+}
+
+// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
+double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
+{
+ double bound, u, r, q, fu, tmp, fa, fb, fc, c;
+ const double gold1 = 1.6180339887;
+ const double gold2 = 0.3819660113;
+ const double tiny = 1e-20;
+ const int max_iter = 100;
+
+ double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
+ int iter;
+
+ fa = func(a, data); fb = func(b, data);
+ if (fb > fa) { // swap, such that f(a) > f(b)
+ tmp = a; a = b; b = tmp;
+ tmp = fa; fa = fb; fb = tmp;
+ }
+ c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
+ while (fb > fc) {
+ bound = b + 100.0 * (c - b); // the farthest point where we want to go
+ r = (b - a) * (fb - fc);
+ q = (b - c) * (fb - fa);
+ if (fabs(q - r) < tiny) { // avoid 0 denominator
+ tmp = q > r? tiny : 0.0 - tiny;
+ } else tmp = q - r;
+ u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
+ if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
+ fu = func(u, data);
+ if (fu < fc) { // (b,u,c) bracket the minimum
+ a = b; b = u; fa = fb; fb = fu;
+ break;
+ } else if (fu > fb) { // (a,b,u) bracket the minimum
+ c = u; fc = fu;
+ break;
+ }
+ u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
+ } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
+ fu = func(u, data);
+ if (fu < fc) { // fb > fc > fu
+ b = c; c = u; u = c + gold1 * (c - b);
+ fb = fc; fc = fu; fu = func(u, data);
+ } else { // (b,c,u) bracket the minimum
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ break;
+ }
+ } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
+ u = bound; fu = func(u, data);
+ } else { // u goes the other way around, use golden section extrapolation
+ u = c + gold1 * (c - b); fu = func(u, data);
+ }
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ }
+ if (a > c) u = a, a = c, c = u; // swap
+
+ // now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+ e = d = 0.0;
+ w = v = b; fv = fw = fb;
+ for (iter = 0; iter != max_iter; ++iter) {
+ mid = 0.5 * (a + c);
+ tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
+ if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
+ *xmin = b; return fb; // found
+ }
+ if (fabs(e) > tol1) {
+ // related to parabolic interpolation
+ r = (b - w) * (fb - fv);
+ q = (b - v) * (fb - fw);
+ p = (b - v) * q - (b - w) * r;
+ q = 2.0 * (q - r);
+ if (q > 0.0) p = 0.0 - p;
+ else q = 0.0 - q;
+ eold = e; e = d;
+ if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
+ d = gold2 * (e = (b >= mid ? a - b : c - b));
+ } else {
+ d = p / q; u = b + d; // actual parabolic interpolation happens here
+ if (u - a < tol2 || c - u < tol2)
+ d = (mid > b)? tol1 : 0.0 - tol1;
+ }
+ } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
+ u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
+ fu = func(u, data);
+ if (fu <= fb) { // u is the minimum point so far
+ if (u >= b) a = b;
+ else c = b;
+ v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
+ } else { // adjust (a,c) and (u,v,w)
+ if (u < b) a = u;
+ else c = u;
+ if (fu <= fw || w == b) {
+ v = w; w = u;
+ fv = fw; fw = fu;
+ } else if (fu <= fv || v == b || v == w) {
+ v = u; fv = fu;
+ }
+ }
+ }
+ *xmin = b;
+ return fb;
+}
diff --git a/samtools-0.1.19/bcftools/kmin.h b/samtools-0.1.19/bcftools/kmin.h
new file mode 100644
index 0000000..6feba45
--- /dev/null
+++ b/samtools-0.1.19/bcftools/kmin.h
@@ -0,0 +1,46 @@
+/*
+ Copyright (c) 2008, 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KMIN_H
+#define KMIN_H
+
+#define KMIN_RADIUS 0.5
+#define KMIN_EPS 1e-7
+#define KMIN_MAXCALL 50000
+
+typedef double (*kmin_f)(int, double*, void*);
+typedef double (*kmin1_f)(double, void*);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls);
+ double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/bcftools/main.c b/samtools-0.1.19/bcftools/main.c
new file mode 100644
index 0000000..eda6217
--- /dev/null
+++ b/samtools-0.1.19/bcftools/main.c
@@ -0,0 +1,191 @@
+#include <string.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "knetfile.h"
+#include "bcf.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 0x10000)
+
+int bcfview(int argc, char *argv[]);
+int bcf_main_index(int argc, char *argv[]);
+
+#define BUF_SIZE 0x10000
+
+int bcf_cat(int n, char * const *fn)
+{
+ int i;
+ bcf_t *out;
+ uint8_t *buf;
+ buf = malloc(BUF_SIZE);
+ out = bcf_open("-", "w");
+ for (i = 0; i < n; ++i) {
+ bcf_t *in;
+ bcf_hdr_t *h;
+ off_t end;
+ struct stat s;
+ in = bcf_open(fn[i], "r");
+ h = bcf_hdr_read(in);
+ if (i == 0) bcf_hdr_write(out, h);
+ bcf_hdr_destroy(h);
+#ifdef _USE_KNETFILE
+ fstat(knet_fileno((knetFile*)in->fp->fp), &s);
+ end = s.st_size - 28;
+ while (knet_tell((knetFile*)in->fp->fp) < end) {
+ int size = knet_tell((knetFile*)in->fp->fp) + BUF_SIZE < end? BUF_SIZE : end - knet_tell((knetFile*)in->fp->fp);
+ knet_read(in->fp->fp, buf, size);
+ fwrite(buf, 1, size, out->fp->fp);
+ }
+#else
+ abort(); // FIXME: not implemented
+#endif
+ bcf_close(in);
+ }
+ bcf_close(out);
+ free(buf);
+ return 0;
+}
+
+extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
+
+int bcf_main_ldpair(int argc, char *argv[])
+{
+ bcf_t *fp;
+ bcf_hdr_t *h;
+ bcf1_t *b0, *b1;
+ bcf_idx_t *idx;
+ kstring_t str;
+ void *str2id;
+ gzFile fplist;
+ kstream_t *ks;
+ int dret, lineno = 0;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: bcftools ldpair <in.bcf> <in.list>\n");
+ return 1;
+ }
+ fplist = gzopen(argv[2], "rb");
+ ks = ks_init(fplist);
+ memset(&str, 0, sizeof(kstring_t));
+ fp = bcf_open(argv[1], "rb");
+ h = bcf_hdr_read(fp);
+ str2id = bcf_build_refhash(h);
+ idx = bcf_idx_load(argv[1]);
+ if (idx == 0) {
+ fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__);
+ return 1;
+ }
+ b0 = calloc(1, sizeof(bcf1_t));
+ b1 = calloc(1, sizeof(bcf1_t));
+ while (ks_getuntil(ks, '\n', &str, &dret) >= 0) {
+ char *p, *q;
+ int k;
+ int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1;
+ ++lineno;
+ for (p = q = str.s, k = 0; *p; ++p) {
+ if (*p == ' ' || *p == '\t') {
+ *p = '\0';
+ if (k == 0) tid0 = bcf_str2id(str2id, q);
+ else if (k == 1) pos0 = atoi(q) - 1;
+ else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0;
+ else if (k == 3) pos1 = atoi(q) - 1;
+ q = p + 1;
+ ++k;
+ }
+ }
+ if (k == 3) pos1 = atoi(q) - 1;
+ if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) {
+ uint64_t off;
+ double r, f[4];
+ off = bcf_idx_query(idx, tid0, pos0);
+ bgzf_seek(fp->fp, off, SEEK_SET);
+ while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0);
+ off = bcf_idx_query(idx, tid1, pos1);
+ bgzf_seek(fp->fp, off, SEEK_SET);
+ while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1);
+ r = bcf_pair_freq(b0, b1, f);
+ r *= r;
+ printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1,
+ r, f[0], f[1], f[2], f[3]);
+ } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno);
+ }
+ bcf_destroy(b0); bcf_destroy(b1);
+ bcf_idx_destroy(idx);
+ bcf_str2id_destroy(str2id);
+ bcf_hdr_destroy(h);
+ bcf_close(fp);
+ free(str.s);
+ ks_destroy(ks);
+ gzclose(fplist);
+ return 0;
+}
+
+int bcf_main_ld(int argc, char *argv[])
+{
+ bcf_t *fp;
+ bcf_hdr_t *h;
+ bcf1_t **b, *b0;
+ int i, j, m, n;
+ double f[4];
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bcftools ld <in.bcf>\n");
+ return 1;
+ }
+ fp = bcf_open(argv[1], "rb");
+ h = bcf_hdr_read(fp);
+ // read the entire BCF
+ m = n = 0; b = 0;
+ b0 = calloc(1, sizeof(bcf1_t));
+ while (bcf_read(fp, h, b0) >= 0) {
+ if (m == n) {
+ m = m? m<<1 : 16;
+ b = realloc(b, sizeof(void*) * m);
+ }
+ b[n] = calloc(1, sizeof(bcf1_t));
+ bcf_cpy(b[n++], b0);
+ }
+ bcf_destroy(b0);
+ // compute pair-wise r^2
+ printf("%d\n", n); // the number of loci
+ for (i = 0; i < n; ++i) {
+ printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1);
+ for (j = 0; j < i; ++j) {
+ double r = bcf_pair_freq(b[i], b[j], f);
+ printf("\t%.3f", r*r);
+ }
+ printf("\t1.000\n");
+ }
+ // free
+ for (i = 0; i < n; ++i) bcf_destroy(b[i]);
+ free(b);
+ bcf_hdr_destroy(h);
+ bcf_close(fp);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n");
+ fprintf(stderr, "Version: %s\n\n", BCF_VERSION);
+ fprintf(stderr, "Usage: bcftools <command> <arguments>\n\n");
+ fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n");
+ fprintf(stderr, " index index BCF\n");
+ fprintf(stderr, " cat concatenate BCFs\n");
+ fprintf(stderr, " ld compute all-pair r^2\n");
+ fprintf(stderr, " ldpair compute r^2 between requested pairs\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1);
+ else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1);
+ else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1);
+ else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ...
+ else {
+ fprintf(stderr, "[main] Unrecognized command.\n");
+ return 1;
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/bcftools/mut.c b/samtools-0.1.19/bcftools/mut.c
new file mode 100644
index 0000000..15ef265
--- /dev/null
+++ b/samtools-0.1.19/bcftools/mut.c
@@ -0,0 +1,127 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include "bcf.h"
+
+#define MAX_GENO 359
+
+int8_t seq_bitcnt[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+char *seq_nt16rev = "XACMGRSVTWYHKDBN";
+
+uint32_t *bcf_trio_prep(int is_x, int is_son)
+{
+ int i, j, k, n, map[10];
+ uint32_t *ret;
+ ret = calloc(MAX_GENO, 4);
+ for (i = 0, k = 0; i < 4; ++i)
+ for (j = i; j < 4; ++j)
+ map[k++] = 1<<i|1<<j;
+ for (i = 0, n = 1; i < 10; ++i) { // father
+ if (is_x && seq_bitcnt[map[i]] != 1) continue;
+ if (is_x && is_son) {
+ for (j = 0; j < 10; ++j) // mother
+ for (k = 0; k < 10; ++k) // child
+ if (seq_bitcnt[map[k]] == 1 && (map[j]&map[k]))
+ ret[n++] = j<<16 | i<<8 | k;
+ } else {
+ for (j = 0; j < 10; ++j) // mother
+ for (k = 0; k < 10; ++k) // child
+ if ((map[i]&map[k]) && (map[j]&map[k]) && ((map[i]|map[j])&map[k]) == map[k])
+ ret[n++] = j<<16 | i<<8 | k;
+ }
+ }
+ ret[0] = n - 1;
+ return ret;
+}
+
+
+int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt)
+{
+ int i, j, k;
+ const bcf_ginfo_t *PL;
+ uint8_t *gl10;
+ int map[10];
+ if (b->n_smpl != 3) return -1; // not a trio
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ gl10 = alloca(10 * b->n_smpl);
+ if (bcf_gl10(b, gl10) < 0) {
+ if (bcf_gl10_indel(b, gl10) < 0) return -1;
+ }
+ PL = b->gi + i;
+ for (i = 0, k = 0; i < 4; ++i)
+ for (j = i; j < 4; ++j)
+ map[k++] = seq_nt16rev[1<<i|1<<j];
+ for (j = 0; j < 3; ++j) // check if ref hom is the most probable in all members
+ if (((uint8_t*)PL->data)[j * PL->len] != 0) break;
+ if (j < 3) { // we need to go through the complex procedure
+ uint8_t *g[3];
+ int minc = 1<<30, minc_j = -1, minf = 0, gtf = 0, gtc = 0;
+ g[0] = gl10;
+ g[1] = gl10 + 10;
+ g[2] = gl10 + 20;
+ for (j = 1; j <= (int)prep[0]; ++j) { // compute LK with constraint
+ int sum = g[0][prep[j]&0xff] + g[1][prep[j]>>8&0xff] + g[2][prep[j]>>16&0xff];
+ if (sum < minc) minc = sum, minc_j = j;
+ }
+ gtc |= map[prep[minc_j]&0xff]; gtc |= map[prep[minc_j]>>8&0xff]<<8; gtc |= map[prep[minc_j]>>16]<<16;
+ for (j = 0; j < 3; ++j) { // compute LK without constraint
+ int min = 1<<30, min_k = -1;
+ for (k = 0; k < 10; ++k)
+ if (g[j][k] < min) min = g[j][k], min_k = k;
+ gtf |= map[min_k]<<(j*8);
+ minf += min;
+ }
+ *llr = minc - minf; *gt = (int64_t)gtc<<32 | gtf;
+ } else *llr = 0, *gt = -1;
+ return 0;
+}
+
+int bcf_pair_call(const bcf1_t *b)
+{
+ int i, j, k;
+ const bcf_ginfo_t *PL;
+ if (b->n_smpl != 2) return -1; // not a pair
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ for (j = 0; j < 2; ++j) // check if ref hom is the most probable in all members
+ if (((uint8_t*)PL->data)[j * PL->len] != 0) break;
+ if (j < 2) { // we need to go through the complex procedure
+ uint8_t *g[2];
+ int minc = 1<<30, minf = 0;
+ g[0] = PL->data;
+ g[1] = (uint8_t*)PL->data + PL->len;
+ for (j = 0; j < PL->len; ++j) // compute LK with constraint
+ minc = minc < g[0][j] + g[1][j]? minc : g[0][j] + g[1][j];
+ for (j = 0; j < 2; ++j) { // compute LK without constraint
+ int min = 1<<30;
+ for (k = 0; k < PL->len; ++k)
+ min = min < g[j][k]? min : g[j][k];
+ minf += min;
+ }
+ return minc - minf;
+ } else return 0;
+}
+
+int bcf_min_diff(const bcf1_t *b)
+{
+ int i, min = 1<<30;
+ const bcf_ginfo_t *PL;
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ for (i = 0; i < b->n_smpl; ++i) {
+ int m1, m2, j;
+ const uint8_t *p = (uint8_t*)PL->data;
+ m1 = m2 = 1<<30;
+ for (j = 0; j < PL->len; ++j) {
+ if ((int)p[j] < m1) m2 = m1, m1 = p[j];
+ else if ((int)p[j] < m2) m2 = p[j];
+ }
+ min = min < m2 - m1? min : m2 - m1;
+ }
+ return min;
+}
diff --git a/samtools-0.1.19/bcftools/prob1.c b/samtools-0.1.19/bcftools/prob1.c
new file mode 100644
index 0000000..3539ee3
--- /dev/null
+++ b/samtools-0.1.19/bcftools/prob1.c
@@ -0,0 +1,988 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <limits.h>
+#include <zlib.h>
+#include "prob1.h"
+#include "kstring.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MC_MAX_EM_ITER 16
+#define MC_EM_EPS 1e-5
+#define MC_DEF_INDEL 0.15
+
+gzFile bcf_p1_fp_lk;
+
+unsigned char seq_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+struct __bcf_p1aux_t {
+ int n, M, n1, is_indel;
+ uint8_t *ploidy; // haploid or diploid ONLY
+ double *q2p, *pdg; // pdg -> P(D|g)
+ double *phi, *phi_indel;
+ double *z, *zswap; // aux for afs
+ double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
+ double **hg; // hypergeometric distribution
+ double *lf; // log factorial
+ double t, t1, t2;
+ double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution
+ const uint8_t *PL; // point to PL
+ int PL_len;
+};
+
+void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
+{
+ int i;
+ for (i = 0; i < ma->M; ++i)
+ ma->phi_indel[i] = ma->phi[i] * x;
+ ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x;
+}
+
+static void init_prior(int type, double theta, int M, double *phi)
+{
+ int i;
+ if (type == MC_PTYPE_COND2) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 2. * (i + 1) / (M + 1) / (M + 2);
+ } else if (type == MC_PTYPE_FLAT) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 1. / (M + 1);
+ } else {
+ double sum;
+ for (i = 0, sum = 0.; i < M; ++i)
+ sum += (phi[i] = theta / (M - i));
+ phi[M] = 1. - sum;
+ }
+}
+
+void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta)
+{
+ init_prior(type, theta, ma->M, ma->phi);
+ bcf_p1_indel_prior(ma, MC_DEF_INDEL);
+}
+
+void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta)
+{
+ if (ma->n1 <= 0 || ma->n1 >= ma->M) return;
+ init_prior(type, theta, 2*ma->n1, ma->phi1);
+ init_prior(type, theta, 2*(ma->n - ma->n1), ma->phi2);
+}
+
+int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn)
+{
+ gzFile fp;
+ kstring_t s;
+ kstream_t *ks;
+ long double sum;
+ int dret, k;
+ memset(&s, 0, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ memset(ma->phi, 0, sizeof(double) * (ma->M + 1));
+ while (ks_getuntil(ks, '\n', &s, &dret) >= 0) {
+ if (strstr(s.s, "[afs] ") == s.s) {
+ char *p = s.s + 6;
+ for (k = 0; k <= ma->M; ++k) {
+ int x;
+ double y;
+ x = strtol(p, &p, 10);
+ if (x != k && (errno == EINVAL || errno == ERANGE)) return -1;
+ ++p;
+ y = strtod(p, &p);
+ if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1;
+ ma->phi[ma->M - k] += y;
+ }
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(s.s);
+ for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k];
+ fprintf(stderr, "[prior]");
+ for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum;
+ for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]);
+ fputc('\n', stderr);
+ for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1));
+ fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum);
+ for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M;
+ fprintf(stderr, "theta=%lf\n", (double)sum);
+ bcf_p1_indel_prior(ma, MC_DEF_INDEL);
+ return 0;
+}
+
+bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy)
+{
+ bcf_p1aux_t *ma;
+ int i;
+ ma = calloc(1, sizeof(bcf_p1aux_t));
+ ma->n1 = -1;
+ ma->n = n; ma->M = 2 * n;
+ if (ploidy) {
+ ma->ploidy = malloc(n);
+ memcpy(ma->ploidy, ploidy, n);
+ for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i];
+ if (ma->M == 2 * n) {
+ free(ma->ploidy);
+ ma->ploidy = 0;
+ }
+ }
+ ma->q2p = calloc(256, sizeof(double));
+ ma->pdg = calloc(3 * ma->n, sizeof(double));
+ ma->phi = calloc(ma->M + 1, sizeof(double));
+ ma->phi_indel = calloc(ma->M + 1, sizeof(double));
+ ma->phi1 = calloc(ma->M + 1, sizeof(double));
+ ma->phi2 = calloc(ma->M + 1, sizeof(double));
+ ma->z = calloc(ma->M + 1, sizeof(double));
+ ma->zswap = calloc(ma->M + 1, sizeof(double));
+ ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large
+ ma->z2 = calloc(ma->M + 1, sizeof(double));
+ ma->afs = calloc(ma->M + 1, sizeof(double));
+ ma->afs1 = calloc(ma->M + 1, sizeof(double));
+ ma->lf = calloc(ma->M + 1, sizeof(double));
+ for (i = 0; i < 256; ++i)
+ ma->q2p[i] = pow(10., -i / 10.);
+ for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1);
+ bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
+ return ma;
+}
+
+int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; }
+
+int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
+{
+ if (n1 == 0 || n1 >= b->n) return -1;
+ if (b->M != b->n * 2) {
+ fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
+ return -1;
+ }
+ b->n1 = n1;
+ return 0;
+}
+
+void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma)
+{
+ // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper.
+ // Ideally, this should set ploidy per site to allow pseudo-autosomal regions
+ b->ploidy = ma->ploidy;
+}
+
+void bcf_p1_destroy(bcf_p1aux_t *ma)
+{
+ if (ma) {
+ int k;
+ free(ma->lf);
+ if (ma->hg && ma->n1 > 0) {
+ for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]);
+ free(ma->hg);
+ }
+ free(ma->ploidy); free(ma->q2p); free(ma->pdg);
+ free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
+ free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
+ free(ma->afs); free(ma->afs1);
+ free(ma);
+ }
+}
+
+extern double kf_gammap(double s, double z);
+int test16(bcf1_t *b, anno16_t *a);
+
+// Wigginton 2005, PMID: 15789306
+// written by Jan Wigginton
+double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets)
+{
+ if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1;
+
+ assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0);
+
+ int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1;
+ int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2;
+
+ int rare_copies = 2 * obs_homr + obs_hets;
+ int genotypes = obs_hets + obs_homc + obs_homr;
+
+ double *het_probs = (double*) calloc(rare_copies+1, sizeof(double));
+
+ /* start at midpoint */
+ int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes);
+
+ /* check to ensure that midpoint and rare alleles have same parity */
+ if ((rare_copies & 1) ^ (mid & 1)) mid++;
+
+ int curr_hets = mid;
+ int curr_homr = (rare_copies - mid) / 2;
+ int curr_homc = genotypes - curr_hets - curr_homr;
+
+ het_probs[mid] = 1.0;
+ double sum = het_probs[mid];
+ for (curr_hets = mid; curr_hets > 1; curr_hets -= 2)
+ {
+ het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0));
+ sum += het_probs[curr_hets - 2];
+
+ /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */
+ curr_homr++;
+ curr_homc++;
+ }
+
+ curr_hets = mid;
+ curr_homr = (rare_copies - mid) / 2;
+ curr_homc = genotypes - curr_hets - curr_homr;
+ for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2)
+ {
+ het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0));
+ sum += het_probs[curr_hets + 2];
+
+ /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */
+ curr_homr--;
+ curr_homc--;
+ }
+ int i;
+ for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum;
+
+ /* p-value calculation for p_hwe */
+ double p_hwe = 0.0;
+ for (i = 0; i <= rare_copies; i++)
+ {
+ if (het_probs[i] > het_probs[obs_hets])
+ continue;
+ p_hwe += het_probs[i];
+ }
+
+ p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
+ free(het_probs);
+ return p_hwe;
+
+}
+
+
+static void _bcf1_set_ref(bcf1_t *b, int idp)
+{
+ kstring_t s;
+ int old_n_gi = b->n_gi;
+ s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
+ kputs(":GT", &s); kputc('\0', &s);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+
+ // Call GTs
+ int isample, an = 0;
+ for (isample = 0; isample < b->n_smpl; isample++)
+ {
+ if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 )
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7;
+ else
+ {
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0;
+ an += b->ploidy ? b->ploidy[isample] : 2;
+ }
+ }
+ bcf_fit_alt(b,1);
+ b->qual = 999;
+
+ // Prepare BCF for output: ref, alt, filter, info, format
+ memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s);
+ kputs(b->ref, &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ {
+ ksprintf(&s, "AN=%d;", an);
+ kputs(b->info, &s);
+ anno16_t a;
+ int has_I16 = test16(b, &a) >= 0? 1 : 0;
+ if (has_I16 )
+ {
+ if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
+ ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ }
+ kputc('\0', &s);
+ rm_info(&s, "I16=");
+ rm_info(&s, "QS=");
+ }
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+}
+
+int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only)
+{
+ int nals = 1;
+ char *p;
+ for (p=b->alt; *p; p++)
+ {
+ if ( *p=='X' || p[0]=='.' ) break;
+ if ( p[0]==',' ) nals++;
+ }
+ if ( b->alt[0] && !*p ) nals++;
+
+ if ( nals>4 )
+ {
+ if ( *b->ref=='N' ) return 0;
+ fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt);
+ exit(1);
+ }
+
+ // find PL, DV and DP FORMAT indexes
+ uint8_t *pl = NULL;
+ int i, npl = 0, idp = -1, idv = -1;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2))
+ {
+ pl = (uint8_t*)b->gi[i].data;
+ npl = b->gi[i].len;
+ }
+ else if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i;
+ else if (b->gi[i].fmt == bcf_str2int("DV", 2)) idv=i;
+ }
+ if ( nals==1 )
+ {
+ if ( !var_only ) _bcf1_set_ref(b, idp);
+ return 1;
+ }
+ if ( !pl ) return -1;
+
+ assert(ma->q2p[0] == 1);
+
+ // Init P(D|G)
+ int npdg = nals*(nals+1)/2;
+ double *pdg,*_pdg;
+ _pdg = pdg = malloc(sizeof(double)*ma->n*npdg);
+ for (i=0; i<ma->n; i++)
+ {
+ int j;
+ double sum = 0;
+ for (j=0; j<npdg; j++)
+ {
+ //_pdg[j] = pow(10,-0.1*pl[j]);
+ _pdg[j] = ma->q2p[pl[j]];
+ sum += _pdg[j];
+ }
+ if ( sum )
+ for (j=0; j<npdg; j++) _pdg[j] /= sum;
+ _pdg += npdg;
+ pl += npl;
+ }
+
+ if ((p = strstr(b->info, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); }
+ double qsum[4];
+ if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); }
+
+
+ // Calculate the most likely combination of alleles, remembering the most and second most likely set
+ int ia,ib,ic, max_als=0, max_als2=0;
+ double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3];
+ for (ia=0; ia<nals; ia++)
+ {
+ double lk_tot = 0;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ int isample;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ double *p = pdg + isample*npdg;
+ // assert( log(p[iaa]) <= 0 );
+ lk_tot += log(p[iaa]);
+ }
+ if ( ia==0 ) ref_lk = lk_tot;
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ lk_sums[0] = lk_sum;
+ if ( nals>1 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( qsum[ib]==0 ) continue;
+ double lk_tot = 0;
+ double fa = qsum[ia]/(qsum[ia]+qsum[ib]);
+ double fb = qsum[ib]/(qsum[ia]+qsum[ib]);
+ double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ double *p = pdg + isample*npdg;
+ //assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 );
+ if ( b->ploidy && b->ploidy[isample]==1 )
+ lk_tot += log(fa*p[iaa] + fb*p[ibb]);
+ else
+ lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]);
+ }
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ }
+ lk_sums[1] = lk_sum;
+ }
+ if ( nals>2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( qsum[ib]==0 ) continue;
+ int ibb = (ib+1)*(ib+2)/2-1;
+ int iab = iaa - ia + ib;
+ for (ic=0; ic<ib; ic++)
+ {
+ if ( qsum[ic]==0 ) continue;
+ double lk_tot = 0;
+ double fa = qsum[ia]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ double fb = qsum[ib]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ double fc = qsum[ic]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ int isample, icc = (ic+1)*(ic+2)/2-1;
+ int iac = iaa - ia + ic, ibc = ibb - ib + ic;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ double *p = pdg + isample*npdg;
+ //assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 );
+ if ( b->ploidy && b->ploidy[isample]==1 )
+ lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc]);
+ else
+ lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]);
+ }
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib|1<<ic; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib|1<<ic; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ }
+ }
+ lk_sums[2] = lk_sum;
+ }
+
+ // Should we add another allele, does it increase the likelihood significantly?
+ int n1=0, n2=0;
+ for (i=0; i<nals; i++) if ( max_als&1<<i) n1++;
+ for (i=0; i<nals; i++) if ( max_als2&1<<i) n2++;
+ if ( n2<n1 && kf_gammap(1,2.0*(max_lk-max_lk2))<threshold )
+ {
+ // the threshold not exceeded, use the second most likely set with fewer alleles
+ max_lk = max_lk2;
+ max_als = max_als2;
+ n1 = n2;
+ }
+ lk_sum = lk_sums[n1-1];
+
+ // Get the BCF record ready for GT and GQ
+ kstring_t s;
+ int old_n_gi = b->n_gi;
+ s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
+ kputs(":GT:GQ", &s); kputc('\0', &s);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+
+ // Call GTs
+ int isample, gts=0, ac[4] = {0,0,0,0};
+ int nRR = 0, nAA = 0, nRA = 0, max_dv = 0;
+ for (isample = 0; isample < b->n_smpl; isample++)
+ {
+ int ploidy = b->ploidy ? b->ploidy[isample] : 2;
+ double *p = pdg + isample*npdg;
+ int ia, als = 0;
+ double lk = 0, lk_s = 0;
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(max_als&1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ double _lk = p[iaa]*qsum[ia]*qsum[ia];
+ if ( _lk > lk ) { lk = _lk; als = ia<<3 | ia; }
+ lk_s += _lk;
+ }
+ if ( ploidy==2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(max_als&1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( !(max_als&1<<ib) ) continue;
+ int iab = iaa - ia + ib;
+ double _lk = 2*qsum[ia]*qsum[ib]*p[iab];
+ if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; }
+ lk_s += _lk;
+ }
+ }
+ }
+ lk = -log(1-lk/lk_s)/0.2302585;
+ int dp = 0;
+ if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 )
+ {
+ // no coverage
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0;
+ continue;
+ }
+ if ( lk>99 ) lk = 99;
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = als;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk;
+
+ // For MDV annotation
+ int dv;
+ if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) )
+ {
+ if ( max_dv < dv ) max_dv = dv;
+ }
+
+ // For HWE annotation; multiple ALT alleles treated as one
+ if ( !als ) nRR++;
+ else if ( !(als>>3&7) || !(als&7) ) nRA++;
+ else nAA++;
+
+ gts |= 1<<(als>>3&7) | 1<<(als&7);
+ ac[ als>>3&7 ]++;
+ ac[ als&7 ]++;
+ }
+ free(pdg);
+ bcf_fit_alt(b,max_als);
+
+ // The VCF spec is ambiguous about QUAL: is it the probability of anything else
+ // (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it
+ // QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter.
+ b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum));
+ if ( b->qual>999 ) b->qual = 999;
+
+ // Prepare BCF for output: ref, alt, filter, info, format
+ memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s);
+ kputs(b->ref, &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ {
+ int an=0, nalts=0;
+ for (i=0; i<nals; i++)
+ {
+ an += ac[i];
+ if ( i>0 && ac[i] ) nalts++;
+ }
+ ksprintf(&s, "AN=%d;", an);
+ if ( nalts )
+ {
+ kputs("AC=", &s);
+ for (i=1; i<nals; i++)
+ {
+ if ( !(gts&1<<i) ) continue;
+ nalts--;
+ ksprintf(&s,"%d", ac[i]);
+ if ( nalts>0 ) kputc(',', &s);
+ }
+ kputc(';', &s);
+ }
+ kputs(b->info, &s);
+ anno16_t a;
+ int has_I16 = test16(b, &a) >= 0? 1 : 0;
+ if (has_I16 )
+ {
+ if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
+ ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3]));
+ if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv);
+ }
+ if ( nAA+nRA )
+ {
+ double hwe = calc_hwe(nAA, nRR, nRA);
+ ksprintf(&s, ";HWE=%e", hwe);
+ }
+ kputc('\0', &s);
+ rm_info(&s, "I16=");
+ rm_info(&s, "QS=");
+ }
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+
+ return gts;
+}
+
+static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
+{
+ int i, j;
+ long *p, tmp;
+ p = alloca(b->n_alleles * sizeof(long));
+ memset(p, 0, sizeof(long) * b->n_alleles);
+ for (j = 0; j < ma->n; ++j) {
+ const uint8_t *pi = ma->PL + j * ma->PL_len;
+ double *pdg = ma->pdg + j * 3;
+ pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
+ for (i = 0; i < b->n_alleles; ++i)
+ p[i] += (int)pi[(i+1)*(i+2)/2-1];
+ }
+ for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i;
+ for (i = 1; i < b->n_alleles; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
+ for (i = b->n_alleles - 1; i >= 0; --i)
+ if ((p[i]&0xf) == 0) break;
+ return i;
+}
+
+
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+{
+ double sum, g[3];
+ double max, f3[3], *pdg = ma->pdg + k * 3;
+ int q, i, max_i, ploidy;
+ ploidy = ma->ploidy? ma->ploidy[k] : 2;
+ if (ploidy == 2) {
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ } else {
+ f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0;
+ }
+ for (i = 0, sum = 0.; i < 3; ++i)
+ sum += (g[i] = pdg[i] * f3[i]);
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ g[i] /= sum;
+ if (g[i] > max) max = g[i], max_i = i;
+ }
+ max = 1. - max;
+ if (max < 1e-308) max = 1e-308;
+ q = (int)(-4.343 * log(max) + .499);
+ if (q > 99) q = 99;
+ return q<<2|max_i;
+}
+
+#define TINY 1e-20
+
+static void mc_cal_y_core(bcf_p1aux_t *ma, int beg)
+{
+ double *z[2], *tmp, *pdg;
+ int _j, last_min, last_max;
+ assert(beg == 0 || ma->M == ma->n*2);
+ z[0] = ma->z;
+ z[1] = ma->zswap;
+ pdg = ma->pdg;
+ memset(z[0], 0, sizeof(double) * (ma->M + 1));
+ memset(z[1], 0, sizeof(double) * (ma->M + 1));
+ z[0][0] = 1.;
+ last_min = last_max = 0;
+ ma->t = 0.;
+ if (ma->M == ma->n * 2) {
+ int M = 0;
+ for (_j = beg; _j < ma->n; ++_j) {
+ int k, j = _j - beg, _min = last_min, _max = last_max, M0;
+ double p[3], sum;
+ M0 = M; M += 2;
+ pdg = ma->pdg + _j * 3;
+ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset
+ ma->t1 = ma->t;
+ memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary?
+ //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.;
+ } else { // this block is very similar to the block above; these two might be merged in future
+ int j, M = 0;
+ for (j = 0; j < ma->n; ++j) {
+ int k, M0, _min = last_min, _max = last_max;
+ double p[3], sum;
+ pdg = ma->pdg + j * 3;
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ M0 = M;
+ M += ma->ploidy[j];
+ if (ma->ploidy[j] == 1) {
+ p[0] = pdg[0]; p[1] = pdg[2];
+ _max++;
+ if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k];
+ for (k = _min < 1? 1 : _min; k <= _max; ++k)
+ z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / M);
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = 0.;
+ } else if (ma->ploidy[j] == 2) {
+ p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2];
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ }
+ if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
+ if (bcf_p1_fp_lk)
+ gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1));
+}
+
+static void mc_cal_y(bcf_p1aux_t *ma)
+{
+ if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples
+ int k;
+ long double x;
+ memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
+ memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ ma->t1 = ma->t2 = 0.;
+ mc_cal_y_core(ma, ma->n1);
+ ma->t2 = ma->t;
+ memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ mc_cal_y_core(ma, 0);
+ // rescale z
+ x = expl(ma->t - (ma->t1 + ma->t2));
+ for (k = 0; k <= ma->M; ++k) ma->z[k] *= x;
+ } else mc_cal_y_core(ma, 0);
+}
+
+#define CONTRAST_TINY 1e-30
+
+extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test
+
+static inline double chi2_test(int a, int b, int c, int d)
+{
+ double x, z;
+ x = (double)(a+b) * (c+d) * (b+d) * (a+c);
+ if (x == 0.) return 1;
+ z = a * d - b * c;
+ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
+}
+
+// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3])
+{
+ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ int n1 = p1->n1, n2 = p1->n - p1->n1;
+ if (p < CONTRAST_TINY) return -1;
+ if (.5*k1/n1 < .5*k2/n2) x[1] += p;
+ else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
+ else x[0] += p;
+ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
+}
+
+static double contrast2(bcf_p1aux_t *p1, double ret[3])
+{
+ int k, k1, k2, k10, k20, n1, n2;
+ double sum;
+ // get n1 and n2
+ n1 = p1->n1; n2 = p1->n - p1->n1;
+ if (n1 <= 0 || n2 <= 0) return 0.;
+ if (p1->hg == 0) { // initialize the hypergeometric distribution
+ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
+ to avoid precomputing this matrix, but it is slower and quite intricate. The following
+ computation in this block can be accelerated with a similar strategy, but perhaps this
+ is not a serious concern for now. */
+ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
+ p1->hg = calloc(2*n1+1, sizeof(void*));
+ for (k1 = 0; k1 <= 2*n1; ++k1) {
+ p1->hg[k1] = calloc(2*n2+1, sizeof(double));
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
+ }
+ }
+ { // compute
+ long double suml = 0;
+ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
+ sum = suml;
+ }
+ { // get the max k1 and k2
+ double max;
+ int max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
+ double x = p1->phi1[k] * p1->z1[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k10 = max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
+ double x = p1->phi2[k] * p1->z2[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k20 = max_k;
+ }
+ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
+ double x[3], y;
+ long double z = 0., L[2];
+ x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0;
+ for (k1 = k10; k1 >= 0; --k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
+ if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0;
+ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y;
+ if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why...
+ z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
+ }
+ return (double)z;
+ }
+}
+
+static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)
+{
+ int k;
+ long double sum = 0., sum2;
+ double *phi = ma->is_indel? ma->phi_indel : ma->phi;
+ memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
+ mc_cal_y(ma);
+ // compute AFS
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)phi[k] * ma->z[k];
+ for (k = 0; k <= ma->M; ++k) {
+ ma->afs1[k] = phi[k] * ma->z[k] / sum;
+ if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
+ }
+ // compute folded variant probability
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+ for (k = 1, sum2 = 0.; k < ma->M; ++k)
+ sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+ *p_var_folded = sum2 / sum;
+ *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum;
+ // the expected frequency
+ for (k = 0, sum = 0.; k <= ma->M; ++k) {
+ ma->afs[k] += ma->afs1[k];
+ sum += k * ma->afs1[k];
+ }
+ return sum / ma->M;
+}
+
+int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
+{
+ int i, k;
+ long double sum = 0.;
+ ma->is_indel = bcf_is_indel(b);
+ rst->perm_rank = -1;
+ // set PL and PL_len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ ma->PL = (uint8_t*)b->gi[i].data;
+ ma->PL_len = b->gi[i].len;
+ break;
+ }
+ }
+ if (i == b->n_gi) return -1; // no PL
+ if (b->n_alleles < 2) return -1; // FIXME: find a better solution
+ //
+ rst->rank0 = cal_pdg(b, ma);
+ rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded);
+ rst->p_ref = ma->afs1[ma->M];
+ for (k = 0, sum = 0.; k < ma->M; ++k)
+ sum += ma->afs1[k];
+ rst->p_var = (double)sum;
+ { // compute the allele count
+ double max = -1;
+ rst->ac = -1;
+ for (k = 0; k <= ma->M; ++k)
+ if (max < ma->z[k]) max = ma->z[k], rst->ac = k;
+ rst->ac = ma->M - rst->ac;
+ }
+ // calculate f_flat and f_em
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)ma->z[k];
+ rst->f_flat = 0.;
+ for (k = 0; k <= ma->M; ++k) {
+ double p = ma->z[k] / sum;
+ rst->f_flat += k * p;
+ }
+ rst->f_flat /= ma->M;
+ { // estimate equal-tail credible interval (95% level)
+ int l, h;
+ double p;
+ for (i = 0, p = 0.; i <= ma->M; ++i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ l = i;
+ for (i = ma->M, p = 0.; i >= 0; --i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ h = i;
+ rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
+ }
+ if (ma->n1 > 0) { // compute LRT
+ double max0, max1, max2;
+ for (k = 0, max0 = -1; k <= ma->M; ++k)
+ if (max0 < ma->z[k]) max0 = ma->z[k];
+ for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k)
+ if (max1 < ma->z1[k]) max1 = ma->z1[k];
+ for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k)
+ if (max2 < ma->z2[k]) max2 = ma->z2[k];
+ rst->lrt = log(max1 * max2 / max0);
+ rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt);
+ } else rst->lrt = -1.0;
+ rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
+ if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant
+ rst->p_chi2 = contrast2(ma, rst->cmp);
+ return 0;
+}
+
+void bcf_p1_dump_afs(bcf_p1aux_t *ma)
+{
+ int k;
+ fprintf(stderr, "[afs]");
+ for (k = 0; k <= ma->M; ++k)
+ fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
+ fprintf(stderr, "\n");
+ memset(ma->afs, 0, sizeof(double) * (ma->M + 1));
+}
diff --git a/samtools-0.1.19/bcftools/prob1.h b/samtools-0.1.19/bcftools/prob1.h
new file mode 100644
index 0000000..6f93155
--- /dev/null
+++ b/samtools-0.1.19/bcftools/prob1.h
@@ -0,0 +1,49 @@
+#ifndef BCF_PROB1_H
+#define BCF_PROB1_H
+
+#include "bcf.h"
+
+struct __bcf_p1aux_t;
+typedef struct __bcf_p1aux_t bcf_p1aux_t;
+
+typedef struct {
+ int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
+ int ac; // ML alternative allele count
+ double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
+ double cil, cih;
+ double cmp[3], p_chi2, lrt; // used by contrast2()
+} bcf_p1rst_t;
+
+typedef struct {
+ double p[4];
+ int mq, depth, is_tested, d[4];
+} anno16_t;
+
+#define MC_PTYPE_FULL 1
+#define MC_PTYPE_COND2 2
+#define MC_PTYPE_FLAT 3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy);
+ void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
+ void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta);
+ void bcf_p1_destroy(bcf_p1aux_t *ma);
+ void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma);
+ int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
+ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only);
+ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
+ void bcf_p1_dump_afs(bcf_p1aux_t *ma);
+ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
+ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
+ void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
+
+ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/bcftools/vcf.c b/samtools-0.1.19/bcftools/vcf.c
new file mode 100644
index 0000000..e8526a3
--- /dev/null
+++ b/samtools-0.1.19/bcftools/vcf.c
@@ -0,0 +1,249 @@
+#include <zlib.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "bcf.h"
+#include "kstring.h"
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 4096)
+
+typedef struct {
+ gzFile fp;
+ FILE *fpout;
+ kstream_t *ks;
+ void *refhash;
+ kstring_t line;
+ int max_ref;
+} vcf_t;
+
+bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
+{
+ kstring_t meta, smpl;
+ int dret;
+ vcf_t *v;
+ bcf_hdr_t *h;
+ if (!bp->is_vcf) return bcf_hdr_read(bp);
+ h = calloc(1, sizeof(bcf_hdr_t));
+ v = (vcf_t*)bp->v;
+ v->line.l = 0;
+ memset(&meta, 0, sizeof(kstring_t));
+ memset(&smpl, 0, sizeof(kstring_t));
+ while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
+ if (v->line.l < 2) continue;
+ if (v->line.s[0] != '#') {
+ free(meta.s);
+ free(smpl.s);
+ free(h);
+ return 0; // no sample line
+ }
+ if (v->line.s[0] == '#' && v->line.s[1] == '#') {
+ kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
+ } else if (v->line.s[0] == '#') {
+ int k;
+ ks_tokaux_t aux;
+ char *p;
+ for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
+ if (k >= 9) {
+ kputsn(p, aux.p - p, &smpl);
+ kputc('\0', &smpl);
+ }
+ }
+ break;
+ }
+ }
+ kputc('\0', &meta);
+ h->name = 0;
+ h->sname = smpl.s; h->l_smpl = smpl.l;
+ h->txt = meta.s; h->l_txt = meta.l;
+ bcf_hdr_sync(h);
+ return h;
+}
+
+bcf_t *vcf_open(const char *fn, const char *mode)
+{
+ bcf_t *bp;
+ vcf_t *v;
+ if (strchr(mode, 'b')) return bcf_open(fn, mode);
+ bp = calloc(1, sizeof(bcf_t));
+ v = calloc(1, sizeof(vcf_t));
+ bp->is_vcf = 1;
+ bp->v = v;
+ v->refhash = bcf_str2id_init();
+ if (strchr(mode, 'r')) {
+ v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ v->ks = ks_init(v->fp);
+ } else if (strchr(mode, 'w'))
+ v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
+ return bp;
+}
+
+int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
+{
+ vcf_t *v;
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t s, rn;
+ int dret;
+ if (bp == 0) return -1;
+ if (!bp->is_vcf) return 0;
+ s.l = s.m = 0; s.s = 0;
+ rn.m = rn.l = h->l_nm; rn.s = h->name;
+ v = (vcf_t*)bp->v;
+ fp = gzopen(fn, "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ bcf_str2id_add(v->refhash, strdup(s.s));
+ kputs(s.s, &rn); kputc('\0', &rn);
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ h->l_nm = rn.l; h->name = rn.s;
+ bcf_hdr_sync(h);
+ free(s.s);
+ return 0;
+}
+
+int vcf_close(bcf_t *bp)
+{
+ vcf_t *v;
+ if (bp == 0) return -1;
+ if (!bp->is_vcf) return bcf_close(bp);
+ v = (vcf_t*)bp->v;
+ if (v->fp) {
+ ks_destroy(v->ks);
+ gzclose(v->fp);
+ }
+ if (v->fpout) fclose(v->fpout);
+ free(v->line.s);
+ bcf_str2id_thorough_destroy(v->refhash);
+ free(v);
+ free(bp);
+ return 0;
+}
+
+int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
+{
+ vcf_t *v = (vcf_t*)bp->v;
+ int i, has_ver = 0;
+ if (!bp->is_vcf) return bcf_hdr_write(bp, h);
+ if (h->l_txt > 0) {
+ if (strstr(h->txt, "##fileformat=")) has_ver = 1;
+ if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
+ fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
+ }
+ if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
+ fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
+ for (i = 0; i < h->n_smpl; ++i)
+ fprintf(v->fpout, "\t%s", h->sns[i]);
+ fputc('\n', v->fpout);
+ return 0;
+}
+
+int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
+{
+ vcf_t *v = (vcf_t*)bp->v;
+ extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
+ if (!bp->is_vcf) return bcf_write(bp, h, b);
+ bcf_fmt_core(h, b, &v->line);
+ fwrite(v->line.s, 1, v->line.l, v->fpout);
+ fputc('\n', v->fpout);
+ return v->line.l + 1;
+}
+
+int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
+{
+ int dret, k, i, sync = 0;
+ vcf_t *v = (vcf_t*)bp->v;
+ char *p, *q;
+ kstring_t str, rn;
+ ks_tokaux_t aux, a2;
+ if (!bp->is_vcf) return bcf_read(bp, h, b);
+ v->line.l = 0;
+ str.l = 0; str.m = b->m_str; str.s = b->str;
+ rn.l = rn.m = h->l_nm; rn.s = h->name;
+ if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
+ b->n_smpl = h->n_smpl;
+ for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
+ *(char*)aux.p = 0;
+ if (k == 0) { // ref
+ int tid = bcf_str2id(v->refhash, p);
+ if (tid < 0) {
+ tid = bcf_str2id_add(v->refhash, strdup(p));
+ kputs(p, &rn); kputc('\0', &rn);
+ sync = 1;
+ }
+ b->tid = tid;
+ } else if (k == 1) { // pos
+ b->pos = atoi(p) - 1;
+ } else if (k == 5) { // qual
+ b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
+ } else if (k <= 8) { // variable length strings
+ kputs(p, &str); kputc('\0', &str);
+ b->l_str = str.l; b->m_str = str.m; b->str = str.s;
+ if (k == 8) bcf_sync(b);
+ } else { // k > 9
+ if (strncmp(p, "./.", 3) == 0) {
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+ ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
+ ((uint8_t*)b->gi[i].data)[k-9] = 0;
+ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
+ ((int32_t*)b->gi[i].data)[k-9] = 0;
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
+ ((uint16_t*)b->gi[i].data)[k-9] = 0;
+ } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ int y = b->n_alleles * (b->n_alleles + 1) / 2;
+ memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ int y = b->n_alleles * (b->n_alleles + 1) / 2;
+ memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
+ }
+ }
+ goto endblock;
+ }
+ for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
+ if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+ ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
+ double _x = strtod(q, &q);
+ int x = (int)(_x + .499);
+ if (x > 255) x = 255;
+ ((uint8_t*)b->gi[i].data)[k-9] = x;
+ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
+ int x = strtol(q, &q, 10);
+ if (x > 0xffff) x = 0xffff;
+ ((uint32_t*)b->gi[i].data)[k-9] = x;
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
+ int x = strtol(q, &q, 10);
+ if (x > 0xffff) x = 0xffff;
+ ((uint16_t*)b->gi[i].data)[k-9] = x;
+ } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ int x, y, j;
+ uint8_t *data = (uint8_t*)b->gi[i].data;
+ y = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (j = 0; j < y; ++j) {
+ x = strtol(q, &q, 10);
+ if (x > 255) x = 255;
+ data[(k-9) * y + j] = x;
+ ++q;
+ }
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ int j, y;
+ float x, *data = (float*)b->gi[i].data;
+ y = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (j = 0; j < y; ++j) {
+ x = strtod(q, &q);
+ data[(k-9) * y + j] = x > 0? -x/10. : x;
+ ++q;
+ }
+ }
+ }
+ endblock: i = i;
+ }
+ }
+ h->l_nm = rn.l; h->name = rn.s;
+ if (sync) bcf_hdr_sync(h);
+ return v->line.l + 1;
+}
diff --git a/samtools-0.1.19/bcftools/vcfutils.pl b/samtools-0.1.19/bcftools/vcfutils.pl
new file mode 100755
index 0000000..2b7ba0b
--- /dev/null
+++ b/samtools-0.1.19/bcftools/vcfutils.pl
@@ -0,0 +1,567 @@
+#!/usr/bin/perl -w
+
+# Author: lh3
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&main;
+exit;
+
+sub main {
+ &usage if (@ARGV < 1);
+ my $command = shift(@ARGV);
+ my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
+ hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
+ gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
+ die("Unknown command \"$command\".\n") if (!defined($func{$command}));
+ &{$func{$command}};
+}
+
+sub splitchr {
+ my %opts = (l=>5000000);
+ getopts('l:', \%opts);
+ my $l = $opts{l};
+ die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ my @t = split;
+ my $last = 0;
+ for (my $i = 0; $i < $t[1];) {
+ my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
+ print "$t[0]:".($i+1)."-$e\n";
+ $i = $e;
+ }
+ }
+}
+
+sub subsam {
+ die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
+ my ($fh, %h);
+ my $fn = shift(@ARGV);
+ my @col;
+ open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
+ $h{$_} = 1 for (@ARGV);
+ while (<$fh>) {
+ if (/^##/) {
+ print;
+ } elsif (/^#/) {
+ my @t = split;
+ my @s = @t[0..8]; # all fixed fields + FORMAT
+ for (9 .. $#t) {
+ if ($h{$t[$_]}) {
+ push(@s, $t[$_]);
+ push(@col, $_);
+ }
+ }
+ pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
+ print join("\t", @s), "\n";
+ } else {
+ my @t = split;
+ if (@col == 0) {
+ print join("\t", @t[0..7]), "\n";
+ } else {
+ print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
+ }
+ }
+ }
+ close($fh);
+}
+
+sub listsam {
+ die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ if (/^#/ && !/^##/) {
+ my @t = split;
+ print join("\n", @t[9..$#t]), "\n";
+ exit;
+ }
+ }
+}
+
+sub fillac {
+ die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ if (/^#/) {
+ print;
+ } else {
+ my @t = split;
+ my @c = (0, 0);
+ my $n = 0;
+ my $s = -1;
+ @_ = split(":", $t[8]);
+ for (0 .. $#_) {
+ if ($_[$_] eq 'GT') { $s = $_; last; }
+ }
+ if ($s < 0) {
+ print join("\t", @t), "\n";
+ next;
+ }
+ for (9 .. $#t) {
+ if ($t[$_] =~ /^0,0,0/) {
+ } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
+ ++$c[$2]; ++$c[$3];
+ $n += 2;
+ }
+ }
+ my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
+ my $info = $t[7];
+ $info =~ s/(;?)AC=(\d+)//;
+ $info =~ s/(;?)AN=(\d+)//;
+ if ($info eq '.') {
+ $info = $AC;
+ } else {
+ $info .= ";$AC";
+ }
+ $t[7] = $info;
+ print join("\t", @t), "\n";
+ }
+ }
+}
+
+sub ldstats {
+ my %opts = (t=>0.9);
+ getopts('t:', \%opts);
+ die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
+ my $cutoff = $opts{t};
+ my ($last, $lastchr) = (0x7fffffff, '');
+ my ($x, $y, $n) = (0, 0, 0);
+ while (<>) {
+ if (/^([^#\s]+)\s(\d+)/) {
+ my ($chr, $pos) = ($1, $2);
+ if (/NEIR=([\d\.]+)/) {
+ ++$n;
+ ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
+ }
+ $last = $pos; $lastchr = $chr;
+ }
+ }
+ print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
+ print "Fraction: ", $y/$n, "\n";
+ print "Length: $x\n";
+}
+
+sub qstats {
+ my %opts = (r=>'', s=>0.02, v=>undef);
+ getopts('r:s:v', \%opts);
+ die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
+Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
+ my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
+ my %h = ();
+ my $is_vcf = defined($opts{v})? 1 : 0;
+ if ($opts{r}) { # read the reference positions
+ my $fh;
+ open($fh, $opts{r}) || die;
+ while (<$fh>) {
+ next if (/^#/);
+ if ($is_vcf) {
+ my @t = split;
+ $h{$t[0],$t[1]} = $t[4];
+ } else {
+ $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
+ }
+ }
+ close($fh);
+ }
+ my $hsize = scalar(keys %h);
+ my @a;
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
+ $t[3] = uc($t[3]); $t[4] = uc($t[4]);
+ my @s = split(',', $t[4]);
+ $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
+ next if (length($s[0]) != 1);
+ my $hit;
+ if ($is_vcf) {
+ $hit = 0;
+ my $aa = $h{$t[0],$t[1]};
+ if (defined($aa)) {
+ my @aaa = split(",", $aa);
+ for (@aaa) {
+ $hit = 1 if ($_ eq $s[0]);
+ }
+ }
+ } else {
+ $hit = defined($h{$t[0],$t[1]})? 1 : 0;
+ }
+ push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
+ }
+ push(@a, [-1, 0, 0, 0]); # end marker
+ die("[qstats] No SNP data!\n") if (@a == 0);
+ @a = sort {$b->[0]<=>$a->[0]} @a;
+ my $next = $opts{s};
+ my $last = $a[0];
+ my @c = (0, 0, 0, 0);
+ my @lc;
+ $lc[1] = $lc[2] = 0;
+ for my $p (@a) {
+ if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
+ my @x;
+ $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
+ $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
+ $x[2] = sprintf("%.4f", $c[3] / $c[1]);
+ my $a = $c[1] - $lc[1];
+ my $b = $c[2] - $lc[2];
+ $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
+ print join("\t", $last, @c, @x), "\n";
+ $next = $c[0]/@a + $opts{s};
+ $lc[1] = $c[1]; $lc[2] = $c[2];
+ }
+ ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
+ $last = $p->[0];
+ }
+}
+
+sub varFilter {
+ my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4);
+ getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts);
+ die(qq/
+Usage: vcfutils.pl varFilter [options] <in.vcf>
+
+Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
+ -d INT minimum read depth [$opts{d}]
+ -D INT maximum read depth [$opts{D}]
+ -a INT minimum number of alternate bases [$opts{a}]
+ -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
+ -W INT window size for filtering adjacent gaps [$opts{W}]
+ -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
+ -2 FLOAT min P-value for baseQ bias [$opts{2}]
+ -3 FLOAT min P-value for mapQ bias [$opts{3}]
+ -4 FLOAT min P-value for end distance bias [$opts{4}]
+ -e FLOAT min P-value for HWE (plus F<0) [$opts{e}]
+ -p print filtered variants
+
+Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ # calculate the window size
+ my ($ol, $ow) = ($opts{W}, $opts{w});
+ my $max_dist = $ol > $ow? $ol : $ow;
+ # the core loop
+ my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
+ while (<>) {
+ my @t = split;
+ if (/^#/) {
+ print; next;
+ }
+ next if ($t[4] eq '.'); # skip non-var sites
+ next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
+ # check if the site is a SNP
+ my $type = 1; # SNP
+ if (length($t[3]) > 1) {
+ $type = 2; # MNP
+ my @s = split(',', $t[4]);
+ for (@s) {
+ $type = 3 if (length != length($t[3]));
+ }
+ } else {
+ my @s = split(',', $t[4]);
+ for (@s) {
+ $type = 3 if (length > 1);
+ }
+ }
+ # clear the out-of-range elements
+ while (@staging) {
+ # Still on the same chromosome and the first element's window still affects this position?
+ last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
+ varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
+ }
+ my $flt = 0;
+ # parse annotations
+ my ($dp, $mq, $dp_alt) = (-1, -1, -1);
+ if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
+ $dp = $1 + $2 + $3 + $4;
+ $dp_alt = $3 + $4;
+ }
+ if ($t[7] =~ /DP=(\d+)/i) {
+ $dp = $1;
+ }
+ $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
+ # the depth and mapQ filter
+ if ($dp >= 0) {
+ if ($dp < $opts{d}) {
+ $flt = 2;
+ } elsif ($dp > $opts{D}) {
+ $flt = 3;
+ }
+ }
+ $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
+ $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
+ $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
+ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
+ $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
+ # HWE filter
+ if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) {
+ my $p = 2*$1 + $2;
+ my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0;
+ $flt = 9 if ($f < 0);
+ }
+
+ my $score = $t[5] * 100 + $dp_alt;
+ my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
+ if ($flt == 0) {
+ if ($type == 3) { # an indel
+ # filtering SNPs and MNPs
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
+ $x->[1] = 5;
+ }
+ # check the staging list for indel filtering
+ for my $x (@staging) {
+ next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
+ if ($x->[0]>>2 < $score) {
+ $x->[1] = 6;
+ } else {
+ $flt = 6; last;
+ }
+ }
+ } else { # SNP or MNP
+ for my $x (@staging) {
+ next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
+ if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
+ && length($x->[7]) - length($x->[6]) == 1) {
+ $x->[1] = 5;
+ } else { $flt = 5; }
+ last;
+ }
+ # check MNP
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
+ if ($x->[0]>>2 < $score) {
+ $x->[1] = 8;
+ } else {
+ $flt = 8; last;
+ }
+ }
+ }
+ }
+ push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
+ }
+ # output the last few elements in the staging list
+ while (@staging) {
+ varFilter_aux(shift @staging, $opts{p});
+ }
+}
+
+sub varFilter_aux {
+ my ($first, $is_print) = @_;
+ if ($first->[1] == 0) {
+ print join("\t", @$first[3 .. @$first-1]), "\n";
+ } elsif ($is_print) {
+ print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ }
+}
+
+sub gapstats {
+ my (@c0, @c1);
+ $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
+ my @s = split(',', $t[4]);
+ for my $x (@s) {
+ my $l = length($x) - length($t[3]) + 5000;
+ if ($x =~ /^-/) {
+ $l = -(length($x) - 1) + 5000;
+ } elsif ($x =~ /^\+/) {
+ $l = length($x) - 1 + 5000;
+ }
+ $c0[$l] += 1 / @s;
+ }
+ }
+ for (my $i = 0; $i < 10000; ++$i) {
+ next if ($c0[$i] == 0);
+ $c1[0] += $c0[$i];
+ $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
+ printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
+ }
+ printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
+}
+
+sub ucscsnp2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
+ print "##fileformat=VCFv4.0\n";
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
+ while (<>) {
+ my @t = split("\t");
+ my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
+ my $pos = $t[2] + 1;
+ my @alt;
+ push(@alt, $t[7]);
+ if ($t[6] eq '-') {
+ $t[9] = reverse($t[9]);
+ $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
+ }
+ my @a = split("/", $t[9]);
+ for (@a) {
+ push(@alt, $_) if ($_ ne $alt[0]);
+ }
+ if ($indel) {
+ --$pos;
+ for (0 .. $#alt) {
+ $alt[$_] =~ tr/-//d;
+ $alt[$_] = "N$alt[$_]";
+ }
+ }
+ my $ref = shift(@alt);
+ my $af = $t[13] > 0? ";AF=$t[13]" : '';
+ my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
+ my $info = "molType=$t[10];class=$t[11]$valid$af";
+ print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
+ }
+}
+
+sub hapmap2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
+ my $fn = shift(@ARGV);
+ # parse UCSC SNP
+ warn("Parsing UCSC SNPs...\n");
+ my ($fh, %map);
+ open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
+ while (<$fh>) {
+ my @t = split;
+ next if ($t[3] - $t[2] != 1); # not SNP
+ @{$map{$t[4]}} = @t[1,3,7];
+ }
+ close($fh);
+ # write VCF
+ warn("Writing VCF...\n");
+ print "##fileformat=VCFv4.0\n";
+ while (<>) {
+ my @t = split;
+ if ($t[0] eq 'rs#') { # the first line
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
+ } else {
+ next unless ($map{$t[0]});
+ next if (length($t[1]) != 3); # skip non-SNPs
+ my $a = \@{$map{$t[0]}};
+ my $ref = $a->[2];
+ my @u = split('/', $t[1]);
+ if ($u[1] eq $ref) {
+ $u[1] = $u[0]; $u[0] = $ref;
+ } elsif ($u[0] ne $ref) { next; }
+ my $alt = $u[1];
+ my %w;
+ $w{$u[0]} = 0; $w{$u[1]} = 1;
+ my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
+ my $is_tri = 0;
+ for (@t[11..$#t]) {
+ if ($_ eq 'NN') {
+ push(@s, './.');
+ } else {
+ my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
+ if (!defined($a[0]) || !defined($a[1])) {
+ $is_tri = 1;
+ last;
+ }
+ push(@s, "$a[0]/$a[1]");
+ }
+ }
+ next if ($is_tri);
+ print join("\t", @s), "\n";
+ }
+ }
+}
+
+sub vcf2fq {
+ my %opts = (d=>3, D=>100000, Q=>10, l=>5);
+ getopts('d:D:Q:l:', \%opts);
+ die(qq/
+Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
+
+Options: -d INT minimum depth [$opts{d}]
+ -D INT maximum depth [$opts{D}]
+ -Q INT min RMS mapQ [$opts{Q}]
+ -l INT INDEL filtering window [$opts{l}]
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ my ($last_chr, $seq, $qual, $last_pos, @gaps);
+ my $_Q = $opts{Q};
+ my $_d = $opts{d};
+ my $_D = $opts{D};
+
+ my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
+ GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
+
+ $last_chr = '';
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ if ($last_chr ne $t[0]) {
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
+ ($last_chr, $last_pos) = ($t[0], 0);
+ $seq = $qual = '';
+ @gaps = ();
+ }
+ die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
+ if ($t[1] - $last_pos > 1) {
+ $seq .= 'n' x ($t[1] - $last_pos - 1);
+ $qual .= '!' x ($t[1] - $last_pos - 1);
+ }
+ if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
+ my ($ref, $alt) = ($t[3], $1);
+ my ($b, $q);
+ $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
+ if ($q < 0) {
+ $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
+ $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
+ $q = -$q;
+ } else {
+ $b = $het{"$ref$alt"};
+ $b ||= 'N';
+ }
+ $b = lc($b);
+ $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
+ $q = int($q + 33 + .499);
+ $q = chr($q <= 126? $q : 126);
+ $seq .= $b;
+ $qual .= $q;
+ } elsif ($t[4] ne '.') { # an INDEL
+ push(@gaps, [$t[1], length($t[3])]);
+ }
+ $last_pos = $t[1];
+ }
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
+}
+
+sub v2q_post_process {
+ my ($chr, $seq, $qual, $gaps, $l) = @_;
+ for my $g (@$gaps) {
+ my $beg = $g->[0] > $l? $g->[0] - $l : 0;
+ my $end = $g->[0] + $g->[1] + $l;
+ $end = length($$seq) if ($end > length($$seq));
+ substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
+ }
+ print "\@$chr\n"; &v2q_print_str($seq);
+ print "+\n"; &v2q_print_str($qual);
+}
+
+sub v2q_print_str {
+ my ($s) = @_;
+ my $l = length($$s);
+ for (my $i = 0; $i < $l; $i += 60) {
+ print substr($$s, $i, 60), "\n";
+ }
+}
+
+sub usage {
+ die(qq/
+Usage: vcfutils.pl <command> [<arguments>]\n
+Command: subsam get a subset of samples
+ listsam list the samples
+ fillac fill the allele count field
+ qstats SNP stats stratified by QUAL
+
+ hapmap2vcf convert the hapmap format to VCF
+ ucscsnp2vcf convert UCSC SNP SQL dump to VCF
+
+ varFilter filtering short variants (*)
+ vcf2fq VCF->fastq (**)
+
+Notes: Commands with description endting with (*) may need bcftools
+ specific annotations.
+\n/);
+}
diff --git a/samtools-0.1.19/bedcov.c b/samtools-0.1.19/bedcov.c
new file mode 100644
index 0000000..3e4b952
--- /dev/null
+++ b/samtools-0.1.19/bedcov.c
@@ -0,0 +1,127 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "kstring.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+typedef struct {
+ bamFile fp;
+ bam_iter_t iter;
+ int min_mapQ;
+} aux_t;
+
+static int read_bam(void *data, bam1_t *b)
+{
+ aux_t *aux = (aux_t*)data;
+ int ret = bam_iter_read(aux->fp, aux->iter, b);
+ if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+ return ret;
+}
+
+int main_bedcov(int argc, char *argv[])
+{
+ extern void bam_init_header_hash(bam_header_t*);
+ gzFile fp;
+ kstring_t str;
+ kstream_t *ks;
+ bam_index_t **idx;
+ bam_header_t *h = 0;
+ aux_t **aux;
+ int *n_plp, dret, i, n, c, min_mapQ = 0;
+ int64_t *cnt;
+ const bam_pileup1_t **plp;
+
+ while ((c = getopt(argc, argv, "Q:")) >= 0) {
+ switch (c) {
+ case 'Q': min_mapQ = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
+ return 1;
+ }
+ memset(&str, 0, sizeof(kstring_t));
+ n = argc - optind - 1;
+ aux = calloc(n, sizeof(void*));
+ idx = calloc(n, sizeof(void*));
+ for (i = 0; i < n; ++i) {
+ aux[i] = calloc(1, sizeof(aux_t));
+ aux[i]->min_mapQ = min_mapQ;
+ aux[i]->fp = bam_open(argv[i+optind+1], "r");
+ idx[i] = bam_index_load(argv[i+optind+1]);
+ if (aux[i]->fp == 0 || idx[i] == 0) {
+ fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
+ return 2;
+ }
+ bgzf_set_cache_size(aux[i]->fp, 20);
+ if (i == 0) h = bam_header_read(aux[0]->fp);
+ }
+ bam_init_header_hash(h);
+ cnt = calloc(n, 8);
+
+ fp = gzopen(argv[optind], "rb");
+ ks = ks_init(fp);
+ n_plp = calloc(n, sizeof(int));
+ plp = calloc(n, sizeof(void*));
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
+ char *p, *q;
+ int tid, beg, end, pos;
+ bam_mplp_t mplp;
+
+ for (p = q = str.s; *p && *p != '\t'; ++p);
+ if (*p != '\t') goto bed_error;
+ *p = 0; tid = bam_get_tid(h, q); *p = '\t';
+ if (tid < 0) goto bed_error;
+ for (q = p = p + 1; isdigit(*p); ++p);
+ if (*p != '\t') goto bed_error;
+ *p = 0; beg = atoi(q); *p = '\t';
+ for (q = p = p + 1; isdigit(*p); ++p);
+ if (*p == '\t' || *p == 0) {
+ int c = *p;
+ *p = 0; end = atoi(q); *p = c;
+ } else goto bed_error;
+
+ for (i = 0; i < n; ++i) {
+ if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
+ aux[i]->iter = bam_iter_query(idx[i], tid, beg, end);
+ }
+ mplp = bam_mplp_init(n, read_bam, (void**)aux);
+ bam_mplp_set_maxcnt(mplp, 64000);
+ memset(cnt, 0, 8 * n);
+ while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
+ if (pos >= beg && pos < end)
+ for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
+ for (i = 0; i < n; ++i) {
+ kputc('\t', &str);
+ kputl(cnt[i], &str);
+ }
+ puts(str.s);
+ bam_mplp_destroy(mplp);
+ continue;
+
+bed_error:
+ fprintf(stderr, "Errors in BED line '%s'\n", str.s);
+ }
+ free(n_plp); free(plp);
+ ks_destroy(ks);
+ gzclose(fp);
+
+ free(cnt);
+ for (i = 0; i < n; ++i) {
+ if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
+ bam_index_destroy(idx[i]);
+ bam_close(aux[i]->fp);
+ free(aux[i]);
+ }
+ bam_header_destroy(h);
+ free(aux); free(idx);
+ free(str.s);
+ return 0;
+}
diff --git a/samtools-0.1.19/bedidx.c b/samtools-0.1.19/bedidx.c
new file mode 100644
index 0000000..ec75a10
--- /dev/null
+++ b/samtools-0.1.19/bedidx.c
@@ -0,0 +1,162 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#ifdef _WIN32
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint64_t)
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+ int n, m;
+ uint64_t *a;
+ int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+ int i, j, m, *idx;
+ m = *n_idx = 0; idx = 0;
+ for (i = 0; i < n; ++i) {
+ int beg, end;
+ beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+ if (m < end + 1) {
+ int oldm = m;
+ m = end + 1;
+ kroundup32(m);
+ idx = realloc(idx, m * sizeof(int));
+ for (j = oldm; j < m; ++j) idx[j] = -1;
+ }
+ if (beg == end) {
+ if (idx[beg] < 0) idx[beg] = i;
+ } else {
+ for (j = beg; j <= end; ++j)
+ if (idx[j] < 0) idx[j] = i;
+ }
+ *n_idx = end + 1;
+ }
+ return idx;
+}
+
+void bed_index(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ bed_reglist_t *p = &kh_val(h, k);
+ if (p->idx) free(p->idx);
+ ks_introsort(uint64_t, p->n, p->a);
+ p->idx = bed_index_core(p->n, p->a, &p->m);
+ }
+ }
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+ int i, min_off;
+ if (p->n == 0) return 0;
+ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+ int n = beg>>LIDX_SHIFT;
+ if (n > p->n) n = p->n;
+ for (i = n - 1; i >= 0; --i)
+ if (p->idx[i] >= 0) break;
+ min_off = i >= 0? p->idx[i] : 0;
+ }
+ for (i = min_off; i < p->n; ++i) {
+ if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+ if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+ return 1; // find the overlap; return
+ }
+ return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+ const reghash_t *h = (const reghash_t*)_h;
+ khint_t k;
+ if (!h) return 0;
+ k = kh_get(reg, h, chr);
+ if (k == kh_end(h)) return 0;
+ return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t *str;
+ // read the list
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) return 0;
+ str = calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+ int beg = -1, end = -1;
+ bed_reglist_t *p;
+ khint_t k = kh_get(reg, h, str->s);
+ if (k == kh_end(h)) { // absent from the hash table
+ int ret;
+ char *s = strdup(str->s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (dret != '\n') { // if the lines has other characters
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ beg = atoi(str->s); // begin
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ end = atoi(str->s); // end
+ if (end < beg) end = -1;
+ }
+ }
+ }
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+ if (beg >= 0 && end > beg) {
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ bed_index(h);
+ return h;
+}
+
+void bed_destroy(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free(kh_val(h, k).idx);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+}
diff --git a/samtools-0.1.19/bgzf.c b/samtools-0.1.19/bgzf.c
new file mode 100644
index 0000000..880d5af
--- /dev/null
+++ b/samtools-0.1.19/bgzf.c
@@ -0,0 +1,694 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include "bgzf.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+typedef knetFile *_bgzf_file_t;
+#define _bgzf_open(fn, mode) knet_open(fn, mode)
+#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
+#define _bgzf_close(fp) knet_close(fp)
+#define _bgzf_fileno(fp) ((fp)->fd)
+#define _bgzf_tell(fp) knet_tell(fp)
+#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
+#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
+#else // ~defined(_USE_KNETFILE)
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else // ~defined(_WIN32)
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif // ~defined(_WIN32)
+typedef FILE *_bgzf_file_t;
+#define _bgzf_open(fn, mode) fopen(fn, mode)
+#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
+#define _bgzf_close(fp) fclose(fp)
+#define _bgzf_fileno(fp) fileno(fp)
+#define _bgzf_tell(fp) ftello(fp)
+#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
+#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
+#endif // ~define(_USE_KNETFILE)
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+ int size;
+ uint8_t *block;
+ int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init()
+{
+ BGZF *fp;
+ fp = calloc(1, sizeof(BGZF));
+ fp->is_write = 0;
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+#ifdef BGZF_CACHE
+ fp->cache = kh_init(cache);
+#endif
+ return fp;
+}
+
+static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
+{
+ BGZF *fp;
+ fp = calloc(1, sizeof(BGZF));
+ fp->is_write = 1;
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+ return fp;
+}
+// get the compress level from the mode string
+static int mode2level(const char *__restrict mode)
+{
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ return compress_level;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r') || strchr(mode, 'R')) {
+ _bgzf_file_t fpr;
+ if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
+ fp = bgzf_read_init();
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ FILE *fpw;
+ if ((fpw = fopen(path, "w")) == 0) return 0;
+ fp = bgzf_write_init(mode2level(mode));
+ fp->fp = fpw;
+ }
+ return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r') || strchr(mode, 'R')) {
+ _bgzf_file_t fpr;
+ if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
+ fp = bgzf_read_init();
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ FILE *fpw;
+ if ((fpw = fdopen(fd, "w")) == 0) return 0;
+ fp = bgzf_write_init(mode2level(mode));
+ fp->fp = fpw;
+ }
+ return fp;
+}
+
+static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
+{
+ uint32_t crc;
+ z_stream zs;
+ uint8_t *dst = (uint8_t*)_dst;
+
+ // compress the body
+ zs.zalloc = NULL; zs.zfree = NULL;
+ zs.next_in = src;
+ zs.avail_in = slen;
+ zs.next_out = dst + BLOCK_HEADER_LENGTH;
+ zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+ if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
+ if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
+ if (deflateEnd(&zs) != Z_OK) return -1;
+ *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ // write the header
+ memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+ packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
+ // write the footer
+ crc = crc32(crc32(0L, NULL, 0L), src, slen);
+ packInt32((uint8_t*)&dst[*dlen - 8], crc);
+ packInt32((uint8_t*)&dst[*dlen - 4], slen);
+ return 0;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+ int comp_size = BGZF_MAX_BLOCK_SIZE;
+ if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_offset = 0;
+ return comp_size;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = fp->uncompressed_block;
+ zs.avail_out = BGZF_MAX_BLOCK_SIZE;
+
+ if (inflateInit2(&zs, -15) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+ inflateEnd(&zs);
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflateEnd(&zs) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ return zs.total_out;
+}
+
+static int check_header(const uint8_t *header)
+{
+ return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2);
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+ khint_t k;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (fp->is_write) return;
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) free(kh_val(h, k).block);
+ kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ k = kh_get(cache, h, block_address);
+ if (k == kh_end(h)) return 0;
+ p = &kh_val(h, k);
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address = block_address;
+ fp->block_length = p->size;
+ memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
+ _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
+ return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+ int ret;
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
+ if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) {
+ /* A better way would be to remove the oldest block in the
+ * cache, but here we remove a random one for simplicity. This
+ * should not have a big impact on performance. */
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) break;
+ if (k < kh_end(h)) {
+ free(kh_val(h, k).block);
+ kh_del(cache, h, k);
+ }
+ }
+ k = kh_put(cache, h, fp->block_address, &ret);
+ if (ret == 0) return; // if this happens, a bug!
+ p = &kh_val(h, k);
+ p->size = fp->block_length;
+ p->end_offset = fp->block_address + size;
+ p->block = malloc(BGZF_MAX_BLOCK_SIZE);
+ memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+ uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+ int count, size = 0, block_length, remaining;
+ int64_t block_address;
+ block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
+ count = _bgzf_read(fp->fp, header, sizeof(header));
+ if (count == 0) { // no data read
+ fp->block_length = 0;
+ return 0;
+ }
+ if (count != sizeof(header) || !check_header(header)) {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
+ size = count;
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+ compressed_block = (uint8_t*)fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ remaining = block_length - BLOCK_HEADER_LENGTH;
+ count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+ if (count != remaining) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ size += count;
+ if ((count = inflate_block(fp, block_length)) < 0) return -1;
+ if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+ fp->block_address = block_address;
+ fp->block_length = count;
+ cache_block(fp, size);
+ return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
+{
+ ssize_t bytes_read = 0;
+ uint8_t *output = data;
+ if (length <= 0) return 0;
+ assert(fp->is_write == 0);
+ while (bytes_read < length) {
+ int copy_length, available = fp->block_length - fp->block_offset;
+ uint8_t *buffer;
+ if (available <= 0) {
+ if (bgzf_read_block(fp) != 0) return -1;
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) break;
+ }
+ copy_length = length - bytes_read < available? length - bytes_read : available;
+ buffer = fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = fp->block_length = 0;
+ }
+ return bytes_read;
+}
+
+/***** BEGIN: multi-threading *****/
+
+typedef struct {
+ BGZF *fp;
+ struct mtaux_t *mt;
+ void *buf;
+ int i, errcode, toproc;
+} worker_t;
+
+typedef struct mtaux_t {
+ int n_threads, n_blks, curr, done;
+ volatile int proc_cnt;
+ void **blk;
+ int *len;
+ worker_t *w;
+ pthread_t *tid;
+ pthread_mutex_t lock;
+ pthread_cond_t cv;
+} mtaux_t;
+
+static int worker_aux(worker_t *w)
+{
+ int i, tmp, stop = 0;
+ // wait for condition: to process or all done
+ pthread_mutex_lock(&w->mt->lock);
+ while (!w->toproc && !w->mt->done)
+ pthread_cond_wait(&w->mt->cv, &w->mt->lock);
+ if (w->mt->done) stop = 1;
+ w->toproc = 0;
+ pthread_mutex_unlock(&w->mt->lock);
+ if (stop) return 1; // to quit the thread
+ w->errcode = 0;
+ for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
+ int clen = BGZF_MAX_BLOCK_SIZE;
+ if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0)
+ w->errcode |= BGZF_ERR_ZLIB;
+ memcpy(w->mt->blk[i], w->buf, clen);
+ w->mt->len[i] = clen;
+ }
+ tmp = __sync_fetch_and_add(&w->mt->proc_cnt, 1);
+ return 0;
+}
+
+static void *mt_worker(void *data)
+{
+ while (worker_aux(data) == 0);
+ return 0;
+}
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+ int i;
+ mtaux_t *mt;
+ pthread_attr_t attr;
+ if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
+ mt = calloc(1, sizeof(mtaux_t));
+ mt->n_threads = n_threads;
+ mt->n_blks = n_threads * n_sub_blks;
+ mt->len = calloc(mt->n_blks, sizeof(int));
+ mt->blk = calloc(mt->n_blks, sizeof(void*));
+ for (i = 0; i < mt->n_blks; ++i)
+ mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
+ mt->tid = calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
+ mt->w = calloc(mt->n_threads, sizeof(worker_t));
+ for (i = 0; i < mt->n_threads; ++i) {
+ mt->w[i].i = i;
+ mt->w[i].mt = mt;
+ mt->w[i].fp = fp;
+ mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
+ }
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ pthread_mutex_init(&mt->lock, 0);
+ pthread_cond_init(&mt->cv, 0);
+ for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
+ pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
+ fp->mt = mt;
+ return 0;
+}
+
+static void mt_destroy(mtaux_t *mt)
+{
+ int i;
+ // signal all workers to quit
+ pthread_mutex_lock(&mt->lock);
+ mt->done = 1; mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
+ // free other data allocated on heap
+ for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
+ for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
+ free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
+ pthread_cond_destroy(&mt->cv);
+ pthread_mutex_destroy(&mt->lock);
+ free(mt);
+}
+
+static void mt_queue(BGZF *fp)
+{
+ mtaux_t *mt = (mtaux_t*)fp->mt;
+ assert(mt->curr < mt->n_blks); // guaranteed by the caller
+ memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
+ mt->len[mt->curr] = fp->block_offset;
+ fp->block_offset = 0;
+ ++mt->curr;
+}
+
+static int mt_flush(BGZF *fp)
+{
+ int i;
+ mtaux_t *mt = (mtaux_t*)fp->mt;
+ if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
+ // signal all the workers to compress
+ pthread_mutex_lock(&mt->lock);
+ for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
+ mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ // worker 0 is doing things here
+ worker_aux(&mt->w[0]);
+ // wait for all the threads to complete
+ while (mt->proc_cnt < mt->n_threads);
+ // dump data to disk
+ for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
+ for (i = 0; i < mt->curr; ++i)
+ if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i])
+ fp->errcode |= BGZF_ERR_IO;
+ mt->curr = 0;
+ return 0;
+}
+
+static int mt_lazy_flush(BGZF *fp)
+{
+ mtaux_t *mt = (mtaux_t*)fp->mt;
+ if (fp->block_offset) mt_queue(fp);
+ if (mt->curr == mt->n_blks)
+ return mt_flush(fp);
+ return -1;
+}
+
+static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length)
+{
+ const uint8_t *input = data;
+ ssize_t rest = length;
+ while (rest) {
+ int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest;
+ memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length; input += copy_length; rest -= copy_length;
+ if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp);
+ }
+ return length - rest;
+}
+
+/***** END: multi-threading *****/
+
+int bgzf_flush(BGZF *fp)
+{
+ if (!fp->is_write) return 0;
+ if (fp->mt) return mt_flush(fp);
+ while (fp->block_offset > 0) {
+ int block_length;
+ block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) return -1;
+ if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
+ fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+ if (fp->block_offset + size > BGZF_BLOCK_SIZE) {
+ if (fp->mt) return mt_lazy_flush(fp);
+ else return bgzf_flush(fp);
+ }
+ return -1;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
+{
+ const uint8_t *input = data;
+ int block_length = BGZF_BLOCK_SIZE, bytes_written = 0;
+ assert(fp->is_write);
+ if (fp->mt) return mt_write(fp, data, length);
+ while (bytes_written < length) {
+ uint8_t* buffer = fp->uncompressed_block;
+ int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ bytes_written += copy_length;
+ if (fp->block_offset == block_length && bgzf_flush(fp)) break;
+ }
+ return bytes_written;
+}
+
+int bgzf_close(BGZF* fp)
+{
+ int ret, count, block_length;
+ if (fp == 0) return -1;
+ if (fp->is_write) {
+ if (bgzf_flush(fp) != 0) return -1;
+ fp->compress_level = -1;
+ block_length = deflate_block(fp, 0); // write an empty block
+ count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
+ if (fflush(fp->fp) != 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ if (fp->mt) mt_destroy(fp->mt);
+ }
+ ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp);
+ if (ret != 0) return -1;
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free_cache(fp);
+ free(fp);
+ return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+ if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+ static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+ uint8_t buf[28];
+ off_t offset;
+ offset = _bgzf_tell((_bgzf_file_t)fp->fp);
+ if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
+ _bgzf_read(fp->fp, buf, 28);
+ _bgzf_seek(fp->fp, offset, SEEK_SET);
+ return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+ int block_offset;
+ int64_t block_address;
+
+ if (fp->is_write || where != SEEK_SET) {
+ fp->errcode |= BGZF_ERR_MISUSE;
+ return -1;
+ }
+ block_offset = pos & 0xFFFF;
+ block_address = pos >> 16;
+ if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+ uint8_t buf[16];
+ int n;
+ _bgzf_file_t fp;
+ if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
+ n = _bgzf_read(fp, buf, 16);
+ _bgzf_close(fp);
+ if (n != 16) return 0;
+ return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+ int c;
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) return -2; /* error */
+ if (fp->block_length == 0) return -1; /* end-of-file */
+ }
+ c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+ int l, state = 0;
+ unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+ str->l = 0;
+ do {
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) { state = -2; break; }
+ if (fp->block_length == 0) { state = -1; break; }
+ }
+ for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+ if (l < fp->block_length) state = 1;
+ l -= fp->block_offset;
+ if (str->l + l + 1 >= str->m) {
+ str->m = str->l + l + 2;
+ kroundup32(str->m);
+ str->s = (char*)realloc(str->s, str->m);
+ }
+ memcpy(str->s + str->l, buf + fp->block_offset, l);
+ str->l += l;
+ fp->block_offset += l + 1;
+ if (fp->block_offset >= fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ } while (state == 0);
+ if (str->l == 0 && state < 0) return state;
+ str->s[str->l] = 0;
+ return str->l;
+}
diff --git a/samtools-0.1.19/bgzf.h b/samtools-0.1.19/bgzf.h
new file mode 100644
index 0000000..cb67681
--- /dev/null
+++ b/samtools-0.1.19/bgzf.h
@@ -0,0 +1,207 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <sys/types.h>
+
+#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
+#define BGZF_MAX_BLOCK_SIZE 0x10000
+
+#define BGZF_ERR_ZLIB 1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO 4
+#define BGZF_ERR_MISUSE 8
+
+typedef struct {
+ int errcode:16, is_write:2, compress_level:14;
+ int cache_size;
+ int block_length, block_offset;
+ int64_t block_address;
+ void *uncompressed_block, *compressed_block;
+ void *cache; // a pointer to a hash table
+ void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading
+ void *mt; // only used for multi-threading
+} BGZF;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /******************
+ * Basic routines *
+ ******************/
+
+ /**
+ * Open an existing file descriptor for reading or writing.
+ *
+ * @param fd file descriptor
+ * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies
+ * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored.
+ * @return BGZF file handler; 0 on error
+ */
+ BGZF* bgzf_dopen(int fd, const char *mode);
+
+ #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
+
+ /**
+ * Open the specified file for reading or writing.
+ */
+ BGZF* bgzf_open(const char* path, const char *mode);
+
+ /**
+ * Close the BGZF and free all associated resources.
+ *
+ * @param fp BGZF file handler
+ * @return 0 on success and -1 on error
+ */
+ int bgzf_close(BGZF *fp);
+
+ /**
+ * Read up to _length_ bytes from the file storing into _data_.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to read into
+ * @param length size of data to read
+ * @return number of bytes actually read; 0 on end-of-file and -1 on error
+ */
+ ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length);
+
+ /**
+ * Write _length_ bytes from _data_ to the file.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to write
+ * @param length size of data to write
+ * @return number of bytes actually written; -1 on error
+ */
+ ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length);
+
+ /**
+ * Write the data in the buffer to the file.
+ */
+ int bgzf_flush(BGZF *fp);
+
+ /**
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ */
+ #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
+
+ /**
+ * Set the file to read from the location specified by _pos_.
+ *
+ * @param fp BGZF file handler
+ * @param pos virtual file offset returned by bgzf_tell()
+ * @param whence must be SEEK_SET
+ * @return 0 on success and -1 on error
+ */
+ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
+
+ /**
+ * Check if the BGZF end-of-file (EOF) marker is present
+ *
+ * @param fp BGZF file handler opened for reading
+ * @return 1 if EOF is present; 0 if not or on I/O error
+ */
+ int bgzf_check_EOF(BGZF *fp);
+
+ /**
+ * Check if a file is in the BGZF format
+ *
+ * @param fn file name
+ * @return 1 if _fn_ is BGZF; 0 if not or on I/O error
+ */
+ int bgzf_is_bgzf(const char *fn);
+
+ /*********************
+ * Advanced routines *
+ *********************/
+
+ /**
+ * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+ *
+ * @param fp BGZF file handler
+ * @param size size of cache in bytes; 0 to disable caching (default)
+ */
+ void bgzf_set_cache_size(BGZF *fp, int size);
+
+ /**
+ * Flush the file if the remaining buffer size is smaller than _size_
+ */
+ int bgzf_flush_try(BGZF *fp, ssize_t size);
+
+ /**
+ * Read one byte from a BGZF file. It is faster than bgzf_read()
+ * @param fp BGZF file handler
+ * @return byte read; -1 on end-of-file or error
+ */
+ int bgzf_getc(BGZF *fp);
+
+ /**
+ * Read one line from a BGZF file. It is faster than bgzf_getc()
+ *
+ * @param fp BGZF file handler
+ * @param delim delimitor
+ * @param str string to write to; must be initialized
+ * @return length of the string; 0 on end-of-file; negative on error
+ */
+ int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+ /**
+ * Read the next BGZF block.
+ */
+ int bgzf_read_block(BGZF *fp);
+
+ /**
+ * Enable multi-threading (only effective on writing)
+ *
+ * @param fp BGZF file handler; must be opened for writing
+ * @param n_threads #threads used for writing
+ * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
+ */
+ int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/bgzip.c b/samtools-0.1.19/bgzip.c
new file mode 100644
index 0000000..ebcafa2
--- /dev/null
+++ b/samtools-0.1.19/bgzip.c
@@ -0,0 +1,206 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+static const int WINDOW_SIZE = 64 * 1024;
+
+static int bgzip_main_usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n");
+ fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n");
+ fprintf(stderr, " -d decompress\n");
+ fprintf(stderr, " -f overwrite files without asking\n");
+ fprintf(stderr, " -b INT decompress at virtual file pointer INT\n");
+ fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n");
+ fprintf(stderr, " -h give this help\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+ int fd = -1;
+ char c;
+ if (!is_forced) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
+ fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
+ scanf("%c", &c);
+ if (c != 'Y' && c != 'y') {
+ fprintf(stderr, "[bgzip] not overwritten\n");
+ exit(1);
+ }
+ }
+ }
+ if (fd < 0) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
+ fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
+ exit(1);
+ }
+ }
+ return fd;
+}
+
+static void fail(BGZF* fp)
+{
+ fprintf(stderr, "Error: %s\n", fp->error);
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ int c, compress, pstdout, is_forced;
+ BGZF *fp;
+ void *buffer;
+ long start, end, size;
+
+ compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+ while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){
+ switch(c){
+ case 'h': return bgzip_main_usage();
+ case 'd': compress = 0; break;
+ case 'c': pstdout = 1; break;
+ case 'b': start = atol(optarg); break;
+ case 's': size = atol(optarg); break;
+ case 'f': is_forced = 1; break;
+ }
+ }
+ if (size >= 0) end = start + size;
+ if (end >= 0 && end < start) {
+ fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
+ return 1;
+ }
+ if (compress == 1) {
+ struct stat sbuf;
+ int f_src = fileno(stdin);
+ int f_dst = fileno(stdout);
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
+ if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
+ if (pstdout)
+ f_dst = fileno(stdout);
+ else
+ {
+ char *name = malloc(strlen(argv[optind]) + 5);
+ strcpy(name, argv[optind]);
+ strcat(name, ".gz");
+ f_dst = write_open(name, is_forced);
+ if (f_dst < 0) return 1;
+ free(name);
+ }
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdout)) )
+ return bgzip_main_usage();
+
+ fp = bgzf_fdopen(f_dst, "w");
+ buffer = malloc(WINDOW_SIZE);
+ while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
+ if (bgzf_write(fp, buffer, c) < 0) fail(fp);
+ // f_dst will be closed here
+ if (bgzf_close(fp) < 0) fail(fp);
+ if (argc > optind && !pstdout) unlink(argv[optind]);
+ free(buffer);
+ close(f_src);
+ return 0;
+ } else {
+ struct stat sbuf;
+ int f_dst;
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+ char *name;
+ int len = strlen(argv[optind]);
+ if ( strcmp(argv[optind]+len-3,".gz") )
+ {
+ fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ fp = bgzf_open(argv[optind], "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
+ return 1;
+ }
+
+ if (pstdout) {
+ f_dst = fileno(stdout);
+ }
+ else {
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = write_open(name, is_forced);
+ free(name);
+ }
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdin)) )
+ return bgzip_main_usage();
+ else
+ {
+ f_dst = fileno(stdout);
+ fp = bgzf_fdopen(fileno(stdin), "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
+ return 1;
+ }
+ }
+ buffer = malloc(WINDOW_SIZE);
+ if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
+ while (1) {
+ if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
+ else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if (c == 0) break;
+ if (c < 0) fail(fp);
+ start += c;
+ write(f_dst, buffer, c);
+ if (end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (bgzf_close(fp) < 0) fail(fp);
+ if (!pstdout) unlink(argv[optind]);
+ return 0;
+ }
+}
diff --git a/samtools-0.1.19/cut_target.c b/samtools-0.1.19/cut_target.c
new file mode 100644
index 0000000..26f434f
--- /dev/null
+++ b/samtools-0.1.19/cut_target.c
@@ -0,0 +1,193 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+#include "errmod.h"
+#include "faidx.h"
+
+#define ERR_DEP 0.83f
+
+typedef struct {
+ int e[2][3], p[2][2];
+} score_param_t;
+
+/* Note that although the two matrics have 10 parameters in total, only 4
+ * (probably 3) are free. Changing the scoring matrices in a sort of symmetric
+ * way will not change the result. */
+static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
+
+typedef struct {
+ int min_baseQ, tid, max_bases;
+ uint16_t *bases;
+ bamFile fp;
+ bam_header_t *h;
+ char *ref;
+ faidx_t *fai;
+ errmod_t *em;
+} ct_t;
+
+static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
+{
+ int i, j, ret, tmp, k, sum[4], qual;
+ float q[16];
+ if (n > g->max_bases) { // enlarge g->bases
+ g->max_bases = n;
+ kroundup32(g->max_bases);
+ g->bases = realloc(g->bases, g->max_bases * 2);
+ }
+ for (i = k = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint8_t *seq;
+ int q, baseQ, b;
+ if (p->is_refskip || p->is_del) continue;
+ baseQ = bam1_qual(p->b)[p->qpos];
+ if (baseQ < g->min_baseQ) continue;
+ seq = bam1_seq(p->b);
+ b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+ if (b > 3) continue;
+ q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+ if (q < 4) q = 4;
+ if (q > 63) q = 63;
+ g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b;
+ }
+ if (k == 0) return 0;
+ errmod_cal(g->em, k, 4, g->bases, q);
+ for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i;
+ for (i = 1; i < 4; ++i) // insertion sort
+ for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
+ tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
+ qual = (sum[1]>>2) - (sum[0]>>2);
+ k = k < 256? k : 255;
+ ret = (qual < 63? qual : 63) << 2 | (sum[0]&3);
+ return ret<<8|k;
+}
+
+static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
+{
+ int i, f[2][2], *prev, *curr, *swap_tmp, s;
+ uint8_t *b; // backtrack array
+ b = calloc(l, 1);
+ f[0][0] = f[0][1] = 0;
+ prev = f[0]; curr = f[1];
+ // fill the backtrack matrix
+ for (i = 0; i < l; ++i) {
+ int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2;
+ int tmp0, tmp1;
+ // compute f[0]
+ tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0)
+ tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1)
+ if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0;
+ else curr[0] = tmp1, b[i] = 1;
+ // compute f[1]
+ tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0)
+ tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1)
+ if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1;
+ else curr[1] = tmp1, b[i] |= 1<<1;
+ // swap
+ swap_tmp = prev; prev = curr; curr = swap_tmp;
+ }
+ // backtrack
+ s = prev[0] > prev[1]? 0 : 1;
+ for (i = l - 1; i > 0; --i) {
+ b[i] |= s<<2;
+ s = b[i]>>s&1;
+ }
+ // print
+ for (i = 0, s = -1; i <= l; ++i) {
+ if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
+ if (s >= 0) {
+ int j;
+ printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s);
+ for (j = s; j < i; ++j) {
+ int c = cns[j]>>8;
+ if (c == 0) putchar('N');
+ else putchar("ACGT"[c&3]);
+ }
+ putchar('\t');
+ for (j = s; j < i; ++j)
+ putchar(33 + (cns[j]>>8>>2));
+ putchar('\n');
+ }
+ //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
+ s = -1;
+ } else if ((b[i]>>2&3) && s < 0) s = i;
+ }
+ free(b);
+}
+
+static int read_aln(void *data, bam1_t *b)
+{
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
+ ct_t *g = (ct_t*)data;
+ int ret, len;
+ ret = bam_read1(g->fp, b);
+ if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) {
+ if (b->core.tid != g->tid) { // then load the sequence
+ free(g->ref);
+ g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+ g->tid = b->core.tid;
+ }
+ bam_prob_realn_core(b, g->ref, 1<<1|1);
+ }
+ return ret;
+}
+
+int main_cut_target(int argc, char *argv[])
+{
+ int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l;
+ const bam_pileup1_t *p;
+ bam_plp_t plp;
+ uint16_t *cns;
+ ct_t g;
+
+ memset(&g, 0, sizeof(ct_t));
+ g.min_baseQ = 13; g.tid = -1;
+ while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) {
+ switch (c) {
+ case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
+ case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
+ case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
+ case '1': g_param.e[1][1] = atoi(optarg); break;
+ case '2': g_param.e[1][2] = atoi(optarg); break;
+ case 'f': g.fai = fai_load(optarg);
+ if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__);
+ break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n");
+ return 1;
+ }
+ l = max_l = 0; cns = 0;
+ g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ g.h = bam_header_read(g.fp);
+ g.em = errmod_init(1 - ERR_DEP);
+ plp = bam_plp_init(read_aln, &g);
+ while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
+ if (tid < 0) break;
+ if (tid != lasttid) { // change of chromosome
+ if (cns) process_cns(g.h, lasttid, l, cns);
+ if (max_l < g.h->target_len[tid]) {
+ max_l = g.h->target_len[tid];
+ kroundup32(max_l);
+ cns = realloc(cns, max_l * 2);
+ }
+ l = g.h->target_len[tid];
+ memset(cns, 0, max_l * 2);
+ lasttid = tid;
+ }
+ cns[pos] = gencns(&g, n, p);
+ lastpos = pos;
+ }
+ process_cns(g.h, lasttid, l, cns);
+ free(cns);
+ bam_header_destroy(g.h);
+ bam_plp_destroy(plp);
+ bam_close(g.fp);
+ if (g.fai) {
+ fai_destroy(g.fai); free(g.ref);
+ }
+ errmod_destroy(g.em);
+ free(g.bases);
+ return 0;
+}
diff --git a/samtools-0.1.19/errmod.c b/samtools-0.1.19/errmod.c
new file mode 100644
index 0000000..fba9a8d
--- /dev/null
+++ b/samtools-0.1.19/errmod.c
@@ -0,0 +1,130 @@
+#include <math.h>
+#include "errmod.h"
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint16_t)
+
+typedef struct __errmod_coef_t {
+ double *fk, *beta, *lhet;
+} errmod_coef_t;
+
+typedef struct {
+ double fsum[16], bsum[16];
+ uint32_t c[16];
+} call_aux_t;
+
+static errmod_coef_t *cal_coef(double depcorr, double eta)
+{
+ int k, n, q;
+ long double sum, sum1;
+ double *lC;
+ errmod_coef_t *ec;
+
+ ec = calloc(1, sizeof(errmod_coef_t));
+ // initialize ->fk
+ ec->fk = (double*)calloc(256, sizeof(double));
+ ec->fk[0] = 1.0;
+ for (n = 1; n != 256; ++n)
+ ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
+ // initialize ->coef
+ ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
+ lC = (double*)calloc(256 * 256, sizeof(double));
+ for (n = 1; n != 256; ++n) {
+ double lgn = lgamma(n+1);
+ for (k = 1; k <= n; ++k)
+ lC[n<<8|k] = lgn - lgamma(k+1) - lgamma(n-k+1);
+ }
+ for (q = 1; q != 64; ++q) {
+ double e = pow(10.0, -q/10.0);
+ double le = log(e);
+ double le1 = log(1.0 - e);
+ for (n = 1; n <= 255; ++n) {
+ double *beta = ec->beta + (q<<16|n<<8);
+ sum1 = sum = 0.0;
+ for (k = n; k >= 0; --k, sum1 = sum) {
+ sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
+ beta[k] = -10. / M_LN10 * logl(sum1 / sum);
+ }
+ }
+ }
+ // initialize ->lhet
+ ec->lhet = (double*)calloc(256 * 256, sizeof(double));
+ for (n = 0; n < 256; ++n)
+ for (k = 0; k < 256; ++k)
+ ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
+ free(lC);
+ return ec;
+}
+
+errmod_t *errmod_init(float depcorr)
+{
+ errmod_t *em;
+ em = (errmod_t*)calloc(1, sizeof(errmod_t));
+ em->depcorr = depcorr;
+ em->coef = cal_coef(depcorr, 0.03);
+ return em;
+}
+
+void errmod_destroy(errmod_t *em)
+{
+ if (em == 0) return;
+ free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
+ free(em->coef); free(em);
+}
+// qual:6, strand:1, base:4
+int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
+{
+ call_aux_t aux;
+ int i, j, k, w[32];
+
+ if (m > m) return -1;
+ memset(q, 0, m * m * sizeof(float));
+ if (n == 0) return 0;
+ // calculate aux.esum and aux.fsum
+ if (n > 255) { // then sample 255 bases
+ ks_shuffle(uint16_t, n, bases);
+ n = 255;
+ }
+ ks_introsort(uint16_t, n, bases);
+ memset(w, 0, 32 * sizeof(int));
+ memset(&aux, 0, sizeof(call_aux_t));
+ for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
+ uint16_t b = bases[j];
+ int q = b>>5 < 4? 4 : b>>5;
+ if (q > 63) q = 63;
+ k = b&0x1f;
+ aux.fsum[k&0xf] += em->coef->fk[w[k]];
+ aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
+ ++aux.c[k&0xf];
+ ++w[k];
+ }
+ // generate likelihood
+ for (j = 0; j != m; ++j) {
+ float tmp1, tmp3;
+ int tmp2, bar_e;
+ // homozygous
+ for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) {
+ if (k == j) continue;
+ tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
+ }
+ if (tmp2) {
+ bar_e = (int)(tmp1 / tmp3 + 0.499);
+ if (bar_e > 63) bar_e = 63;
+ q[j*m+j] = tmp1;
+ }
+ // heterozygous
+ for (k = j + 1; k < m; ++k) {
+ int cjk = aux.c[j] + aux.c[k];
+ for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
+ if (i == j || i == k) continue;
+ tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
+ }
+ if (tmp2) {
+ bar_e = (int)(tmp1 / tmp3 + 0.499);
+ if (bar_e > 63) bar_e = 63;
+ q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
+ } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
+ }
+ for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/errmod.h b/samtools-0.1.19/errmod.h
new file mode 100644
index 0000000..32c07b6
--- /dev/null
+++ b/samtools-0.1.19/errmod.h
@@ -0,0 +1,24 @@
+#ifndef ERRMOD_H
+#define ERRMOD_H
+
+#include <stdint.h>
+
+struct __errmod_coef_t;
+
+typedef struct {
+ double depcorr;
+ struct __errmod_coef_t *coef;
+} errmod_t;
+
+errmod_t *errmod_init(float depcorr);
+void errmod_destroy(errmod_t *em);
+
+/*
+ n: number of bases
+ m: maximum base
+ bases[i]: qual:6, strand:1, base:4
+ q[i*m+j]: phred-scaled likelihood of (i,j)
+ */
+int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
+
+#endif
diff --git a/samtools-0.1.19/examples/00README.txt b/samtools-0.1.19/examples/00README.txt
new file mode 100644
index 0000000..dbb276f
--- /dev/null
+++ b/samtools-0.1.19/examples/00README.txt
@@ -0,0 +1,23 @@
+File ex1.fa contains two sequences cut from the human genome
+build36. They were exatracted with command:
+
+ samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550
+
+Sequence names were changed manually for simplicity. File ex1.sam.gz
+contains MAQ alignments exatracted with:
+
+ (samtools view NA18507_maq.bam 2:2044001-2045500;
+ samtools view NA18507_maq.bam 20:68001-69500)
+
+and processed with `samtools fixmate' to make it self-consistent as a
+standalone alignment.
+
+To try samtools, you may run the following commands:
+
+ samtools faidx ex1.fa # index the reference FASTA
+ samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM
+ samtools index ex1.bam # index BAM
+ samtools tview ex1.bam ex1.fa # view alignment
+ samtools pileup -cf ex1.fa ex1.bam # pileup and consensus
+ samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz
+
diff --git a/samtools-0.1.19/examples/Makefile b/samtools-0.1.19/examples/Makefile
new file mode 100644
index 0000000..309399f
--- /dev/null
+++ b/samtools-0.1.19/examples/Makefile
@@ -0,0 +1,50 @@
+all:../libbam.a ../samtools ../bcftools/bcftools \
+ ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz ex1.bcf calDepth
+ @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo;
+
+ex1.fa.fai:ex1.fa
+ ../samtools faidx ex1.fa
+ex1.bam:ex1.sam.gz ex1.fa.fai
+ ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam
+ex1.bam.bai:ex1.bam
+ ../samtools index ex1.bam
+ex1.pileup.gz:ex1.bam ex1.fa
+ ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz
+ex1.glf:ex1.bam ex1.fa
+ ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf
+ex1.glfview.gz:ex1.glf
+ ../samtools glfview ex1.glf | gzip > ex1.glfview.gz
+ex1a.bam:ex1.bam
+ ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"a";print}}' | ../samtools view -bS - > $@
+ex1b.bam:ex1.bam
+ ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"b";print}}' | ../samtools view -bS - > $@
+ex1f.rg:
+ (echo "@RG ID:ex1 LB:ex1 SM:ex1"; echo "@RG ID:ex1a LB:ex1 SM:ex1"; echo "@RG ID:ex1b LB:ex1b SM:ex1b") > $@
+ex1f.bam:ex1.bam ex1a.bam ex1b.bam ex1f.rg
+ ../samtools merge -rh ex1f.rg $@ ex1.bam ex1a.bam ex1b.bam
+ex1f-rmduppe.bam:ex1f.bam
+ ../samtools rmdup ex1f.bam $@
+ex1f-rmdupse.bam:ex1f.bam
+ ../samtools rmdup -S ex1f.bam $@
+
+ex1.bcf:ex1.bam ex1.fa.fai
+ ../samtools mpileup -gf ex1.fa ex1.bam > $@
+
+../bcftools/bcftools:
+ (cd ../bcftools; make bcftools)
+
+../samtools:
+ (cd ..; make samtools)
+
+../libbam.a:
+ (cd ..; make libbam.a)
+
+calDepth:../libbam.a calDepth.c
+ gcc -g -Wall -O2 -I.. calDepth.c -o $@ -L.. -lbam -lm -lz
+
+clean:
+ rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf
+
+# ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t", at F[0,1], at _)."\n"'
+
+# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t", at F[0,1], at _)."\n"'
diff --git a/samtools-0.1.19/examples/bam2bed.c b/samtools-0.1.19/examples/bam2bed.c
new file mode 100644
index 0000000..bb937d1
--- /dev/null
+++ b/samtools-0.1.19/examples/bam2bed.c
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include "sam.h"
+static int fetch_func(const bam1_t *b, void *data)
+{
+ samfile_t *fp = (samfile_t*)data;
+ uint32_t *cigar = bam1_cigar(b);
+ const bam1_core_t *c = &b->core;
+ int i, l;
+ if (b->core.tid < 0) return 0;
+ for (i = l = 0; i < c->n_cigar; ++i) {
+ int op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
+ l += cigar[i]>>4;
+ }
+ printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid],
+ c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+');
+ return 0;
+}
+int main(int argc, char *argv[])
+{
+ samfile_t *fp;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bam2bed <in.bam> [region]\n");
+ return 1;
+ }
+ if ((fp = samopen(argv[1], "rb", 0)) == 0) {
+ fprintf(stderr, "bam2bed: Fail to open BAM file %s\n", argv[1]);
+ return 1;
+ }
+ if (argc == 2) { /* if a region is not specified */
+ bam1_t *b = bam_init1();
+ while (samread(fp, b) >= 0) fetch_func(b, fp);
+ bam_destroy1(b);
+ } else {
+ int ref, beg, end;
+ bam_index_t *idx;
+ if ((idx = bam_index_load(argv[1])) == 0) {
+ fprintf(stderr, "bam2bed: BAM indexing file is not available.\n");
+ return 1;
+ }
+ bam_parse_region(fp->header, argv[2], &ref, &beg, &end);
+ if (ref < 0) {
+ fprintf(stderr, "bam2bed: Invalid region %s\n", argv[2]);
+ return 1;
+ }
+ bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func);
+ bam_index_destroy(idx);
+ }
+ samclose(fp);
+ return 0;
+}
diff --git a/samtools-0.1.19/examples/calDepth.c b/samtools-0.1.19/examples/calDepth.c
new file mode 100644
index 0000000..7a3239c
--- /dev/null
+++ b/samtools-0.1.19/examples/calDepth.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include "sam.h"
+
+typedef struct {
+ int beg, end;
+ samfile_t *in;
+} tmpstruct_t;
+
+// callback for bam_fetch()
+static int fetch_func(const bam1_t *b, void *data)
+{
+ bam_plbuf_t *buf = (bam_plbuf_t*)data;
+ bam_plbuf_push(b, buf);
+ return 0;
+}
+// callback for bam_plbuf_init()
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+ tmpstruct_t *tmp = (tmpstruct_t*)data;
+ if ((int)pos >= tmp->beg && (int)pos < tmp->end)
+ printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ tmpstruct_t tmp;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: calDepth <in.bam> [region]\n");
+ return 1;
+ }
+ tmp.beg = 0; tmp.end = 0x7fffffff;
+ tmp.in = samopen(argv[1], "rb", 0);
+ if (tmp.in == 0) {
+ fprintf(stderr, "Fail to open BAM file %s\n", argv[1]);
+ return 1;
+ }
+ if (argc == 2) { // if a region is not specified
+ sampileup(tmp.in, -1, pileup_func, &tmp);
+ } else {
+ int ref;
+ bam_index_t *idx;
+ bam_plbuf_t *buf;
+ idx = bam_index_load(argv[1]); // load BAM index
+ if (idx == 0) {
+ fprintf(stderr, "BAM indexing file is not available.\n");
+ return 1;
+ }
+ bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region
+ if (ref < 0) {
+ fprintf(stderr, "Invalid region %s\n", argv[2]);
+ return 1;
+ }
+ buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup
+ bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func);
+ bam_plbuf_push(0, buf); // finalize pileup
+ bam_index_destroy(idx);
+ bam_plbuf_destroy(buf);
+ }
+ samclose(tmp.in);
+ return 0;
+}
diff --git a/samtools-0.1.19/examples/chk_indel.c b/samtools-0.1.19/examples/chk_indel.c
new file mode 100644
index 0000000..aaa77e0
--- /dev/null
+++ b/samtools-0.1.19/examples/chk_indel.c
@@ -0,0 +1,83 @@
+/* To compile, copy this file to the samtools source code directory and compile with:
+ gcc -g -O2 -Wall chk_indel_rg.c -o chk_indel_rg -Wall -I. -L. -lbam -lz
+*/
+
+#include <string.h>
+#include "bam.h"
+
+typedef struct {
+ long cnt[4]; // short:ins, short:del, long:ins, long:del
+} rgcnt_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(rgcnt, rgcnt_t)
+
+#define MAX_LEN 127
+#define Q_THRES 10
+#define L_THRES 6 // short: <=L_THRES; otherwise long
+
+int main(int argc, char *argv[])
+{
+ bamFile fp;
+ bam1_t *b;
+ int i, x;
+ khash_t(rgcnt) *h;
+ khint_t k;
+
+ if (argc == 1) {
+ fprintf(stderr, "Usage: chk_indel_rg <in.bam>\n\n");
+ fprintf(stderr, "Output: filename, RG, #ins-in-short-homopolymer, #del-in-short, #ins-in-long, #del-in-long\n");
+ return 1;
+ }
+
+ h = kh_init(rgcnt);
+ fp = bam_open(argv[1], "r");
+ bam_header_destroy(bam_header_read(fp)); // we do not need the header
+ b = bam_init1();
+
+ while (bam_read1(fp, b) >= 0) {
+ if (b->core.n_cigar >= 3 && b->core.qual >= Q_THRES) {
+ const uint8_t *seq;
+ const uint32_t *cigar = bam1_cigar(b);
+ char *rg;
+ for (i = 0; i < b->core.n_cigar; ++i) // check if there are 1bp indels
+ if (bam_cigar_oplen(cigar[i]) == 1 && (bam_cigar_op(cigar[i]) == BAM_CDEL || bam_cigar_op(cigar[i]) == BAM_CINS))
+ break;
+ if (i == b->core.n_cigar) continue; // no 1bp ins or del
+ if ((rg = (char*)bam_aux_get(b, "RG")) == 0) continue; // no RG tag
+ seq = bam1_seq(b);
+ for (i = x = 0; i < b->core.n_cigar; ++i) {
+ int op = bam_cigar_op(cigar[i]);
+ if (bam_cigar_oplen(cigar[i]) == 1 && (op == BAM_CDEL || op == BAM_CINS)) {
+ int c, j, hrun, which;
+ c = bam1_seqi(seq, x);
+ for (j = x + 1, hrun = 0; j < b->core.l_qseq; ++j, ++hrun) // calculate the hompolymer run length
+ if (bam1_seqi(seq, j) != c) break;
+ k = kh_get(rgcnt, h, rg + 1);
+ if (k == kh_end(h)) { // absent
+ char *key = strdup(rg + 1);
+ k = kh_put(rgcnt, h, key, &c);
+ memset(&kh_val(h, k), 0, sizeof(rgcnt_t));
+ }
+ which = (hrun <= L_THRES? 0 : 1)<<1 | (op == BAM_CINS? 0 : 1);
+ ++kh_val(h, k).cnt[which];
+ }
+ if (bam_cigar_type(op)&1) ++x;
+ }
+ }
+ }
+
+ for (k = 0; k != kh_end(h); ++k) {
+ if (!kh_exist(h, k)) continue;
+ printf("%s\t%s", argv[1], kh_key(h, k));
+ for (i = 0; i < 4; ++i)
+ printf("\t%ld", kh_val(h, k).cnt[i]);
+ putchar('\n');
+ free((char*)kh_key(h, k));
+ }
+
+ bam_destroy1(b);
+ bam_close(fp);
+ kh_destroy(rgcnt, h);
+ return 0;
+}
diff --git a/samtools-0.1.19/examples/ex1.fa b/samtools-0.1.19/examples/ex1.fa
new file mode 100644
index 0000000..ef611b4
--- /dev/null
+++ b/samtools-0.1.19/examples/ex1.fa
@@ -0,0 +1,56 @@
+>seq1
+CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT
+GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC
+GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG
+TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC
+AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA
+CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC
+AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT
+CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA
+ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC
+AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC
+AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC
+ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC
+CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT
+TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT
+TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT
+GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT
+ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA
+ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG
+TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA
+CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG
+TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC
+TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC
+TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG
+TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG
+AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA
+TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC
+TCCCTCGTCTTCTTA
+>seq2
+TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG
+CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT
+TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT
+CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA
+AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT
+AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC
+ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG
+GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT
+CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT
+TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA
+AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA
+ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT
+TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA
+AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC
+TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA
+GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT
+AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA
+AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT
+AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT
+AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT
+ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT
+GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG
+CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA
+GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA
+AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA
+TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC
+CAGAAAAAAATATTTACAGTAACT
diff --git a/samtools-0.1.19/examples/ex1.sam.gz b/samtools-0.1.19/examples/ex1.sam.gz
new file mode 100644
index 0000000..44c07ee
Binary files /dev/null and b/samtools-0.1.19/examples/ex1.sam.gz differ
diff --git a/samtools-0.1.19/examples/toy.fa b/samtools-0.1.19/examples/toy.fa
new file mode 100644
index 0000000..afe990a
--- /dev/null
+++ b/samtools-0.1.19/examples/toy.fa
@@ -0,0 +1,4 @@
+>ref
+AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT
+>ref2
+aggttttataaaacaattaagtctacagagcaactacgcg
diff --git a/samtools-0.1.19/examples/toy.sam b/samtools-0.1.19/examples/toy.sam
new file mode 100644
index 0000000..33449b1
--- /dev/null
+++ b/samtools-0.1.19/examples/toy.sam
@@ -0,0 +1,14 @@
+ at SQ SN:ref LN:45
+ at SQ SN:ref2 LN:40
+r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112
+r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
+r003 0 ref 9 30 5H6M * 0 0 AGCTAA *
+r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
+r003 16 ref 29 30 6H5M * 0 0 TAGGC *
+r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
+x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
+x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
+x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
+x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
+x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
+x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
diff --git a/samtools-0.1.19/faidx.c b/samtools-0.1.19/faidx.c
new file mode 100644
index 0000000..51c82ac
--- /dev/null
+++ b/samtools-0.1.19/faidx.c
@@ -0,0 +1,437 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "faidx.h"
+#include "khash.h"
+
+typedef struct {
+ int32_t line_len, line_blen;
+ int64_t len;
+ uint64_t offset;
+} faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+
+#ifndef _NO_RAZF
+#include "razf.h"
+#else
+#ifdef _WIN32
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
+#define RAZF FILE
+#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
+#define razf_open(fn, mode) fopen(fn, mode)
+#define razf_close(fp) fclose(fp)
+#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define razf_tell(fp) ftello(fp)
+#endif
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+struct __faidx_t {
+ RAZF *rz;
+ int n, m;
+ char **name;
+ khash_t(s) *hash;
+};
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
+{
+ khint_t k;
+ int ret;
+ faidx1_t t;
+ if (idx->n == idx->m) {
+ idx->m = idx->m? idx->m<<1 : 16;
+ idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
+ }
+ idx->name[idx->n] = strdup(name);
+ k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
+ t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
+ kh_value(idx->hash, k) = t;
+ ++idx->n;
+}
+
+faidx_t *fai_build_core(RAZF *rz)
+{
+ char c, *name;
+ int l_name, m_name, ret;
+ int line_len, line_blen, state;
+ int l1, l2;
+ faidx_t *idx;
+ uint64_t offset;
+ int64_t len;
+
+ idx = (faidx_t*)calloc(1, sizeof(faidx_t));
+ idx->hash = kh_init(s);
+ name = 0; l_name = m_name = 0;
+ len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
+ while (razf_read(rz, &c, 1)) {
+ if (c == '\n') { // an empty line
+ if (state == 1) {
+ offset = razf_tell(rz);
+ continue;
+ } else if ((state == 0 && len < 0) || state == 2) continue;
+ }
+ if (c == '>') { // fasta header
+ if (len >= 0)
+ fai_insert_index(idx, name, len, line_len, line_blen, offset);
+ l_name = 0;
+ while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
+ if (m_name < l_name + 2) {
+ m_name = l_name + 2;
+ kroundup32(m_name);
+ name = (char*)realloc(name, m_name);
+ }
+ name[l_name++] = c;
+ }
+ name[l_name] = '\0';
+ if (ret == 0) {
+ fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
+ state = 1; len = 0;
+ offset = razf_tell(rz);
+ } else {
+ if (state == 3) {
+ fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ if (state == 2) state = 3;
+ l1 = l2 = 0;
+ do {
+ ++l1;
+ if (isgraph(c)) ++l2;
+ } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
+ if (state == 3 && l2) {
+ fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ ++l1; len += l2;
+ if (state == 1) line_len = l1, line_blen = l2, state = 0;
+ else if (state == 0) {
+ if (l1 != line_len || l2 != line_blen) state = 2;
+ }
+ }
+ }
+ fai_insert_index(idx, name, len, line_len, line_blen, offset);
+ free(name);
+ return idx;
+}
+
+void fai_save(const faidx_t *fai, FILE *fp)
+{
+ khint_t k;
+ int i;
+ for (i = 0; i < fai->n; ++i) {
+ faidx1_t x;
+ k = kh_get(s, fai->hash, fai->name[i]);
+ x = kh_value(fai->hash, k);
+#ifdef _WIN32
+ fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
+#else
+ fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+#endif
+ }
+}
+
+faidx_t *fai_read(FILE *fp)
+{
+ faidx_t *fai;
+ char *buf, *p;
+ int len, line_len, line_blen;
+#ifdef _WIN32
+ long offset;
+#else
+ long long offset;
+#endif
+ fai = (faidx_t*)calloc(1, sizeof(faidx_t));
+ fai->hash = kh_init(s);
+ buf = (char*)calloc(0x10000, 1);
+ while (!feof(fp) && fgets(buf, 0x10000, fp)) {
+ for (p = buf; *p && isgraph(*p); ++p);
+ *p = 0; ++p;
+#ifdef _WIN32
+ sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
+#else
+ sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+#endif
+ fai_insert_index(fai, buf, len, line_len, line_blen, offset);
+ }
+ free(buf);
+ return fai;
+}
+
+void fai_destroy(faidx_t *fai)
+{
+ int i;
+ for (i = 0; i < fai->n; ++i) free(fai->name[i]);
+ free(fai->name);
+ kh_destroy(s, fai->hash);
+ if (fai->rz) razf_close(fai->rz);
+ free(fai);
+}
+
+int fai_build(const char *fn)
+{
+ char *str;
+ RAZF *rz;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+ rz = razf_open(fn, "r");
+ if (rz == 0) {
+ fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
+ free(str);
+ return -1;
+ }
+ fai = fai_build_core(rz);
+ razf_close(rz);
+ fp = fopen(str, "wb");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
+ fai_destroy(fai); free(str);
+ return -1;
+ }
+ fai_save(fai, fp);
+ fclose(fp);
+ free(str);
+ fai_destroy(fai);
+ return 0;
+}
+
+#ifdef _USE_KNETFILE
+FILE *download_and_open(const char *fn)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ uint8_t *buf;
+ FILE *fp;
+ knetFile *fp_remote;
+ const char *url = fn;
+ const char *p;
+ int l = strlen(fn);
+ for (p = fn + l - 1; p >= fn; --p)
+ if (*p == '/') break;
+ fn = p + 1;
+
+ // First try to open a local copy
+ fp = fopen(fn, "r");
+ if (fp)
+ return fp;
+
+ // If failed, download from remote and open
+ fp_remote = knet_open(url, "rb");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
+ return NULL;
+ }
+ if ((fp = fopen(fn, "wb")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
+ knet_close(fp_remote);
+ return NULL;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+
+ return fopen(fn, "r");
+}
+#endif
+
+faidx_t *fai_load(const char *fn)
+{
+ char *str;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+
+#ifdef _USE_KNETFILE
+ if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
+ {
+ fp = download_and_open(str);
+ if ( !fp )
+ {
+ fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
+ free(str);
+ return 0;
+ }
+ }
+ else
+#endif
+ fp = fopen(str, "rb");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] build FASTA index.\n");
+ fai_build(fn);
+ fp = fopen(str, "rb");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
+ free(str);
+ return 0;
+ }
+ }
+
+ fai = fai_read(fp);
+ fclose(fp);
+
+ fai->rz = razf_open(fn, "rb");
+ free(str);
+ if (fai->rz == 0) {
+ fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
+ return 0;
+ }
+ return fai;
+}
+
+char *fai_fetch(const faidx_t *fai, const char *str, int *len)
+{
+ char *s, c;
+ int i, l, k, name_end;
+ khiter_t iter;
+ faidx1_t val;
+ khash_t(s) *h;
+ int beg, end;
+
+ beg = end = -1;
+ h = fai->hash;
+ name_end = l = strlen(str);
+ s = (char*)malloc(l+1);
+ // remove space
+ for (i = k = 0; i < l; ++i)
+ if (!isspace(str[i])) s[k++] = str[i];
+ s[k] = 0; l = k;
+ // determine the sequence name
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+ if (i >= 0) name_end = i;
+ if (name_end < l) { // check if this is really the end
+ int n_hyphen = 0;
+ for (i = name_end + 1; i < l; ++i) {
+ if (s[i] == '-') ++n_hyphen;
+ else if (!isdigit(s[i]) && s[i] != ',') break;
+ }
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+ s[name_end] = 0;
+ iter = kh_get(s, h, s);
+ if (iter == kh_end(h)) { // cannot find the sequence name
+ iter = kh_get(s, h, str); // try str as the name
+ if (iter == kh_end(h)) {
+ *len = 0;
+ free(s); return 0;
+ } else s[name_end] = ':', name_end = l;
+ }
+ } else iter = kh_get(s, h, str);
+ if(iter == kh_end(h)) {
+ fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
+ free(s);
+ return 0;
+ };
+ val = kh_value(h, iter);
+ // parse the interval
+ if (name_end < l) {
+ for (i = k = name_end + 1; i < l; ++i)
+ if (s[i] != ',') s[k++] = s[i];
+ s[k] = 0;
+ beg = atoi(s + name_end + 1);
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+ end = i < k? atoi(s + i + 1) : val.len;
+ if (beg > 0) --beg;
+ } else beg = 0, end = val.len;
+ if (beg >= val.len) beg = val.len;
+ if (end >= val.len) end = val.len;
+ if (beg > end) beg = end;
+ free(s);
+
+ // now retrieve the sequence
+ l = 0;
+ s = (char*)malloc(end - beg + 2);
+ razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
+ while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
+ if (isgraph(c)) s[l++] = c;
+ s[l] = '\0';
+ *len = l;
+ return s;
+}
+
+int faidx_main(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
+ return 1;
+ } else {
+ if (argc == 2) fai_build(argv[1]);
+ else {
+ int i, j, k, l;
+ char *s;
+ faidx_t *fai;
+ fai = fai_load(argv[1]);
+ if (fai == 0) return 1;
+ for (i = 2; i != argc; ++i) {
+ printf(">%s\n", argv[i]);
+ s = fai_fetch(fai, argv[i], &l);
+ for (j = 0; j < l; j += 60) {
+ for (k = 0; k < 60 && k < l - j; ++k)
+ putchar(s[j + k]);
+ putchar('\n');
+ }
+ free(s);
+ }
+ fai_destroy(fai);
+ }
+ }
+ return 0;
+}
+
+int faidx_fetch_nseq(const faidx_t *fai)
+{
+ return fai->n;
+}
+
+char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
+{
+ int l;
+ char c;
+ khiter_t iter;
+ faidx1_t val;
+ char *seq=NULL;
+
+ // Adjust position
+ iter = kh_get(s, fai->hash, c_name);
+ if(iter == kh_end(fai->hash)) return 0;
+ val = kh_value(fai->hash, iter);
+ if(p_end_i < p_beg_i) p_beg_i = p_end_i;
+ if(p_beg_i < 0) p_beg_i = 0;
+ else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
+ if(p_end_i < 0) p_end_i = 0;
+ else if(val.len <= p_end_i) p_end_i = val.len - 1;
+
+ // Now retrieve the sequence
+ l = 0;
+ seq = (char*)malloc(p_end_i - p_beg_i + 2);
+ razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
+ while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
+ if (isgraph(c)) seq[l++] = c;
+ seq[l] = '\0';
+ *len = l;
+ return seq;
+}
+
+#ifdef FAIDX_MAIN
+int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
+#endif
diff --git a/samtools-0.1.19/faidx.h b/samtools-0.1.19/faidx.h
new file mode 100644
index 0000000..1fb1b1f
--- /dev/null
+++ b/samtools-0.1.19/faidx.h
@@ -0,0 +1,103 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#ifndef FAIDX_H
+#define FAIDX_H
+
+/*!
+ @header
+
+ Index FASTA files and extract subsequence.
+
+ @copyright The Wellcome Trust Sanger Institute.
+ */
+
+struct __faidx_t;
+typedef struct __faidx_t faidx_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /*!
+ @abstract Build index for a FASTA or razip compressed FASTA file.
+ @param fn FASTA file name
+ @return 0 on success; or -1 on failure
+ @discussion File "fn.fai" will be generated.
+ */
+ int fai_build(const char *fn);
+
+ /*!
+ @abstract Distroy a faidx_t struct.
+ @param fai Pointer to the struct to be destroyed
+ */
+ void fai_destroy(faidx_t *fai);
+
+ /*!
+ @abstract Load index from "fn.fai".
+ @param fn File name of the FASTA file
+ */
+ faidx_t *fai_load(const char *fn);
+
+ /*!
+ @abstract Fetch the sequence in a region.
+ @param fai Pointer to the faidx_t struct
+ @param reg Region in the format "chr2:20,000-30,000"
+ @param len Length of the region
+ @return Pointer to the sequence; null on failure
+
+ @discussion The returned sequence is allocated by malloc family
+ and should be destroyed by end users by calling free() on it.
+ */
+ char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
+
+ /*!
+ @abstract Fetch the number of sequences.
+ @param fai Pointer to the faidx_t struct
+ @return The number of sequences
+ */
+ int faidx_fetch_nseq(const faidx_t *fai);
+
+ /*!
+ @abstract Fetch the sequence in a region.
+ @param fai Pointer to the faidx_t struct
+ @param c_name Region name
+ @param p_beg_i Beginning position number (zero-based)
+ @param p_end_i End position number (zero-based)
+ @param len Length of the region
+ @return Pointer to the sequence; null on failure
+
+ @discussion The returned sequence is allocated by malloc family
+ and should be destroyed by end users by calling free() on it.
+ */
+ char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/kaln.c b/samtools-0.1.19/kaln.c
new file mode 100644
index 0000000..9c0bbaa
--- /dev/null
+++ b/samtools-0.1.19/kaln.c
@@ -0,0 +1,486 @@
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3 at gmail.com>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "kaln.h"
+
+#define FROM_M 0
+#define FROM_I 1
+#define FROM_D 2
+
+typedef struct {
+ int i, j;
+ unsigned char ctype;
+} path_t;
+
+int aln_sm_blosum62[] = {
+/* A R N D C Q E G H I L K M F P S T W Y V * X */
+ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
+ -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
+ -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
+ -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
+ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
+ -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
+ -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
+ -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
+ -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
+ -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
+ -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+ -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
+ -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
+ -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
+ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
+ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
+ -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
+ -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
+ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
+ -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
+ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
+};
+
+int aln_sm_blast[] = {
+ 1, -3, -3, -3, -2,
+ -3, 1, -3, -3, -2,
+ -3, -3, 1, -3, -2,
+ -3, -3, -3, 1, -2,
+ -2, -2, -2, -2, -2
+};
+
+int aln_sm_qual[] = {
+ 0, -23, -23, -23, 0,
+ -23, 0, -23, -23, 0,
+ -23, -23, 0, -23, 0,
+ -23, -23, -23, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
+ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
+
+ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
+
+static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
+{
+ int i, n;
+ uint32_t *cigar;
+ unsigned char last_type;
+
+ if (path_len == 0 || path == 0) {
+ *n_cigar = 0;
+ return 0;
+ }
+
+ last_type = path->ctype;
+ for (i = n = 1; i < path_len; ++i) {
+ if (last_type != path[i].ctype) ++n;
+ last_type = path[i].ctype;
+ }
+ *n_cigar = n;
+ cigar = (uint32_t*)calloc(*n_cigar, 4);
+
+ cigar[0] = 1u << 4 | path[path_len-1].ctype;
+ last_type = path[path_len-1].ctype;
+ for (i = path_len - 2, n = 0; i >= 0; --i) {
+ if (path[i].ctype == last_type) cigar[n] += 1u << 4;
+ else {
+ cigar[++n] = 1u << 4 | path[i].ctype;
+ last_type = path[i].ctype;
+ }
+ }
+
+ return cigar;
+}
+
+/***************************/
+/* START OF common_align.c */
+/***************************/
+
+#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
+
+#define set_M(MM, cur, p, sc) \
+{ \
+ if ((p)->M >= (p)->I) { \
+ if ((p)->M >= (p)->D) { \
+ (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
+ } else { \
+ (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
+ } \
+ } else { \
+ if ((p)->I > (p)->D) { \
+ (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
+ } else { \
+ (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
+ } \
+ } \
+}
+#define set_I(II, cur, p) \
+{ \
+ if ((p)->M - gap_open > (p)->I) { \
+ (cur)->It = FROM_M; \
+ (II) = (p)->M - gap_open - gap_ext; \
+ } else { \
+ (cur)->It = FROM_I; \
+ (II) = (p)->I - gap_ext; \
+ } \
+}
+#define set_end_I(II, cur, p) \
+{ \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->I) { \
+ (cur)->It = FROM_M; \
+ (II) = (p)->M - gap_end_open - gap_end_ext; \
+ } else { \
+ (cur)->It = FROM_I; \
+ (II) = (p)->I - gap_end_ext; \
+ } \
+ } else set_I(II, cur, p); \
+}
+#define set_D(DD, cur, p) \
+{ \
+ if ((p)->M - gap_open > (p)->D) { \
+ (cur)->Dt = FROM_M; \
+ (DD) = (p)->M - gap_open - gap_ext; \
+ } else { \
+ (cur)->Dt = FROM_D; \
+ (DD) = (p)->D - gap_ext; \
+ } \
+}
+#define set_end_D(DD, cur, p) \
+{ \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->D) { \
+ (cur)->Dt = FROM_M; \
+ (DD) = (p)->M - gap_end_open - gap_end_ext; \
+ } else { \
+ (cur)->Dt = FROM_D; \
+ (DD) = (p)->D - gap_end_ext; \
+ } \
+ } else set_D(DD, cur, p); \
+}
+
+typedef struct {
+ uint8_t Mt:3, It:2, Dt:3;
+} dpcell_t;
+
+typedef struct {
+ int M, I, D;
+} dpscore_t;
+
+/***************************
+ * banded global alignment *
+ ***************************/
+uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
+{
+ int i, j;
+ dpcell_t **dpcell, *q;
+ dpscore_t *curr, *last, *s;
+ int b1, b2, tmp_end;
+ int *mat, end, max = 0;
+ uint8_t type, ctype;
+ uint32_t *cigar = 0;
+
+ int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
+ int *score_matrix, N_MATRIX_ROW;
+
+ /* initialize some align-related parameters. just for compatibility */
+ gap_open = ap->gap_open;
+ gap_ext = ap->gap_ext;
+ gap_end_open = ap->gap_end_open;
+ gap_end_ext = ap->gap_end_ext;
+ b = ap->band_width;
+ score_matrix = ap->matrix;
+ N_MATRIX_ROW = ap->row;
+
+ if (n_cigar) *n_cigar = 0;
+ if (len1 == 0 || len2 == 0) return 0;
+
+ /* calculate b1 and b2 */
+ if (len1 > len2) {
+ b1 = len1 - len2 + b;
+ b2 = b;
+ } else {
+ b1 = b;
+ b2 = len2 - len1 + b;
+ }
+ if (b1 > len1) b1 = len1;
+ if (b2 > len2) b2 = len2;
+ --seq1; --seq2;
+
+ /* allocate memory */
+ end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
+ dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
+ for (j = 0; j <= len2; ++j)
+ dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
+ for (j = b2 + 1; j <= len2; ++j)
+ dpcell[j] -= j - b2;
+ curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+ last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+
+ /* set first row */
+ SET_INF(*curr); curr->M = 0;
+ for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
+ SET_INF(*s);
+ set_end_D(s->D, dpcell[0] + i, s - 1);
+ }
+ s = curr; curr = last; last = s;
+
+ /* core dynamic programming, part 1 */
+ tmp_end = (b2 < len2)? b2 : len2 - 1;
+ for (j = 1; j <= tmp_end; ++j) {
+ q = dpcell[j]; s = curr; SET_INF(*s);
+ set_end_I(s->I, q, last);
+ end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ ++s; ++q;
+ for (i = 1; i != end; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+ set_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_D(s->D, q, s - 1);
+ if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+ set_end_I(s->I, q, last + i);
+ } else s->I = MINOR_INF;
+ s = curr; curr = last; last = s;
+ }
+ /* last row for part 1, use set_end_D() instead of set_D() */
+ if (j == len2 && b2 != len2 - 1) {
+ q = dpcell[j]; s = curr; SET_INF(*s);
+ set_end_I(s->I, q, last);
+ end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ ++s; ++q;
+ for (i = 1; i != end; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+ set_I(s->I, q, last + i);
+ set_end_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_end_D(s->D, q, s - 1);
+ if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+ set_end_I(s->I, q, last + i);
+ } else s->I = MINOR_INF;
+ s = curr; curr = last; last = s;
+ ++j;
+ }
+
+ /* core dynamic programming, part 2 */
+ for (; j <= len2 - b2 + 1; ++j) {
+ SET_INF(curr[j - b2]);
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ end = j + b1 - 1;
+ for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_D(s->D, q, s - 1);
+ s->I = MINOR_INF;
+ s = curr; curr = last; last = s;
+ }
+
+ /* core dynamic programming, part 3 */
+ for (; j < len2; ++j) {
+ SET_INF(curr[j - b2]);
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+ set_end_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ s = curr; curr = last; last = s;
+ }
+ /* last row */
+ if (j == len2) {
+ SET_INF(curr[j - b2]);
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_I(s->I, q, last + i);
+ set_end_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+ set_end_I(s->I, q, last + i);
+ set_end_D(s->D, q, s - 1);
+ s = curr; curr = last; last = s;
+ }
+
+ *_score = last[len1].M;
+ if (n_cigar) { /* backtrace */
+ path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
+ i = len1; j = len2;
+ q = dpcell[j] + i;
+ s = last + len1;
+ max = s->M; type = q->Mt; ctype = FROM_M;
+ if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
+ if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
+
+ p = path;
+ p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
+ ++p;
+ do {
+ switch (ctype) {
+ case FROM_M: --i; --j; break;
+ case FROM_I: --j; break;
+ case FROM_D: --i; break;
+ }
+ q = dpcell[j] + i;
+ ctype = type;
+ switch (type) {
+ case FROM_M: type = q->Mt; break;
+ case FROM_I: type = q->It; break;
+ case FROM_D: type = q->Dt; break;
+ }
+ p->ctype = ctype; p->i = i; p->j = j;
+ ++p;
+ } while (i || j);
+ cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
+ free(path);
+ }
+
+ /* free memory */
+ for (j = b2 + 1; j <= len2; ++j)
+ dpcell[j] += j - b2;
+ for (j = 0; j <= len2; ++j)
+ free(dpcell[j]);
+ free(dpcell);
+ free(curr); free(last);
+
+ return cigar;
+}
+
+typedef struct {
+ int M, I, D;
+} score_aux_t;
+
+#define MINUS_INF -0x40000000
+
+// matrix: len2 rows and len1 columns
+int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
+{
+
+#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
+ int t1, t2; \
+ score_aux_t *_q; \
+ _q = _q0; \
+ _p->M = _q->M >= _q->I? _q->M : _q->I; \
+ _p->M = _p->M >= _q->D? _p->M : _q->D; \
+ _p->M += (_sc); \
+ ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
+ _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
+ }
+
+ int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
+ const uint8_t *seq1, *seq2;
+ score_aux_t *curr, *last, *swap;
+ bw = abs(len1 - len2) + ap->band_width;
+ i = len1 > len2? len1 : len2;
+ if (bw > i + 1) bw = i + 1;
+ seq1 = _seq1 - 1; seq2 = _seq2 - 1;
+ curr = calloc(len1 + 2, sizeof(score_aux_t));
+ last = calloc(len1 + 2, sizeof(score_aux_t));
+ { // the zero-th row
+ int x, end = len1;
+ score_aux_t *p;
+ j = 0;
+ x = j + bw; end = len1 < x? len1 : x; // band end
+ p = curr;
+ p->M = 0; p->I = p->D = MINUS_INF;
+ for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
+ p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
+ p->M = p->I = p->D = MINUS_INF;
+ swap = curr; curr = last; last = swap;
+ }
+ for (j = 1; j < len2; ++j) {
+ int x, beg = 0, end = len1, *scrow, col_end;
+ score_aux_t *p;
+ x = j - bw; beg = 0 > x? 0 : x; // band start
+ x = j + bw; end = len1 < x? len1 : x; // band end
+ if (beg == 0) { // from zero-th column
+ p = curr;
+ p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+ ++beg; // then beg = 1
+ }
+ scrow = scmat + seq2[j] * scmat_size;
+ if (end == len1) col_end = 1, --end;
+ else col_end = 0;
+ for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
+ if (col_end) {
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
+ ++p;
+ }
+ p->M = p->I = p->D = MINUS_INF;
+// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+ swap = curr; curr = last; last = swap;
+ }
+ { // the last row
+ int x, beg = 0, *scrow;
+ score_aux_t *p;
+ j = len2;
+ x = j - bw; beg = 0 > x? 0 : x; // band start
+ if (beg == 0) { // from zero-th column
+ p = curr;
+ p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+ ++beg; // then beg = 1
+ }
+ scrow = scmat + seq2[j] * scmat_size;
+ for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
+// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+ }
+ ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
+ ret = ret >= curr[len1].D? ret : curr[len1].D;
+ free(curr); free(last);
+ return ret;
+}
+
+#ifdef _MAIN
+int main(int argc, char *argv[])
+{
+// int len1 = 35, len2 = 35;
+// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
+// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
+ int len1 = 4, len2 = 4;
+ uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
+ uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
+ int sc;
+// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
+ sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
+ printf("%d\n", sc);
+ return 0;
+}
+#endif
diff --git a/samtools-0.1.19/kaln.h b/samtools-0.1.19/kaln.h
new file mode 100644
index 0000000..1ece132
--- /dev/null
+++ b/samtools-0.1.19/kaln.h
@@ -0,0 +1,67 @@
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3 at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef LH3_KALN_H_
+#define LH3_KALN_H_
+
+#include <stdint.h>
+
+#define MINOR_INF -1073741823
+
+typedef struct {
+ int gap_open;
+ int gap_ext;
+ int gap_end_open;
+ int gap_end_ext;
+
+ int *matrix;
+ int row;
+ int band_width;
+} ka_param_t;
+
+typedef struct {
+ int iio, iie, ido, ide;
+ int eio, eie, edo, ede;
+ int *matrix;
+ int row;
+ int band_width;
+} ka_param2_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap,
+ int *_score, int *n_cigar);
+ int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap);
+#ifdef __cplusplus
+}
+#endif
+
+extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */
+extern ka_param_t ka_param_qual; // only use this for global alignment!!!
+extern ka_param2_t ka_param2_qual; // only use this for global alignment!!!
+
+#endif
diff --git a/samtools-0.1.19/khash.h b/samtools-0.1.19/khash.h
new file mode 100644
index 0000000..a7e8056
--- /dev/null
+++ b/samtools-0.1.19/khash.h
@@ -0,0 +1,528 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ if (!ret) kh_del(32, h, k);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+
+ @copyright Heng Li
+ */
+
+#define AC_VERSION_KHASH_H "0.2.5"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compipler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_HASH_PRIME_SIZE 32
+static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
+{
+ 0ul, 3ul, 11ul, 23ul, 53ul,
+ 97ul, 193ul, 389ul, 769ul, 1543ul,
+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
+ 3221225473ul, 4294967291ul
+};
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ extern kh_##name##_t *kh_init_##name(); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ SCOPE kh_##name##_t *kh_init_##name() { \
+ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ free(h->keys); free(h->flags); \
+ free(h->vals); \
+ free(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t inc, k, i, last; \
+ k = __hash_func(key); i = k % h->n_buckets; \
+ inc = 1 + k % (h->n_buckets - 1); last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+ else i += inc; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { \
+ khint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ khint_t t = __ac_HASH_PRIME_SIZE - 1; \
+ while (__ac_prime_list[t] > new_n_buckets) --t; \
+ new_n_buckets = __ac_prime_list[t+1]; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
+ else { \
+ new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
+ memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) \
+ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ } \
+ } \
+ if (j) { \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isdel_true(h->flags, j); \
+ while (1) { \
+ khint_t inc, k, i; \
+ k = __hash_func(key); \
+ i = k % new_n_buckets; \
+ inc = 1 + k % (new_n_buckets - 1); \
+ while (!__ac_isempty(new_flags, i)) { \
+ if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
+ else i += inc; \
+ } \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isdel_true(h->flags, i); \
+ } else { \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) \
+ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ free(h->flags); \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ } \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { \
+ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
+ else kh_resize_##name(h, h->n_buckets + 1); \
+ } \
+ { \
+ khint_t inc, k, i, site, last; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
+ if (__ac_isempty(h->flags, i)) x = i; \
+ else { \
+ inc = 1 + k % (h->n_buckets - 1); last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+ else i += inc; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [khint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [khint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+static inline khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = *s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other necessary macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/samtools-0.1.19/klist.h b/samtools-0.1.19/klist.h
new file mode 100644
index 0000000..2f17016
--- /dev/null
+++ b/samtools-0.1.19/klist.h
@@ -0,0 +1,96 @@
+#ifndef _LH3_KLIST_H
+#define _LH3_KLIST_H
+
+#include <stdlib.h>
+
+#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \
+ typedef struct { \
+ size_t cnt, n, max; \
+ kmptype_t **buf; \
+ } kmp_##name##_t; \
+ static inline kmp_##name##_t *kmp_init_##name() { \
+ return calloc(1, sizeof(kmp_##name##_t)); \
+ } \
+ static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \
+ size_t k; \
+ for (k = 0; k < mp->n; ++k) { \
+ kmpfree_f(mp->buf[k]); free(mp->buf[k]); \
+ } \
+ free(mp->buf); free(mp); \
+ } \
+ static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
+ ++mp->cnt; \
+ if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \
+ return mp->buf[--mp->n]; \
+ } \
+ static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+ --mp->cnt; \
+ if (mp->n == mp->max) { \
+ mp->max = mp->max? mp->max<<1 : 16; \
+ mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \
+ } \
+ mp->buf[mp->n++] = p; \
+ }
+
+#define kmempool_t(name) kmp_##name##_t
+#define kmp_init(name) kmp_init_##name()
+#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
+#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
+#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
+
+#define KLIST_INIT(name, kltype_t, kmpfree_t) \
+ struct __kl1_##name { \
+ kltype_t data; \
+ struct __kl1_##name *next; \
+ }; \
+ typedef struct __kl1_##name kl1_##name; \
+ KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \
+ typedef struct { \
+ kl1_##name *head, *tail; \
+ kmp_##name##_t *mp; \
+ size_t size; \
+ } kl_##name##_t; \
+ static inline kl_##name##_t *kl_init_##name() { \
+ kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \
+ kl->mp = kmp_init(name); \
+ kl->head = kl->tail = kmp_alloc(name, kl->mp); \
+ kl->head->next = 0; \
+ return kl; \
+ } \
+ static inline void kl_destroy_##name(kl_##name##_t *kl) { \
+ kl1_##name *p; \
+ for (p = kl->head; p != kl->tail; p = p->next) \
+ kmp_free(name, kl->mp, p); \
+ kmp_free(name, kl->mp, p); \
+ kmp_destroy(name, kl->mp); \
+ free(kl); \
+ } \
+ static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \
+ kl1_##name *q, *p = kmp_alloc(name, kl->mp); \
+ q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \
+ ++kl->size; \
+ return &q->data; \
+ } \
+ static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \
+ kl1_##name *p; \
+ if (kl->head->next == 0) return -1; \
+ --kl->size; \
+ p = kl->head; kl->head = kl->head->next; \
+ if (d) *d = p->data; \
+ kmp_free(name, kl->mp, p); \
+ return 0; \
+ }
+
+#define kliter_t(name) kl1_##name
+#define klist_t(name) kl_##name##_t
+#define kl_val(iter) ((iter)->data)
+#define kl_next(iter) ((iter)->next)
+#define kl_begin(kl) ((kl)->head)
+#define kl_end(kl) ((kl)->tail)
+
+#define kl_init(name) kl_init_##name()
+#define kl_destroy(name, kl) kl_destroy_##name(kl)
+#define kl_pushp(name, kl) kl_pushp_##name(kl)
+#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
+
+#endif
diff --git a/samtools-0.1.19/knetfile.c b/samtools-0.1.19/knetfile.c
new file mode 100644
index 0000000..af09146
--- /dev/null
+++ b/samtools-0.1.19/knetfile.c
@@ -0,0 +1,632 @@
+/* The MIT License
+
+ Copyright (c) 2008 by Genome Research Ltd (GRL).
+ 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Probably I will not do socket programming in the next few years and
+ therefore I decide to heavily annotate this file, for Linux and
+ Windows as well. -ac */
+
+#include <time.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#include "knetfile.h"
+
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
+static int socket_wait(int fd, int is_read)
+{
+ fd_set fds, *fdr = 0, *fdw = 0;
+ struct timeval tv;
+ int ret;
+ tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ if (is_read) fdr = &fds;
+ else fdw = &fds;
+ ret = select(fd+1, fdr, fdw, 0, &tv);
+#ifndef _WIN32
+ if (ret == -1) perror("select");
+#else
+ if (ret == 0)
+ fprintf(stderr, "select time-out\n");
+ else if (ret == SOCKET_ERROR)
+ fprintf(stderr, "select: %d\n", WSAGetLastError());
+#endif
+ return ret;
+}
+
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+ int on = 1, fd;
+ struct linger lng = { 0, 0 };
+ struct addrinfo hints, *res = 0;
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ /* In Unix/Mac, getaddrinfo() is the most convenient way to get
+ * server information. */
+ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+ if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+ /* The following two setsockopt() are used by ftplib
+ * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+ * necessary. */
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+ freeaddrinfo(res);
+ return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *int64tostr(char *buf, int64_t x)
+{
+ int cnt;
+ int i = 0;
+ do {
+ buf[i++] = '0' + x % 10;
+ x /= 10;
+ } while (x);
+ buf[i] = 0;
+ for (cnt = i, i = 0; i < cnt/2; ++i) {
+ int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+ }
+ return buf;
+}
+
+int64_t strtoint64(const char *buf)
+{
+ int64_t x;
+ for (x = 0; *buf != '\0'; ++buf)
+ x = x * 10 + ((int64_t) *buf - 48);
+ return x;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+ WSADATA wsaData;
+ return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+ WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) \
+ do { \
+ fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
+ return -1; \
+ } while (0)
+
+ int on = 1;
+ SOCKET fd;
+ struct linger lng = { 0, 0 };
+ struct sockaddr_in server;
+ struct hostent *hp = 0;
+ // open socket
+ if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ // get host info
+ if (isalpha(host[0])) hp = gethostbyname(host);
+ else {
+ struct in_addr addr;
+ addr.s_addr = inet_addr(host);
+ hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+ }
+ if (hp == 0) __err_connect("gethost");
+ // connect
+ server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+ server.sin_family= AF_INET;
+ server.sin_port = htons(atoi(port));
+ if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+ // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+ return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+ off_t rest = len, curr, l = 0;
+ /* recv() and read() may not read the required length of data with
+ * one call. They have to be called repeatedly. */
+ while (rest) {
+ if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+ curr = netread(fd, buf + l, rest);
+ /* According to the glibc manual, section 13.2, a zero returned
+ * value indicates end-of-file (EOF), which should mean that
+ * read() will not return zero if EOF has not been met but data
+ * are not immediately available. */
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
+static int kftp_get_response(knetFile *ftp)
+{
+#ifndef _WIN32
+ unsigned char c;
+#else
+ char c;
+#endif
+ int n = 0;
+ char *p;
+ if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+ while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+ //fputc(c, stderr);
+ if (n >= ftp->max_response) {
+ ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+ ftp->response = realloc(ftp->response, ftp->max_response);
+ }
+ ftp->response[n++] = c;
+ if (c == '\n') {
+ if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+ && ftp->response[3] != '-') break;
+ n = 0;
+ continue;
+ }
+ }
+ if (n < 2) return -1;
+ ftp->response[n-2] = 0;
+ return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+ if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+ netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
+ return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+ char *p;
+ int v[6];
+ kftp_send_cmd(ftp, "PASV\r\n", 1);
+ for (p = ftp->response; *p && *p != '('; ++p);
+ if (*p != '(') return -1;
+ ++p;
+ sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+ memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+ ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+ return 0;
+}
+
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+ char host[80], port[10];
+ if (ftp->pasv_port == 0) {
+ fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+ return -1;
+ }
+ sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+ sprintf(port, "%d", ftp->pasv_port);
+ ftp->fd = socket_connect(host, port);
+ if (ftp->fd == -1) return -1;
+ return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+ ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+ if (ftp->ctrl_fd == -1) return -1;
+ kftp_get_response(ftp);
+ kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+ kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+ kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+ return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+ if (ftp->ctrl_fd != -1) {
+ netclose(ftp->ctrl_fd);
+ ftp->ctrl_fd = -1;
+ }
+ netclose(ftp->fd);
+ ftp->fd = -1;
+ return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host, ->retr and ->size
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p;
+ int l;
+ if (strstr(fn, "ftp://") != fn) return 0;
+ for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+ if (*p != '/') return 0;
+ l = p - fn - 6;
+ fp = calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_FTP;
+ fp->fd = -1;
+ /* the Linux/Mac version of socket_connect() also recognizes a port
+ * like "ftp", but the Windows version does not. */
+ fp->port = strdup("21");
+ fp->host = calloc(l + 1, 1);
+ if (strchr(mode, 'c')) fp->no_reconnect = 1;
+ strncpy(fp->host, fn + 6, l);
+ fp->retr = calloc(strlen(p) + 8, 1);
+ sprintf(fp->retr, "RETR %s\r\n", p);
+ fp->size_cmd = calloc(strlen(p) + 8, 1);
+ sprintf(fp->size_cmd, "SIZE %s\r\n", p);
+ fp->seek_offset = 0;
+ return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+ int ret;
+ long long file_size;
+ if (fp->fd != -1) {
+ netclose(fp->fd);
+ if (fp->no_reconnect) kftp_get_response(fp);
+ }
+ kftp_pasv_prep(fp);
+ kftp_send_cmd(fp, fp->size_cmd, 1);
+#ifndef _WIN32
+ if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
+ {
+ fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
+ return -1;
+ }
+#else
+ const char *p = fp->response;
+ while (*p != ' ') ++p;
+ while (*p < '0' || *p > '9') ++p;
+ file_size = strtoint64(p);
+#endif
+ fp->file_size = file_size;
+ if (fp->offset>=0) {
+ char tmp[32];
+#ifndef _WIN32
+ sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+ strcpy(tmp, "REST ");
+ int64tostr(tmp + 5, fp->offset);
+ strcat(tmp, "\r\n");
+#endif
+ kftp_send_cmd(fp, tmp, 1);
+ }
+ kftp_send_cmd(fp, fp->retr, 0);
+ kftp_pasv_connect(fp);
+ ret = kftp_get_response(fp);
+ if (ret != 150) {
+ fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ fp->is_ready = 1;
+ return 0;
+}
+
+
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p, *proxy, *q;
+ int l;
+ if (strstr(fn, "http://") != fn) return 0;
+ // set ->http_host
+ for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+ l = p - fn - 7;
+ fp = calloc(1, sizeof(knetFile));
+ fp->http_host = calloc(l + 1, 1);
+ strncpy(fp->http_host, fn + 7, l);
+ fp->http_host[l] = 0;
+ for (q = fp->http_host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ // get http_proxy
+ proxy = getenv("http_proxy");
+ // set ->host, ->port and ->path
+ if (proxy == 0) {
+ fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(*p? p : "/");
+ } else {
+ fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+ for (q = fp->host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(fn);
+ }
+ fp->type = KNF_TYPE_HTTP;
+ fp->ctrl_fd = fp->fd = -1;
+ fp->seek_offset = 0;
+ return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+ int ret, l = 0;
+ char *buf, *p;
+ if (fp->fd != -1) netclose(fp->fd);
+ fp->fd = socket_connect(fp->host, fp->port);
+ buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+ l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+ l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+ l += sprintf(buf + l, "\r\n");
+ netwrite(fp->fd, buf, l);
+ l = 0;
+ while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+ if (buf[l] == '\n' && l >= 3)
+ if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+ ++l;
+ }
+ buf[l] = 0;
+ if (l < 14) { // prematured header
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ ret = strtol(buf + 8, &p, 0); // HTTP return code
+ if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
+ off_t rest = fp->offset;
+ while (rest) {
+ off_t l = rest < 0x10000? rest : 0x10000;
+ rest -= my_netread(fp->fd, buf, l);
+ }
+ } else if (ret != 206 && ret != 200) {
+ free(buf);
+ fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ free(buf);
+ fp->is_ready = 1;
+ return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+ knetFile *fp = 0;
+ if (mode[0] != 'r') {
+ fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+ return 0;
+ }
+ if (strstr(fn, "ftp://") == fn) {
+ fp = kftp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ if (kftp_connect(fp) == -1) {
+ knet_close(fp);
+ return 0;
+ }
+ kftp_connect_file(fp);
+ } else if (strstr(fn, "http://") == fn) {
+ fp = khttp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ khttp_connect_file(fp);
+ } else { // local file
+#ifdef _WIN32
+ /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+ * be undefined on some systems, although it is defined on my
+ * Mac and the Linux I have tested on. */
+ int fd = open(fn, O_RDONLY | O_BINARY);
+#else
+ int fd = open(fn, O_RDONLY);
+#endif
+ if (fd == -1) {
+ perror("open");
+ return 0;
+ }
+ fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ fp->ctrl_fd = -1;
+ }
+ if (fp && fp->fd == -1) {
+ knet_close(fp);
+ return 0;
+ }
+ return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+ knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ return fp;
+}
+
+off_t knet_read(knetFile *fp, void *buf, off_t len)
+{
+ off_t l = 0;
+ if (fp->fd == -1) return 0;
+ if (fp->type == KNF_TYPE_FTP) {
+ if (fp->is_ready == 0) {
+ if (!fp->no_reconnect) kftp_reconnect(fp);
+ kftp_connect_file(fp);
+ }
+ } else if (fp->type == KNF_TYPE_HTTP) {
+ if (fp->is_ready == 0)
+ khttp_connect_file(fp);
+ }
+ if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+ off_t rest = len, curr;
+ while (rest) {
+ do {
+ curr = read(fp->fd, buf + l, rest);
+ } while (curr < 0 && EINTR == errno);
+ if (curr < 0) return -1;
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ } else l = my_netread(fp->fd, buf, len);
+ fp->offset += l;
+ return l;
+}
+
+off_t knet_seek(knetFile *fp, int64_t off, int whence)
+{
+ if (whence == SEEK_SET && off == fp->offset) return 0;
+ if (fp->type == KNF_TYPE_LOCAL) {
+ /* Be aware that lseek() returns the offset after seeking,
+ * while fseek() returns zero on success. */
+ off_t offset = lseek(fp->fd, off, whence);
+ if (offset == -1) {
+ // Be silent, it is OK for knet_seek to fail when the file is streamed
+ // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+ return -1;
+ }
+ fp->offset = offset;
+ return 0;
+ }
+ else if (fp->type == KNF_TYPE_FTP)
+ {
+ if (whence==SEEK_CUR)
+ fp->offset += off;
+ else if (whence==SEEK_SET)
+ fp->offset = off;
+ else if ( whence==SEEK_END)
+ fp->offset = fp->file_size+off;
+ fp->is_ready = 0;
+ return 0;
+ }
+ else if (fp->type == KNF_TYPE_HTTP)
+ {
+ if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
+ fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
+ errno = ESPIPE;
+ return -1;
+ }
+ if (whence==SEEK_CUR)
+ fp->offset += off;
+ else if (whence==SEEK_SET)
+ fp->offset = off;
+ fp->is_ready = 0;
+ return 0;
+ }
+ errno = EINVAL;
+ fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+ return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+ if (fp == 0) return 0;
+ if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+ if (fp->fd != -1) {
+ /* On Linux/Mac, netclose() is an alias of close(), but on
+ * Windows, it is an alias of closesocket(). */
+ if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+ else netclose(fp->fd);
+ }
+ free(fp->host); free(fp->port);
+ free(fp->response); free(fp->retr); // FTP specific
+ free(fp->path); free(fp->http_host); // HTTP specific
+ free(fp);
+ return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+ char *buf;
+ knetFile *fp;
+ int type = 4, l;
+#ifdef _WIN32
+ knet_win32_init();
+#endif
+ buf = calloc(0x100000, 1);
+ if (type == 0) {
+ fp = knet_open("knetfile.c", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 1) { // NCBI FTP, large file
+ fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+ knet_seek(fp, 2500000000ll, SEEK_SET);
+ l = knet_read(fp, buf, 255);
+ } else if (type == 2) {
+ fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 3) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 4) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+ knet_read(fp, buf, 10000);
+ knet_seek(fp, 20000, SEEK_SET);
+ knet_seek(fp, 10000, SEEK_SET);
+ l = knet_read(fp, buf+10000, 10000000) + 10000;
+ }
+ if (type != 4 && type != 1) {
+ knet_read(fp, buf, 255);
+ buf[255] = 0;
+ printf("%s\n", buf);
+ } else write(fileno(stdout), buf, l);
+ knet_close(fp);
+ free(buf);
+ return 0;
+}
+#endif
diff --git a/samtools-0.1.19/knetfile.h b/samtools-0.1.19/knetfile.h
new file mode 100644
index 0000000..0a0e66f
--- /dev/null
+++ b/samtools-0.1.19/knetfile.h
@@ -0,0 +1,75 @@
+#ifndef KNETFILE_H
+#define KNETFILE_H
+
+#include <stdint.h>
+#include <fcntl.h>
+
+#ifndef _WIN32
+#define netread(fd, ptr, len) read(fd, ptr, len)
+#define netwrite(fd, ptr, len) write(fd, ptr, len)
+#define netclose(fd) close(fd)
+#else
+#include <winsock2.h>
+#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
+#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
+#define netclose(fd) closesocket(fd)
+#endif
+
+// FIXME: currently I/O is unbuffered
+
+#define KNF_TYPE_LOCAL 1
+#define KNF_TYPE_FTP 2
+#define KNF_TYPE_HTTP 3
+
+typedef struct knetFile_s {
+ int type, fd;
+ int64_t offset;
+ char *host, *port;
+
+ // the following are for FTP only
+ int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
+ char *response, *retr, *size_cmd;
+ int64_t seek_offset; // for lazy seek
+ int64_t file_size;
+
+ // the following are for HTTP only
+ char *path, *http_host;
+} knetFile;
+
+#define knet_tell(fp) ((fp)->offset)
+#define knet_fileno(fp) ((fp)->fd)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+ int knet_win32_init();
+ void knet_win32_destroy();
+#endif
+
+ knetFile *knet_open(const char *fn, const char *mode);
+
+ /*
+ This only works with local files.
+ */
+ knetFile *knet_dopen(int fd, const char *mode);
+
+ /*
+ If ->is_ready==0, this routine updates ->fd; otherwise, it simply
+ reads from ->fd.
+ */
+ off_t knet_read(knetFile *fp, void *buf, off_t len);
+
+ /*
+ This routine only sets ->offset and ->is_ready=0. It does not
+ communicate with the FTP server.
+ */
+ off_t knet_seek(knetFile *fp, int64_t off, int whence);
+ int knet_close(knetFile *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/kprobaln.c b/samtools-0.1.19/kprobaln.c
new file mode 100644
index 0000000..04e526a
--- /dev/null
+++ b/samtools-0.1.19/kprobaln.c
@@ -0,0 +1,280 @@
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008-2010, by Heng Li <lh3lh3 at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "kprobaln.h"
+
+/*****************************************
+ * Probabilistic banded glocal alignment *
+ *****************************************/
+
+#define EI .25
+#define EM .33333333333
+
+static float g_qual2prob[256];
+
+#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
+
+kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
+kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
+
+/*
+ The topology of the profile HMM:
+
+ /\ /\ /\ /\
+ I[1] I[k-1] I[k] I[L]
+ ^ \ \ ^ \ ^ \ \ ^
+ | \ \ | \ | \ \ |
+ M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
+ \ \/ \/ \/ /
+ \ /\ /\ /\ /
+ -> D[k-1] -> D[k] ->
+
+ M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
+
+ On input, _ref is the reference sequence and _query is the query
+ sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
+ ambiguous residue. iqual is the base quality. c sets the gap open
+ probability, gap extension probability and band width.
+
+ On output, state and q are arrays of length l_query. The higher 30
+ bits give the reference position the query base is matched to and the
+ lower two bits can be 0 (an alignment match) or 1 (an
+ insertion). q[i] gives the phred scaled posterior probability of
+ state[i] being wrong.
+ */
+int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
+ const kpa_par_t *c, int *state, uint8_t *q)
+{
+ double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
+ float *qual, *_qual;
+ const uint8_t *ref, *query;
+ int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
+
+ if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
+
+ /*** initialization ***/
+ is_backward = state && q? 1 : 0;
+ ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
+ bw = l_ref > l_query? l_ref : l_query;
+ if (bw > c->bw) bw = c->bw;
+ if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
+ bw2 = bw * 2 + 1;
+ // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
+ f = calloc(l_query+1, sizeof(void*));
+ if (is_backward) b = calloc(l_query+1, sizeof(void*));
+ for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0
+ f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
+ if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
+ }
+ s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
+ // initialize qual
+ _qual = calloc(l_query, sizeof(float));
+ if (g_qual2prob[0] == 0)
+ for (i = 0; i < 256; ++i)
+ g_qual2prob[i] = pow(10, -i/10.);
+ for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
+ qual = _qual - 1;
+ // initialize transition probability
+ sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
+ m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
+ m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
+ m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
+ bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
+ /*** forward ***/
+ // f[0]
+ set_u(k, bw, 0, 0);
+ f[0][k] = s[0] = 1.;
+ { // f[1]
+ double *fi = f[1], sum;
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+ set_u(u, bw, 1, k);
+ fi[u+0] = e * bM; fi[u+1] = EI * bI;
+ sum += fi[u] + fi[u+1];
+ }
+ // rescale
+ s[1] = sum;
+ set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
+ for (k = _beg; k <= _end; ++k) fi[k] /= sum;
+ }
+ // f[2..l_query]
+ for (i = 2; i <= l_query; ++i) {
+ double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
+ int beg = 1, end = l_ref, x, _beg, _end;
+ uint8_t qyi = query[i];
+ x = i - bw; beg = beg > x? beg : x; // band start
+ x = i + bw; end = end < x? end : x; // band end
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u, v11, v01, v10;
+ double e;
+ e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
+ set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
+ fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
+ fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
+ fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
+ sum += fi[u] + fi[u+1] + fi[u+2];
+// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
+ }
+ // rescale
+ s[i] = sum;
+ set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+ for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
+ }
+ { // f[l_query+1]
+ double sum;
+ for (k = 1, sum = 0.; k <= l_ref; ++k) {
+ int u;
+ set_u(u, bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
+ }
+ s[l_query+1] = sum; // the last scaling factor
+ }
+ { // compute likelihood
+ double p = 1., Pr1 = 0.;
+ for (i = 0; i <= l_query + 1; ++i) {
+ p *= s[i];
+ if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
+ }
+ Pr1 += -4.343 * log(p * l_ref * l_query);
+ Pr = (int)(Pr1 + .499);
+ if (!is_backward) { // skip backward and MAP
+ for (i = 0; i <= l_query; ++i) free(f[i]);
+ free(f); free(s); free(_qual);
+ return Pr;
+ }
+ }
+ /*** backward ***/
+ // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
+ for (k = 1; k <= l_ref; ++k) {
+ int u;
+ double *bi = b[l_query];
+ set_u(u, bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
+ }
+ // b[l_query-1..1]
+ for (i = l_query - 1; i >= 1; --i) {
+ int beg = 1, end = l_ref, x, _beg, _end;
+ double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
+ uint8_t qyi1 = query[i+1];
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = end; k >= beg; --k) {
+ int u, v11, v01, v10;
+ double e;
+ set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
+ e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
+ bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
+ bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
+ bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
+// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
+ }
+ // rescale
+ set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+ for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
+ }
+ { // b[0]
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
+ double sum = 0.;
+ for (k = end; k >= beg; --k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+ set_u(u, bw, 1, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
+ }
+ set_u(k, bw, 0, 0);
+ pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
+ }
+ is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
+ /*** MAP ***/
+ for (i = 1; i <= l_query; ++i) {
+ double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
+ int beg = 1, end = l_ref, x, max_k = -1;
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = beg; k <= end; ++k) {
+ int u;
+ double z;
+ set_u(u, bw, i, k);
+ z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
+ z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
+ }
+ max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
+ if (state) state[i-1] = max_k;
+ if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
+#ifdef _MAIN
+ fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
+ "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
+#endif
+ }
+ /*** free ***/
+ for (i = 0; i <= l_query; ++i) {
+ free(f[i]); free(b[i]);
+ }
+ free(f); free(b); free(s); free(_qual);
+ return Pr;
+}
+
+#ifdef _MAIN
+#include <unistd.h>
+int main(int argc, char *argv[])
+{
+ uint8_t conv[256], *iqual, *ref, *query;
+ int c, l_ref, l_query, i, q = 30, b = 10, P;
+ while ((c = getopt(argc, argv, "b:q:")) >= 0) {
+ switch (c) {
+ case 'b': b = atoi(optarg); break;
+ case 'q': q = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
+ return 1;
+ }
+ memset(conv, 4, 256);
+ conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
+ conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
+ ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
+ l_ref = strlen((char*)ref); l_query = strlen((char*)query);
+ for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
+ for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
+ iqual = malloc(l_query);
+ memset(iqual, q, l_query);
+ kpa_par_def.bw = b;
+ P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
+ fprintf(stderr, "%d\n", P);
+ free(iqual);
+ return 0;
+}
+#endif
diff --git a/samtools-0.1.19/kprobaln.h b/samtools-0.1.19/kprobaln.h
new file mode 100644
index 0000000..0357dcc
--- /dev/null
+++ b/samtools-0.1.19/kprobaln.h
@@ -0,0 +1,49 @@
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3 at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef LH3_KPROBALN_H_
+#define LH3_KPROBALN_H_
+
+#include <stdint.h>
+
+typedef struct {
+ float d, e;
+ int bw;
+} kpa_par_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
+ const kpa_par_t *c, int *state, uint8_t *q);
+
+#ifdef __cplusplus
+}
+#endif
+
+extern kpa_par_t kpa_par_def, kpa_par_alt;
+
+#endif
diff --git a/samtools-0.1.19/kseq.h b/samtools-0.1.19/kseq.h
new file mode 100644
index 0000000..a5cec7c
--- /dev/null
+++ b/samtools-0.1.19/kseq.h
@@ -0,0 +1,235 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ unsigned char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) return -1; \
+ } \
+ return (int)ks->buf[ks->begin++]; \
+ }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) break; \
+ } else break; \
+ } \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ SCOPE void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 16384) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/samtools-0.1.19/ksort.h b/samtools-0.1.19/ksort.h
new file mode 100644
index 0000000..aa0bb93
--- /dev/null
+++ b/samtools-0.1.19/ksort.h
@@ -0,0 +1,285 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+/*
+ 2012-12-11 (0.1.4):
+
+ * Defined __ks_insertsort_##name as static to compile with C99.
+
+ 2008-11-16 (0.1.4):
+
+ * Fixed a bug in introsort() that happens in rare cases.
+
+ 2008-11-05 (0.1.3):
+
+ * Fixed a bug in introsort() for complex comparisons.
+
+ * Fixed a bug in mergesort(). The previous version is not stable.
+
+ 2008-09-15 (0.1.2):
+
+ * Accelerated introsort. On my Mac (not on another Linux machine),
+ my implementation is as fast as std::sort on random input.
+
+ * Added combsort and in introsort, switch to combsort if the
+ recursion is too deep.
+
+ 2008-09-13 (0.1.1):
+
+ * Added k-small algorithm
+
+ 2008-09-05 (0.1.0):
+
+ * Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+ void *left, *right;
+ int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt) \
+ void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
+ { \
+ type_t *a2[2], *a, *b; \
+ int curr, shift; \
+ \
+ a2[0] = array; \
+ a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
+ for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
+ a = a2[curr]; b = a2[1-curr]; \
+ if (shift == 0) { \
+ type_t *p = b, *i, *eb = a + n; \
+ for (i = a; i < eb; i += 2) { \
+ if (i == eb - 1) *p++ = *i; \
+ else { \
+ if (__sort_lt(*(i+1), *i)) { \
+ *p++ = *(i+1); *p++ = *i; \
+ } else { \
+ *p++ = *i; *p++ = *(i+1); \
+ } \
+ } \
+ } \
+ } else { \
+ size_t i, step = 1ul<<shift; \
+ for (i = 0; i < n; i += step<<1) { \
+ type_t *p, *j, *k, *ea, *eb; \
+ if (n < i + step) { \
+ ea = a + n; eb = a; \
+ } else { \
+ ea = a + i + step; \
+ eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+ } \
+ j = a + i; k = a + i + step; p = b + i; \
+ while (j < ea && k < eb) { \
+ if (__sort_lt(*k, *j)) *p++ = *k++; \
+ else *p++ = *j++; \
+ } \
+ while (j < ea) *p++ = *j++; \
+ while (k < eb) *p++ = *k++; \
+ } \
+ } \
+ curr = 1 - curr; \
+ } \
+ if (curr == 1) { \
+ type_t *p = a2[0], *i = a2[1], *eb = array + n; \
+ for (; p < eb; ++i) *p++ = *i; \
+ } \
+ if (temp == 0) free(a2[1]); \
+ } \
+ void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
+ { \
+ size_t k = i; \
+ type_t tmp = l[i]; \
+ while ((k = (k << 1) + 1) < n) { \
+ if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
+ if (__sort_lt(l[k], tmp)) break; \
+ l[i] = l[k]; i = k; \
+ } \
+ l[i] = tmp; \
+ } \
+ void ks_heapmake_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
+ ks_heapadjust_##name(i, lsize, l); \
+ } \
+ void ks_heapsort_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = lsize - 1; i > 0; --i) { \
+ type_t tmp; \
+ tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+ } \
+ } \
+ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
+ { \
+ type_t *i, *j, swap_tmp; \
+ for (i = s + 1; i < t; ++i) \
+ for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
+ swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
+ } \
+ } \
+ void ks_combsort_##name(size_t n, type_t a[]) \
+ { \
+ const double shrink_factor = 1.2473309501039786540366528676643; \
+ int do_swap; \
+ size_t gap = n; \
+ type_t tmp, *i, *j; \
+ do { \
+ if (gap > 2) { \
+ gap = (size_t)(gap / shrink_factor); \
+ if (gap == 9 || gap == 10) gap = 11; \
+ } \
+ do_swap = 0; \
+ for (i = a; i < a + n - gap; ++i) { \
+ j = i + gap; \
+ if (__sort_lt(*j, *i)) { \
+ tmp = *i; *i = *j; *j = tmp; \
+ do_swap = 1; \
+ } \
+ } \
+ } while (do_swap || gap > 2); \
+ if (gap != 1) __ks_insertsort_##name(a, a + n); \
+ } \
+ void ks_introsort_##name(size_t n, type_t a[]) \
+ { \
+ int d; \
+ ks_isort_stack_t *top, *stack; \
+ type_t rp, swap_tmp; \
+ type_t *s, *t, *i, *j, *k; \
+ \
+ if (n < 1) return; \
+ else if (n == 2) { \
+ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+ return; \
+ } \
+ for (d = 2; 1ul<<d < n; ++d); \
+ stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+ top = stack; s = a; t = a + (n-1); d <<= 1; \
+ while (1) { \
+ if (s < t) { \
+ if (--d == 0) { \
+ ks_combsort_##name(t - s + 1, s); \
+ t = s; \
+ continue; \
+ } \
+ i = s; j = t; k = i + ((j-i)>>1) + 1; \
+ if (__sort_lt(*k, *i)) { \
+ if (__sort_lt(*k, *j)) k = j; \
+ } else k = __sort_lt(*j, *i)? i : j; \
+ rp = *k; \
+ if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
+ for (;;) { \
+ do ++i; while (__sort_lt(*i, rp)); \
+ do --j; while (i <= j && __sort_lt(rp, *j)); \
+ if (j <= i) break; \
+ swap_tmp = *i; *i = *j; *j = swap_tmp; \
+ } \
+ swap_tmp = *i; *i = *t; *t = swap_tmp; \
+ if (i-s > t-i) { \
+ if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+ s = t-i > 16? i+1 : t; \
+ } else { \
+ if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+ t = i-s > 16? i-1 : s; \
+ } \
+ } else { \
+ if (top == stack) { \
+ free(stack); \
+ __ks_insertsort_##name(a, a+n); \
+ return; \
+ } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+ } \
+ } \
+ } \
+ /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+ /* 0 <= kk < n */ \
+ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
+ { \
+ type_t *low, *high, *k, *ll, *hh, *mid; \
+ low = arr; high = arr + n - 1; k = arr + kk; \
+ for (;;) { \
+ if (high <= low) return *k; \
+ if (high == low + 1) { \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ return *k; \
+ } \
+ mid = low + (high - low) / 2; \
+ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
+ KSORT_SWAP(type_t, *mid, *(low+1)); \
+ ll = low + 1; hh = high; \
+ for (;;) { \
+ do ++ll; while (__sort_lt(*ll, *low)); \
+ do --hh; while (__sort_lt(*low, *hh)); \
+ if (hh < ll) break; \
+ KSORT_SWAP(type_t, *ll, *hh); \
+ } \
+ KSORT_SWAP(type_t, *low, *hh); \
+ if (hh <= k) low = ll; \
+ if (hh >= k) high = hh - 1; \
+ } \
+ } \
+ void ks_shuffle_##name(size_t n, type_t a[]) \
+ { \
+ int i, j; \
+ for (i = n; i > 1; --i) { \
+ type_t tmp; \
+ j = (int)(drand48() * i); \
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \
+ } \
+ }
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/samtools-0.1.19/kstring.c b/samtools-0.1.19/kstring.c
new file mode 100644
index 0000000..b8ff45c
--- /dev/null
+++ b/samtools-0.1.19/kstring.c
@@ -0,0 +1,212 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "kstring.h"
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+ va_list ap;
+ int l;
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
+ va_end(ap);
+ if (l + 1 > s->m - s->l) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+ }
+ va_end(ap);
+ s->l += l;
+ return l;
+}
+
+char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
+{
+ const char *p, *start;
+ if (sep) { // set up the table
+ if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
+ aux->finished = 0;
+ if (sep[1]) {
+ aux->sep = -1;
+ aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+ for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+ } else aux->sep = sep[0];
+ }
+ if (aux->finished) return 0;
+ else if (str) aux->p = str - 1, aux->finished = 0;
+ if (aux->sep < 0) {
+ for (p = start = aux->p + 1; *p; ++p)
+ if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+ } else {
+ for (p = start = aux->p + 1; *p; ++p)
+ if (*p == aux->sep) break;
+ }
+ aux->p = p; // end of token
+ if (*p == 0) aux->finished = 1; // no more tokens
+ return (char*)start;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+ int i, n, max, last_char, last_start, *offsets, l;
+ n = 0; max = *_max; offsets = *_offsets;
+ l = strlen(s);
+
+#define __ksplit_aux do { \
+ if (_offsets) { \
+ s[i] = 0; \
+ if (n == max) { \
+ max = max? max<<1 : 2; \
+ offsets = (int*)realloc(offsets, sizeof(int) * max); \
+ } \
+ offsets[n++] = last_start; \
+ } else ++n; \
+ } while (0)
+
+ for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+ if (delimiter == 0) {
+ if (isspace(s[i]) || s[i] == 0) {
+ if (isgraph(last_char)) __ksplit_aux; // the end of a field
+ } else {
+ if (isspace(last_char) || last_char == 0) last_start = i;
+ }
+ } else {
+ if (s[i] == delimiter || s[i] == 0) {
+ if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+ } else {
+ if (last_char == delimiter || last_char == 0) last_start = i;
+ }
+ }
+ last_char = s[i];
+ }
+ *_max = max; *_offsets = offsets;
+ return n;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+typedef unsigned char ubyte_t;
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+static int *ksBM_prep(const ubyte_t *pat, int m)
+{
+ int i, *suff, *prep, *bmGs, *bmBc;
+ prep = (int*)calloc(m + 256, sizeof(int));
+ bmGs = prep; bmBc = prep + m;
+ { // preBmBc()
+ for (i = 0; i < 256; ++i) bmBc[i] = m;
+ for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+ }
+ suff = (int*)calloc(m, sizeof(int));
+ { // suffixes()
+ int f = 0, g;
+ suff[m - 1] = m;
+ g = m - 1;
+ for (i = m - 2; i >= 0; --i) {
+ if (i > g && suff[i + m - 1 - f] < i - g)
+ suff[i] = suff[i + m - 1 - f];
+ else {
+ if (i < g) g = i;
+ f = i;
+ while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+ suff[i] = f - g;
+ }
+ }
+ }
+ { // preBmGs()
+ int j = 0;
+ for (i = 0; i < m; ++i) bmGs[i] = m;
+ for (i = m - 1; i >= 0; --i)
+ if (suff[i] == i + 1)
+ for (; j < m - 1 - i; ++j)
+ if (bmGs[j] == m)
+ bmGs[j] = m - 1 - i;
+ for (i = 0; i <= m - 2; ++i)
+ bmGs[m - 1 - suff[i]] = m - 1 - i;
+ }
+ free(suff);
+ return prep;
+}
+
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
+{
+ int i, j, *prep = 0, *bmGs, *bmBc;
+ const ubyte_t *str, *pat;
+ str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+ prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+ if (_prep && *_prep == 0) *_prep = prep;
+ bmGs = prep; bmBc = prep + m;
+ j = 0;
+ while (j <= n - m) {
+ for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+ if (i >= 0) {
+ int max = bmBc[str[i+j]] - m + 1 + i;
+ if (max < bmGs[i]) max = bmGs[i];
+ j += max;
+ } else return (void*)(str + j);
+ }
+ if (_prep == 0) free(prep);
+ return 0;
+}
+
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+ return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+ return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+ kstring_t *s;
+ int *fields, n, i;
+ ks_tokaux_t aux;
+ char *p;
+ s = (kstring_t*)calloc(1, sizeof(kstring_t));
+ // test ksprintf()
+ ksprintf(s, " abcdefg: %d ", 100);
+ printf("'%s'\n", s->s);
+ // test ksplit()
+ fields = ksplit(s, 0, &n);
+ for (i = 0; i < n; ++i)
+ printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+ // test kstrtok()
+ s->l = 0;
+ for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+ kputsn(p, aux.p - p, s);
+ kputc('\n', s);
+ }
+ printf("%s", s->s);
+ // free
+ free(s->s); free(s); free(fields);
+
+ {
+ static char *str = "abcdefgcdgcagtcakcdcd";
+ static char *pat = "cd";
+ char *ret, *s = str;
+ int *prep = 0;
+ while ((ret = kstrstr(s, pat, &prep)) != 0) {
+ printf("match: %s\n", ret);
+ s = ret + prep[0];
+ }
+ free(prep);
+ }
+ return 0;
+}
+#endif
diff --git a/samtools-0.1.19/kstring.h b/samtools-0.1.19/kstring.h
new file mode 100644
index 0000000..abd8236
--- /dev/null
+++ b/samtools-0.1.19/kstring.h
@@ -0,0 +1,169 @@
+/* The MIT License
+
+ Copyright (c) by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+typedef struct {
+ uint64_t tab[4];
+ int sep, finished;
+ const char *p; // end of the current token
+} ks_tokaux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ int ksprintf(kstring_t *s, const char *fmt, ...);
+ int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+ char *kstrstr(const char *str, const char *pat, int **_prep);
+ char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+ void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
+
+ /* kstrtok() is similar to strtok_r() except that str is not
+ * modified and both str and sep can be NULL. For efficiency, it is
+ * actually recommended to set both to NULL in the subsequent calls
+ * if sep is not changed. */
+ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline void ks_resize(kstring_t *s, size_t size)
+{
+ if (s->m < size) {
+ s->m = size;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ memcpy(s->s + s->l, p, l);
+ s->l += l;
+ s->s[s->l] = 0;
+ return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+ return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+ if (s->l + 1 >= s->m) {
+ s->m = s->l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ s->s[s->l++] = c;
+ s->s[s->l] = 0;
+ return c;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+ char buf[16];
+ int l, x;
+ if (c == 0) return kputc('0', s);
+ if(c < 0) for (l = 0, x = c; x < 0; x /= 10) buf[l++] = '0' - (x%10);
+ else for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+ char buf[16];
+ int l, i;
+ unsigned x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+ char buf[32];
+ long l, x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+ int max = 0, *offsets = 0;
+ *n = ksplit_core(s->s, delimiter, &max, &offsets);
+ return offsets;
+}
+
+#endif
diff --git a/samtools-0.1.19/misc/HmmGlocal.java b/samtools-0.1.19/misc/HmmGlocal.java
new file mode 100644
index 0000000..9e93b13
--- /dev/null
+++ b/samtools-0.1.19/misc/HmmGlocal.java
@@ -0,0 +1,178 @@
+import java.io.*;
+import java.lang.*;
+
+public class HmmGlocal
+{
+ private double[] qual2prob;
+ private double cd, ce; // gap open probility [1e-3], gap extension probability [0.1]
+ private int cb; // band width [7]
+
+ public HmmGlocal(final double d, final double e, final int b) {
+ cd = d; ce = e; cb = b;
+ qual2prob = new double[256];
+ for (int i = 0; i < 256; ++i)
+ qual2prob[i] = Math.pow(10, -i/10.);
+ }
+ private static int set_u(final int b, final int i, final int k) {
+ int x = i - b;
+ x = x > 0? x : 0;
+ return (k + 1 - x) * 3;
+ }
+ public int hmm_glocal(final byte[] _ref, final byte[] _query, final byte[] _iqual, int[] state, byte[] q) {
+ int i, k;
+ /*** initialization ***/
+ // change coordinates
+ int l_ref = _ref.length;
+ byte[] ref = new byte[l_ref+1];
+ for (i = 0; i < l_ref; ++i) ref[i+1] = _ref[i]; // FIXME: this is silly...
+ int l_query = _query.length;
+ byte[] query = new byte[l_query+1];
+ double[] qual = new double[l_query+1];
+ for (i = 0; i < l_query; ++i) {
+ query[i+1] = _query[i];
+ qual[i+1] = qual2prob[_iqual[i]];
+ }
+ // set band width
+ int bw2, bw = l_ref > l_query? l_ref : l_query;
+ if (bw > cb) bw = cb;
+ if (bw < Math.abs(l_ref - l_query)) bw = Math.abs(l_ref - l_query);
+ bw2 = bw * 2 + 1;
+ // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
+ double[][] f = new double[l_query+1][bw2*3 + 6];
+ double[][] b = new double[l_query+1][bw2*3 + 6];
+ double[] s = new double[l_query+2];
+ // initialize transition probabilities
+ double sM, sI, bM, bI;
+ sM = sI = 1. / (2 * l_query + 2);
+ bM = (1 - cd) / l_query; bI = cd / l_query; // (bM+bI)*l_query==1
+ double[] m = new double[9];
+ m[0*3+0] = (1 - cd - cd) * (1 - sM); m[0*3+1] = m[0*3+2] = cd * (1 - sM);
+ m[1*3+0] = (1 - ce) * (1 - sI); m[1*3+1] = ce * (1 - sI); m[1*3+2] = 0.;
+ m[2*3+0] = 1 - ce; m[2*3+1] = 0.; m[2*3+2] = ce;
+ /*** forward ***/
+ // f[0]
+ f[0][set_u(bw, 0, 0)] = s[0] = 1.;
+ { // f[1]
+ double[] fi = f[1];
+ double sum;
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
+ u = set_u(bw, 1, k);
+ fi[u+0] = e * bM; fi[u+1] = .25 * bI;
+ sum += fi[u] + fi[u+1];
+ }
+ // rescale
+ s[1] = sum;
+ _beg = set_u(bw, 1, beg); _end = set_u(bw, 1, end); _end += 2;
+ for (k = _beg; k <= _end; ++k) fi[k] /= sum;
+ }
+ // f[2..l_query]
+ for (i = 2; i <= l_query; ++i) {
+ double[] fi = f[i], fi1 = f[i-1];
+ double sum, qli = qual[i];
+ int beg = 1, end = l_ref, x, _beg, _end;
+ byte qyi = query[i];
+ x = i - bw; beg = beg > x? beg : x; // band start
+ x = i + bw; end = end < x? end : x; // band end
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u, v11, v01, v10;
+ double e;
+ e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli / 3.;
+ u = set_u(bw, i, k); v11 = set_u(bw, i-1, k-1); v10 = set_u(bw, i-1, k); v01 = set_u(bw, i, k-1);
+ fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
+ fi[u+1] = .25 * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
+ fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
+ sum += fi[u] + fi[u+1] + fi[u+2];
+ //System.out.println("("+i+","+k+";"+u+"): "+fi[u]+","+fi[u+1]+","+fi[u+2]);
+ }
+ // rescale
+ s[i] = sum;
+ _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
+ for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
+ }
+ { // f[l_query+1]
+ double sum;
+ for (k = 1, sum = 0.; k <= l_ref; ++k) {
+ int u = set_u(bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
+ }
+ s[l_query+1] = sum; // the last scaling factor
+ }
+ /*** backward ***/
+ // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
+ for (k = 1; k <= l_ref; ++k) {
+ int u = set_u(bw, l_query, k);
+ double[] bi = b[l_query];
+ if (u < 3 || u >= bw2*3+3) continue;
+ bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
+ }
+ // b[l_query-1..1]
+ for (i = l_query - 1; i >= 1; --i) {
+ int beg = 1, end = l_ref, x, _beg, _end;
+ double[] bi = b[i], bi1 = b[i+1];
+ double y = (i > 1)? 1. : 0., qli1 = qual[i+1];
+ byte qyi1 = query[i+1];
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = end; k >= beg; --k) {
+ int u, v11, v01, v10;
+ double e;
+ u = set_u(bw, i, k); v11 = set_u(bw, i+1, k+1); v10 = set_u(bw, i+1, k); v01 = set_u(bw, i, k+1);
+ e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 / 3.) * bi1[v11];
+ bi[u+0] = e * m[0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
+ bi[u+1] = e * m[3] + .25 * m[4] * bi1[v10+1];
+ bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
+ }
+ // rescale
+ _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
+ for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
+ }
+ double pb;
+ { // b[0]
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
+ double sum = 0.;
+ for (k = end; k >= beg; --k) {
+ int u = set_u(bw, 1, k);
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += e * b[1][u+0] * bM + .25 * b[1][u+1] * bI;
+ }
+ pb = b[0][set_u(bw, 0, 0)] = sum / s[0]; // if everything works as is expected, pb == 1.0
+ }
+ int is_diff = Math.abs(pb - 1.) > 1e-7? 1 : 0;
+ /*** MAP ***/
+ for (i = 1; i <= l_query; ++i) {
+ double sum = 0., max = 0.;
+ double[] fi = f[i], bi = b[i];
+ int beg = 1, end = l_ref, x, max_k = -1;
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = beg; k <= end; ++k) {
+ int u = set_u(bw, i, k);
+ double z;
+ sum += (z = fi[u+0] * bi[u+0]); if (z > max) { max = z; max_k = (k-1)<<2 | 0; }
+ sum += (z = fi[u+1] * bi[u+1]); if (z > max) { max = z; max_k = (k-1)<<2 | 1; }
+ }
+ max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
+ if (state != null) state[i-1] = max_k;
+ if (q != null) {
+ k = (int)(-4.343 * Math.log(1. - max) + .499);
+ q[i-1] = (byte)(k > 100? 99 : k);
+ }
+ //System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")");
+ }
+ return 0;
+ }
+
+ public static void main(String[] args) {
+ byte[] ref = {'\0', '\1', '\3', '\3', '\1'};
+ byte[] query = {'\0', '\3', '\3', '\1'};
+ byte[] qual = new byte[4];
+ qual[0] = qual[1] = qual[2] = qual[3] = (byte)20;
+ HmmGlocal hg = new HmmGlocal(1e-3, 0.1, 7);
+ hg.hmm_glocal(ref, query, qual, null, null);
+ }
+}
\ No newline at end of file
diff --git a/samtools-0.1.19/misc/Makefile b/samtools-0.1.19/misc/Makefile
new file mode 100644
index 0000000..d36e7ac
--- /dev/null
+++ b/samtools-0.1.19/misc/Makefile
@@ -0,0 +1,69 @@
+CC= gcc
+CXX= g++
+CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
+CXXFLAGS= $(CFLAGS)
+DFLAGS= -D_FILE_OFFSET_BITS=64
+OBJS=
+PROG= md5sum-lite md5fa maq2sam-short maq2sam-long ace2sam wgsim bamcheck
+INCLUDES= -I..
+SUBDIRS= .
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+lib-recur all-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+lib:
+
+bamcheck:bamcheck.o
+ $(CC) $(CFLAGS) -o $@ bamcheck.o -L.. -lm -lbam -lpthread -lz
+
+bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h
+ $(CC) $(CFLAGS) -c -I.. -o $@ bamcheck.c
+
+ace2sam:ace2sam.o
+ $(CC) $(CFLAGS) -o $@ ace2sam.o -lz
+
+wgsim:wgsim.o
+ $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz
+
+md5fa:md5.o md5fa.o md5.h ../kseq.h
+ $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
+
+md5sum-lite:md5sum-lite.o
+ $(CC) $(CFLAGS) -o $@ md5sum-lite.o
+
+md5sum-lite.o:md5.c md5.h
+ $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c
+
+maq2sam-short:maq2sam.c
+ $(CC) $(CFLAGS) -o $@ maq2sam.c -lz
+
+maq2sam-long:maq2sam.c
+ $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz
+
+md5fa.o:md5.h md5fa.c
+ $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c
+
+wgsim.o:wgsim.c ../kseq.h
+ $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c
+
+ace2sam.o:ace2sam.c ../kstring.h ../kseq.h
+ $(CC) $(CFLAGS) -c -I.. -o $@ ace2sam.c
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a
+
+clean:cleanlocal-recur
diff --git a/samtools-0.1.19/misc/ace2sam.c b/samtools-0.1.19/misc/ace2sam.c
new file mode 100644
index 0000000..325133d
--- /dev/null
+++ b/samtools-0.1.19/misc/ace2sam.c
@@ -0,0 +1,249 @@
+/* The MIT License
+
+ Copyright (c) 2011 Heng Li <lh3 at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+#include "kstring.h"
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define N_TMPSTR 5
+#define LINE_LEN 60
+
+// append a CIGAR operation plus length
+#define write_cigar(_c, _n, _m, _v) do { \
+ if (_n == _m) { \
+ _m = _m? _m<<1 : 4; \
+ _c = realloc(_c, _m * sizeof(unsigned)); \
+ } \
+ _c[_n++] = (_v); \
+ } while (0)
+
+// a fatal error
+static void fatal(const char *msg)
+{
+ fprintf(stderr, "E %s\n", msg);
+ exit(1);
+}
+// remove pads
+static void remove_pads(const kstring_t *src, kstring_t *dst)
+{
+ int i, j;
+ dst->l = 0;
+ kputsn(src->s, src->l, dst);
+ for (i = j = 0; i < dst->l; ++i)
+ if (dst->s[i] != '*') dst->s[j++] = dst->s[i];
+ dst->s[j] = 0;
+ dst->l = j;
+}
+
+int main(int argc, char *argv[])
+{
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t s, t[N_TMPSTR];
+ int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
+ long m_cigar = 0, n_cigar = 0;
+ unsigned *af, *cigar = 0;
+
+ while ((c = getopt(argc, argv, "pc")) >= 0) {
+ switch (c) {
+ case 'p': is_padded = 1; break;
+ case 'c': write_cns = 1; break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n");
+ fprintf(stderr, "Options: -p output padded SAM\n");
+ fprintf(stderr, " -c write the contig sequence in SAM\n\n");
+ fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
+ fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n");
+ fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
+ fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n");
+ return 1;
+ }
+
+ s.l = s.m = 0; s.s = 0;
+ af_n = af_max = af_i = 0; af = 0;
+ for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
+ fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ if (strcmp(s.s, "CO") == 0) { // contig sequence
+ kstring_t *cns;
+ t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
+ af_n = af_i = 0; // reset the af array
+ ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
+ ks_getuntil(ks, '\n', &s, &dret); // read the whole line
+ while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
+ remove_pads(&t[1], &t[2]); // construct the unpadded sequence
+ // compute the array for mapping padded positions to unpadded positions
+ p2u = realloc(p2u, t[1].m * sizeof(int));
+ for (i = k = 0; i < t[1].l; ++i) {
+ p2u[i] = k;
+ if (t[1].s[i] != '*') ++k;
+ }
+ // write out the SAM header and contig sequences
+ fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
+ cns = &t[is_padded?1:2];
+ fprintf(stderr, "S >%s\n", t[0].s);
+ for (i = 0; i < cns->l; i += LINE_LEN) {
+ fputs("S ", stderr);
+ for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
+ fputc(cns->s[i + k], stderr);
+ fputc('\n', stderr);
+ }
+
+#define __padded2cigar(sp) do { \
+ int i, l_M = 0, l_D = 0; \
+ for (i = 0; i < sp.l; ++i) { \
+ if (sp.s[i] == '*') { \
+ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
+ ++l_D; l_M = 0; \
+ } else { \
+ if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
+ ++l_M; l_D = 0; \
+ } \
+ } \
+ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
+ else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
+ } while (0)
+
+ if (write_cns) { // write the consensus SAM line (dummy read)
+ n_cigar = 0;
+ if (is_padded) __padded2cigar(t[1]);
+ else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
+ kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
+ for (i = 0; i < n_cigar; ++i) {
+ kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
+ }
+ kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
+ }
+ } else if (strcmp(s.s, "BQ") == 0) { // contig quality
+ if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'");
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line
+ if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*"
+ for (i = 0; i < t[2].l; ++i) { // read the consensus quality
+ int q;
+ if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
+ if (s.l) {
+ q = atoi(s.s) + 33;
+ if (q > 126) q = 126;
+ if (write_cns) kputc(q, &t[4]);
+ } else --i;
+ }
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
+ if (write_cns) puts(t[4].s); t[4].l = 0;
+ } else if (strcmp(s.s, "AF") == 0) { // padded read position
+ int reversed, neg, pos;
+ if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
+ if (write_cns) {
+ if (t[4].l) puts(t[4].s);
+ t[4].l = 0;
+ }
+ ks_getuntil(ks, 0, &s, &dret); // read name
+ ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand
+ ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position
+ if (af_n == af_max) { // double the af array
+ af_max = af_max? af_max<<1 : 4;
+ af = realloc(af, af_max * sizeof(unsigned));
+ }
+ af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information
+ } else if (strcmp(s.s, "RD") == 0) { // read sequence
+ if (af_i >= af_n) fatal("more 'RD' records than 'AF'");
+ t[2].l = t[3].l = t[4].l = 0;
+ ks_getuntil(ks, 0, &t[4], &dret); // QNAME
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line
+ while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence
+ } else if (strcmp(s.s, "QA") == 0) { // clipping
+ if (af_i >= af_n) fatal("more 'QA' records than 'AF'");
+ int beg, end, pos, op;
+ ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
+ ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
+ ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ // compute 1-based POS
+ pos = af[af_i]>>2; // retrieve the position information
+ if (af[af_i]>>1&1) pos = -pos;
+ pos += beg; // now pos is the true padded position
+ // generate CIGAR
+ remove_pads(&t[2], &t[3]); // backup the unpadded read sequence
+ n_cigar = 0;
+ if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
+ if (is_padded) {
+ __padded2cigar(t[2]);
+ if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR
+ if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR
+ } else {
+ // generate flattened CIGAR string
+ for (i = beg, k = pos - 1; i < end; ++i, ++k)
+ t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6);
+ // generate the proper CIGAR
+ for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) {
+ if (op != t[2].s[i]) {
+ write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
+ op = t[2].s[i]; k = 1;
+ } else ++k;
+ }
+ write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
+ // remove unnecessary "P" and possibly merge adjacent operations
+ for (i = 2; i < n_cigar; ++i) {
+ if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) {
+ cigar[i-1] = 0;
+ if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations
+ cigar[i] += cigar[i-2], cigar[i-2] = 0;
+ }
+ }
+ for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations
+ if (cigar[i]) cigar[k++] = cigar[i];
+ n_cigar = k;
+ }
+ if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4);
+ // write the SAM line for the read
+ kputc('\t', &t[4]); // QNAME has already been written
+ kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG
+ kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME
+ kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS
+ kputs("\t60\t", &t[4]); // MAPQ
+ for (i = 0; i < n_cigar; ++i) { // CIGAR
+ kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
+ }
+ kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN
+ kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ
+ kputs("\t*", &t[4]); // QUAL
+ puts(t[4].s); // print to stdout
+ ++af_i;
+ } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(af); free(s.s); free(cigar); free(p2u);
+ for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);
+ return 0;
+}
diff --git a/samtools-0.1.19/misc/bamcheck.c b/samtools-0.1.19/misc/bamcheck.c
new file mode 100644
index 0000000..352db21
--- /dev/null
+++ b/samtools-0.1.19/misc/bamcheck.c
@@ -0,0 +1,1521 @@
+/*
+ Author: petr.danecek at sanger
+ gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
+
+ Assumptions, approximations and other issues:
+ - GC-depth graph does not split reads, the starting position determines which bin is incremented.
+ There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
+ - coverage distribution ignores softclips and deletions
+ - some stats require sorted BAMs
+ - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
+ - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
+ - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
+ counts, no splicing is done, no indels or soft clips are considered, even small overlap is
+ good enough to include the read in the stats.
+
+*/
+
+#define BAMCHECK_VERSION "2012-09-04"
+
+#define _ISOC99_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+#include <getopt.h>
+#include <errno.h>
+#include <assert.h>
+#include "faidx.h"
+#include "khash.h"
+#include "sam.h"
+#include "sam_header.h"
+#include "razf.h"
+
+#define BWA_MIN_RDLEN 35
+#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
+#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
+#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
+#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
+#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
+#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
+#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
+
+typedef struct
+{
+ int32_t line_len, line_blen;
+ int64_t len;
+ uint64_t offset;
+}
+faidx1_t;
+KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
+KHASH_MAP_INIT_STR(kh_bam_tid, int)
+KHASH_MAP_INIT_STR(kh_rg, const char *)
+struct __faidx_t {
+ RAZF *rz;
+ int n, m;
+ char **name;
+ khash_t(kh_faidx) *hash;
+};
+
+typedef struct
+{
+ float gc;
+ uint32_t depth;
+}
+gc_depth_t;
+
+// For coverage distribution, a simple pileup
+typedef struct
+{
+ int64_t pos;
+ int size, start;
+ int *buffer;
+}
+round_buffer_t;
+
+typedef struct { uint32_t from, to; } pos_t;
+typedef struct
+{
+ int npos,mpos,cpos;
+ pos_t *pos;
+}
+regions_t;
+
+typedef struct
+{
+ // Parameters
+ int trim_qual; // bwa trim quality
+
+ // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
+ // insert size histogram holder
+ int nquals; // The number of quality bins
+ int nbases; // The maximum sequence length the allocated array can hold
+ int nisize; // The maximum insert size that the allocated array can hold
+ int ngc; // The size of gc_1st and gc_2nd
+ int nindels; // The maximum indel length for indel distribution
+
+ // Arrays for the histogram data
+ uint64_t *quals_1st, *quals_2nd;
+ uint64_t *gc_1st, *gc_2nd;
+ uint64_t *isize_inward, *isize_outward, *isize_other;
+ uint64_t *acgt_cycles;
+ uint64_t *read_lengths;
+ uint64_t *insertions, *deletions;
+ uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
+
+ // The extremes encountered
+ int max_len; // Maximum read length
+ int max_qual; // Maximum quality
+ float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
+ int is_sorted;
+
+ // Summary numbers
+ uint64_t total_len;
+ uint64_t total_len_dup;
+ uint64_t nreads_1st;
+ uint64_t nreads_2nd;
+ uint64_t nreads_filtered;
+ uint64_t nreads_dup;
+ uint64_t nreads_unmapped;
+ uint64_t nreads_unpaired;
+ uint64_t nreads_paired;
+ uint64_t nreads_anomalous;
+ uint64_t nreads_mq0;
+ uint64_t nbases_mapped;
+ uint64_t nbases_mapped_cigar;
+ uint64_t nbases_trimmed; // bwa trimmed bases
+ uint64_t nmismatches;
+ uint64_t nreads_QCfailed, nreads_secondary;
+
+ // GC-depth related data
+ uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
+ gc_depth_t *gcd; // The GC-depth bins holder
+ int gcd_bin_size; // The size of GC-depth bin
+ uint32_t gcd_ref_size; // The approximate size of the genome
+ int32_t tid, gcd_pos; // Position of the current bin
+ int32_t pos; // Position of the last read
+
+ // Coverage distribution related data
+ int ncov; // The number of coverage bins
+ uint64_t *cov; // The coverage frequencies
+ int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
+ round_buffer_t cov_rbuf; // Pileup round buffer
+
+ // Mismatches by read cycle
+ uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
+ int mrseq_buf; // The size of the buffer
+ int32_t rseq_pos; // The coordinate of the first base in the buffer
+ int32_t nrseq_buf; // The used part of the buffer
+ uint64_t *mpc_buf; // Mismatches per cycle
+
+ // Filters
+ int filter_readlen;
+
+ // Target regions
+ int nregions, reg_from,reg_to;
+ regions_t *regions;
+
+ // Auxiliary data
+ int flag_require, flag_filter;
+ double sum_qual; // For calculating average quality value
+ samfile_t *sam;
+ khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
+ faidx_t *fai; // Reference sequence for GC-depth graph
+ int argc; // Command line arguments to be printed on the output
+ char **argv;
+}
+stats_t;
+
+void error(const char *format, ...);
+void bam_init_header_hash(bam_header_t *header);
+int is_in_regions(bam1_t *bam_line, stats_t *stats);
+
+
+// Coverage distribution methods
+inline int coverage_idx(int min, int max, int n, int step, int depth)
+{
+ if ( depth < min )
+ return 0;
+
+ if ( depth > max )
+ return n-1;
+
+ return 1 + (depth - min) / step;
+}
+
+inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
+{
+ return (offset + (pos-refpos) % size) % size;
+}
+
+void round_buffer_flush(stats_t *stats, int64_t pos)
+{
+ int ibuf,idp;
+
+ if ( pos==stats->cov_rbuf.pos )
+ return;
+
+ int64_t new_pos = pos;
+ if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
+ {
+ // Flush the whole buffer, but in sequential order,
+ pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
+ }
+
+ if ( pos < stats->cov_rbuf.pos )
+ error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
+
+ int ifrom = stats->cov_rbuf.start;
+ int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
+ if ( ifrom>ito )
+ {
+ for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
+ {
+ if ( !stats->cov_rbuf.buffer[ibuf] )
+ continue;
+ idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+ stats->cov[idp]++;
+ stats->cov_rbuf.buffer[ibuf] = 0;
+ }
+ ifrom = 0;
+ }
+ for (ibuf=ifrom; ibuf<=ito; ibuf++)
+ {
+ if ( !stats->cov_rbuf.buffer[ibuf] )
+ continue;
+ idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+ stats->cov[idp]++;
+ stats->cov_rbuf.buffer[ibuf] = 0;
+ }
+ stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
+ stats->cov_rbuf.pos = new_pos;
+}
+
+void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
+{
+ if ( to-from >= rbuf->size )
+ error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
+ if ( from < rbuf->pos )
+ error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
+
+ int ifrom,ito,ibuf;
+ ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
+ ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
+ if ( ifrom>ito )
+ {
+ for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
+ rbuf->buffer[ibuf]++;
+ ifrom = 0;
+ }
+ for (ibuf=ifrom; ibuf<=ito; ibuf++)
+ rbuf->buffer[ibuf]++;
+}
+
+// Calculate the number of bases in the read trimmed by BWA
+int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
+{
+ if ( len<BWA_MIN_RDLEN ) return 0;
+
+ // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
+ // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
+ int max_trimmed = len - BWA_MIN_RDLEN + 1;
+ int l, sum=0, max_sum=0, max_l=0;
+
+ for (l=0; l<max_trimmed; l++)
+ {
+ sum += trim_qual - quals[ reverse ? l : len-1-l ];
+ if ( sum<0 ) break;
+ if ( sum>max_sum )
+ {
+ max_sum = sum;
+ // This is the correct way, but bwa clips from some reason one base less
+ // max_l = l+1;
+ max_l = l;
+ }
+ }
+ return max_l;
+}
+
+
+void count_indels(stats_t *stats,bam1_t *bam_line)
+{
+ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
+ int is_1st = IS_READ1(bam_line) ? 1 : 0;
+ int icig;
+ int icycle = 0;
+ int read_len = bam_line->core.l_qseq;
+ for (icig=0; icig<bam_line->core.n_cigar; icig++)
+ {
+ // Conversion from uint32_t to MIDNSHP
+ // 0123456
+ // MIDNSHP
+ int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
+ int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
+
+ if ( cig==1 )
+ {
+ int idx = is_fwd ? icycle : read_len-icycle-ncig;
+ if ( idx<0 )
+ error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
+ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
+ if ( is_1st )
+ stats->ins_cycles_1st[idx]++;
+ else
+ stats->ins_cycles_2nd[idx]++;
+ icycle += ncig;
+ if ( ncig<=stats->nindels )
+ stats->insertions[ncig-1]++;
+ continue;
+ }
+ if ( cig==2 )
+ {
+ int idx = is_fwd ? icycle-1 : read_len-icycle-1;
+ if ( idx<0 ) continue; // discard meaningless deletions
+ if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
+ if ( is_1st )
+ stats->del_cycles_1st[idx]++;
+ else
+ stats->del_cycles_2nd[idx]++;
+ if ( ncig<=stats->nindels )
+ stats->deletions[ncig-1]++;
+ continue;
+ }
+ if ( cig!=3 && cig!=5 )
+ icycle += ncig;
+ }
+}
+
+void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
+{
+ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
+ int icig,iread=0,icycle=0;
+ int iref = bam_line->core.pos - stats->rseq_pos;
+ int read_len = bam_line->core.l_qseq;
+ uint8_t *read = bam1_seq(bam_line);
+ uint8_t *quals = bam1_qual(bam_line);
+ uint64_t *mpc_buf = stats->mpc_buf;
+ for (icig=0; icig<bam_line->core.n_cigar; icig++)
+ {
+ // Conversion from uint32_t to MIDNSHP
+ // 0123456
+ // MIDNSHP
+ int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
+ int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
+ if ( cig==1 )
+ {
+ iread += ncig;
+ icycle += ncig;
+ continue;
+ }
+ if ( cig==2 )
+ {
+ iref += ncig;
+ continue;
+ }
+ if ( cig==4 )
+ {
+ icycle += ncig;
+ // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
+ // iref += ncig;
+ iread += ncig;
+ continue;
+ }
+ if ( cig==5 )
+ {
+ icycle += ncig;
+ continue;
+ }
+ // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
+ // chunk of refseq in memory. Not very frequent and not noticable in the stats.
+ if ( cig==3 || cig==5 ) continue;
+ if ( cig!=0 )
+ error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
+
+ if ( ncig+iref > stats->nrseq_buf )
+ error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
+
+ int im;
+ for (im=0; im<ncig; im++)
+ {
+ uint8_t cread = bam1_seqi(read,iread);
+ uint8_t cref = stats->rseq_buf[iref];
+
+ // ---------------15
+ // =ACMGRSVTWYHKDBN
+ if ( cread==15 )
+ {
+ int idx = is_fwd ? icycle : read_len-icycle-1;
+ if ( idx>stats->max_len )
+ error("mpc: %d>%d\n",idx,stats->max_len);
+ idx = idx*stats->nquals;
+ if ( idx>=stats->nquals*stats->nbases )
+ error("FIXME: mpc_buf overflow\n");
+ mpc_buf[idx]++;
+ }
+ else if ( cref && cread && cref!=cread )
+ {
+ uint8_t qual = quals[iread] + 1;
+ if ( qual>=stats->nquals )
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
+
+ int idx = is_fwd ? icycle : read_len-icycle-1;
+ if ( idx>stats->max_len )
+ error("mpc: %d>%d\n",idx,stats->max_len);
+
+ idx = idx*stats->nquals + qual;
+ if ( idx>=stats->nquals*stats->nbases )
+ error("FIXME: mpc_buf overflow\n");
+ mpc_buf[idx]++;
+ }
+
+ iref++;
+ iread++;
+ icycle++;
+ }
+ }
+}
+
+void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
+{
+ khash_t(kh_faidx) *h;
+ khiter_t iter;
+ faidx1_t val;
+ char *chr, c;
+ faidx_t *fai = stats->fai;
+
+ h = fai->hash;
+ chr = stats->sam->header->target_name[tid];
+
+ // ID of the sequence name
+ iter = kh_get(kh_faidx, h, chr);
+ if (iter == kh_end(h))
+ error("No such reference sequence [%s]?\n", chr);
+ val = kh_value(h, iter);
+
+ // Check the boundaries
+ if (pos >= val.len)
+ error("Was the bam file mapped with the reference sequence supplied?"
+ " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
+ int size = stats->mrseq_buf;
+ // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
+ if (size+pos > val.len) size = val.len-pos;
+
+ // Position the razf reader
+ razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
+
+ uint8_t *ptr = stats->rseq_buf;
+ int nread = 0;
+ while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
+ {
+ if ( !isgraph(c) )
+ continue;
+
+ // Conversion between uint8_t coding and ACGT
+ // -12-4---8-------
+ // =ACMGRSVTWYHKDBN
+ if ( c=='A' || c=='a' )
+ *ptr = 1;
+ else if ( c=='C' || c=='c' )
+ *ptr = 2;
+ else if ( c=='G' || c=='g' )
+ *ptr = 4;
+ else if ( c=='T' || c=='t' )
+ *ptr = 8;
+ else
+ *ptr = 0;
+ ptr++;
+ nread++;
+ }
+ if ( nread < stats->mrseq_buf )
+ {
+ memset(ptr,0, stats->mrseq_buf - nread);
+ nread = stats->mrseq_buf;
+ }
+ stats->nrseq_buf = nread;
+ stats->rseq_pos = pos;
+ stats->tid = tid;
+}
+
+float fai_gc_content(stats_t *stats, int pos, int len)
+{
+ uint32_t gc,count,c;
+ int i = pos - stats->rseq_pos, ito = i + len;
+ assert( i>=0 && ito<=stats->nrseq_buf );
+
+ // Count GC content
+ gc = count = 0;
+ for (; i<ito; i++)
+ {
+ c = stats->rseq_buf[i];
+ if ( c==2 || c==4 )
+ {
+ gc++;
+ count++;
+ }
+ else if ( c==1 || c==8 )
+ count++;
+ }
+ return count ? (float)gc/count : 0;
+}
+
+void realloc_rseq_buffer(stats_t *stats)
+{
+ int n = stats->nbases*10;
+ if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
+ if ( stats->mrseq_buf<n )
+ {
+ stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
+ stats->mrseq_buf = n;
+ }
+}
+
+void realloc_gcd_buffer(stats_t *stats, int seq_len)
+{
+ if ( seq_len >= stats->gcd_bin_size )
+ error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
+
+ int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
+ if ( n <= stats->igcd )
+ error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
+
+ if ( n > stats->ngcd )
+ {
+ stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
+ if ( !stats->gcd )
+ error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
+ memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
+ stats->ngcd = n;
+ }
+
+ realloc_rseq_buffer(stats);
+}
+
+void realloc_buffers(stats_t *stats, int seq_len)
+{
+ int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
+
+ stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
+ if ( !stats->quals_1st )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
+ memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
+
+ stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
+ if ( !stats->quals_2nd )
+ error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
+ memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
+
+ if ( stats->mpc_buf )
+ {
+ stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
+ if ( !stats->mpc_buf )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
+ memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
+ }
+
+ stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
+ if ( !stats->acgt_cycles )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
+ memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
+
+ stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
+ if ( !stats->read_lengths )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+ memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
+ if ( !stats->insertions )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+ memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
+ if ( !stats->deletions )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+ memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
+ if ( !stats->ins_cycles_1st )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
+ if ( !stats->ins_cycles_2nd )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
+ if ( !stats->del_cycles_1st )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
+ if ( !stats->del_cycles_2nd )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->nbases = n;
+
+ // Realloc the coverage distribution buffer
+ int *rbuffer = calloc(sizeof(int),seq_len*5);
+ n = stats->cov_rbuf.size-stats->cov_rbuf.start;
+ memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
+ if ( stats->cov_rbuf.start>1 )
+ memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
+ stats->cov_rbuf.start = 0;
+ free(stats->cov_rbuf.buffer);
+ stats->cov_rbuf.buffer = rbuffer;
+ stats->cov_rbuf.size = seq_len*5;
+
+ realloc_rseq_buffer(stats);
+}
+
+void collect_stats(bam1_t *bam_line, stats_t *stats)
+{
+ if ( stats->rg_hash )
+ {
+ const uint8_t *rg = bam_aux_get(bam_line, "RG");
+ if ( !rg ) return;
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
+ if ( k == kh_end(stats->rg_hash) ) return;
+ }
+ if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( !is_in_regions(bam_line,stats) )
+ return;
+ if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
+ return;
+
+ if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
+ if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
+
+ int seq_len = bam_line->core.l_qseq;
+ if ( !seq_len ) return;
+
+ if ( seq_len >= stats->nbases )
+ realloc_buffers(stats,seq_len);
+ if ( stats->max_len<seq_len )
+ stats->max_len = seq_len;
+
+ stats->read_lengths[seq_len]++;
+
+ // Count GC and ACGT per cycle
+ uint8_t base, *seq = bam1_seq(bam_line);
+ int gc_count = 0;
+ int i;
+ int reverse = IS_REVERSE(bam_line);
+ for (i=0; i<seq_len; i++)
+ {
+ // Conversion from uint8_t coding to ACGT
+ // -12-4---8-------
+ // =ACMGRSVTWYHKDBN
+ // 01 2 3
+ base = bam1_seqi(seq,i);
+ base /= 2;
+ if ( base==1 || base==2 ) gc_count++;
+ else if ( base>2 ) base=3;
+ if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
+ error("FIXME: acgt_cycles\n");
+ stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
+ }
+ int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
+ int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
+ if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
+
+ // Determine which array (1st or 2nd read) will these stats go to,
+ // trim low quality bases from end the same way BWA does,
+ // fill GC histogram
+ uint64_t *quals;
+ uint8_t *bam_quals = bam1_qual(bam_line);
+ if ( bam_line->core.flag&BAM_FREAD2 )
+ {
+ quals = stats->quals_2nd;
+ stats->nreads_2nd++;
+ for (i=gc_idx_min; i<gc_idx_max; i++)
+ stats->gc_2nd[i]++;
+ }
+ else
+ {
+ quals = stats->quals_1st;
+ stats->nreads_1st++;
+ for (i=gc_idx_min; i<gc_idx_max; i++)
+ stats->gc_1st[i]++;
+ }
+ if ( stats->trim_qual>0 )
+ stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
+
+ // Quality histogram and average quality
+ for (i=0; i<seq_len; i++)
+ {
+ uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
+ if ( qual>=stats->nquals )
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
+ if ( qual>stats->max_qual )
+ stats->max_qual = qual;
+
+ quals[ i*stats->nquals+qual ]++;
+ stats->sum_qual += qual;
+ }
+
+ // Look at the flags and increment appropriate counters (mapped, paired, etc)
+ if ( IS_UNMAPPED(bam_line) )
+ stats->nreads_unmapped++;
+ else
+ {
+ if ( !bam_line->core.qual )
+ stats->nreads_mq0++;
+
+ count_indels(stats,bam_line);
+
+ if ( !IS_PAIRED(bam_line) )
+ stats->nreads_unpaired++;
+ else
+ {
+ stats->nreads_paired++;
+
+ if ( bam_line->core.tid!=bam_line->core.mtid )
+ stats->nreads_anomalous++;
+
+ // The insert size is tricky, because for long inserts the libraries are
+ // prepared differently and the pairs point in other direction. BWA does
+ // not set the paired flag for them. Similar thing is true also for 454
+ // reads. Mates mapped to different chromosomes have isize==0.
+ int32_t isize = bam_line->core.isize;
+ if ( isize<0 ) isize = -isize;
+ if ( isize >= stats->nisize )
+ isize = stats->nisize-1;
+ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
+ {
+ int pos_fst = bam_line->core.mpos - bam_line->core.pos;
+ int is_fst = IS_READ1(bam_line) ? 1 : -1;
+ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
+ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+
+ if ( is_fwd*is_mfwd>0 )
+ stats->isize_other[isize]++;
+ else if ( is_fst*pos_fst>0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize_inward[isize]++;
+ else
+ stats->isize_outward[isize]++;
+ }
+ else if ( is_fst*pos_fst<0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize_outward[isize]++;
+ else
+ stats->isize_inward[isize]++;
+ }
+ }
+ }
+
+ // Number of mismatches
+ uint8_t *nm = bam_aux_get(bam_line,"NM");
+ if (nm)
+ stats->nmismatches += bam_aux2i(nm);
+
+ // Number of mapped bases from cigar
+ // Conversion from uint32_t to MIDNSHP
+ // 012-4--
+ // MIDNSHP
+ if ( bam_line->core.n_cigar == 0)
+ error("FIXME: mapped read with no cigar?\n");
+ int readlen=seq_len;
+ if ( stats->regions )
+ {
+ // Count only on-target bases
+ int iref = bam_line->core.pos + 1;
+ for (i=0; i<bam_line->core.n_cigar; i++)
+ {
+ int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
+ int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ if ( cig==2 ) readlen += ncig;
+ else if ( cig==0 )
+ {
+ if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
+ else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
+ if ( ncig<0 ) ncig = 0;
+ stats->nbases_mapped_cigar += ncig;
+ iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ }
+ else if ( cig==1 )
+ {
+ iref += ncig;
+ if ( iref>=stats->reg_from && iref<=stats->reg_to )
+ stats->nbases_mapped_cigar += ncig;
+ }
+ }
+ }
+ else
+ {
+ // Count the whole read
+ for (i=0; i<bam_line->core.n_cigar; i++)
+ {
+ if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
+ stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
+ readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ }
+ }
+ stats->nbases_mapped += seq_len;
+
+ if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
+ stats->is_sorted = 0;
+ stats->pos = bam_line->core.pos;
+
+ if ( stats->is_sorted )
+ {
+ if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
+ round_buffer_flush(stats,-1);
+
+ // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
+ // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
+ // 20kbp, so the effect is negligible.
+ if ( stats->fai )
+ {
+ int inc_ref = 0, inc_gcd = 0;
+ // First pass or new chromosome
+ if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
+ // Read goes beyond the end of the rseq buffer
+ else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
+ // Read overlaps the next gcd bin
+ else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
+ {
+ inc_gcd = 1;
+ if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
+ }
+ if ( inc_gcd )
+ {
+ stats->igcd++;
+ if ( stats->igcd >= stats->ngcd )
+ realloc_gcd_buffer(stats, readlen);
+ if ( inc_ref )
+ read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+ stats->gcd_pos = bam_line->core.pos;
+ stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
+ }
+
+ count_mismatches_per_cycle(stats,bam_line);
+ }
+ // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
+ else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
+ {
+ // First pass or a new chromosome
+ stats->tid = bam_line->core.tid;
+ stats->gcd_pos = bam_line->core.pos;
+ stats->igcd++;
+ if ( stats->igcd >= stats->ngcd )
+ realloc_gcd_buffer(stats, readlen);
+ }
+ stats->gcd[ stats->igcd ].depth++;
+ // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
+ if ( !stats->fai )
+ stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
+
+ // Coverage distribution graph
+ round_buffer_flush(stats,bam_line->core.pos);
+ round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
+ }
+ }
+
+ stats->total_len += seq_len;
+ if ( IS_DUP(bam_line) )
+ {
+ stats->total_len_dup += seq_len;
+ stats->nreads_dup++;
+ }
+}
+
+// Sort by GC and depth
+#define GCD_t(x) ((gc_depth_t *)x)
+static int gcd_cmp(const void *a, const void *b)
+{
+ if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
+ if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
+ if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
+ if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
+ return 0;
+}
+#undef GCD_t
+
+float gcd_percentile(gc_depth_t *gcd, int N, int p)
+{
+ float n,d;
+ int k;
+
+ n = p*(N+1)/100;
+ k = n;
+ if ( k<=0 )
+ return gcd[0].depth;
+ if ( k>=N )
+ return gcd[N-1].depth;
+
+ d = n - k;
+ return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
+}
+
+void output_stats(stats_t *stats)
+{
+ // Calculate average insert size and standard deviation (from the main bulk data only)
+ int isize, ibulk=0;
+ uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
+ for (isize=0; isize<stats->nisize; isize++)
+ {
+ // Each pair was counted twice
+ stats->isize_inward[isize] *= 0.5;
+ stats->isize_outward[isize] *= 0.5;
+ stats->isize_other[isize] *= 0.5;
+
+ nisize_inward += stats->isize_inward[isize];
+ nisize_outward += stats->isize_outward[isize];
+ nisize_other += stats->isize_other[isize];
+ nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
+ }
+
+ double bulk=0, avg_isize=0, sd_isize=0;
+ for (isize=0; isize<stats->nisize; isize++)
+ {
+ bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
+ avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
+
+ if ( bulk/nisize > stats->isize_main_bulk )
+ {
+ ibulk = isize+1;
+ nisize = bulk;
+ break;
+ }
+ }
+ avg_isize /= nisize ? nisize : 1;
+ for (isize=1; isize<ibulk; isize++)
+ sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
+ sd_isize = sqrt(sd_isize);
+
+
+ printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
+ printf("# The command line was: %s",stats->argv[0]);
+ int i;
+ for (i=1; i<stats->argc; i++)
+ printf(" %s",stats->argv[i]);
+ printf("\n");
+ printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
+ printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
+ printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
+ printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
+ printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
+ printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
+ printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
+ printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
+ printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
+ printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
+ printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
+ printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
+ printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
+ printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
+ printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
+ printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
+ printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
+ printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
+ printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
+ printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
+ printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
+ printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
+ printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
+ float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
+ printf("SN\taverage length:\t%.0f\n", avg_read_length);
+ printf("SN\tmaximum length:\t%d\n", stats->max_len);
+ printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
+ printf("SN\tinsert size average:\t%.1f\n", avg_isize);
+ printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
+ printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
+ printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
+ printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
+ printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
+
+ int ibase,iqual;
+ if ( stats->max_len<stats->nbases ) stats->max_len++;
+ if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
+ printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
+ printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+ for (ibase=0; ibase<stats->max_len; ibase++)
+ {
+ printf("FFQ\t%d",ibase+1);
+ for (iqual=0; iqual<=stats->max_qual; iqual++)
+ {
+ printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
+ }
+ printf("\n");
+ }
+ printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
+ printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+ for (ibase=0; ibase<stats->max_len; ibase++)
+ {
+ printf("LFQ\t%d",ibase+1);
+ for (iqual=0; iqual<=stats->max_qual; iqual++)
+ {
+ printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
+ }
+ printf("\n");
+ }
+ if ( stats->mpc_buf )
+ {
+ printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
+ printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
+ printf("# is the number of N's and the rest is the number of mismatches\n");
+ for (ibase=0; ibase<stats->max_len; ibase++)
+ {
+ printf("MPC\t%d",ibase+1);
+ for (iqual=0; iqual<=stats->max_qual; iqual++)
+ {
+ printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
+ }
+ printf("\n");
+ }
+ }
+ printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
+ int ibase_prev = 0;
+ for (ibase=0; ibase<stats->ngc; ibase++)
+ {
+ if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
+ printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
+ ibase_prev = ibase;
+ }
+ printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
+ ibase_prev = 0;
+ for (ibase=0; ibase<stats->ngc; ibase++)
+ {
+ if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
+ printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
+ ibase_prev = ibase;
+ }
+ printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
+ for (ibase=0; ibase<stats->max_len; ibase++)
+ {
+ uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
+ uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
+ if ( ! sum ) continue;
+ printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
+ }
+ printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
+ for (isize=0; isize<ibulk; isize++)
+ printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
+ (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
+
+ printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
+ int ilen;
+ for (ilen=0; ilen<stats->max_len; ilen++)
+ {
+ if ( stats->read_lengths[ilen]>0 )
+ printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
+ }
+
+ printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
+ for (ilen=0; ilen<stats->nindels; ilen++)
+ {
+ if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
+ printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
+ }
+
+ printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
+ for (ilen=0; ilen<=stats->nbases; ilen++)
+ {
+ // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
+ // the index of the cycle of the first inserted base (also 1-based)
+ if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
+ printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
+ }
+
+ printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
+ if ( stats->cov[0] )
+ printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
+ int icov;
+ for (icov=1; icov<stats->ncov-1; icov++)
+ if ( stats->cov[icov] )
+ printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
+ if ( stats->cov[stats->ncov-1] )
+ printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
+
+ // Calculate average GC content, then sort by GC and depth
+ printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
+ uint32_t igcd;
+ for (igcd=0; igcd<stats->igcd; igcd++)
+ {
+ if ( stats->fai )
+ stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
+ else
+ if ( stats->gcd[igcd].depth )
+ stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
+ }
+ qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
+ igcd = 0;
+ while ( igcd < stats->igcd )
+ {
+ // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
+ uint32_t nbins=0, itmp=igcd;
+ float gc = stats->gcd[igcd].gc;
+ while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
+ {
+ nbins++;
+ itmp++;
+ }
+ printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
+ gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
+ gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
+ );
+ igcd += nbins;
+ }
+}
+
+size_t mygetline(char **line, size_t *n, FILE *fp)
+{
+ if (line == NULL || n == NULL || fp == NULL)
+ {
+ errno = EINVAL;
+ return -1;
+ }
+ if (*n==0 || !*line)
+ {
+ *line = NULL;
+ *n = 0;
+ }
+
+ size_t nread=0;
+ int c;
+ while ((c=getc(fp))!= EOF && c!='\n')
+ {
+ if ( ++nread>=*n )
+ {
+ *n += 255;
+ *line = realloc(*line, sizeof(char)*(*n));
+ }
+ (*line)[nread-1] = c;
+ }
+ if ( nread>=*n )
+ {
+ *n += 255;
+ *line = realloc(*line, sizeof(char)*(*n));
+ }
+ (*line)[nread] = 0;
+ return nread>0 ? nread : -1;
+
+}
+
+void init_regions(stats_t *stats, char *file)
+{
+ khiter_t iter;
+ khash_t(kh_bam_tid) *header_hash;
+
+ bam_init_header_hash(stats->sam->header);
+ header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
+
+ FILE *fp = fopen(file,"r");
+ if ( !fp ) error("%s: %s\n",file,strerror(errno));
+
+ char *line = NULL;
+ size_t len = 0;
+ ssize_t nread;
+ int warned = 0;
+ int prev_tid=-1, prev_pos=-1;
+ while ((nread = mygetline(&line, &len, fp)) != -1)
+ {
+ if ( line[0] == '#' ) continue;
+
+ int i = 0;
+ while ( i<nread && !isspace(line[i]) ) i++;
+ if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
+ line[i] = 0;
+
+ iter = kh_get(kh_bam_tid, header_hash, line);
+ int tid = kh_val(header_hash, iter);
+ if ( iter == kh_end(header_hash) )
+ {
+ if ( !warned )
+ fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
+ warned = 1;
+ continue;
+ }
+
+ if ( tid >= stats->nregions )
+ {
+ stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
+ int j;
+ for (j=stats->nregions; j<stats->nregions+100; j++)
+ {
+ stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
+ stats->regions[j].pos = NULL;
+ }
+ stats->nregions += 100;
+ }
+ int npos = stats->regions[tid].npos;
+ if ( npos >= stats->regions[tid].mpos )
+ {
+ stats->regions[tid].mpos += 1000;
+ stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
+ }
+
+ if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
+ if ( prev_tid==-1 || prev_tid!=tid )
+ {
+ prev_tid = tid;
+ prev_pos = stats->regions[tid].pos[npos].from;
+ }
+ if ( prev_pos>stats->regions[tid].pos[npos].from )
+ error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
+ stats->regions[tid].npos++;
+ }
+ if (line) free(line);
+ if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
+ fclose(fp);
+}
+
+void destroy_regions(stats_t *stats)
+{
+ int i;
+ for (i=0; i<stats->nregions; i++)
+ {
+ if ( !stats->regions[i].mpos ) continue;
+ free(stats->regions[i].pos);
+ }
+ if ( stats->regions ) free(stats->regions);
+}
+
+static int fetch_read(const bam1_t *bam_line, void *data)
+{
+ collect_stats((bam1_t*)bam_line,(stats_t*)data);
+ return 1;
+}
+
+void reset_regions(stats_t *stats)
+{
+ int i;
+ for (i=0; i<stats->nregions; i++)
+ stats->regions[i].cpos = 0;
+}
+
+int is_in_regions(bam1_t *bam_line, stats_t *stats)
+{
+ if ( !stats->regions ) return 1;
+
+ if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
+ if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
+
+ regions_t *reg = &stats->regions[bam_line->core.tid];
+ if ( reg->cpos==reg->npos ) return 0; // done for this chr
+
+ // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
+ // even small overlap is enough to include the read in the stats.
+ int i = reg->cpos;
+ while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
+ if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
+ if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
+ reg->cpos = i;
+ stats->reg_from = reg->pos[i].from;
+ stats->reg_to = reg->pos[i].to;
+
+ return 1;
+}
+
+void init_group_id(stats_t *stats, char *id)
+{
+ if ( !stats->sam->header->dict )
+ stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
+ void *iter = stats->sam->header->dict;
+ const char *key, *val;
+ int n = 0;
+ stats->rg_hash = kh_init(kh_rg);
+ while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
+ {
+ if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
+ {
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
+ if ( k != kh_end(stats->rg_hash) )
+ fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
+ int ret;
+ k = kh_put(kh_rg, stats->rg_hash, key, &ret);
+ kh_value(stats->rg_hash, k) = val;
+ n++;
+ }
+ }
+ if ( !n )
+ error("The sample or read group \"%s\" not present.\n", id);
+}
+
+
+void error(const char *format, ...)
+{
+ if ( !format )
+ {
+ printf("Version: %s\n", BAMCHECK_VERSION);
+ printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
+ printf("Usage: bamcheck [OPTIONS] file.bam\n");
+ printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
+ printf("Options:\n");
+ printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
+ printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
+ printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
+ printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
+ printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
+ printf(" -h, --help This help message\n");
+ printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
+ printf(" -I, --id <string> Include only listed read group or sample name\n");
+ printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
+ printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
+ printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
+ printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
+ printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
+ printf(" -s, --sam Input is SAM\n");
+ printf("\n");
+ }
+ else
+ {
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ }
+ exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+ char *targets = NULL;
+ char *bam_fname = NULL;
+ char *group_id = NULL;
+ samfile_t *sam = NULL;
+ char in_mode[5];
+
+ stats_t *stats = calloc(1,sizeof(stats_t));
+ stats->ngc = 200;
+ stats->nquals = 256;
+ stats->nbases = 300;
+ stats->nisize = 8000;
+ stats->max_len = 30;
+ stats->max_qual = 40;
+ stats->isize_main_bulk = 0.99; // There are always outliers at the far end
+ stats->gcd_bin_size = 20e3;
+ stats->gcd_ref_size = 4.2e9;
+ stats->rseq_pos = -1;
+ stats->tid = stats->gcd_pos = -1;
+ stats->igcd = 0;
+ stats->is_sorted = 1;
+ stats->cov_min = 1;
+ stats->cov_max = 1000;
+ stats->cov_step = 1;
+ stats->argc = argc;
+ stats->argv = argv;
+ stats->filter_readlen = -1;
+ stats->nindels = stats->nbases;
+
+ strcpy(in_mode, "rb");
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"remove-dups",0,0,'d'},
+ {"sam",0,0,'s'},
+ {"ref-seq",1,0,'r'},
+ {"coverage",1,0,'c'},
+ {"read-length",1,0,'l'},
+ {"insert-size",1,0,'i'},
+ {"most-inserts",1,0,'m'},
+ {"trim-quality",1,0,'q'},
+ {"target-regions",0,0,'t'},
+ {"required-flag",1,0,'f'},
+ {"filtering-flag",0,0,'F'},
+ {"id",1,0,'I'},
+ {"GC-depth",1,0,1},
+ {0,0,0,0}
+ };
+ int opt;
+ while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
+ {
+ switch (opt)
+ {
+ case 'f': stats->flag_require=strtol(optarg,0,0); break;
+ case 'F': stats->flag_filter=strtol(optarg,0,0); break;
+ case 'd': stats->flag_filter|=BAM_FDUP; break;
+ case 's': strcpy(in_mode, "r"); break;
+ case 'r': stats->fai = fai_load(optarg);
+ if (stats->fai==0)
+ error("Could not load faidx: %s\n", optarg);
+ break;
+ case 1 : {
+ float flen,fbin;
+ if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
+ error("Unable to parse --GC-depth %s\n", optarg);
+ stats->gcd_bin_size = fbin;
+ stats->gcd_ref_size = flen;
+ }
+ break;
+ case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
+ error("Unable to parse -c %s\n", optarg);
+ break;
+ case 'l': stats->filter_readlen = atoi(optarg); break;
+ case 'i': stats->nisize = atoi(optarg); break;
+ case 'm': stats->isize_main_bulk = atof(optarg); break;
+ case 'q': stats->trim_qual = atoi(optarg); break;
+ case 't': targets = optarg; break;
+ case 'I': group_id = optarg; break;
+ case '?':
+ case 'h': error(NULL);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+ if ( optind<argc )
+ bam_fname = argv[optind++];
+
+ if ( !bam_fname )
+ {
+ if ( isatty(fileno((FILE *)stdin)) )
+ error(NULL);
+ bam_fname = "-";
+ }
+
+ // Init structures
+ // .. coverage bins and round buffer
+ if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
+ {
+ stats->cov_step = stats->cov_max - stats->cov_min;
+ if ( stats->cov_step <= 0 )
+ stats->cov_step = 1;
+ }
+ stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
+ stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
+ stats->cov = calloc(sizeof(uint64_t),stats->ncov);
+ stats->cov_rbuf.size = stats->nbases*5;
+ stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
+ // .. bam
+ if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
+ error("Failed to open: %s\n", bam_fname);
+ stats->sam = sam;
+ if ( group_id ) init_group_id(stats, group_id);
+ bam1_t *bam_line = bam_init1();
+ // .. arrays
+ stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
+ stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
+ stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
+ stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
+ stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
+ stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
+ stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
+ stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
+ stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
+ stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ realloc_rseq_buffer(stats);
+ if ( targets )
+ init_regions(stats, targets);
+
+ // Collect statistics
+ if ( optind<argc )
+ {
+ // Collect stats in selected regions only
+ bam_index_t *bam_idx = bam_index_load(bam_fname);
+ if (bam_idx == 0)
+ error("Random alignment retrieval only works for indexed BAM files.\n");
+
+ int i;
+ for (i=optind; i<argc; i++)
+ {
+ int tid, beg, end;
+ bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
+ if ( tid < 0 ) continue;
+ reset_regions(stats);
+ bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
+ }
+ bam_index_destroy(bam_idx);
+ }
+ else
+ {
+ // Stream through the entire BAM ignoring off-target regions if -t is given
+ while (samread(sam,bam_line) >= 0)
+ collect_stats(bam_line,stats);
+ }
+ round_buffer_flush(stats,-1);
+
+ output_stats(stats);
+
+ bam_destroy1(bam_line);
+ samclose(stats->sam);
+ if (stats->fai) fai_destroy(stats->fai);
+ free(stats->cov_rbuf.buffer); free(stats->cov);
+ free(stats->quals_1st); free(stats->quals_2nd);
+ free(stats->gc_1st); free(stats->gc_2nd);
+ free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
+ free(stats->gcd);
+ free(stats->rseq_buf);
+ free(stats->mpc_buf);
+ free(stats->acgt_cycles);
+ free(stats->read_lengths);
+ free(stats->insertions);
+ free(stats->deletions);
+ free(stats->ins_cycles_1st);
+ free(stats->ins_cycles_2nd);
+ free(stats->del_cycles_1st);
+ free(stats->del_cycles_2nd);
+ destroy_regions(stats);
+ free(stats);
+ if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
+
+ return 0;
+}
+
+
+
diff --git a/samtools-0.1.19/misc/blast2sam.pl b/samtools-0.1.19/misc/blast2sam.pl
new file mode 100755
index 0000000..084f018
--- /dev/null
+++ b/samtools-0.1.19/misc/blast2sam.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&blast2sam;
+
+sub blast2sam {
+ my %opts = ();
+ getopts('s', \%opts);
+ die("Usage: blast2sam.pl <in.blastn>\n") if (-t STDIN && @ARGV == 0);
+ my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq);
+ $show_seq = defined($opts{s});
+ @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*');
+ while (<>) {
+ if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print
+ &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend);
+ @cigar = ();
+ }
+ if (/^Query= (\S+)/) {
+ $sam[0] = $1;
+ } elsif (/\((\S+)\s+letters\)/) {
+ $qlen = $1; $qlen =~ s/,//g;
+ } elsif (/^>(\S+)/) {
+ $sam[2] = $1;
+ } elsif (/Length = (\d+)/) {
+ $slen = $1;
+ } elsif (/Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/) { # the start of an alignment block
+ my ($as, $ev) = (int($1 + .499), $3);
+ $ev = "1$ev" if ($ev =~ /^e/);
+ @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev");
+ @cigar = (); $qbeg = 0;
+ @cmaux = (0, 0, 0, '');
+ } elsif (/Strand = (\S+) \/ (\S+)/) {
+ $sam[1] |= 0x10 if ($2 eq 'Minus');
+ } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) {
+ $q = $2;
+ unless ($qbeg) {
+ $qbeg = $1;
+ push(@cigar, ($1-1) . "H") if ($1 > 1);
+ }
+ $qend = $3;
+ if ($show_seq) {
+ my $x = $q;
+ $x =~ s/-//g; $sam[9] .= $x;
+ }
+ } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) {
+ $s = $2;
+ if ($sam[1] & 0x10) {
+ $sam[3] = $3;
+ } else {
+ $sam[3] = $1 unless ($sam[3]);
+ }
+ &aln2cm(\@cigar, \$q, \$s, \@cmaux);
+ }
+ }
+ &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend);
+}
+
+sub blast_print_sam {
+ my ($sam, $cigar, $cmaux, $qrest) = @_;
+ push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1));
+ push(@$cigar, $qrest . 'H') if ($qrest);
+ if ($sam->[1] & 0x10) {
+ @$cigar = reverse(@$cigar);
+ $sam->[9] = reverse($sam->[9]);
+ $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/;
+ }
+ $sam->[9] = '*' if (!$sam->[9]);
+ $sam->[5] = join('', @$cigar);
+ print join("\t", @$sam), "\n";
+}
+
+sub aln2cm {
+ my ($cigar, $q, $s, $cmaux) = @_;
+ my $l = length($$q);
+ for (my $i = 0; $i < $l; ++$i) {
+ my $op;
+ # set $op
+ if (substr($$q, $i, 1) eq '-') { $op = 2; }
+ elsif (substr($$s, $i, 1) eq '-') { $op = 1; }
+ else { $op = 0; }
+ # for CIGAR
+ if ($cmaux->[0] == $op) {
+ ++$cmaux->[1];
+ } else {
+ push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1));
+ $cmaux->[0] = $op; $cmaux->[1] = 1;
+ }
+ }
+}
diff --git a/samtools-0.1.19/misc/bowtie2sam.pl b/samtools-0.1.19/misc/bowtie2sam.pl
new file mode 100755
index 0000000..5dff88d
--- /dev/null
+++ b/samtools-0.1.19/misc/bowtie2sam.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.1
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&bowtie2sam;
+exit;
+
+sub bowtie2sam {
+ my %opts = ();
+ die("Usage: bowtie2sam.pl <aln.bowtie>\n") if (@ARGV == 0 && -t STDIN);
+ # core loop
+ my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k);
+ $last = '';
+ while (<>) {
+ my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches
+ if ($name eq $last) {
+ # I do not know whether the multiple hits are ordered on the
+ # number of mismatches. I assume they are not and so I have to
+ # keep all these multiple hits in memory.
+ @{$staging[$k]} = @s;
+ if ($best_s > $nm) {
+ $subbest_s = $best_s;
+ $best_s = $nm;
+ $best_k = $k;
+ } elsif ($subbest_s > $nm) {
+ $subbest_s = $nm;
+ }
+ ++$k;
+ } else {
+ if ($last) {
+ if ($best_s == $subbest_s) {
+ $staging[$best_k][4] = 0;
+ } elsif ($subbest_s - $best_s == 1) {
+ $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15);
+ }
+ print join("\t", @{$staging[$best_k]}), "\n";
+ }
+ $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0;
+ @{$staging[0]} = @s;
+ $last = $name;
+ }
+ }
+ print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0);
+}
+
+sub bowtie2sam_aux {
+ my ($line, $s) = @_;
+ chomp($line);
+ my @t = split("\t", $line);
+ my $ret;
+ @$s = ();
+ # read name
+ $s->[0] = $ret = $t[0];
+ $s->[0] =~ s/\/[12]$//g;
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ # read & quality
+ $s->[9] = $t[4]; $s->[10] = $t[5];
+ # cigar
+ $s->[5] = length($s->[9]) . "M";
+ # coor
+ $s->[2] = $t[2]; $s->[3] = $t[3] + 1;
+ $s->[1] |= 0x10 if ($t[1] eq '-');
+ # mapQ
+ $s->[4] = $t[6] == 0? 25 : 0;
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ my $nm = @t - 7;
+ push(@$s, "NM:i:" . (@t-7));
+ push(@$s, "X$nm:i:" . ($t[6]+1));
+ my $md = '';
+ if ($t[7]) {
+ $_ = $t[7];
+ my $a = 0;
+ while (/(\d+):[ACGTN]>([ACGTN])/gi) {
+ my ($y, $z) = ($1, $2);
+ $md .= (int($y)-$a) . $z;
+ $a += $y - $a + 1;
+ }
+ $md .= length($s->[9]) - $a;
+ } else {
+ $md = length($s->[9]);
+ }
+ push(@$s, "MD:Z:$md");
+ return ($ret, $nm);
+}
diff --git a/samtools-0.1.19/misc/export2sam.pl b/samtools-0.1.19/misc/export2sam.pl
new file mode 100755
index 0000000..ec6dacf
--- /dev/null
+++ b/samtools-0.1.19/misc/export2sam.pl
@@ -0,0 +1,545 @@
+#!/usr/bin/env perl
+#
+#
+# export2sam.pl converts GERALD export files to SAM format.
+#
+#
+#
+########## License:
+#
+# The MIT License
+#
+# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
+# Modified SAMtools work copyright (c) 2010 Illumina, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+#
+#
+#
+########## ChangeLog:
+#
+# Version: 2.3.1 (18MAR2011)
+#
+# - Restore file '-' as stdin input.
+#
+# Version: 2.3.0 (24JAN2011)
+#
+# - Add support for export reserved chromosome name "CONTROL",
+# which is translated to optional field "XC:Z:CONTROL".
+# - Check for ".gz" file extension on export files and open
+# these as gzip pipes when the extension is found.
+#
+# Version: 2.2.0 (16NOV2010)
+#
+# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y
+# - For export records with reserved chromosome name identifiers
+# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"
+# to the SAM record, so that these cases can be distinguished
+# from other unmatched reads.
+#
+# Version: 2.1.0 (21SEP2010)
+#
+# - Additional export record error checking.
+# - Convert export records with chromomsome value of "RM" to unmapped
+# SAM records.
+#
+# Version: 2.0.0 (15FEB2010)
+#
+# Script updated by Illumina in conjunction with CASAVA 1.7.0
+# release.
+#
+# Major changes are as follows:
+# - The CIGAR string has been updated to include all gaps from
+# ELANDv2 alignments.
+# - The ELAND single read alignment score is always stored in the
+# optional "SM" field and the ELAND paired read alignment score
+# is stored in the optional "AS" field when it exists.
+# - The MAPQ value is set to the higher of the two alignment scores,
+# but no greater than 254, i.e. min(254,max(SM,AS))
+# - The SAM "proper pair" bit (0x0002) is now set for read pairs
+# meeting ELAND's expected orientation and insert size criteria.
+# - The default quality score translation is set for export files
+# which contain Phread+64 quality values. An option,
+# "--qlogodds", has been added to translate quality values from
+# the Solexa+64 format used in export files prior to Pipeline
+# 1.3
+# - The export match descriptor is now reverse-complemented when
+# necessary such that it always corresponds to the forward
+# strand of the reference, to be consistent with other
+# information in the SAM record. It is now written to the
+# optional 'XD' field (rather than 'MD') to acknowledge its
+# minor differences from the samtools match descriptor (see
+# additional detail below).
+# - An option, "--nofilter", has been added to include reads which
+# have failed primary analysis quality filtration. Such reads
+# will have the corresponding SAM flag bit (0x0200) set.
+# - Labels in the export 'contig' field are preserved by setting
+# RNAME to "$export_chromosome/$export_contig" when the contig
+# label exists.
+#
+#
+# Contact: lh3
+# Version: 0.1.2 (03JAN2009)
+#
+#
+#
+########## Known Conversion Limitations:
+#
+# - Export records for reads that map to a position < 1 (allowed
+# in export format), are converted to unmapped reads in the SAM
+# record.
+# - Export records contain the reserved chromosome names: "NM",
+# "QC","RM" and "CONTROL". "NM" indicates that the aligner could
+# not map the read to the reference sequence set. "QC" means that
+# the aligner did not attempt to map the read due to some
+# technical limitation. "RM" means that the read mapped to a set
+# of 'contaminant' sequences specified in GERALD's RNA-seq
+# workflow. "CONTROL" means that the read is a control. All of
+# these alignment types are collapsed to the single unmapped
+# alignment state in the SAM record, but the optional SAM "XC"
+# field is used to record the original reserved chromosome name of
+# the read for all but the "NM" case.
+# - The export match descriptor is slightly different than the
+# samtools match descriptor. For this reason it is stored in the
+# optional SAM field 'XD' (and not 'MD'). Note that the export
+# match descriptor differs from the samtools version in two
+# respects: (1) indels are explicitly closed with the '$'
+# character and (2) insertions must be enumerated in the match
+# descriptor. For example a 35-base read with a two-base insertion
+# is described as: 20^2$14
+#
+#
+#
+
+my $version = "2.3.1";
+
+use strict;
+use warnings;
+
+use Getopt::Long;
+use File::Spec;
+use List::Util qw(min max);
+
+
+use constant {
+ EXPORT_MACHINE => 0,
+ EXPORT_RUNNO => 1,
+ EXPORT_LANE => 2,
+ EXPORT_TILE => 3,
+ EXPORT_X => 4,
+ EXPORT_Y => 5,
+ EXPORT_INDEX => 6,
+ EXPORT_READNO => 7,
+ EXPORT_READ => 8,
+ EXPORT_QUAL => 9,
+ EXPORT_CHROM => 10,
+ EXPORT_CONTIG => 11,
+ EXPORT_POS => 12,
+ EXPORT_STRAND => 13,
+ EXPORT_MD => 14,
+ EXPORT_SEMAP => 15,
+ EXPORT_PEMAP => 16,
+ EXPORT_PASSFILT => 21,
+ EXPORT_SIZE => 22,
+};
+
+
+use constant {
+ SAM_QNAME => 0,
+ SAM_FLAG => 1,
+ SAM_RNAME => 2,
+ SAM_POS => 3,
+ SAM_MAPQ => 4,
+ SAM_CIGAR => 5,
+ SAM_MRNM => 6,
+ SAM_MPOS => 7,
+ SAM_ISIZE => 8,
+ SAM_SEQ => 9,
+ SAM_QUAL => 10,
+};
+
+
+# function prototypes for Richard's code
+sub match_desc_to_cigar($);
+sub match_desc_frag_length($);
+sub reverse_compl_match_descriptor($);
+sub write_header($;$;$);
+
+
+&export2sam;
+exit;
+
+
+
+
+sub export2sam {
+
+ my $cmdline = $0 . " " . join(" ", at ARGV);
+ my $arg_count = scalar @ARGV;
+ my $progname = (File::Spec->splitpath($0))[2];
+
+ my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
+ my $is_nofilter = 0;
+ my $read1file;
+ my $read2file;
+ my $print_version = 0;
+ my $help = 0;
+
+ my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
+ "nofilter" => \$is_nofilter,
+ "read1=s" => \$read1file,
+ "read2=s" => \$read2file,
+ "version" => \$print_version,
+ "help" => \$help );
+
+ my $usage = <<END;
+
+$progname converts GERALD export files to SAM format.
+
+Usage: $progname --read1=FILENAME [ options ] | --version | --help
+
+ --read1=FILENAME read1 export file or '-' for stdin (mandatory)
+ (file may be gzipped with ".gz" extension)
+ --read2=FILENAME read2 export file or '-' for stdin
+ (file may be gzipped with ".gz" extension)
+ --nofilter include reads that failed the basecaller
+ purity filter
+ --qlogodds assume export file(s) use logodds quality values
+ as reported by OLB (Pipeline) prior to v1.3
+ (default: phred quality values)
+
+END
+
+ my $version_msg = <<END;
+
+$progname version: $version
+
+END
+
+ if((not $result) or $help or ($arg_count==0)) {
+ die($usage);
+ }
+
+ if(@ARGV) {
+ print STDERR "\nERROR: Unrecognized arguments: " . join(" ", at ARGV) . "\n\n";
+ die($usage);
+ }
+
+ if($print_version) {
+ die($version_msg);
+ }
+
+ if(not defined($read1file)) {
+ print STDERR "\nERROR: read1 export file must be specified\n\n";
+ die($usage);
+ }
+
+ unless((-f $read1file) or ($read1file eq '-')) {
+ die("\nERROR: Can't find read1 export file: '$read1file'\n\n");
+ }
+
+ if (defined $read2file) {
+ unless((-f $read2file) or ($read2file eq '-')) {
+ die("\nERROR: Can't find read2 export file: '$read2file'\n\n");
+ }
+ if($read1file eq $read2file) {
+ die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n");
+ }
+ }
+
+ my ($fh1, $fh2, $is_paired);
+
+ my $read1cmd="$read1file";
+ $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/);
+ open($fh1, $read1cmd)
+ or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n");
+ $is_paired = defined $read2file;
+ if ($is_paired) {
+ my $read2cmd="$read2file";
+ $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/);
+ open($fh2, $read2cmd)
+ or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n");
+ }
+ # quality value conversion table
+ my @conv_table;
+ if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
+ for (-64..64) {
+ $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
+ }
+ } else { # convert from phred+64 quality values (pipeline v1.3+):
+ for (-64..-1) {
+ $conv_table[$_+64] = undef;
+ }
+ for (0..64) {
+ $conv_table[$_+64] = int(33 + $_);
+ }
+ }
+ # write the header
+ print write_header( $progname, $version, $cmdline );
+ # core loop
+ my $export_line_count = 0;
+ while (<$fh1>) {
+ $export_line_count++;
+ my (@s1, @s2);
+ &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
+ if ($is_paired) {
+ my $read2line = <$fh2>;
+ if(not $read2line){
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
+ }
+ &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
+
+ if (@s1 && @s2) { # then set mate coordinate
+ if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
+ die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
+ }
+
+ my $isize = 0;
+ if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
+ my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
+ my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
+ $isize = $x2 - $x1;
+ }
+
+ foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
+ my ($sa,$sb,$is) = @{$_};
+ if ($sb->[SAM_RNAME] ne '*') {
+ $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
+ $sa->[SAM_MPOS] = $sb->[SAM_POS];
+ $sa->[SAM_ISIZE] = $is;
+ $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
+ } else {
+ $sa->[SAM_FLAG] |= 0x8;
+ }
+ }
+ }
+ }
+ print join("\t", @s1), "\n" if (@s1);
+ print join("\t", @s2), "\n" if (@s2 && $is_paired);
+ }
+ close($fh1);
+ if($is_paired) {
+ while(my $read2line = <$fh2>){
+ $export_line_count++;
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
+ }
+ close($fh2);
+ }
+}
+
+sub export2sam_aux {
+ my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
+ chomp($line);
+ my @t = split("\t", $line);
+ if(scalar(@t) < EXPORT_SIZE) {
+ my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n";
+ $msg.="\t...erroneous export record:\n" . $line . "\n\n";
+ die($msg);
+ }
+ @$s = ();
+ my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
+ return if(not ($isPassFilt or $is_nofilter));
+ # read name
+ my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");
+ $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),
+ int($t[EXPORT_X]), int($t[EXPORT_Y]));
+ # initial flag (will be updated later)
+ $s->[SAM_FLAG] = 0;
+ if($is_paired) {
+ if($t[EXPORT_READNO] != $read_no){
+ die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
+ }
+ $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
+ }
+ $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
+
+ # read & quality
+ my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
+ if ($is_export_rev) { # then reverse the sequence and quality
+ $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
+ $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
+ $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
+ } else {
+ $s->[SAM_SEQ] = $t[EXPORT_READ];
+ $s->[SAM_QUAL] = $t[EXPORT_QUAL];
+ }
+ my @convqual = ();
+ foreach (unpack('C*', $s->[SAM_QUAL])){
+ my $val=$ct->[$_];
+ if(not defined $val){
+ my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
+ if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
+ die($msg . "\n");
+ }
+ push @convqual,$val;
+ }
+
+ $s->[SAM_QUAL] = pack('C*', at convqual); # change coding
+
+
+ # coor
+ my $has_coor = 0;
+ $s->[SAM_RNAME] = "*";
+ if (($t[EXPORT_CHROM] eq 'NM') or
+ ($t[EXPORT_CHROM] eq 'QC') or
+ ($t[EXPORT_CHROM] eq 'RM') or
+ ($t[EXPORT_CHROM] eq 'CONTROL')) {
+ $s->[SAM_FLAG] |= 0x4; # unmapped
+ push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');
+ } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
+ $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
+ push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
+ } elsif ($t[EXPORT_POS] < 1) {
+ $s->[SAM_FLAG] |= 0x4; # unmapped
+ } else {
+ $s->[SAM_RNAME] = $t[EXPORT_CHROM];
+ $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
+ $has_coor = 1;
+ }
+ $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
+
+# print STDERR "t[14] = " . $t[14] . "\n";
+ my $matchDesc = '';
+ $s->[SAM_CIGAR] = "*";
+ if($has_coor){
+ $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
+
+ if($matchDesc =~ /\^/){
+ # construct CIGAR string using Richard's function
+ $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
+ } else {
+ $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
+ }
+ }
+
+# print STDERR "cigar_string = $cigar_string\n";
+
+ $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
+ if($has_coor){
+ my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
+ my $pemap = 0;
+ if($is_paired) {
+ $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
+
+ # set `proper pair' bit if non-blank, non-zero PE alignment score:
+ $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
+ }
+ $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
+ } else {
+ $s->[SAM_MAPQ] = 0;
+ }
+ # mate coordinate
+ $s->[SAM_MRNM] = '*';
+ $s->[SAM_MPOS] = 0;
+ $s->[SAM_ISIZE] = 0;
+ # aux
+ push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
+ if($has_coor){
+ # The export match descriptor differs slightly from the samtools match descriptor.
+ # In order for the converted SAM files to be as compliant as possible,
+ # we put the export match descriptor in optional field 'XD' rather than 'MD':
+ push(@$s, "XD:Z:$matchDesc");
+ push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
+ push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
+ }
+}
+
+
+
+#
+# the following code is taken from Richard Shaw's sorted2sam.pl file
+#
+sub reverse_compl_match_descriptor($)
+{
+# print "\nREVERSING THE MATCH DESCRIPTOR!\n";
+ my ($match_desc) = @_;
+ my $rev_compl_match_desc = reverse($match_desc);
+ $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
+
+ # Unreverse the digits of numbers.
+ $rev_compl_match_desc = join('',
+ map {($_ =~ /\d+/)
+ ? join('', reverse(split('', $_)))
+ : $_} split(/(\d+)/,
+ $rev_compl_match_desc));
+
+ return $rev_compl_match_desc;
+}
+
+
+
+sub match_desc_to_cigar($)
+{
+ my ($match_desc) = @_;
+
+ my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
+ my $cigar_str = '';
+ my $cigar_del_ch = 'D';
+ my $cigar_ins_ch = 'I';
+ my $cigar_match_ch = 'M';
+
+ foreach my $match_desc_part (@match_desc_parts) {
+ next if (!$match_desc_part);
+
+ if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
+ # Deletion
+ $cigar_str .= (length($1) . $cigar_del_ch);
+ } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
+ # Insertion
+ $cigar_str .= ($1 . $cigar_ins_ch);
+ } else {
+ $cigar_str .= (match_desc_frag_length($match_desc_part)
+ . $cigar_match_ch);
+ }
+ }
+
+ return $cigar_str;
+}
+
+
+#------------------------------------------------------------------------------
+
+sub match_desc_frag_length($)
+ {
+ my ($match_desc_str) = @_;
+ my $len = 0;
+
+ my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
+
+ foreach my $match_desc_field (@match_desc_fields) {
+ next if ($match_desc_field eq '');
+
+ $len += (($match_desc_field =~ /(\d+)/)
+ ? $1 : length($match_desc_field));
+ }
+
+ return $len;
+}
+
+
+# argument holds the command line
+sub write_header($;$;$)
+{
+ my ($progname,$version,$cl) = @_;
+ my $complete_header = "";
+ $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
+
+ return $complete_header;
+}
diff --git a/samtools-0.1.19/misc/interpolate_sam.pl b/samtools-0.1.19/misc/interpolate_sam.pl
new file mode 100755
index 0000000..6cd6831
--- /dev/null
+++ b/samtools-0.1.19/misc/interpolate_sam.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+use strict;
+
+###Builds interpolated pileup from SAM file
+##@description counts bases between paired ends and piles up single end reads.
+##@output, uses a #header for the RNAME and then the number of reads per base
+##@author sm8 at sanger.ac.uk, Stephen B. Montgomery
+
+##@caveats
+##Requires RNAME to have format as per example
+## chromosome:NCBI36:18:1:76117153:1
+## supercontig::NT_113883:1:137703:1
+## clone::AC138827.3:1:149397:1
+##Expects simple CIGAR characters, M, I and D
+##Expects SAM file to be sorted.
+##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77)
+
+##Verify and read in SAM file
+my $sam_file = $ARGV[0];
+if(!defined($sam_file)) { die("No sam file defined on arg 1"); }
+unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); }
+open(SAM, $sam_file) || die("Cannot open sam file");
+
+##Globals
+my $current_location = ""; ##Current RNAME being processed
+my $current_size = 0; ##Size of sequence region being processed
+my $current_position = 1; ##Current base being processed
+my $open = 0; ##Number of open reads (PE reads that have not been closed)
+my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the
+ ##contained value from those open and deletes the indexed position from the hash
+
+while (my $line = <SAM>) {
+ my @tokens = split /\t/, $line;
+
+ if ($current_location ne $tokens[2]) { ##Start a new sequence region
+ for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region
+ if (defined($close{$i})) {
+ $open = $open - $close{$i};
+ delete $close{$i};
+ }
+ print $open . "\n";
+ }
+ if ($current_location ne "") {
+ print "\n";
+ }
+
+ ##Initiate a new sequence region
+ my @location_tokens = split /:/, $tokens[2];
+ $current_position = 1;
+ $current_location = $tokens[2];
+ $current_size = $location_tokens[4];
+ $open = 0;
+ %close = ();
+ print "#" . $tokens[2] . "\n";
+
+ ##Print pileup to just before the first read (will be 0)
+ for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) {
+ print $open . "\n";
+ }
+ $current_position = $tokens[3];
+
+ } else { ##Sequence region already open
+ if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position
+ ##cycle through to catch up to the current position
+ for (my $i = $current_position; $i < $tokens[3]; $i++) {
+ if (defined($close{$i})) {
+ $open = $open - $close{$i};
+ delete $close{$i};
+ }
+ print $open . "\n";
+ }
+ $current_position = $tokens[3];
+ }
+ }
+ $open++; ##Increment the number of open reads
+
+ if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition
+ $open--;
+ my $parsed_cig = &parseCigar($tokens[5]);
+ my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
+ if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
+ $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
+ } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition
+ my $parsed_cig = &parseCigar($tokens[5]);
+ my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
+ if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
+ $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
+ } else {
+ #do nothing
+ }
+}
+for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region
+ if (defined($close{$i})) {
+ $open = $open - $close{$i};
+ delete $close{$i};
+ }
+ print $open . "\n";
+}
+print "\n";
+close(SAM);
+exit(0);
+
+##reads and tokenizes simple cigarline
+sub parseCigar() {
+ my $cigar_line = shift;
+ $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g;
+ my @cigar_tokens = split /\t/, $cigar_line;
+ my %parsed = ('M' => 0,
+ 'I' => 0,
+ 'D' => 0);
+ my @events = ();
+ for(my $i = 0; $i < scalar(@cigar_tokens); $i++) {
+ if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) {
+ if (!defined($parsed{$2})) { $parsed{$2} = 0; }
+ my $nt = $2;
+ if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; }
+ $parsed{$nt} += $1;
+ my %event_el = ("t" => $nt,
+ "n" => $1);
+ push @events, \%event_el;
+ }
+ }
+ $parsed{'events'} = \@events;
+ return \%parsed;
+}
diff --git a/samtools-0.1.19/misc/maq2sam.c b/samtools-0.1.19/misc/maq2sam.c
new file mode 100644
index 0000000..2bfbe2a
--- /dev/null
+++ b/samtools-0.1.19/misc/maq2sam.c
@@ -0,0 +1,173 @@
+#include <string.h>
+#include <zlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define PACKAGE_VERSION "r439"
+
+//#define MAQ_LONGREADS
+
+#ifdef MAQ_LONGREADS
+# define MAX_READLEN 128
+#else
+# define MAX_READLEN 64
+#endif
+
+#define MAX_NAMELEN 36
+#define MAQMAP_FORMAT_OLD 0
+#define MAQMAP_FORMAT_NEW -1
+
+#define PAIRFLAG_FF 0x01
+#define PAIRFLAG_FR 0x02
+#define PAIRFLAG_RF 0x04
+#define PAIRFLAG_RR 0x08
+#define PAIRFLAG_PAIRED 0x10
+#define PAIRFLAG_DIFFCHR 0x20
+#define PAIRFLAG_NOMATCH 0x40
+#define PAIRFLAG_SW 0x80
+
+typedef struct
+{
+ uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */
+ uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual;
+ uint32_t seqid, pos;
+ int dist;
+ char name[MAX_NAMELEN];
+} maqmap1_t;
+
+typedef struct
+{
+ int format, n_ref;
+ char **ref_name;
+ uint64_t n_mapped_reads;
+ maqmap1_t *mapped_reads;
+} maqmap_t;
+
+maqmap_t *maq_new_maqmap()
+{
+ maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t));
+ mm->format = MAQMAP_FORMAT_NEW;
+ return mm;
+}
+void maq_delete_maqmap(maqmap_t *mm)
+{
+ int i;
+ if (mm == 0) return;
+ for (i = 0; i < mm->n_ref; ++i)
+ free(mm->ref_name[i]);
+ free(mm->ref_name);
+ free(mm->mapped_reads);
+ free(mm);
+}
+maqmap_t *maqmap_read_header(gzFile fp)
+{
+ maqmap_t *mm;
+ int k, len;
+ mm = maq_new_maqmap();
+ gzread(fp, &mm->format, sizeof(int));
+ if (mm->format != MAQMAP_FORMAT_NEW) {
+ if (mm->format > 0) {
+ fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n");
+ exit(3);
+ }
+ assert(mm->format == MAQMAP_FORMAT_NEW);
+ }
+ gzread(fp, &mm->n_ref, sizeof(int));
+ mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*));
+ for (k = 0; k != mm->n_ref; ++k) {
+ gzread(fp, &len, sizeof(int));
+ mm->ref_name[k] = (char*)malloc(len * sizeof(char));
+ gzread(fp, mm->ref_name[k], len);
+ }
+ /* read number of mapped reads */
+ gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t));
+ return mm;
+}
+
+void maq2tam_core(gzFile fp, const char *rg)
+{
+ maqmap_t *mm;
+ maqmap1_t mm1, *m1;
+ int ret;
+ m1 = &mm1;
+ mm = maqmap_read_header(fp);
+ while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) {
+ int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1];
+ if (m1->flag) flag |= 1;
+ if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2;
+ if (m1->flag == 192) flag |= 4;
+ if (m1->flag == 64) flag |= 8;
+ if (m1->pos&1) flag |= 0x10;
+ if ((flag&1) && m1->dist != 0) {
+ int c;
+ if (m1->dist > 0) {
+ if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0;
+ else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1;
+ else c = m1->pos&1;
+ } else {
+ if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0;
+ else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1;
+ else c = m1->pos&1;
+ }
+ if (c) flag |= 0x20;
+ }
+ if (m1->flag) {
+ int l = strlen(m1->name);
+ if (m1->name[l-2] == '/') {
+ flag |= (m1->name[l-1] == '1')? 0x40 : 0x80;
+ m1->name[l-2] = '\0';
+ }
+ }
+ printf("%s\t%d\t", m1->name, flag);
+ printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1);
+ if (m1->flag == 130) {
+ int c = (int8_t)m1->seq[MAX_READLEN-1];
+ printf("%d\t", m1->alt_qual);
+ if (c == 0) printf("%dM\t", m1->size);
+ else {
+ if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c);
+ else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual);
+ }
+ se_mapq = 0; // zero SE mapQ for reads aligned by SW
+ } else {
+ if (flag&4) printf("0\t*\t");
+ else printf("%d\t%dM\t", m1->map_qual, m1->size);
+ }
+ printf("*\t0\t%d\t", m1->dist);
+ for (j = 0; j != m1->size; ++j) {
+ if (m1->seq[j] == 0) putchar('N');
+ else putchar("ACGT"[m1->seq[j]>>6&3]);
+ }
+ putchar('\t');
+ for (j = 0; j != m1->size; ++j)
+ putchar((m1->seq[j]&0x3f) + 33);
+ putchar('\t');
+ if (rg) printf("RG:Z:%s\t", rg);
+ if (flag&4) { // unmapped
+ printf("MF:i:%d\n", m1->flag);
+ } else {
+ printf("MF:i:%d\t", m1->flag);
+ if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq);
+ printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]);
+ }
+ }
+ if (ret > 0)
+ fprintf(stderr, "Truncated! Continue anyway.\n");
+ maq_delete_maqmap(mm);
+}
+
+int main(int argc, char *argv[])
+{
+ gzFile fp;
+ if (argc == 1) {
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Usage: maq2sam <in.map> [<readGroup>]\n");
+ return 1;
+ }
+ fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ maq2tam_core(fp, argc > 2? argv[2] : 0);
+ gzclose(fp);
+ return 0;
+}
diff --git a/samtools-0.1.19/misc/md5.c b/samtools-0.1.19/misc/md5.c
new file mode 100644
index 0000000..55ae181
--- /dev/null
+++ b/samtools-0.1.19/misc/md5.c
@@ -0,0 +1,296 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ */
+
+/* Brutally hacked by John Walker back from ANSI C to K&R (no
+ prototypes) to maintain the tradition that Netfone will compile
+ with Sun's original "cc". */
+
+#include <string.h>
+#include "md5.h"
+
+#ifndef HIGHFIRST
+#define byteReverse(buf, len) /* Nothing */
+#else
+/*
+ * Note: this code is harmless on little-endian machines.
+ */
+void byteReverse(buf, longs)
+ unsigned char *buf; unsigned longs;
+{
+ uint32_t t;
+ do {
+ t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
+ ((unsigned) buf[1] << 8 | buf[0]);
+ *(uint32_t *) buf = t;
+ buf += 4;
+ } while (--longs);
+}
+#endif
+
+void MD5Transform(uint32_t buf[4], uint32_t in[16]);
+
+
+/*
+ * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(ctx)
+ struct MD5Context *ctx;
+{
+ ctx->buf[0] = 0x67452301;
+ ctx->buf[1] = 0xefcdab89;
+ ctx->buf[2] = 0x98badcfe;
+ ctx->buf[3] = 0x10325476;
+
+ ctx->bits[0] = 0;
+ ctx->bits[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(ctx, buf, len)
+ struct MD5Context *ctx; unsigned char *buf; unsigned len;
+{
+ uint32_t t;
+
+ /* Update bitcount */
+
+ t = ctx->bits[0];
+ if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
+ ctx->bits[1]++; /* Carry from low to high */
+ ctx->bits[1] += len >> 29;
+
+ t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
+
+ /* Handle any leading odd-sized chunks */
+
+ if (t) {
+ unsigned char *p = (unsigned char *) ctx->in + t;
+
+ t = 64 - t;
+ if (len < t) {
+ memcpy(p, buf, len);
+ return;
+ }
+ memcpy(p, buf, t);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ buf += t;
+ len -= t;
+ }
+ /* Process data in 64-byte chunks */
+
+ while (len >= 64) {
+ memcpy(ctx->in, buf, 64);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ buf += 64;
+ len -= 64;
+ }
+
+ /* Handle any remaining bytes of data. */
+
+ memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(digest, ctx)
+ unsigned char digest[16]; struct MD5Context *ctx;
+{
+ unsigned count;
+ unsigned char *p;
+
+ /* Compute number of bytes mod 64 */
+ count = (ctx->bits[0] >> 3) & 0x3F;
+
+ /* Set the first char of padding to 0x80. This is safe since there is
+ always at least one byte free */
+ p = ctx->in + count;
+ *p++ = 0x80;
+
+ /* Bytes of padding needed to make 64 bytes */
+ count = 64 - 1 - count;
+
+ /* Pad out to 56 mod 64 */
+ if (count < 8) {
+ /* Two lots of padding: Pad the first block to 64 bytes */
+ memset(p, 0, count);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+
+ /* Now fill the next block with 56 bytes */
+ memset(ctx->in, 0, 56);
+ } else {
+ /* Pad block to 56 bytes */
+ memset(p, 0, count - 8);
+ }
+ byteReverse(ctx->in, 14);
+
+ /* Append length in bits and transform */
+ ((uint32_t *) ctx->in)[14] = ctx->bits[0];
+ ((uint32_t *) ctx->in)[15] = ctx->bits[1];
+
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ byteReverse((unsigned char *) ctx->buf, 4);
+ memcpy(digest, ctx->buf, 16);
+ memset(ctx, 0, sizeof(ctx)); /* In case it's sensitive */
+}
+
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+ ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+void MD5Transform(buf, in)
+ uint32_t buf[4]; uint32_t in[16];
+{
+ register uint32_t a, b, c, d;
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
+
+/* lh3: the following code is added by me */
+
+#ifdef MD5SUM_MAIN
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define HEX_STR "0123456789abcdef"
+
+static void md5_one(const char *fn)
+{
+ unsigned char buf[4096], digest[16];
+ MD5_CTX md5;
+ int l;
+ FILE *fp;
+
+ fp = strcmp(fn, "-")? fopen(fn, "r") : stdin;
+ if (fp == 0) {
+ fprintf(stderr, "md5sum: %s: No such file or directory\n", fn);
+ exit(1);
+ }
+ MD5Init(&md5);
+ while ((l = fread(buf, 1, 4096, fp)) > 0)
+ MD5Update(&md5, buf, l);
+ MD5Final(digest, &md5);
+ if (fp != stdin) fclose(fp);
+ for (l = 0; l < 16; ++l)
+ printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
+ printf(" %s\n", fn);
+}
+int main(int argc, char *argv[])
+{
+ int i;
+ if (argc == 1) md5_one("-");
+ else for (i = 1; i < argc; ++i) md5_one(argv[i]);
+ return 0;
+}
+#endif
diff --git a/samtools-0.1.19/misc/md5.h b/samtools-0.1.19/misc/md5.h
new file mode 100644
index 0000000..44121e4
--- /dev/null
+++ b/samtools-0.1.19/misc/md5.h
@@ -0,0 +1,57 @@
+/*
+ This file is adapted from a program in this page:
+
+ http://www.fourmilab.ch/md5/
+
+ The original source code does not work on 64-bit machines due to the
+ wrong typedef "uint32". I also added prototypes.
+
+ -lh3
+ */
+
+#ifndef MD5_H
+#define MD5_H
+
+/* The following tests optimise behaviour on little-endian
+ machines, where there is no need to reverse the byte order
+ of 32 bit words in the MD5 computation. By default,
+ HIGHFIRST is defined, which indicates we're running on a
+ big-endian (most significant byte first) machine, on which
+ the byteReverse function in md5.c must be invoked. However,
+ byteReverse is coded in such a way that it is an identity
+ function when run on a little-endian machine, so calling it
+ on such a platform causes no harm apart from wasting time.
+ If the platform is known to be little-endian, we speed
+ things up by undefining HIGHFIRST, which defines
+ byteReverse as a null macro. Doing things in this manner
+ insures we work on new platforms regardless of their byte
+ order. */
+
+#define HIGHFIRST
+
+#if __LITTLE_ENDIAN__ != 0
+#undef HIGHFIRST
+#endif
+
+#include <stdint.h>
+
+struct MD5Context {
+ uint32_t buf[4];
+ uint32_t bits[2];
+ unsigned char in[64];
+};
+
+void MD5Init(struct MD5Context *ctx);
+void MD5Update(struct MD5Context *ctx, unsigned char *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *ctx);
+
+/*
+ * This is needed to make RSAREF happy on some MS-DOS compilers.
+ */
+typedef struct MD5Context MD5_CTX;
+
+/* Define CHECK_HARDWARE_PROPERTIES to have main,c verify
+ byte order and uint32_t settings. */
+#define CHECK_HARDWARE_PROPERTIES
+
+#endif /* !MD5_H */
diff --git a/samtools-0.1.19/misc/md5fa.c b/samtools-0.1.19/misc/md5fa.c
new file mode 100644
index 0000000..7a165bf
--- /dev/null
+++ b/samtools-0.1.19/misc/md5fa.c
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <zlib.h>
+#include "md5.h"
+#include "kseq.h"
+
+#define HEX_STR "0123456789abcdef"
+
+KSEQ_INIT(gzFile, gzread)
+
+static void md5_one(const char *fn)
+{
+ MD5_CTX md5_one, md5_all;
+ int l, i, k;
+ gzFile fp;
+ kseq_t *seq;
+ unsigned char unordered[16], digest[16];
+
+ for (l = 0; l < 16; ++l) unordered[l] = 0;
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "md5fa: %s: No such file or directory\n", fn);
+ exit(1);
+ }
+
+ MD5Init(&md5_all);
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ for (i = k = 0; i < seq->seq.l; ++i) {
+ if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]);
+ else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i];
+ }
+ MD5Init(&md5_one);
+ MD5Update(&md5_one, (unsigned char*)seq->seq.s, k);
+ MD5Final(digest, &md5_one);
+ for (l = 0; l < 16; ++l) {
+ printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
+ unordered[l] ^= digest[l];
+ }
+ printf(" %s %s\n", fn, seq->name.s);
+ MD5Update(&md5_all, (unsigned char*)seq->seq.s, k);
+ }
+ MD5Final(digest, &md5_all);
+ kseq_destroy(seq);
+ for (l = 0; l < 16; ++l)
+ printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
+ printf(" %s >ordered\n", fn);
+ for (l = 0; l < 16; ++l)
+ printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]);
+ printf(" %s >unordered\n", fn);
+}
+
+int main(int argc, char *argv[])
+{
+ int i;
+ if (argc == 1) md5_one("-");
+ else for (i = 1; i < argc; ++i) md5_one(argv[i]);
+ return 0;
+}
diff --git a/samtools-0.1.19/misc/novo2sam.pl b/samtools-0.1.19/misc/novo2sam.pl
new file mode 100755
index 0000000..8b53c9e
--- /dev/null
+++ b/samtools-0.1.19/misc/novo2sam.pl
@@ -0,0 +1,281 @@
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.3
+
+#Modified by Zayed Albertyn(zayed.albertyn at gmail.com) & Colin Hercus(colin at novocraft.com)
+
+#use strict;
+#use warnings;
+use Data::Dumper;
+use Getopt::Std;
+
+&novo2sam;
+exit;
+
+sub mating {
+ my ($s1, $s2) = @_;
+ my $isize = 0;
+ if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
+ my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
+ my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
+ $isize = $x2 - $x1;
+ }
+ # update mate coordinate
+ if ($s2->[2] ne '*') {
+ @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
+ $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
+ } else {
+ $s1->[1] |= 0x8;
+ }
+ if ($s1->[2] ne '*') {
+ @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
+ $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
+ } else {
+ $s2->[1] |= 0x8;
+ }
+}
+
+sub novo2sam {
+ my %opts = ();
+ getopts("p", \%opts);
+ die("Usage: novo2sam.pl [-p] <aln.novo>\n") if (@ARGV == 0);
+ my $is_paired = defined($opts{p});
+ # core loop
+ my @s1 = ();
+ my @s2 = ();
+ my ($s_last, $s_curr) = (\@s1, \@s2);
+ while (<>) {
+ next if (/^#/);
+ next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/);
+ &novo2sam_aux($_, $s_curr, $is_paired);
+ if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
+ &mating($s_last, $s_curr);
+ print join("\t", @$s_last), "\n";
+ print join("\t", @$s_curr), "\n";
+ @$s_last = (); @$s_curr = ();
+ } else {
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+ my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
+ }
+ }
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+}
+
+sub novo2sam_aux {
+ my ($line, $s, $is_paired) = @_;
+
+ chomp($line);
+ my @t = split(/\s+/, $line);
+ my @variations = @t[13 .. $#t];
+ @$s = ();
+ return if ($t[4] ne 'U');
+ my $len = length($t[2]);
+ # read name
+ $s->[0] = substr($t[0], 1);
+ $s->[0] =~ s/\/[12]$//g;
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7);
+ $s->[1] |= 2 if ($t[10] eq '.');
+ # read & quality
+ if ($t[9] eq 'R') {
+ $s->[9] = reverse($t[2]);
+ $s->[10] = reverse($t[3]);
+ $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
+ } else {
+ $s->[9] = $t[2]; $s->[10] = $t[3];
+ }
+ # cigar
+ my $cigarstring ="";
+ if (scalar @variations ==0 ) {
+ $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment
+ } else {
+ #convert to correct CIGAR
+ my $tmpstr = join" ", at variations ;
+ if ( $tmpstr=~ /\+|\-/ ) {
+ $cigarstring = cigar_method($line,\@variations,$len);
+ $s->[5]=$cigarstring;
+ } else {
+ $s->[5]=$len. "M";
+ }
+}
+
+# coor
+ $s->[2] = substr($t[7], 1); $s->[3] = $t[8];
+ $s->[1] |= 0x10 if ($t[9] eq 'R');
+ # mapQ
+ $s->[4] = $t[5] > $t[6]? $t[5] : $t[6];
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ push(@$s, "NM:i:".(@t-13));
+ my $md = '';
+ $md = mdtag($md,$line,\@variations,$len);
+ push(@$s, "MD:Z:$md");
+
+}
+
+sub mdtag {
+ my $oldmd = shift;
+ my $line = shift;
+ my $ref =shift;
+ my $rdlen = shift;
+ my @variations = @$ref;
+ my $string="";
+ my $mdtag="";
+ my $t=1;
+ my $q=1;
+ my $deleteflag=0;
+ my $len =0;
+ foreach $string (@variations) {
+ my ($indeltype,$insert) = indeltype($string);
+ if ($indeltype eq "+") {
+ $len = length ($insert);
+ $q+=$len;
+ next;
+ }
+ my $pos = $1 if $string =~ /^(\d+)/;
+ $len = $pos - $t;
+ if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) {
+ $mdtag.=$len;
+ }
+ $t+=$len;
+ $q+=$len;
+ if ($indeltype eq ">") {
+ $mdtag.=$insert;
+ $deleteflag=0;
+ $t+=1;
+ $q+=1;
+ }
+ if ($indeltype eq "-") {
+ my $deletedbase = $2 if $string =~ /(\d+)\-([A-Za-z]+)/;
+ if ($deleteflag == 0 ) {
+ $mdtag.="^";
+ }
+ $mdtag.=$deletedbase;
+ $deleteflag=1;
+ $t+=1;
+ }
+ }
+ $len = $rdlen - $q + 1;
+ if ($len > 0) {
+ $mdtag.="$len";
+ }
+# print "In:$line\n";
+# print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n";
+
+ return $mdtag;
+}
+
+sub indeltype {
+ my $string = shift;
+ my $insert="";
+ my $indeltype;
+ if ($string =~ /([A-Za-z]+)\>/) {
+ $indeltype=">";
+ $insert=$1;
+ } elsif ($string =~ /\-/) {
+ $indeltype="-";
+ } elsif ($string =~ /\+([A-Za-z]+)/) {
+ $indeltype="+";
+ $insert=$1;
+ }
+ return ($indeltype,$insert);
+
+}
+
+
+sub cigar_method {
+ my $line = shift;
+ my $ref =shift;
+ my $rdlen = shift;
+ my @variations = @$ref;
+ my $string="";
+ my $type="";
+ my $t =1;
+ my $q=1;
+ my $indeltype="";
+ my $cigar= "";
+ my $insert = "";
+ my $len=0;
+ my @cig=();
+ foreach $string (@variations) {
+ next if $string =~ />/;
+ my $pos = $1 if $string =~ /^(\d+)/;
+
+ if ($string =~ /\+([A-Za-z]+)/) {
+ $indeltype="+";
+ $insert = $1;
+ }elsif ($string =~ /\-([A-Za-z]+)/) {
+ $indeltype="-";
+ $insert = $1;
+ }
+#print "$pos $indeltype $insert $t $q\n";
+ $len = $pos - $t;
+ if ( $len > 0) {
+ $cigar.=$len."M";
+ push(@cig,$len."M");
+ }
+ $t+=$len;
+ $q+=$len;
+
+ if ($indeltype eq "-") {
+ $cigar.="D";
+ push(@cig,"D");
+ $t++;
+ }
+ if ($indeltype eq "+") {
+ $len = length ($insert);
+ if ($len == 1) {
+ $cigar.="I";
+ push(@cig,"I");
+ }
+ if ($len > 1) {
+ $cigar.=$len."I";
+ push(@cig,$len."I")
+ }
+ $q+=$len;
+ }
+ $insert="";
+ }
+ $len= $rdlen - $q + 1;
+ if ($len > 0) {
+ $cigar.=$len."M";
+ push(@cig,$len."M");
+ }
+
+ $cigar = newcigar($cigar,'D');
+ $cigar = newcigar($cigar,'I');
+
+ #print "$line\n";
+ #print "c CIGAR:\t$cigar\n\n";
+ return $cigar;
+
+}
+
+
+
+sub newcigar {
+ my $cigar = shift;
+ my $char = shift;
+ my $new = "";
+ my $copy = $cigar;
+#print "$cigar\n";
+ $copy =~ s/^($char+)/$1;/g;
+#print "$copy\n";
+ $copy =~ s/([^0-9$char])($char+)/$1;$2;/g;
+#print "$copy\n";
+ my @parts = split(/;/,$copy);
+ my $el="";
+ foreach $el (@parts) {
+#print "$el\n";
+ if ($el =~ /^$char+$/) {
+ $new.=length($el).$char;
+ }else {
+ $new.=$el;
+ }
+
+ }
+ return $new;
+}
diff --git a/samtools-0.1.19/misc/plot-bamcheck b/samtools-0.1.19/misc/plot-bamcheck
new file mode 100755
index 0000000..1792c6f
--- /dev/null
+++ b/samtools-0.1.19/misc/plot-bamcheck
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+#
+# Author: petr.danecek at sanger
+#
+
+use strict;
+use warnings;
+use Carp;
+
+my $opts = parse_params();
+parse_bamcheck($opts);
+plot_qualities($opts);
+plot_acgt_cycles($opts);
+plot_gc($opts);
+plot_gc_depth($opts);
+plot_isize($opts);
+plot_coverage($opts);
+plot_mismatches_per_cycle($opts);
+plot_indel_dist($opts);
+plot_indel_cycles($opts);
+
+exit;
+
+#--------------------------------
+
+sub error
+{
+ my (@msg) = @_;
+ if ( scalar @msg ) { confess @msg; }
+ die
+ "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n",
+ " plot-bamcheck -p outdir/ file.bam.bc\n",
+ "Options:\n",
+ " -k, --keep-files Do not remove temporary files.\n",
+ " -p, --prefix <path> The output files prefix, add a slash to create new directory.\n",
+ " -r, --ref-stats <file.fa.gc> Optional reference stats file with expected GC content (created with -s).\n",
+ " -s, --do-ref-stats <file.fa> Calculate reference sequence GC for later use with -r\n",
+ " -t, --targets <file.tab> Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n",
+ " -h, -?, --help This help message.\n",
+ "\n";
+}
+
+
+sub parse_params
+{
+ $0 =~ s{^.+/}{};
+ my $opts = { args=>join(' ',$0, at ARGV) };
+ while (defined(my $arg=shift(@ARGV)))
+ {
+ if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; }
+ if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; }
+ if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; }
+ if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; }
+ if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
+ if ( -e $arg ) { $$opts{bamcheck}=$arg; next; }
+ error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
+ }
+ if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; }
+ if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") }
+ if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") }
+ if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; }
+ elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; }
+ return $opts;
+}
+
+
+# Creates GC stats for either the whole reference or only on target regions for exome QC
+sub do_ref_stats
+{
+ my ($opts) = @_;
+
+
+ my %targets = ();
+ if ( exists($$opts{targets}) )
+ {
+ my ($prev_chr,$prev_pos);
+ open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!");
+ while (my $line=<$fh>)
+ {
+ if ( $line=~/^#/ ) { next; }
+ my ($chr,$from,$to) = split(/\s+/,$line);
+ chomp($to);
+ push @{$targets{$chr}}, $from,$to;
+ if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from }
+ if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); }
+ $prev_pos = $from;
+ }
+ close($fh);
+ }
+
+ my $_len = 60; # for now do only standard fasta's with 60 bases per line
+ my %gc_counts = ();
+ my ($skip_chr,$pos,$ireg,$regions);
+ open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!");
+ while (my $line=<$fh>)
+ {
+ if ( $line=~/^>/ )
+ {
+ if ( !scalar %targets ) { next; }
+
+ if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); }
+ if ( !exists($targets{$1}) ) { $skip_chr=$1; next; }
+ undef $skip_chr;
+ $pos = 0;
+ $ireg = 0;
+ $regions = $targets{$1};
+ }
+ if ( defined $skip_chr ) { next; }
+
+ # Only $_len sized lines are considered and no chopping for target regions.
+ chomp($line);
+ my $len = length($line);
+ if ( $len ne $_len ) { next; }
+
+ if ( scalar %targets )
+ {
+ while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; }
+ $pos += $len;
+ if ( $ireg==@$regions ) { next; }
+ if ( $pos < $$regions[$ireg] ) { next; }
+ }
+
+ my $gc_count = 0;
+ for (my $i=0; $i<$len; $i++)
+ {
+ my $base = substr($line,$i,1);
+ if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; }
+ }
+ $gc_counts{$gc_count}++;
+ }
+
+ print "# Generated by $$opts{args}\n";
+ print "# The columns are: GC content bin, normalized frequency\n";
+ my $max;
+ for my $count (values %gc_counts)
+ {
+ if ( !defined $max or $count>$max ) { $max=$count; }
+ }
+ for my $gc (sort {$a<=>$b} keys %gc_counts)
+ {
+ if ( $gc==0 ) { next; }
+ printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max;
+ }
+}
+
+sub plot
+{
+ my ($cmdfile) = @_;
+ my $cmd = "gnuplot $cmdfile";
+ system($cmd);
+ if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); }
+}
+
+
+sub parse_bamcheck
+{
+ my ($opts) = @_;
+ open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!");
+ my $line = <$fh>;
+ if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); }
+ $$opts{dat}{version} = $1;
+ while ($line=<$fh>)
+ {
+ if ( $line=~/^#/ ) { next; }
+ my @items = split(/\t/,$line);
+ chomp($items[-1]);
+ if ( $items[0] eq 'SN' )
+ {
+ $$opts{dat}{$items[1]} = splice(@items,2);
+ next;
+ }
+ push @{$$opts{dat}{$items[0]}}, [splice(@items,1)];
+ }
+ close($fh);
+
+ # Check sanity
+ if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} )
+ {
+ error("Sanity check failed: no sequences found by bamcheck??\n");
+ }
+}
+
+sub older_than
+{
+ my ($opts,$version) = @_;
+ my ($year,$month,$day) = split(/-/,$version);
+ $version = $$opts{dat}{version};
+ if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; }
+ if ( $1<$year ) { return 1; }
+ elsif ( $1>$year ) { return 0; }
+ if ( $2<$month ) { return 1; }
+ elsif ( $2>$month ) { return 0; }
+ if ( $3<$day ) { return 1; }
+ return 0;
+}
+
+sub get_defaults
+{
+ my ($opts,$img_fname,%args) = @_;
+
+ if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); }
+
+ # Determine the gnuplot script file name
+ my $gp_file = $img_fname;
+ $gp_file =~ s{\.[^.]+$}{.gp};
+ if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; }
+
+ # Determine the default title:
+ # 5446_6/5446_6.bam.bc.gp -> 5446_6
+ # test.aaa.png -> test.aaa
+ if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); }
+ my $title = $1;
+
+ my $dir = $gp_file;
+ $dir =~ s{/[^/]+$}{};
+ if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; }
+
+ my $wh = exists($args{wh}) ? $args{wh} : '600,400';
+
+ open(my $fh,'>',$gp_file) or error("$gp_file: $!");
+ return {
+ title => $title,
+ gp => $gp_file,
+ img => $img_fname,
+ fh => $fh,
+ terminal => qq[set terminal png size $wh truecolor],
+ grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"',
+ };
+}
+
+sub percentile
+{
+ my ($p, at vals) = @_;
+ my $N = 0;
+ for my $val (@vals) { $N += $val; }
+ my $n = $p*($N+1)/100.;
+ my $k = int($n);
+ my $d = $n-$k;
+ if ( $k<=0 ) { return 0; }
+ if ( $k>=$N ) { return scalar @vals-1; }
+ my $cnt;
+ for (my $i=0; $i<@vals; $i++)
+ {
+ $cnt += $vals[$i];
+ if ( $cnt>=$k ) { return $i; }
+ }
+ error("FIXME: this should not happen [percentile]\n");
+}
+
+sub plot_qualities
+{
+ my ($opts) = @_;
+
+ if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; }
+
+ my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50;
+ my $is_paired = $$opts{dat}{'is paired:'};
+
+ # Average quality per cycle, forward and reverse reads in one plot
+ my $args = get_defaults($opts,"$$opts{prefix}quals.png");
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set ylabel "Average Quality"
+ set xlabel "Cycle"
+ set yrange [0:$yrange]
+ set title "$$args{title}"
+ plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[
+ ];
+ my (@fp75, at fp50, at fmean);
+ my (@lp75, at lp50, at lmean);
+ my ($fmax,$fmax_qual,$fmax_cycle);
+ my ($lmax,$lmax_qual,$lmax_cycle);
+ for my $cycle (@{$$opts{dat}{FFQ}})
+ {
+ my $sum=0; my $n=0;
+ for (my $iqual=1; $iqual<@$cycle; $iqual++)
+ {
+ $sum += $$cycle[$iqual]*$iqual;
+ $n += $$cycle[$iqual];
+ if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; }
+ }
+ my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
+ my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
+ my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
+ if ( !$n ) { next; }
+ push @fp75, "$$cycle[0]\t$p25\t$p75\n";
+ push @fp50, "$$cycle[0]\t$p50\n";
+ push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
+ printf $fh $fmean[-1];
+ }
+ print $fh "end\n";
+ if ( $is_paired )
+ {
+ for my $cycle (@{$$opts{dat}{LFQ}})
+ {
+ my $sum=0; my $n=0;
+ for (my $iqual=1; $iqual<@$cycle; $iqual++)
+ {
+ $sum += $$cycle[$iqual]*$iqual;
+ $n += $$cycle[$iqual];
+ if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; }
+ }
+ my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
+ my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
+ my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
+ if ( !$n ) { next; }
+ push @lp75, "$$cycle[0]\t$p25\t$p75\n";
+ push @lp50, "$$cycle[0]\t$p50\n";
+ push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
+ printf $fh $lmean[-1];
+ }
+ print $fh "end\n";
+ }
+ close($fh);
+ plot($$args{gp});
+
+
+
+ # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots
+ $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500');
+ $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set multiplot
+ set rmargin 0
+ set lmargin 0
+ set tmargin 0
+ set bmargin 0
+ set origin 0.1,0.1
+ set size 0.4,0.8
+ set yrange [0:$yrange]
+ set ylabel "Quality"
+ set xlabel "Cycle (fwd reads)"
+ plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean'
+ ];
+ print $fh join('', at fp75),"end\n";
+ print $fh join('', at fp50),"end\n";
+ print $fh join('', at fmean),"end\n";
+ if ( $is_paired )
+ {
+ print $fh qq[
+ set origin 0.55,0.1
+ set size 0.4,0.8
+ unset ytics
+ set y2tics mirror
+ set yrange [0:$yrange]
+ unset ylabel
+ set xlabel "Cycle (rev reads)"
+ set label "$$args{title}" at screen 0.5,0.95 center
+ plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean'
+ ];
+ print $fh join('', at lp75),"end\n";
+ print $fh join('', at lp50),"end\n";
+ print $fh join('', at lmean),"end\n";
+ }
+ close($fh);
+ plot($$args{gp});
+
+
+
+ # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve
+ $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600');
+ $fh = $$args{fh};
+ my $nquals = @{$$opts{dat}{FFQ}[0]}-1;
+ my $ncycles = @{$$opts{dat}{FFQ}};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set multiplot
+ set rmargin 0
+ set lmargin 0
+ set tmargin 0
+ set bmargin 0
+ set origin 0.15,0.52
+ set size 0.8,0.4
+ set title "$$args{title}"
+ set ylabel "Frequency (fwd reads)"
+ set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax
+ unset xlabel
+ set xrange [0:$nquals]
+ set format x ""
+ ];
+ my @plots;
+ for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] }
+ print $fh "plot ", join(",", @plots), "\n";
+ for my $cycle (@{$$opts{dat}{FFQ}})
+ {
+ for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; }
+ print $fh "end\n";
+ }
+ if ( $is_paired )
+ {
+ print $fh qq[
+ set origin 0.15,0.1
+ set size 0.8,0.4
+ unset title
+ unset format
+ set xtics
+ set xlabel "Quality"
+ unset label
+ set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax
+ set ylabel "Frequency (rev reads)"
+ ];
+ print $fh "plot ", join(",", @plots), "\n";
+ for my $cycle (@{$$opts{dat}{LFQ}})
+ {
+ for (my $iqual=1; $iqual<$nquals; $iqual++)
+ {
+ print $fh "$iqual\t$$cycle[$iqual]\n";
+ }
+ print $fh "end\n";
+ }
+ }
+ close($fh);
+ plot($$args{gp});
+
+
+ # Heatmap qualitites
+ $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500');
+ $fh = $$args{fh};
+ my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax;
+ my @ytics;
+ for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
+ my $ytics = join(',', @ytics);
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ unset key
+ unset colorbox
+ set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1)
+ set cbrange [0:$max]
+ set yrange [0:$ncycles]
+ set xrange [0:$nquals]
+ set view map
+ set multiplot
+ set rmargin 0
+ set lmargin 0
+ set tmargin 0
+ set bmargin 0
+ set origin 0,0.46
+ set size 0.95,0.6
+ set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles
+ set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black"
+ set ylabel "Cycle (fwd reads)" offset character -1,0
+ unset ytics
+ set ytics ($ytics)
+ unset xtics
+ set title "$$args{title}"
+ splot '-' matrix with image
+ ];
+ for my $cycle (@{$$opts{dat}{FFQ}})
+ {
+ for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
+ print $fh "\n";
+ }
+ print $fh "end\nend\n";
+ @ytics = ();
+ for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
+ $ytics = join(',', @ytics);
+ print $fh qq[
+ set origin 0,0.03
+ set size 0.95,0.6
+ set ylabel "Cycle (rev reads)" offset character -1,0
+ set xlabel "Base Quality"
+ unset title
+ unset ytics
+ set ytics ($ytics)
+ set xrange [0:$nquals]
+ set xtics
+ set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812
+ set cblabel "Number of bases"
+ splot '-' matrix with image
+ ];
+ for my $cycle (@{$$opts{dat}{LFQ}})
+ {
+ for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
+ print $fh "\n";
+ }
+ print $fh "end\nend\n";
+ close($fh);
+ plot($$args{gp});
+}
+
+
+sub plot_acgt_cycles
+{
+ my ($opts) = @_;
+
+ if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; }
+
+ my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png");
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set style line 1 linecolor rgb "green"
+ set style line 2 linecolor rgb "red"
+ set style line 3 linecolor rgb "black"
+ set style line 4 linecolor rgb "blue"
+ set style increment user
+ set ylabel "Base content [%]"
+ set xlabel "Read Cycle"
+ set yrange [0:100]
+ set title "$$args{title}"
+ plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T'
+ ];
+ for my $base (1..4)
+ {
+ for my $cycle (@{$$opts{dat}{GCC}})
+ {
+ print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n";
+ }
+ print $fh "end\n";
+ }
+ close($fh);
+ plot($$args{gp});
+}
+
+
+sub plot_gc
+{
+ my ($opts) = @_;
+
+ my $is_paired = $$opts{dat}{'is paired:'};
+ my $args = get_defaults($opts,"$$opts{prefix}gc-content.png");
+ my $fh = $$args{fh};
+ my ($gcl_max,$gcf_max,$lmax,$fmax);
+ for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } }
+ for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } }
+ my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax;
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set title "$$args{title}"
+ set ylabel "Normalized Frequency"
+ set xlabel "GC Content [%]"
+ set yrange [0:1.1]
+ set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0
+ plot ]
+ . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '')
+ . q['-' smooth csplines with lines lc 1 title 'First fragments' ]
+ . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '')
+ . q[
+ ];
+ if ( exists($$opts{ref_stats}) )
+ {
+ open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!");
+ while (my $line=<$ref>) { print $fh $line }
+ close($ref);
+ print $fh "end\n";
+ }
+ for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; }
+ print $fh "end\n";
+ if ( $is_paired )
+ {
+ for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; }
+ print $fh "end\n";
+ }
+ close($fh);
+ plot($$args{gp});
+}
+
+
+sub plot_gc_depth
+{
+ my ($opts) = @_;
+
+ if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; }
+
+ # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics.
+ my @tics = ( {gc=>30},{gc=>40},{gc=>50} );
+ for my $gc (@{$$opts{dat}{GCD}})
+ {
+ for my $tic (@tics)
+ {
+ my $diff = abs($$gc[0]-$$tic{gc});
+ if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; }
+ }
+ }
+
+ my @x2tics;
+ for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; }
+ my $x2tics = join(',', at x2tics);
+
+ my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500');
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set ylabel "Mapped depth"
+ set xlabel "Percentile of mapped sequence ordered by GC content"
+ set x2label "GC Content [%]"
+ set title "$$args{title}"
+ set x2tics ($x2tics)
+ set xtics nomirror
+ set xrange [0.1:99.9]
+
+ plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\
+ '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\
+ '-' using 1:2 with lines lc rgb "#0084ff" t 'Median'
+ ];
+ for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n";
+ for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n";
+ for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n";
+ close($fh);
+ plot($$args{gp});
+}
+
+
+sub plot_isize
+{
+ my ($opts) = @_;
+
+ if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; }
+
+ my ($isize_max,$isize_cnt);
+ for my $isize (@{$$opts{dat}{IS}})
+ {
+ if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; }
+ }
+
+ my $args = get_defaults($opts,"$$opts{prefix}insert-size.png");
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set rmargin 5
+ set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt
+ set ylabel "Number of pairs"
+ set xlabel "Insert Size"
+ set title "$$args{title}"
+ plot \\
+ '-' with lines lc rgb 'black' title 'All pairs', \\
+ '-' with lines title 'Inward', \\
+ '-' with lines title 'Outward', \\
+ '-' with lines title 'Other'
+ ];
+ for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n";
+ for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n";
+ for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n";
+ for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n";
+ close($fh);
+ plot($$args{gp});
+}
+
+
+sub plot_coverage
+{
+ my ($opts) = @_;
+
+ if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; }
+
+ my @vals;
+ for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; }
+ my $i = percentile(99.8, at vals);
+ my $p99 = $$opts{dat}{COV}[$i][1];
+
+ my $args = get_defaults($opts,"$$opts{prefix}coverage.png");
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set ylabel "Number of mapped bases"
+ set xlabel "Coverage"
+ set style fill solid border -1
+ set title "$$args{title}"
+ set xrange [:$p99]
+ plot '-' with lines notitle
+ ];
+ for my $cov (@{$$opts{dat}{COV}})
+ {
+ if ( $$cov[2]==0 ) { next; }
+ print $fh "$$cov[1]\t$$cov[2]\n";
+ }
+ print $fh "end\n";
+ close($fh);
+ plot($$args{gp});
+}
+
+
+sub plot_mismatches_per_cycle
+{
+ my ($opts) = @_;
+
+ if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; }
+ if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); }
+
+ my $nquals = @{$$opts{dat}{MPC}[0]} - 2;
+ my $ncycles = @{$$opts{dat}{MPC}};
+ my ($style,$with);
+ if ( $ncycles>100 ) { $style = ''; $with = 'w l'; }
+ else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; }
+
+ my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set style line 1 linecolor rgb "#e40000"
+ set style line 2 linecolor rgb "#ff9f00"
+ set style line 3 linecolor rgb "#eeee00"
+ set style line 4 linecolor rgb "#4ebd68"
+ set style line 5 linecolor rgb "#0061ff"
+ set style increment user
+ set key left top
+ $style
+ set ylabel "Number of mismatches"
+ set xlabel "Read Cycle"
+ set style fill solid border -1
+ set title "$$args{title}"
+ set xrange [-1:$ncycles]
+ plot '-' $with ti 'Base Quality>30', \\
+ '-' $with ti '30>=Q>20', \\
+ '-' $with ti '20>=Q>10', \\
+ '-' $with ti '10>=Q', \\
+ '-' $with ti "N's"
+ ];
+ for my $cycle (@{$$opts{dat}{MPC}})
+ {
+ my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; }
+ print $fh "$sum\n";
+ }
+ print $fh "end\n";
+ for my $cycle (@{$$opts{dat}{MPC}})
+ {
+ my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; }
+ print $fh "$sum\n";
+ }
+ print $fh "end\n";
+ for my $cycle (@{$$opts{dat}{MPC}})
+ {
+ my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; }
+ print $fh "$sum\n";
+ }
+ print $fh "end\n";
+ for my $cycle (@{$$opts{dat}{MPC}})
+ {
+ my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; }
+ print $fh "$sum\n";
+ }
+ print $fh "end\n";
+ for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; }
+ print $fh "end\n";
+ close($fh);
+ plot($$args{gp});
+}
+
+sub plot_indel_dist
+{
+ my ($opts) = @_;
+
+ if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; }
+
+ my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png");
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set style line 1 linetype 1 linecolor rgb "red"
+ set style line 2 linetype 2 linecolor rgb "black"
+ set style line 3 linetype 3 linecolor rgb "green"
+ set style increment user
+ set ylabel "Indel count [log]"
+ set xlabel "Indel length"
+ set y2label "Insertions/Deletions ratio"
+ set log y
+ set y2tics nomirror
+ set ytics nomirror
+ set title "$$args{title}"
+ plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio"
+ ];
+ for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
+ for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
+ for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n";
+ close($fh);
+ plot($$args{gp});
+}
+
+sub plot_indel_cycles
+{
+ my ($opts) = @_;
+
+ if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; }
+
+ my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png");
+ my $fh = $$args{fh};
+ print $fh qq[
+ $$args{terminal}
+ set output "$$args{img}"
+ $$args{grid}
+ set style line 1 linetype 1 linecolor rgb "red"
+ set style line 2 linetype 2 linecolor rgb "black"
+ set style line 3 linetype 3 linecolor rgb "green"
+ set style line 4 linetype 4 linecolor rgb "blue"
+ set style increment user
+ set ylabel "Indel count"
+ set xlabel "Read Cycle"
+ set title "$$args{title}"
+ plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)'
+ ];
+ for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
+ for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
+ for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n";
+ for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n";
+ close($fh);
+ plot($$args{gp});
+}
+
+
+
+
+
+
+
+sub has_values
+{
+ my ($opts, at tags) = @_;
+ for my $tag (@tags)
+ {
+ my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`;
+ chomp($lines[0]);
+ if ( $lines[0]<2 ) { return 0; }
+ }
+ return 1;
+}
+
+sub plot_mismatches_per_cycle_old
+{
+ my ($opts) = @_;
+
+ my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
+ my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`;
+ my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`;
+ chomp($nquals);
+ chomp($ncycles);
+ $nquals--;
+ $ncycles--;
+ my @gr0_15 = (2..17);
+ my @gr16_30 = (18..32);
+ my @gr31_n = (33..$nquals);
+ my $gr0_15 = '$'. join('+$', at gr0_15);
+ my $gr16_30 = '$'. join('+$', at gr16_30);
+ my $gr31_n = '$'. join('+$', at gr31_n);
+
+ open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!");
+ print $fh q[
+ set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
+ set output "] . $$args{img} . q["
+
+ set key left top
+ set style data histogram
+ set style histogram rowstacked
+
+ set grid back lc rgb "#aaaaaa"
+ set ylabel "Number of mismatches"
+ set xlabel "Read Cycle"
+ set style fill solid border -1
+ set title "] . $$args{title} . qq["
+ set xrange [-1:$ncycles]
+
+ plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q'
+ ];
+ close($fh);
+
+ plot($$args{gp});
+}
+
+
diff --git a/samtools-0.1.19/misc/psl2sam.pl b/samtools-0.1.19/misc/psl2sam.pl
new file mode 100755
index 0000000..a96a6de
--- /dev/null
+++ b/samtools-0.1.19/misc/psl2sam.pl
@@ -0,0 +1,65 @@
+#!/usr/bin/perl -w
+
+# Author: lh3
+
+# This script calculates a score using the BLAST scoring
+# system. However, I am not sure how to count gap opens and gap
+# extensions. It seems to me that column 5-8 are not what I am
+# after. This script counts gaps from the last three columns. It does
+# not generate reference skip (N) in the CIGAR as it is not easy to
+# directly tell which gaps correspond to introns.
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my %opts = (a=>1, b=>3, q=>5, r=>2);
+getopts('a:b:q:r:', \%opts);
+die("Usage: psl2sam.pl [-a $opts{a}] [-b $opts{b}] [-q $opts{q}] [-r $opts{r}] <in.psl>\n") if (@ARGV == 0 && -t STDIN);
+
+my @stack;
+my $last = '';
+my ($a, $b, $q, $r) = ($opts{a}, $opts{b}, $opts{q}, $opts{r});
+while (<>) {
+ next unless (/^\d/);
+ my @t = split;
+ my @s;
+ my $cigar = '';
+ if ($t[8] eq '-') {
+ my $tmp = $t[11];
+ $t[11] = $t[10] - $t[12];
+ $t[12] = $t[10] - $tmp;
+ }
+ @s[0..4] = ($t[9], (($t[8] eq '+')? 0 : 16), $t[13], $t[15]+1, 0);
+ @s[6..10] = ('*', 0, 0, '*', '*');
+ $cigar .= $t[11].'H' if ($t[11]); # 5'-end clipping
+ my @x = split(',', $t[18]);
+ my @y = split(',', $t[19]);
+ my @z = split(',', $t[20]);
+ my ($y0, $z0) = ($y[0], $z[0]);
+ my ($gap_open, $gap_ext) = (0, 0, 0);
+ for (1 .. $t[17]-1) {
+ my $ly = $y[$_] - $y[$_-1] - $x[$_-1];
+ my $lz = $z[$_] - $z[$_-1] - $x[$_-1];
+ if ($ly < $lz) { # del: the reference gap is longer
+ ++$gap_open;
+ $gap_ext += $lz - $ly;
+ $cigar .= ($y[$_] - $y0) . 'M';
+ $cigar .= ($lz - $ly) . 'D';
+ ($y0, $z0) = ($y[$_], $z[$_]);
+ } elsif ($lz < $ly) { # ins: the query gap is longer
+ ++$gap_open;
+ $gap_ext += $ly - $lz;
+ $cigar .= ($z[$_] - $z0) . 'M';
+ $cigar .= ($ly - $lz) . 'I';
+ ($y0, $z0) = ($y[$_], $z[$_]);
+ }
+ }
+ $cigar .= ($t[12] - $y0) . 'M';
+ $cigar .= ($t[10] - $t[12]).'H' if ($t[10] != $t[12]); # 3'-end clipping
+ $s[5] = $cigar;
+ my $score = $a * $t[0] - $b * $t[1] - $q * $gap_open - $r * $gap_ext;
+ $score = 0 if ($score < 0);
+ $s[11] = "AS:i:$score";
+ print join("\t", @s), "\n";
+}
diff --git a/samtools-0.1.19/misc/r2plot.lua b/samtools-0.1.19/misc/r2plot.lua
new file mode 100755
index 0000000..0a1b9f1
--- /dev/null
+++ b/samtools-0.1.19/misc/r2plot.lua
@@ -0,0 +1,83 @@
+#!/usr/bin/env luajit
+
+function string:split(sep, n)
+ local a, start = {}, 1;
+ sep = sep or "%s+";
+ repeat
+ local b, e = self:find(sep, start);
+ if b == nil then
+ table.insert(a, self:sub(start));
+ break
+ end
+ a[#a+1] = self:sub(start, b - 1);
+ start = e + 1;
+ if n and #a == n then
+ table.insert(a, self:sub(start));
+ break
+ end
+ until start > #self;
+ return a;
+end
+
+function io.xopen(fn, mode)
+ mode = mode or 'r';
+ if fn == nil then return io.stdin;
+ elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout;
+ elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w');
+ elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w');
+ else return io.open(fn, mode) end
+end
+
+local eps = {};
+
+function eps.func(fp)
+ fp = fp or io.stdout
+ fp:write("/C { dup 255 and 255 div exch dup -8 bitshift 255 and 255 div 3 1 roll -16 bitshift 255 and 255 div 3 1 roll setrgbcolor } bind def\n")
+ fp:write("/L { 4 2 roll moveto lineto } bind def\n")
+ fp:write("/LX { dup 4 -1 roll exch moveto lineto } bind def\n")
+ fp:write("/LY { dup 4 -1 roll moveto exch lineto } bind def\n")
+ fp:write("/LS { 3 1 roll moveto show } bind def\n")
+ fp:write("/RS { dup stringwidth pop 4 -1 roll exch sub 3 -1 roll moveto show } bind def\n")
+ fp:write("/B { 4 copy 3 1 roll exch 6 2 roll 8 -2 roll moveto lineto lineto lineto closepath } bind def\n")
+end
+
+function eps.font(ft, size, fp)
+ fp = fp or io.stdout
+ fp:write(string.format('/FS %d def\n', size));
+ fp:write('/FS4 FS 4 div def\n');
+ fp:write('/' .. ft .. ' findfont FS scalefont setfont\n');
+end
+
+local scale = 8;
+
+if #arg == 0 then
+ print("Usage: r2plot.lua <in.txt>");
+ os.exit(1)
+end
+
+local fp = io.xopen(arg[1]);
+local n = tonumber(fp:read());
+
+print('%!PS-Adobe-3.0 EPSF-3.0');
+print('%%' .. string.format('BoundingBox: -%d -%d %.3f %.3f\n', 10*scale, scale, (n+1)*scale, (n+1)*scale));
+print(string.format('%.3f setlinewidth', scale));
+print(string.format('/plot { setgray moveto 0 %d rlineto } def', scale));
+print(string.format('/plothalf { setgray moveto 0 %.2f rlineto } def', scale/2));
+eps.func();
+eps.font('Helvetica', scale-1);
+
+local i = 1;
+for l in fp:lines() do
+ local t = l:split('\t');
+ print(string.format("%d %d FS4 add (%s) RS", (i-1)*scale-2, (i-1)*scale, t[1]));
+ for j = 2, #t do
+ if tonumber(t[j]) > 0.01 then
+ print(string.format('%.2f %.2f %.2f plot stroke', (i-1+.5)*scale, (j-2)*scale, 1.-t[j]));
+ end
+ end
+ i = i + 1;
+end
+for j = 1, 21 do
+ print(string.format('%.2f %.2f %.2f plothalf stroke', -8*scale, (j-1) * scale/2, 1.-(j-1)/20));
+end
+print('showpage');
diff --git a/samtools-0.1.19/misc/sam2vcf.pl b/samtools-0.1.19/misc/sam2vcf.pl
new file mode 100755
index 0000000..afaf91e
--- /dev/null
+++ b/samtools-0.1.19/misc/sam2vcf.pl
@@ -0,0 +1,270 @@
+#!/usr/bin/perl -w
+#
+# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
+#
+# Contact: pd3 at sanger
+# Version: 2010-04-23
+
+use strict;
+use warnings;
+use Carp;
+
+my $opts = parse_params();
+do_pileup_to_vcf($opts);
+
+exit;
+
+#---------------
+
+sub error
+{
+ my (@msg) = @_;
+ if ( scalar @msg ) { croak(@msg); }
+ die
+ "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n",
+ "Options:\n",
+ " -h, -?, --help This help message.\n",
+ " -i, --indels-only Ignore SNPs.\n",
+ " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n",
+ " -R, --keep-ref Print reference alleles as well.\n",
+ " -s, --snps-only Ignore indels.\n",
+ " -t, --column-title <string> The column title.\n",
+ "\n";
+}
+
+
+sub parse_params
+{
+ my %opts = ();
+
+ $opts{fh_in} = *STDIN;
+ $opts{fh_out} = *STDOUT;
+
+ while (my $arg=shift(@ARGV))
+ {
+ if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; }
+ if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; }
+ if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; }
+ if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; }
+ if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; }
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
+
+ error("Unknown parameter \"$arg\". Run -h for help.\n");
+ }
+ return \%opts;
+}
+
+sub iupac_to_gtype
+{
+ my ($ref,$base) = @_;
+ my %iupac = (
+ 'K' => ['G','T'],
+ 'M' => ['A','C'],
+ 'S' => ['C','G'],
+ 'R' => ['A','G'],
+ 'W' => ['A','T'],
+ 'Y' => ['C','T'],
+ );
+ if ( !exists($iupac{$base}) )
+ {
+ if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); }
+ if ( $ref eq $base ) { return ('.','0/0'); }
+ return ($base,'1/1');
+ }
+ my $gt = $iupac{$base};
+ if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); }
+ elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); }
+ return ("$$gt[0],$$gt[1]",'1/2');
+}
+
+
+sub parse_indel
+{
+ my ($cons) = @_;
+ if ( $cons=~/^-/ )
+ {
+ my $len = length($');
+ return "D$len";
+ }
+ elsif ( $cons=~/^\+/ ) { return "I$'"; }
+ elsif ( $cons eq '*' ) { return undef; }
+ error("FIXME: could not parse [$cons]\n");
+}
+
+
+# An example of the pileup format:
+# 1 3000011 C C 32 0 98 1 ^~, A
+# 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0
+# 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0
+# 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0
+#
+sub do_pileup_to_vcf
+{
+ my ($opts) = @_;
+
+ my $fh_in = $$opts{fh_in};
+ my $fh_out = $$opts{fh_out};
+ my ($prev_chr,$prev_pos,$prev_ref);
+ my $refseq;
+ my $ignore_indels = $$opts{snps_only} ? 1 : 0;
+ my $ignore_snps = $$opts{indels_only} ? 1 : 0;
+ my $keep_ref = $$opts{keep_ref} ? 1 : 0;
+ my $title = exists($$opts{title}) ? $$opts{title} : 'data';
+
+ print $fh_out
+ qq[##fileformat=VCFv3.3\n],
+ qq[##INFO=DP,1,Integer,"Total Depth"\n],
+ qq[##FORMAT=GT,1,String,"Genotype"\n],
+ qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n],
+ qq[##FORMAT=DP,1,Integer,"Read Depth"\n],
+ qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n]
+ ;
+
+ while (my $line=<$fh_in>)
+ {
+ chomp($line);
+ my (@items) = split(/\t/,$line);
+ if ( scalar @items<8 )
+ {
+ error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n");
+ }
+ my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items;
+ $ref = uc($ref);
+ $cons = uc($cons);
+
+ my ($alt,$gt);
+ if ( $ref eq '*' )
+ {
+ # An indel is involved.
+ if ( $ignore_indels )
+ {
+ $prev_ref = $ref;
+ $prev_pos = $pos;
+ $prev_chr = $chr;
+ next;
+ }
+
+ if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos)
+ {
+ if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); }
+ if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); }
+ $ref = $refseq->get_base($chr,$pos);
+ $ref = uc($ref);
+ }
+ else { $ref = $prev_ref; }
+
+ # One of the alleles can be a reference and it can come in arbitrary order. In some
+ # cases */* can be encountered. In such a case, look in the additional columns.
+ my ($al1,$al2) = split(m{/},$cons);
+ if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; }
+ my $alt1 = parse_indel($al1);
+ my $alt2 = parse_indel($al2);
+ if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); }
+ if ( !$alt1 )
+ {
+ $alt=$alt2;
+ $gt='0/1';
+ }
+ elsif ( !$alt2 )
+ {
+ $alt=$alt1;
+ $gt='0/1';
+ }
+ elsif ( $alt1 eq $alt2 )
+ {
+ $alt="$alt1";
+ $gt='1/1';
+ }
+ else
+ {
+ $alt="$alt1,$alt2";
+ $gt='1/2';
+ }
+ }
+ else
+ {
+ if ( $ignore_snps || (!$keep_ref && $ref eq $cons) )
+ {
+ $prev_ref = $ref;
+ $prev_pos = $pos;
+ $prev_chr = $chr;
+ next;
+ }
+
+ # SNP
+ ($alt,$gt) = iupac_to_gtype($ref,$cons);
+ }
+
+ print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
+
+ $prev_ref = $ref;
+ $prev_pos = $pos;
+ $prev_chr = $chr;
+ }
+}
+
+
+#------------- Fasta --------------------
+#
+# Uses samtools to get a requested base from a fasta file. For efficiency, preloads
+# a chunk to memory. The size of the cached sequence can be controlled by the 'size'
+# parameter.
+#
+package Fasta;
+
+use strict;
+use warnings;
+use Carp;
+
+sub Fasta::new
+{
+ my ($class, at args) = @_;
+ my $self = {@args};
+ bless $self, ref($class) || $class;
+ if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); }
+ $$self{chr} = undef;
+ $$self{from} = undef;
+ $$self{to} = undef;
+ if ( !$$self{size} ) { $$self{size}=10_000_000; }
+ bless $self, ref($class) || $class;
+ return $self;
+}
+
+sub read_chunk
+{
+ my ($self,$chr,$pos) = @_;
+ my $to = $pos + $$self{size};
+ my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
+ my @out = `$cmd`;
+ if ( $? ) { $self->throw("$cmd: $!"); }
+ my $line = shift(@out);
+ if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
+ $$self{chr} = $chr;
+ $$self{from} = $1;
+ $$self{to} = $2;
+ my $chunk = '';
+ while ($line=shift(@out))
+ {
+ chomp($line);
+ $chunk .= $line;
+ }
+ $$self{chunk} = $chunk;
+ return;
+}
+
+sub get_base
+{
+ my ($self,$chr,$pos) = @_;
+ if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
+ {
+ $self->read_chunk($chr,$pos);
+ }
+ my $idx = $pos - $$self{from};
+ return substr($$self{chunk},$idx,1);
+}
+
+sub throw
+{
+ my ($self, at msg) = @_;
+ croak(@msg);
+}
diff --git a/samtools-0.1.19/misc/samtools.pl b/samtools-0.1.19/misc/samtools.pl
new file mode 100755
index 0000000..d03c1c7
--- /dev/null
+++ b/samtools-0.1.19/misc/samtools.pl
@@ -0,0 +1,528 @@
+#!/usr/bin/perl -w
+
+# Author: lh3
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my $version = '0.3.3';
+&usage if (@ARGV < 1);
+
+my $command = shift(@ARGV);
+my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, plp2vcf=>\&plp2vcf,
+ unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq);
+
+die("Unknown command \"$command\".\n") if (!defined($func{$command}));
+&{$func{$command}};
+exit(0);
+
+#
+# showALEN
+#
+
+sub showALEN {
+ die(qq/Usage: samtools.pl showALEN <in.sam>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ my @t = split;
+ next if (/^\@/ || @t < 11);
+ my $l = 0;
+ $_ = $t[5];
+ s/(\d+)[MI]/$l+=$1/eg;
+ print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n";
+ }
+}
+
+#
+# varFilter
+#
+
+#
+# Filtration code:
+#
+# d low depth
+# D high depth
+# W too many SNPs in a window (SNP only)
+# G close to a high-quality indel (SNP only)
+# Q low RMS mapping quality (SNP only)
+# g close to another indel with higher quality (indel only)
+# s low SNP quality (SNP only)
+# i low indel quality (indel only)
+
+sub varFilter {
+ my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, S=>'', i=>'');
+ getopts('pq:d:D:l:Q:w:W:N:G:S:i:', \%opts);
+ die(qq/
+Usage: samtools.pl varFilter [options] <in.cns-pileup>
+
+Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
+ -q INT minimum RMS mapping quality for gaps [$opts{q}]
+ -d INT minimum read depth [$opts{d}]
+ -D INT maximum read depth [$opts{D}]
+ -S INT minimum SNP quality [$opts{S}]
+ -i INT minimum indel quality [$opts{i}]
+
+ -G INT min indel score for nearby SNP filtering [$opts{G}]
+ -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
+
+ -W INT window size for filtering dense SNPs [$opts{W}]
+ -N INT max number of SNPs in a window [$opts{N}]
+
+ -l INT window size for filtering adjacent gaps [$opts{l}]
+
+ -p print filtered variants
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ # calculate the window size
+ my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
+ my $max_dist = $ol > $ow? $ol : $ow;
+ $max_dist = $oW if ($max_dist < $oW);
+ # the core loop
+ my @staging; # (indel_filtering_score, flt_tag)
+ while (<>) {
+ my @t = split;
+ next if (uc($t[2]) eq uc($t[3]) || $t[3] eq '*/*'); # skip non-var sites
+ # clear the out-of-range elements
+ while (@staging) {
+ # Still on the same chromosome and the first element's window still affects this position?
+ last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
+ varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
+ }
+ my ($flt, $score) = (0, -1);
+ # first a simple filter
+ if ($t[7] < $opts{d}) {
+ $flt = 2;
+ } elsif ($t[7] > $opts{D}) {
+ $flt = 3;
+ }
+ if ($t[2] eq '*') { # an indel
+ if ($opts{i} && $opts{i}>$t[5]) { $flt = 8; }
+ }
+ elsif ($opts{S} && $opts{S}>$t[5]) { $flt = 7; } # SNP
+
+ # site dependent filters
+ my $len=0;
+ if ($flt == 0) {
+ if ($t[2] eq '*') { # an indel
+ # If deletion, remember the length of the deletion
+ my ($a,$b) = split(m{/},$t[3]);
+ my $alen = length($a) - 1;
+ my $blen = length($b) - 1;
+ if ( $alen>$blen )
+ {
+ if ( substr($a,0,1) eq '-' ) { $len=$alen; }
+ }
+ elsif ( substr($b,0,1) eq '-' ) { $len=$blen; }
+
+ $flt = 1 if ($t[6] < $opts{q});
+ # filtering SNPs
+ if ($t[5] >= $opts{G}) {
+ for my $x (@staging) {
+ # Is it a SNP and is it outside the SNP filter window?
+ next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
+ $x->[1] = 5 if ($x->[1] == 0);
+ }
+ }
+ # calculate the filtering score (different from indel quality)
+ $score = $t[5];
+ $score += $opts{s} * $t[10] if ($t[8] ne '*');
+ $score += $opts{s} * $t[11] if ($t[9] ne '*');
+ # check the staging list for indel filtering
+ for my $x (@staging) {
+ # Is it a SNP and is it outside the gap filter window
+ next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
+ if ($x->[0] < $score) {
+ $x->[1] = 6;
+ } else {
+ $flt = 6; last;
+ }
+ }
+ } else { # a SNP
+ $flt = 1 if ($t[6] < $opts{Q});
+ # check adjacent SNPs
+ my $k = 1;
+ for my $x (@staging) {
+ ++$k if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
+ }
+ # filtering is necessary
+ if ($k > $opts{N}) {
+ $flt = 4;
+ for my $x (@staging) {
+ $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
+ }
+ } else { # then check gap filter
+ for my $x (@staging) {
+ next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
+ if ($x->[0] >= $opts{G}) {
+ $flt = 5; last;
+ }
+ }
+ }
+ }
+ }
+ push(@staging, [$score, $flt, $len, @t]);
+ }
+ # output the last few elements in the staging list
+ while (@staging) {
+ varFilter_aux(shift @staging, $opts{p});
+ }
+}
+
+sub varFilter_aux {
+ my ($first, $is_print) = @_;
+ if ($first->[1] == 0) {
+ print join("\t", @$first[3 .. @$first-1]), "\n";
+ } elsif ($is_print) {
+ print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ }
+}
+
+#
+# pileup2fq
+#
+
+sub pileup2fq {
+ my %opts = (d=>3, D=>255, Q=>25, G=>25, l=>10);
+ getopts('d:D:Q:G:l:', \%opts);
+ die(qq/
+Usage: samtools.pl pileup2fq [options] <in.cns-pileup>
+
+Options: -d INT minimum depth [$opts{d}]
+ -D INT maximum depth [$opts{D}]
+ -Q INT min RMS mapQ [$opts{Q}]
+ -G INT minimum indel score [$opts{G}]
+ -l INT indel filter winsize [$opts{l}]\n
+/) if (@ARGV == 0 && -t STDIN);
+
+ my ($last_chr, $seq, $qual, @gaps, $last_pos);
+ my $_Q = $opts{Q};
+ my $_d = $opts{d};
+ my $_D = $opts{D};
+
+ $last_chr = '';
+ while (<>) {
+ my @t = split;
+ if ($last_chr ne $t[0]) {
+ &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
+ $last_chr = $t[0];
+ $last_pos = 0;
+ $seq = ''; $qual = '';
+ @gaps = ();
+ }
+ if ($t[1] - $last_pos != 1) {
+ $seq .= 'n' x ($t[1] - $last_pos - 1);
+ $qual .= '!' x ($t[1] - $last_pos - 1);
+ }
+ if ($t[2] eq '*') {
+ push(@gaps, $t[1]) if ($t[5] >= $opts{G});
+ } else {
+ $seq .= ($t[6] >= $_Q && $t[7] >= $_d && $t[7] <= $_D)? uc($t[3]) : lc($t[3]);
+ my $q = $t[4] + 33;
+ $q = 126 if ($q > 126);
+ $qual .= chr($q);
+ }
+ $last_pos = $t[1];
+ }
+ &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
+}
+
+sub p2q_post_process {
+ my ($chr, $seq, $qual, $gaps, $l) = @_;
+ &p2q_filter_gaps($seq, $gaps, $l);
+ print "\@$chr\n"; &p2q_print_str($seq);
+ print "+\n"; &p2q_print_str($qual);
+}
+
+sub p2q_filter_gaps {
+ my ($seq, $gaps, $l) = @_;
+ for my $g (@$gaps) {
+ my $x = $g > $l? $g - $l : 0;
+ substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l));
+ }
+}
+
+sub p2q_print_str {
+ my ($s) = @_;
+ my $l = length($$s);
+ for (my $i = 0; $i < $l; $i += 60) {
+ print substr($$s, $i, 60), "\n";
+ }
+}
+
+#
+# sam2fq
+#
+
+sub sam2fq {
+ my %opts = (n=>20, p=>'');
+ getopts('n:p:', \%opts);
+ die("Usage: samtools.pl sam2fq [-n 20] [-p <prefix>] <inp.sam>\n") if (@ARGV == 0 && -t STDIN);
+ if ($opts{p} && $opts{n} > 1) {
+ my $pre = $opts{p};
+ my @fh;
+ for (0 .. $opts{n}-1) {
+ open($fh[$_], sprintf("| gzip > $pre.%.3d.fq.gz", $_)) || die;
+ }
+ my $i = 0;
+ while (<>) {
+ next if (/^@/);
+ chomp;
+ my @t = split("\t");
+ next if ($t[9] eq '*');
+ my ($name, $seq, $qual);
+ if ($t[1] & 16) { # reverse strand
+ $seq = reverse($t[9]);
+ $qual = reverse($t[10]);
+ $seq =~ tr/ACGTacgt/TGCAtgca/;
+ } else {
+ ($seq, $qual) = @t[9,10];
+ }
+ $name = $t[0];
+ $name .= "/1" if ($t[1] & 0x40);
+ $name .= "/2" if ($t[1] & 0x80);
+ print {$fh[$i]} "\@$name\n$seq\n";
+ if ($qual ne '*') {
+ print {$fh[$i]} "+\n$qual\n";
+ }
+ $i = 0 if (++$i == $opts{n});
+ }
+ close($fh[$_]) for (0 .. $opts{n}-1);
+ } else {
+ die("To be implemented.\n");
+ }
+}
+
+#
+# sra2hdr
+#
+
+# This subroutine does not use an XML parser. It requires that the SRA
+# XML files are properly formated.
+sub sra2hdr {
+ my %opts = ();
+ getopts('', \%opts);
+ die("Usage: samtools.pl sra2hdr <SRA.prefix>\n") if (@ARGV == 0);
+ my $pre = $ARGV[0];
+ my $fh;
+ # read sample
+ my $sample = 'UNKNOWN';
+ open($fh, "$pre.sample.xml") || die;
+ while (<$fh>) {
+ $sample = $1 if (/<SAMPLE.*alias="([^"]+)"/i);
+ }
+ close($fh);
+ # read experiment
+ my (%exp2lib, $exp);
+ open($fh, "$pre.experiment.xml") || die;
+ while (<$fh>) {
+ if (/<EXPERIMENT.*accession="([^\s"]+)"/i) {
+ $exp = $1;
+ } elsif (/<LIBRARY_NAME>\s*(\S+)\s*<\/LIBRARY_NAME>/i) {
+ $exp2lib{$exp} = $1;
+ }
+ }
+ close($fh);
+ # read run
+ my ($run, @fn);
+ open($fh, "$pre.run.xml") || die;
+ while (<$fh>) {
+ if (/<RUN.*accession="([^\s"]+)"/i) {
+ $run = $1; @fn = ();
+ } elsif (/<EXPERIMENT_REF.*accession="([^\s"]+)"/i) {
+ print "\@RG\tID:$run\tSM:$sample\tLB:$exp2lib{$1}\n";
+ } elsif (/<FILE.*filename="([^\s"]+)"/i) {
+ push(@fn, $1);
+ } elsif (/<\/RUN>/i) {
+ if (@fn == 1) {
+ print STDERR "$fn[0]\t$run\n";
+ } else {
+ for (0 .. $#fn) {
+ print STDERR "$fn[$_]\t$run", "_", $_+1, "\n";
+ }
+ }
+ }
+ }
+ close($fh);
+}
+
+#
+# unique
+#
+
+sub unique {
+ my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3);
+ getopts('Qf:q:r:a:b:m', \%opts);
+ die("Usage: samtools.pl unique [-f $opts{f}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
+ my $last = '';
+ my $recal_Q = !defined($opts{Q});
+ my $multi_only = defined($opts{m});
+ my @a;
+ while (<>) {
+ my $score = -1;
+ print $_ if (/^\@/);
+ $score = $1 if (/AS:i:(\d+)/);
+ my @t = split("\t");
+ next if (@t < 11);
+ if ($score < 0) { # AS tag is unavailable
+ my $cigar = $t[5];
+ my ($mm, $go, $ge) = (0, 0, 0);
+ $cigar =~ s/(\d+)[ID]/++$go,$ge+=$1/eg;
+ $cigar = $t[5];
+ $cigar =~ s/(\d+)M/$mm+=$1/eg;
+ $score = $mm * $opts{a} - $go * $opts{q} - $ge * $opts{r}; # no mismatches...
+ }
+ $score = 1 if ($score < 1);
+ if ($t[0] ne $last) {
+ &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a);
+ $last = $t[0];
+ }
+ push(@a, [$score, \@t]);
+ }
+ &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a);
+}
+
+sub unique_aux {
+ my ($a, $fac, $is_recal, $multi_only) = @_;
+ my ($max, $max2, $max_i) = (0, 0, -1);
+ for (my $i = 0; $i < @$a; ++$i) {
+ if ($a->[$i][0] > $max) {
+ $max2 = $max; $max = $a->[$i][0]; $max_i = $i;
+ } elsif ($a->[$i][0] > $max2) {
+ $max2 = $a->[$i][0];
+ }
+ }
+ if ($is_recal) {
+ if (!$multi_only || @$a > 1) {
+ my $q = int($fac * ($max - $max2) / $max + .499);
+ $q = 250 if ($q > 250);
+ $a->[$max_i][1][4] = $q < 250? $q : 250;
+ }
+ }
+ print join("\t", @{$a->[$max_i][1]});
+ @$a = ();
+}
+
+#
+# uniqcmp: compare two SAM files
+#
+
+sub uniqcmp {
+ my %opts = (q=>10, s=>100);
+ getopts('pq:s:', \%opts);
+ die("Usage: samtools.pl uniqcmp <in1.sam> <in2.sam>\n") if (@ARGV < 2);
+ my ($fh, %a);
+ warn("[uniqcmp] read the first file...\n");
+ &uniqcmp_aux($ARGV[0], \%a, 0);
+ warn("[uniqcmp] read the second file...\n");
+ &uniqcmp_aux($ARGV[1], \%a, 1);
+ warn("[uniqcmp] stats...\n");
+ my @cnt;
+ $cnt[$_] = 0 for (0..9);
+ for my $x (keys %a) {
+ my $p = $a{$x};
+ my $z;
+ if (defined($p->[0]) && defined($p->[1])) {
+ $z = ($p->[0][0] == $p->[1][0] && $p->[0][1] eq $p->[1][1] && abs($p->[0][2] - $p->[1][2]) < $opts{s})? 0 : 1;
+ if ($p->[0][3] >= $opts{q} && $p->[1][3] >= $opts{q}) {
+ ++$cnt[$z*3+0];
+ } elsif ($p->[0][3] >= $opts{q}) {
+ ++$cnt[$z*3+1];
+ } elsif ($p->[1][3] >= $opts{q}) {
+ ++$cnt[$z*3+2];
+ }
+ print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t",
+ $p->[0][5]-$p->[1][5], "\n" if ($z && defined($opts{p}) && ($p->[0][3] >= $opts{q} || $p->[1][3] >= $opts{q}));
+ } elsif (defined($p->[0])) {
+ ++$cnt[$p->[0][3]>=$opts{q}? 6 : 7];
+ print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t*\t0\t*\t",
+ $p->[0][5], "\n" if (defined($opts{p}) && $p->[0][3] >= $opts{q});
+ } else {
+ print STDERR "$x\t*\t0\t*\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t",
+ -$p->[1][5], "\n" if (defined($opts{p}) && $p->[1][3] >= $opts{q});
+ ++$cnt[$p->[1][3]>=$opts{q}? 8 : 9];
+ }
+ }
+ print "Consistent (high, high): $cnt[0]\n";
+ print "Consistent (high, low ): $cnt[1]\n";
+ print "Consistent (low , high): $cnt[2]\n";
+ print "Inconsistent (high, high): $cnt[3]\n";
+ print "Inconsistent (high, low ): $cnt[4]\n";
+ print "Inconsistent (low , high): $cnt[5]\n";
+ print "Second missing (high): $cnt[6]\n";
+ print "Second missing (low ): $cnt[7]\n";
+ print "First missing (high): $cnt[8]\n";
+ print "First missing (low ): $cnt[9]\n";
+}
+
+sub uniqcmp_aux {
+ my ($fn, $a, $which) = @_;
+ my $fh;
+ $fn = "samtools view $fn |" if ($fn =~ /\.bam/);
+ open($fh, $fn) || die;
+ while (<$fh>) {
+ my @t = split;
+ next if (@t < 11);
+# my $l = ($t[5] =~ /^(\d+)S/)? $1 : 0;
+ my $l = 0;
+ my ($x, $nm) = (0, 0);
+ $nm = $1 if (/NM:i:(\d+)/);
+ $_ = $t[5];
+ s/(\d+)[MI]/$x+=$1/eg;
+ @{$a->{$t[0]}[$which]} = (($t[1]&0x10)? 1 : 0, $t[2], $t[3]-$l, $t[4], "$x:$nm", $x - 4 * $nm);
+ }
+ close($fh);
+}
+
+sub plp2vcf {
+ while (<>) {
+ my @t = split;
+ next if ($t[3] eq '*/*');
+ if ($t[2] eq '*') { # indel
+ my @s = split("/", $t[3]);
+ my (@a, @b);
+ my ($ref, $alt);
+ for (@s) {
+ next if ($_ eq '*');
+ if (/^-/) {
+ push(@a, 'N'.substr($_, 1));
+ push(@b, 'N');
+ } elsif (/^\+/) {
+ push(@a, 'N');
+ push(@b, 'N'.substr($_, 1));
+ }
+ }
+ if ($a[0] && $a[1]) {
+ if (length($a[0]) < length($a[1])) {
+ $ref = $a[1];
+ $alt = ($b[0] . ('N' x (length($a[1]) - length($a[0])))) . ",$b[1]";
+ } elsif (length($a[0]) > length($a[1])) {
+ $ref = $a[0];
+ $alt = ($b[1] . ('N' x (length($a[0]) - length($a[1])))) . ",$b[0]";
+ } else {
+ $ref = $a[0];
+ $alt = ($b[0] eq $b[1])? $b[0] : "$b[0],$b[1]";
+ }
+ } else {
+ $ref = $a[0]; $alt = $b[0];
+ }
+ print join("\t", @t[0,1], '.', $ref, $alt, $t[5], '.', '.'), "\n";
+ } else { # SNP
+ }
+ }
+}
+
+#
+# Usage
+#
+
+sub usage {
+ die(qq/
+Program: samtools.pl (helper script for SAMtools)
+Version: $version
+Contact: Heng Li <lh3\@sanger.ac.uk>\n
+Usage: samtools.pl <command> [<arguments>]\n
+Command: varFilter filtering SNPs and short indels
+ pileup2fq generate fastq from `pileup -c'
+ showALEN print alignment length (ALEN) following CIGAR
+\n/);
+}
diff --git a/samtools-0.1.19/misc/soap2sam.pl b/samtools-0.1.19/misc/soap2sam.pl
new file mode 100755
index 0000000..b37135e
--- /dev/null
+++ b/samtools-0.1.19/misc/soap2sam.pl
@@ -0,0 +1,109 @@
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.1
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&soap2sam;
+exit;
+
+sub mating {
+ my ($s1, $s2) = @_;
+ my $isize = 0;
+ if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
+ my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
+ my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
+ $isize = $x2 - $x1;
+ }
+ # update mate coordinate
+ if ($s2->[2] ne '*') {
+ @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
+ $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
+ } else {
+ $s1->[1] |= 0x8;
+ }
+ if ($s1->[2] ne '*') {
+ @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
+ $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
+ } else {
+ $s2->[1] |= 0x8;
+ }
+}
+
+sub soap2sam {
+ my %opts = ();
+ getopts("p", \%opts);
+ die("Usage: soap2sam.pl [-p] <aln.soap>\n") if (@ARGV == 0 && -t STDIN);
+ my $is_paired = defined($opts{p});
+ # core loop
+ my @s1 = ();
+ my @s2 = ();
+ my ($s_last, $s_curr) = (\@s1, \@s2);
+ while (<>) {
+ s/[\177-\377]|[\000-\010]|[\012-\040]//g;
+ next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0);
+ if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
+ &mating($s_last, $s_curr);
+ print join("\t", @$s_last), "\n";
+ print join("\t", @$s_curr), "\n";
+ @$s_last = (); @$s_curr = ();
+ } else {
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+ my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
+ }
+ }
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+}
+
+sub soap2sam_aux {
+ my ($line, $s, $is_paired) = @_;
+ chomp($line);
+ my @t = split(/\s+/, $line);
+ return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]);
+ @$s = ();
+ # fix SOAP-2.1.x bugs
+ @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/);
+ # read name
+ $s->[0] = $t[0];
+ $s->[0] =~ s/\/[12]$//g;
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7);
+ $s->[1] |= 2 if ($is_paired);
+ # read & quality
+ $s->[9] = $t[1];
+ $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2];
+ # cigar
+ $s->[5] = length($s->[9]) . "M";
+ # coor
+ $s->[2] = $t[7]; $s->[3] = $t[8];
+ $s->[1] |= 0x10 if ($t[6] eq '-');
+ # mapQ
+ $s->[4] = $t[3] == 1? 30 : 0;
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ push(@$s, "NM:i:$t[9]");
+ my $md = '';
+ if ($t[9]) {
+ my @x;
+ for (10 .. $#t) {
+ push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i);
+ }
+ @x = sort(@x);
+ my $a = 0;
+ for (@x) {
+ my ($y, $z) = split(",");
+ $md .= (int($y)-$a) . $z;
+ $a += $y - $a + 1;
+ }
+ $md .= length($t[1]) - $a;
+ } else {
+ $md = length($t[1]);
+ }
+ push(@$s, "MD:Z:$md");
+ return 0;
+}
diff --git a/samtools-0.1.19/misc/varfilter.py b/samtools-0.1.19/misc/varfilter.py
new file mode 100755
index 0000000..03ce395
--- /dev/null
+++ b/samtools-0.1.19/misc/varfilter.py
@@ -0,0 +1,205 @@
+#!/software/bin/python
+
+# Author: lh3, converted to python and modified to add -C option by Aylwyn Scally
+#
+# About:
+# varfilter.py is a port of Heng's samtools.pl varFilter script into
+# python, with an additional -C INT option. This option sets a minimum
+# consensus score, above which the script will output a pileup line
+# wherever it _could have_ called a variant, even if none is actually
+# called (i.e. hom-ref positions). This is important if you want to
+# subsequently merge the calls with those for another individual to get a
+# synoptic view of calls at each site. Without this option, and in all
+# other respects, it behaves like samtools.pl varFilter.
+#
+# Aylwyn Scally as6 at sanger.ac.uk
+
+
+# Filtration code:
+#
+# C low CNS quality (hom-ref only)
+# d low depth
+# D high depth
+# W too many SNPs in a window (SNP only)
+# G close to a high-quality indel (SNP only)
+# Q low RMS mapping quality (SNP only)
+# g close to another indel with higher quality (indel only)
+# s low SNP quality (SNP only)
+# i low indel quality (indel only)
+
+
+import sys
+import getopt
+
+def usage():
+ print '''usage: varfilter.py [options] [cns-pileup]
+
+Options: -Q INT minimum RMS mapping quality for SNPs
+ -q INT minimum RMS mapping quality for gaps
+ -d INT minimum read depth
+ -D INT maximum read depth
+ -S INT minimum SNP quality
+ -i INT minimum indel quality
+ -C INT minimum consensus quality for hom-ref sites
+
+ -G INT min indel score for nearby SNP filtering
+ -w INT SNP within INT bp around a gap to be filtered
+
+ -W INT window size for filtering dense SNPs
+ -N INT max number of SNPs in a window
+
+ -l INT window size for filtering adjacent gaps
+
+ -p print filtered variants'''
+
+def varFilter_aux(first, is_print):
+ try:
+ if first[1] == 0:
+ sys.stdout.write("\t".join(first[4:]) + "\n")
+ elif is_print:
+ sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n")
+ except IOError:
+ sys.exit()
+
+mindepth = 3
+maxdepth = 100
+gapgapwin = 30
+minsnpmapq = 25
+mingapmapq = 10
+minindelscore = 25
+scorefactor = 100
+snpgapwin = 10
+densesnpwin = 10
+densesnps = 2
+printfilt = False
+minsnpq = 0
+minindelq = 0
+mincnsq = 0
+
+try:
+ options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', [])
+except getopt.GetoptError:
+ usage()
+ sys.exit(2)
+for (oflag, oarg) in options:
+ if oflag == '-d': mindepth = int(oarg)
+ if oflag == '-D': maxdepth = int(oarg)
+ if oflag == '-l': gapgapwin = int(oarg)
+ if oflag == '-Q': minsnpmapq = int(oarg)
+ if oflag == '-q': mingapmapq = int(oarg)
+ if oflag == '-G': minindelscore = int(oarg)
+ if oflag == '-s': scorefactor = int(oarg)
+ if oflag == '-w': snpgapwin = int(oarg)
+ if oflag == '-W': densesnpwin = int(oarg)
+ if oflag == '-C': mincnsq = int(oarg)
+ if oflag == '-N': densesnps = int(oarg)
+ if oflag == '-p': printfilt = True
+ if oflag == '-S': minsnpq = int(oarg)
+ if oflag == '-i': minindelq = int(oarg)
+
+if len(args) < 1:
+ inp = sys.stdin
+else:
+ inp = open(args[0])
+
+# calculate the window size
+max_dist = max(gapgapwin, snpgapwin, densesnpwin)
+
+staging = []
+for t in (line.strip().split() for line in inp):
+ (flt, score) = (0, -1)
+ # non-var sites
+ if t[3] == '*/*':
+ continue
+ is_snp = t[2].upper() != t[3].upper()
+ if not (is_snp or mincnsq):
+ continue
+ # clear the out-of-range elements
+ while staging:
+ # Still on the same chromosome and the first element's window still affects this position?
+ if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]):
+ break
+ varFilter_aux(staging.pop(0), printfilt)
+
+ # first a simple filter
+ if int(t[7]) < mindepth:
+ flt = 2
+ elif int(t[7]) > maxdepth:
+ flt = 3
+ if t[2] == '*': # an indel
+ if minindelq and minindelq > int(t[5]):
+ flt = 8
+ elif is_snp:
+ if minsnpq and minsnpq> int(t[5]):
+ flt = 7
+ else:
+ if mincnsq and mincnsq > int(t[4]):
+ flt = 9
+
+ # site dependent filters
+ dlen = 0
+ if flt == 0:
+ if t[2] == '*': # an indel
+ # If deletion, remember the length of the deletion
+ (a,b) = t[3].split('/')
+ alen = len(a) - 1
+ blen = len(b) - 1
+ if alen>blen:
+ if a[0] == '-': dlen=alen
+ elif b[0] == '-': dlen=blen
+
+ if int(t[6]) < mingapmapq:
+ flt = 1
+ # filtering SNPs
+ if int(t[5]) >= minindelscore:
+ for x in (y for y in staging if y[3]):
+ # Is it a SNP and is it outside the SNP filter window?
+ if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
+ continue
+ if x[1] == 0:
+ x[1] = 5
+
+ # calculate the filtering score (different from indel quality)
+ score = int(t[5])
+ if t[8] != '*':
+ score += scorefactor * int(t[10])
+ if t[9] != '*':
+ score += scorefactor * int(t[11])
+ # check the staging list for indel filtering
+ for x in (y for y in staging if y[3]):
+ # Is it a SNP and is it outside the gap filter window
+ if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]):
+ continue
+ if x[0] < score:
+ x[1] = 6
+ else:
+ flt = 6
+ break
+ else: # a SNP or hom-ref
+ if int(t[6]) < minsnpmapq:
+ flt = 1
+ # check adjacent SNPs
+ k = 1
+ for x in (y for y in staging if y[3]):
+ if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5):
+ k += 1
+
+ # filtering is necessary
+ if k > densesnps:
+ flt = 4
+ for x in (y for y in staging if y[3]):
+ if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0:
+ x[1] = 4
+ else: # then check gap filter
+ for x in (y for y in staging if y[3]):
+ if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
+ continue
+ if x[0] >= minindelscore:
+ flt = 5
+ break
+
+ staging.append([score, flt, dlen, is_snp] + t)
+
+# output the last few elements in the staging list
+while staging:
+ varFilter_aux(staging.pop(0), printfilt)
diff --git a/samtools-0.1.19/misc/vcfutils.lua b/samtools-0.1.19/misc/vcfutils.lua
new file mode 100755
index 0000000..51d374e
--- /dev/null
+++ b/samtools-0.1.19/misc/vcfutils.lua
@@ -0,0 +1,694 @@
+#!/usr/bin/env luajit
+
+-----------------------------------
+-- BEGIN: routines from klib.lua --
+-----------------------------------
+
+-- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt()
+--[[ Example:
+ for o, a in os.getopt(arg, 'a:b') do
+ print(o, a)
+ end
+]]--
+function os.getopt(args, ostr)
+ local arg, place = nil, 0;
+ return function ()
+ if place == 0 then -- update scanning pointer
+ place = 1
+ if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
+ if #args[1] >= 2 then
+ place = place + 1
+ if args[1]:sub(2, 2) == '-' then -- found "--"
+ table.remove(args, 1);
+ place = 0
+ return nil;
+ end
+ end
+ end
+ local optopt = place <= #args[1] and args[1]:sub(place, place) or nil
+ place = place + 1;
+ local oli = optopt and ostr:find(optopt) or nil
+ if optopt == ':' or oli == nil then -- unknown option
+ if optopt == '-' then return nil end
+ if place > #args[1] then
+ table.remove(args, 1);
+ place = 0;
+ end
+ return '?';
+ end
+ oli = oli + 1;
+ if ostr:sub(oli, oli) ~= ':' then -- do not need argument
+ arg = nil;
+ if place > #args[1] then
+ table.remove(args, 1);
+ place = 0;
+ end
+ else -- need an argument
+ if place <= #args[1] then -- no white space
+ arg = args[1]:sub(place);
+ else
+ table.remove(args, 1);
+ if #args == 0 then -- an option requiring argument is the last one
+ place = 0;
+ if ostr:sub(1, 1) == ':' then return ':' end
+ return '?';
+ else arg = args[1] end
+ end
+ table.remove(args, 1);
+ place = 0;
+ end
+ return optopt, arg;
+ end
+end
+
+-- Description: string split
+function string:split(sep, n)
+ local a, start = {}, 1;
+ sep = sep or "%s+";
+ repeat
+ local b, e = self:find(sep, start);
+ if b == nil then
+ table.insert(a, self:sub(start));
+ break
+ end
+ a[#a+1] = self:sub(start, b - 1);
+ start = e + 1;
+ if n and #a == n then
+ table.insert(a, self:sub(start));
+ break
+ end
+ until start > #self;
+ return a;
+end
+
+-- Description: smart file open
+function io.xopen(fn, mode)
+ mode = mode or 'r';
+ if fn == nil then return io.stdin;
+ elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout;
+ elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w');
+ elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w');
+ else return io.open(fn, mode) end
+end
+
+-- Description: log gamma function
+-- Required by: math.lbinom()
+-- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+function math.lgamma(z)
+ local x;
+ x = 0.1659470187408462e-06 / (z+7);
+ x = x + 0.9934937113930748e-05 / (z+6);
+ x = x - 0.1385710331296526 / (z+5);
+ x = x + 12.50734324009056 / (z+4);
+ x = x - 176.6150291498386 / (z+3);
+ x = x + 771.3234287757674 / (z+2);
+ x = x - 1259.139216722289 / (z+1);
+ x = x + 676.5203681218835 / z;
+ x = x + 0.9999999999995183;
+ return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5);
+end
+
+-- Description: regularized incomplete gamma function
+-- Dependent on: math.lgamma()
+--[[
+ Formulas are taken from Wiki, with additional input from Numerical
+ Recipes in C (for modified Lentz's algorithm) and AS245
+ (http://lib.stat.cmu.edu/apstat/245).
+
+ A good online calculator is available at:
+
+ http://www.danielsoper.com/statcalc/calc23.aspx
+
+ It calculates upper incomplete gamma function, which equals
+ math.igamma(s,z,true)*math.exp(math.lgamma(s))
+]]--
+function math.igamma(s, z, complement)
+
+ local function _kf_gammap(s, z)
+ local sum, x = 1, 1;
+ for k = 1, 100 do
+ x = x * z / (s + k);
+ sum = sum + x;
+ if x / sum < 1e-14 then break end
+ end
+ return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum));
+ end
+
+ local function _kf_gammaq(s, z)
+ local C, D, f, TINY;
+ f = 1. + z - s; C = f; D = 0.; TINY = 1e-290;
+ -- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2
+ for j = 1, 100 do
+ local d;
+ local a, b = j * (s - j), j*2 + 1 + z - s;
+ D = b + a * D;
+ if D < TINY then D = TINY end
+ C = b + a / C;
+ if C < TINY then C = TINY end
+ D = 1. / D;
+ d = C * D;
+ f = f * d;
+ if math.abs(d - 1) < 1e-14 then break end
+ end
+ return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f));
+ end
+
+ if complement then
+ return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z);
+ else
+ return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z));
+ end
+end
+
+function math.brent(func, a, b, tol)
+ local gold1, gold2, tiny, max_iter = 1.6180339887, 0.3819660113, 1e-20, 100
+
+ local fa, fb = func(a, data), func(b, data)
+ if fb > fa then -- swap, such that f(a) > f(b)
+ a, b, fa, fb = b, a, fb, fa
+ end
+ local c = b + gold1 * (b - a)
+ local fc = func(c) -- golden section extrapolation
+ while fb > fc do
+ local bound = b + 100.0 * (c - b) -- the farthest point where we want to go
+ local r = (b - a) * (fb - fc)
+ local q = (b - c) * (fb - fa)
+ if math.abs(q - r) < tiny then -- avoid 0 denominator
+ tmp = q > r and tiny or 0.0 - tiny
+ else tmp = q - r end
+ u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp) -- u is the parabolic extrapolation point
+ if (b > u and u > c) or (b < u and u < c) then -- u lies between b and c
+ fu = func(u)
+ if fu < fc then -- (b,u,c) bracket the minimum
+ a, b, fa, fb = b, u, fb, fu
+ break
+ elseif fu > fb then -- (a,b,u) bracket the minimum
+ c, fc = u, fu
+ break
+ end
+ u = c + gold1 * (c - b)
+ fu = func(u) -- golden section extrapolation
+ elseif (c > u and u > bound) or (c < u and u < bound) then -- u lies between c and bound
+ fu = func(u)
+ if fu < fc then -- fb > fc > fu
+ b, c, u = c, u, c + gold1 * (c - b)
+ fb, fc, fu = fc, fu, func(u)
+ else -- (b,c,u) bracket the minimum
+ a, b, c = b, c, u
+ fa, fb, fc = fb, fc, fu
+ break
+ end
+ elseif (u > bound and bound > c) or (u < bound and bound < c) then -- u goes beyond the bound
+ u = bound
+ fu = func(u)
+ else -- u goes the other way around, use golden section extrapolation
+ u = c + gold1 * (c - b)
+ fu = func(u)
+ end
+ a, b, c = b, c, u
+ fa, fb, fc = fb, fc, fu
+ end
+ if a > c then a, c = c, a end -- swap
+
+ -- now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+ local e, d = 0, 0
+ local w, v, fw, fv
+ w, v = b, b
+ fw, fv = fb, fb
+ for iter = 1, max_iter do
+ local mid = 0.5 * (a + c)
+ local tol1 = tol * math.abs(b) + tiny
+ local tol2 = 2.0 * tol1
+ if math.abs(b - mid) <= tol2 - 0.5 * (c - a) then return fb, b end -- found
+ if math.abs(e) > tol1 then
+ -- related to parabolic interpolation
+ local r = (b - w) * (fb - fv)
+ local q = (b - v) * (fb - fw)
+ local p = (b - v) * q - (b - w) * r
+ q = 2.0 * (q - r)
+ if q > 0.0 then p = 0.0 - p
+ else q = 0.0 - q end
+ eold, e = e, d
+ if math.abs(p) >= math.abs(0.5 * q * eold) or p <= q * (a - b) or p >= q * (c - b) then
+ e = b >= mid and a - b or c - b
+ d = gold2 * e
+ else
+ d, u = p / q, b + d -- actual parabolic interpolation happens here
+ if u - a < tol2 or c - u < tol2 then
+ d = mid > b and tol1 or 0.0 - tol1
+ end
+ end
+ else -- golden section interpolation
+ e = b >= min and a - b or c - b
+ d = gold2 * e
+ end
+ u = fabs(d) >= tol1 and b + d or b + (d > 0.0 and tol1 or -tol1);
+ fu = func(u)
+ if fu <= fb then -- u is the minimum point so far
+ if u >= b then a = b
+ else c = b end
+ v, w, b = w, b, u
+ fv, fw, fb = fw, fb, fu
+ else -- adjust (a,c) and (u,v,w)
+ if u < b then a = u
+ else c = u end
+ if fu <= fw or w == b then
+ v, w = w, u
+ fv, fw = fw, fu
+ elseif fu <= fv or v == b or v == w then
+ v, fv = u, fu;
+ end
+ end
+ end
+ return fb, b
+end
+
+matrix = {}
+
+-- Description: chi^2 test for contingency tables
+-- Dependent on: math.igamma()
+function matrix.chi2(a)
+ if #a == 2 and #a[1] == 2 then -- 2x2 table
+ local x, z
+ x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2])
+ if x == 0 then return 0, 1, false end
+ z = a[1][1] * a[2][2] - a[1][2] * a[2][1]
+ z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x
+ return z, math.igamma(.5, .5 * z, true), true
+ else -- generic table
+ local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0
+ for i = 1, n do rs[i] = 0 end
+ for j = 1, m do cs[j] = 0 end
+ for i = 1, n do -- compute column sum and row sum
+ for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end
+ end
+ for i = 1, n do N = N + rs[i] end
+ for i = 1, n do -- compute the chi^2 statistics
+ for j = 1, m do
+ local E = rs[i] * cs[j] / N;
+ z = z + (a[i][j] - E) * (a[i][j] - E) / E
+ end
+ end
+ return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true;
+ end
+end
+
+---------------------------------
+-- END: routines from klib.lua --
+---------------------------------
+
+
+--------------------------
+-- BEGIN: misc routines --
+--------------------------
+
+-- precompute an array for PL->probability conversion
+-- @param m maximum PL
+function algo_init_q2p(m)
+ local q2p = {}
+ for i = 0, m do
+ q2p[i] = math.pow(10, -i / 10)
+ end
+ return q2p
+end
+
+-- given the haplotype frequency, compute r^2
+-- @param f 4 haplotype frequencies; f[] is 0-indexed.
+-- @return r^2
+function algo_r2(f)
+ local p = { f[0] + f[1], f[0] + f[2] }
+ local D = f[0] * f[3] - f[1] * f[2]
+ return (p[1] == 0 or p[2] == 0 or 1-p[1] == 0 or 1-p[2] == 0) and 0 or D * D / (p[1] * p[2] * (1 - p[1]) * (1 - p[2]))
+end
+
+-- parse a VCF line to get PL
+-- @param q2p is computed by algo_init_q2p()
+function text_parse_pl(t, q2p, parse_GT)
+ parse_GT = parse_GT == nil and true or false
+ local ht, gt, pl = {}, {}, {}
+ local s, j0 = t[9]:split(':'), 0
+ for j = 1, #s do
+ if s[j] == 'PL' then j0 = j break end
+ end
+ local has_GT = (s[1] == 'GT' and parse_GT) and true or false
+ for i = 10, #t do
+ if j0 > 0 then
+ local s = t[i]:split(':')
+ local a, b = 1, s[j0]:find(',')
+ pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))]
+ a, b = b + 1, s[j0]:find(',', b + 1)
+ pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))]
+ a, b = b + 1, s[j0]:find(',', b + 1)
+ pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, (b and b - 1) or nil))]
+ end
+ if has_GT then
+ if t[i]:sub(1, 1) ~= '.' then
+ local g = tonumber(t[i]:sub(1, 1)) + tonumber(t[i]:sub(3, 3));
+ gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6
+ gt[#gt - 2 + g] = 1
+ ht[#ht+1] = tonumber(t[i]:sub(1, 1)); ht[#ht+1] = tonumber(t[i]:sub(3, 3));
+ else
+ gt[#gt+1] = 1; gt[#gt+1] = 1; gt[#gt+1] = 1
+ ht[#ht+1] = -1; ht[#ht+1] = -1;
+ end
+ end
+-- print(t[i], pl[#pl-2], pl[#pl-1], pl[#pl], gt[#gt-2], gt[#gt-1], gt[#gt])
+ end
+ if #pl == 0 then pl = nil end
+ local x = has_GT and { t[1], t[2], ht, gt, pl } or { t[1], t[2], nil, nil, pl }
+ return x
+end
+
+-- Infer haplotype frequency
+-- @param pdg genotype likelihoods P(D|g) generated by text_parse_pl(). pdg[] is 1-indexed.
+-- @param eps precision [1e-5]
+-- @return 2-locus haplotype frequencies, 0-indexed array
+function algo_hapfreq2(pdg, eps)
+ eps = eps or 1e-5
+ local n, f = #pdg[1] / 3, {[0]=0.25, 0.25, 0.25, 0.25}
+ for iter = 1, 100 do
+ local F = {[0]=0, 0, 0, 0}
+ for i = 0, n - 1 do
+ local p1, p2 = {[0]=pdg[1][i*3+1], pdg[1][i*3+2], pdg[1][i*3+3]}, {[0]=pdg[2][i*3+1], pdg[2][i*3+2], pdg[2][i*3+3]}
+ local u = { [0]=
+ f[0] * (f[0] * p1[0] * p2[0] + f[1] * p1[0] * p2[1] + f[2] * p1[1] * p2[0] + f[3] * p1[1] * p2[1]),
+ f[1] * (f[0] * p1[0] * p2[1] + f[1] * p1[0] * p2[2] + f[2] * p1[1] * p2[1] + f[3] * p1[1] * p2[2]),
+ f[2] * (f[0] * p1[1] * p2[0] + f[1] * p1[1] * p2[1] + f[2] * p1[2] * p2[0] + f[3] * p1[2] * p2[1]),
+ f[3] * (f[0] * p1[1] * p2[1] + f[1] * p1[1] * p2[2] + f[2] * p1[2] * p2[1] + f[3] * p1[2] * p2[2])
+ }
+ local s = u[0] + u[1] + u[2] + u[3]
+ s = 1 / (s * n)
+ F[0] = F[0] + u[0] * s
+ F[1] = F[1] + u[1] * s
+ F[2] = F[2] + u[2] * s
+ F[3] = F[3] + u[3] * s
+ end
+ local e = 0
+ for k = 0, 3 do
+ e = math.abs(f[k] - F[k]) > e and math.abs(f[k] - F[k]) or e
+ end
+ for k = 0, 3 do f[k] = F[k] end
+ if e < eps then break end
+-- print(f[0], f[1], f[2], f[3])
+ end
+ return f
+end
+
+------------------------
+-- END: misc routines --
+------------------------
+
+
+---------------------
+-- BEGIN: commands --
+---------------------
+
+-- CMD vcf2bgl: convert PL tagged VCF to Beagle input --
+function cmd_vcf2bgl()
+ if #arg == 0 then
+ print("\nUsage: vcf2bgl.lua <in.vcf>")
+ print("\nNB: This command finds PL by matching /(\\d+),(\\d+),(\\d+)/.\n");
+ os.exit(1)
+ end
+
+ local lookup = {}
+ for i = 0, 10000 do lookup[i] = string.format("%.4f", math.pow(10, -i/10)) end
+
+ local fp = io.xopen(arg[1])
+ for l in fp:lines() do
+ if l:sub(1, 2) == '##' then -- meta lines; do nothing
+ elseif l:sub(1, 1) == '#' then -- sample lines
+ local t, s = l:split('\t'), {}
+ for i = 10, #t do s[#s+1] = t[i]; s[#s+1] = t[i]; s[#s+1] = t[i] end
+ print('marker', 'alleleA', 'alleleB', table.concat(s, '\t'))
+ else -- data line
+ local t = l:split('\t');
+ if t[5] ~= '.' and t[5]:find(",") == nil and #t[5] == 1 and #t[4] == 1 then -- biallic SNP
+ local x, z = -1, {};
+ if t[9]:find('PL') then
+ for i = 10, #t do
+ local AA, Aa, aa = t[i]:match('(%d+),(%d+),(%d+)')
+ AA = tonumber(AA); Aa = tonumber(Aa); aa = tonumber(aa);
+ if AA ~= nil then
+ z[#z+1] = lookup[AA]; z[#z+1] = lookup[Aa]; z[#z+1] = lookup[aa];
+ else z[#z+1] = 1; z[#z+1] = 1; z[#z+1] = 1; end
+ end
+ print(t[1]..':'..t[2], t[4], t[5], table.concat(z, '\t'))
+ elseif t[9]:find('GL') then
+ print('Error: not implemented')
+ os.exit(1)
+ end
+ end
+ end
+ end
+ fp:close()
+end
+
+-- CMD bgl2vcf: convert Beagle output to VCF
+function cmd_bgl2vcf()
+ if #arg < 2 then
+ print('Usage: bgl2vcf.lua <in.phased> <in.gprobs>')
+ os.exit(1)
+ end
+
+ local fpp = io.xopen(arg[1]);
+ local fpg = io.xopen(arg[2]);
+ for lg in fpg:lines() do
+ local tp, tg, a = fpp:read():split('%s'), lg:split('%s', 4), {}
+ if tp[1] == 'I' then
+ for i = 3, #tp, 2 do a[#a+1] = tp[i] end
+ print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', table.concat(a, '\t'))
+ else
+ local chr, pos = tg[1]:match('(%S+):(%d+)$')
+ a = {chr, pos, '.', tg[2], tg[3], 30, '.', '.', 'GT'}
+ for i = 3, #tp, 2 do
+ a[#a+1] = ((tp[i] == tg[2] and 0) or 1) .. '|' .. ((tp[i+1] == tg[2] and 0) or 1)
+ end
+ print(table.concat(a, '\t'))
+ end
+ end
+ fpg:close(); fpp:close();
+end
+
+-- CMD freq: count alleles in each population
+function cmd_freq()
+ -- parse the command line
+ local site_only = true; -- print site allele frequency or not
+ for c in os.getopt(arg, 's') do
+ if c == 's' then site_only = false end
+ end
+ if #arg == 0 then
+ print("\nUsage: vcfutils.lua freq [-s] <in.vcf> [samples.txt]\n")
+ print("NB: 1) This command only considers biallelic variants.")
+ print(" 2) Apply '-s' to get the allele frequency spectrum.")
+ print(" 3) 'samples.txt' is TAB-delimited with each line consisting of sample and population.")
+ print("")
+ os.exit(1)
+ end
+
+ -- read the sample-population pairs
+ local pop, sample = {}, {}
+ if #arg > 1 then
+ local fp = io.xopen(arg[2]);
+ for l in fp:lines() do
+ local s, p = l:match("^(%S+)%s+(%S+)"); -- sample, population pair
+ sample[s] = p; -- FIXME: check duplications
+ if pop[p] then table.insert(pop[p], s)
+ else pop[p] = {s} end
+ end
+ fp:close();
+ end
+ pop['NA'] = {}
+
+ -- parse VCF
+ fp = (#arg >= 2 and io.xopen(arg[1])) or io.stdin;
+ local col, cnt = {}, {};
+ for k in pairs(pop) do
+ col[k], cnt[k] = {}, {[0]=0};
+ end
+ for l in fp:lines() do
+ if l:sub(1, 2) == '##' then -- meta lines; do nothing
+ elseif l:sub(1, 1) == '#' then -- the sample line
+ local t, del_NA = l:split('\t'), true;
+ for i = 10, #t do
+ local k = sample[t[i]]
+ if k == nil then
+ k, del_NA = 'NA', false
+ table.insert(pop[k], t[i])
+ end
+ table.insert(col[k], i);
+ table.insert(cnt[k], 0);
+ table.insert(cnt[k], 0);
+ end
+ if del_NA then pop['NA'], col['NA'], cnt['NA'] = nil, nil, nil end
+ else -- data lines
+ local t = l:split('\t');
+ if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic
+ if site_only == true then io.write(t[1], '\t', t[2], '\t', t[4], '\t', t[5]) end
+ for k, v in pairs(col) do
+ local ac, an = 0, 0;
+ for i = 1, #v do
+ local a1, a2 = t[v[i]]:match("^(%d).(%d)");
+ if a1 ~= nil then ac, an = ac + a1 + a2, an + 2 end
+ end
+ if site_only == true then io.write('\t', k, ':', an, ':', ac) end
+ if an == #cnt[k] then cnt[k][ac] = cnt[k][ac] + 1 end
+ end
+ if site_only == true then io.write('\n') end
+ end
+ end
+ end
+ fp:close();
+
+ -- print
+ if site_only == false then
+ for k, v in pairs(cnt) do
+ io.write(k .. "\t" .. #v);
+ for i = 0, #v do io.write("\t" .. v[i]) end
+ io.write('\n');
+ end
+ end
+end
+
+function cmd_vcf2chi2()
+ if #arg < 3 then
+ print("Usage: vcfutils.lua vcf2chi2 <in.vcf> <group1.list> <group2.list>");
+ os.exit(1)
+ end
+
+ local g = {};
+
+ -- read the list of groups
+ local fp = io.xopen(arg[2]);
+ for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 1 end -- FIXME: check duplicate
+ fp:close()
+ fp = io.xopen(arg[3]);
+ for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 2 end
+ fp:close()
+
+ -- process VCF
+ fp = io.xopen(arg[1])
+ local h = {{}, {}}
+ for l in fp:lines() do
+ if l:sub(1, 2) == '##' then print(l) -- meta lines; do nothing
+ elseif l:sub(1, 1) == '#' then -- sample lines
+ local t = l:split('\t');
+ for i = 10, #t do
+ if g[t[i]] == 1 then table.insert(h[1], i)
+ elseif g[t[i]] == 2 then table.insert(h[2], i) end
+ end
+ while #t > 8 do table.remove(t) end
+ print(table.concat(t, "\t"))
+ else -- data line
+ local t = l:split('\t');
+ if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic
+ local a = {{0, 0}, {0, 0}}
+ for i = 1, 2 do
+ for _, k in pairs(h[i]) do
+ if t[k]:find("^0.0") then a[i][1] = a[i][1] + 2
+ elseif t[k]:find("^1.1") then a[i][2] = a[i][2] + 2
+ elseif t[k]:find("^0.1") or t[k]:find("^1.0") then
+ a[i][1], a[i][2] = a[i][1] + 1, a[i][2] + 1
+ end
+ end
+ end
+ local chi2, p, succ = matrix.chi2(a);
+ while #t > 8 do table.remove(t) end
+ --print(a[1][1], a[1][2], a[2][1], a[2][2], chi2, p);
+ if succ then print(table.concat(t, "\t") .. ";PCHI2=" .. string.format("%.3g", p)
+ .. string.format(';AF1=%.4g;AF2=%.4g,%.4g', (a[1][2]+a[2][2]) / (a[1][1]+a[1][2]+a[2][1]+a[2][2]),
+ a[1][2]/(a[1][1]+a[1][2]), a[2][2]/(a[2][1]+a[2][2])))
+ else print(table.concat(t, "\t")) end
+ end
+ end
+ end
+ fp:close()
+end
+
+-- CMD: compute r^2
+function cmd_r2()
+ local w, is_ht, is_gt = 1, false, false
+ for o, a in os.getopt(arg, 'w:hg') do
+ if o == 'w' then w = tonumber(a)
+ elseif o == 'h' then is_ht, is_gt = true, true
+ elseif o == 'g' then is_gt = true
+ end
+ end
+ if #arg == 0 then
+ print("Usage: vcfutils.lua r2 [-hg] [-w 1] <in.vcf>")
+ os.exit(1)
+ end
+ local stack, fp, q2p = {}, io.xopen(arg[1]), algo_init_q2p(1023)
+ for l in fp:lines() do
+ if l:sub(1, 1) ~= '#' then
+ local t = l:split('\t')
+ local x = text_parse_pl(t, q2p)
+ if #t[5] == 1 and t[5] ~= '.' then -- biallelic
+ local r2 = {}
+ for k = 1, w do
+ if is_gt == false then -- use PL
+ if stack[k] then
+ local pdg = { stack[k][5], x[5] }
+ r2[#r2+1] = algo_r2(algo_hapfreq2(pdg))
+ else r2[#r2+1] = 0 end
+ elseif is_ht == false then -- use unphased GT
+ if stack[k] then
+ local pdg = { stack[k][4], x[4] }
+ r2[#r2+1] = algo_r2(algo_hapfreq2(pdg))
+ else r2[#r2+1] = 0 end
+ else -- use phased GT
+ if stack[k] then
+ local f, ht = { [0]=0, 0, 0, 0 }, { stack[k][3], x[3] }
+ for i = 1, #ht[1] do
+ local j = ht[1][i] * 2 + ht[2][i]
+ f[j] = f[j] + 1
+ end
+ local sum = f[0] + f[1] + f[2] + f[3]
+ for k = 0, 3 do f[k] = f[k] / sum end
+ r2[#r2+1] = algo_r2(f)
+ else r2[#r2+1] = 0 end
+ end
+ end
+ for k = 1, #r2 do
+ r2[k] = string.format('%.3f', r2[k])
+ end
+ print(x[1], x[2], table.concat(r2, '\t'))
+ if #stack == w then table.remove(stack, 1) end
+ stack[#stack+1] = x
+ end
+ end
+ end
+ fp:close()
+end
+
+-------------------
+-- END: commands --
+-------------------
+
+
+-------------------
+-- MAIN FUNCTION --
+-------------------
+
+if #arg == 0 then
+ print("\nUsage: vcfutils.lua <command> <arguments>\n")
+ print("Command: freq count biallelic alleles in each population")
+ print(" r2 compute r^2")
+ print(" vcf2chi2 compute 1-degree chi-square between two groups of samples")
+ print(" vcf2bgl convert PL annotated VCF to Beagle input")
+ print(" bgl2vcf convert Beagle input to VCF")
+ print("")
+ os.exit(1)
+end
+
+local cmd = arg[1]
+table.remove(arg, 1)
+if cmd == 'vcf2bgl' then cmd_vcf2bgl()
+elseif cmd == 'bgl2vcf' then cmd_bgl2vcf()
+elseif cmd == 'freq' then cmd_freq()
+elseif cmd == 'r2' then cmd_r2()
+elseif cmd == 'vcf2chi2' then cmd_vcf2chi2()
+else
+ print('ERROR: unknown command "' .. cmd .. '"')
+ os.exit(1)
+end
diff --git a/samtools-0.1.19/misc/wgsim.c b/samtools-0.1.19/misc/wgsim.c
new file mode 100644
index 0000000..b9c513c
--- /dev/null
+++ b/samtools-0.1.19/misc/wgsim.c
@@ -0,0 +1,419 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+ 2011 Heng Li <lh3 at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* This program is separated from maq's read simulator with Colin
+ * Hercus' modification to allow longer indels. */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include <zlib.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+#define PACKAGE_VERSION "0.3.0"
+
+const uint8_t nst_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+/* Simple normal random number generator, copied from genran.c */
+
+double ran_normal()
+{
+ static int iset = 0;
+ static double gset;
+ double fac, rsq, v1, v2;
+ if (iset == 0) {
+ do {
+ v1 = 2.0 * drand48() - 1.0;
+ v2 = 2.0 * drand48() - 1.0;
+ rsq = v1 * v1 + v2 * v2;
+ } while (rsq >= 1.0 || rsq == 0.0);
+ fac = sqrt(-2.0 * log(rsq) / rsq);
+ gset = v1 * fac;
+ iset = 1;
+ return v2 * fac;
+ } else {
+ iset = 0;
+ return gset;
+ }
+}
+
+/* wgsim */
+
+enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
+typedef unsigned short mut_t;
+static mut_t mutmsk = (mut_t)0xf000;
+
+typedef struct {
+ int l, m; /* length and maximum buffer size */
+ mut_t *s; /* sequence */
+} mutseq_t;
+
+static double ERR_RATE = 0.02;
+static double MUT_RATE = 0.001;
+static double INDEL_FRAC = 0.15;
+static double INDEL_EXTEND = 0.3;
+static double MAX_N_RATIO = 0.1;
+
+void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
+{
+ int i, deleting = 0;
+ mutseq_t *ret[2];
+
+ ret[0] = hap1; ret[1] = hap2;
+ ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
+ ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
+ ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
+ ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
+ for (i = 0; i != ks->seq.l; ++i) {
+ int c;
+ c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
+ if (deleting) {
+ if (drand48() < INDEL_EXTEND) {
+ if (deleting & 1) ret[0]->s[i] |= DELETE;
+ if (deleting & 2) ret[1]->s[i] |= DELETE;
+ continue;
+ } else deleting = 0;
+ }
+ if (c < 4 && drand48() < MUT_RATE) { // mutation
+ if (drand48() >= INDEL_FRAC) { // substitution
+ double r = drand48();
+ c = (c + (int)(r * 3.0 + 1)) & 3;
+ if (is_hap || drand48() < 0.333333) { // hom
+ ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
+ } else { // het
+ ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
+ }
+ } else { // indel
+ if (drand48() < 0.5) { // deletion
+ if (is_hap || drand48() < 0.333333) { // hom-del
+ ret[0]->s[i] = ret[1]->s[i] = DELETE;
+ deleting = 3;
+ } else { // het-del
+ deleting = drand48()<0.5?1:2;
+ ret[deleting-1]->s[i] = DELETE;
+ }
+ } else { // insertion
+ int num_ins = 0, ins = 0;
+ do {
+ num_ins++;
+ ins = (ins << 2) | (int)(drand48() * 4.0);
+ } while (num_ins < 4 && drand48() < INDEL_EXTEND);
+
+ if (is_hap || drand48() < 0.333333) { // hom-ins
+ ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
+ } else { // het-ins
+ ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
+ }
+ }
+ }
+ }
+ }
+}
+void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
+{
+ int i;
+ for (i = 0; i != ks->seq.l; ++i) {
+ int c[3];
+ c[0] = nst_nt4_table[(int)ks->seq.s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ if (c[0] >= 4) continue;
+ if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
+ printf("%s\t%d\t", name, i+1);
+ if (c[1] == c[2]) { // hom
+ if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+ printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+ } else if ((c[1]&mutmsk) == DELETE) { // del
+ printf("%c\t-\t-\n", "ACGTN"[c[0]]);
+ } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
+ printf("-\t");
+ int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
+ while (n > 0) {
+ putchar("ACGTN"[ins & 0x3]);
+ ins >>= 2;
+ n--;
+ }
+ printf("\t-\n");
+ } else assert(0);
+ } else { // het
+ if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+ printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
+ } else if ((c[1]&mutmsk) == DELETE) {
+ printf("%c\t-\t+\n", "ACGTN"[c[0]]);
+ } else if ((c[2]&mutmsk) == DELETE) {
+ printf("%c\t-\t+\n", "ACGTN"[c[0]]);
+ } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
+ printf("-\t");
+ int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
+ while (n > 0) {
+ putchar("ACGTN"[ins & 0x3]);
+ ins >>= 2;
+ n--;
+ }
+ printf("\t+\n");
+ } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
+ printf("-\t");
+ int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
+ while (n > 0) {
+ putchar("ACGTN"[ins & 0x3]);
+ ins >>= 2;
+ n--;
+ }
+ printf("\t+\n");
+ } else assert(0);
+ }
+ }
+ }
+}
+
+void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
+{
+ kseq_t *ks;
+ mutseq_t rseq[2];
+ gzFile fp_fa;
+ uint64_t tot_len, ii;
+ int i, l, n_ref;
+ char *qstr;
+ int size[2], Q, max_size;
+ uint8_t *tmp_seq[2];
+ mut_t *target;
+
+ l = size_l > size_r? size_l : size_r;
+ qstr = (char*)calloc(l+1, 1);
+ tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
+ tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
+ size[0] = size_l; size[1] = size_r;
+ max_size = size_l > size_r? size_l : size_r;
+
+ Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
+
+ fp_fa = gzopen(fn, "r");
+ ks = kseq_init(fp_fa);
+ tot_len = n_ref = 0;
+ fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
+ while ((l = kseq_read(ks)) >= 0) {
+ tot_len += l;
+ ++n_ref;
+ }
+ fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
+ kseq_destroy(ks);
+ gzclose(fp_fa);
+
+ fp_fa = gzopen(fn, "r");
+ ks = kseq_init(fp_fa);
+ while ((l = kseq_read(ks)) >= 0) {
+ uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
+ if (l < dist + 3 * std_dev) {
+ fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
+ continue;
+ }
+
+ // generate mutations and print them out
+ wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
+ wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
+
+ for (ii = 0; ii != n_pairs; ++ii) { // the core loop
+ double ran;
+ int d, pos, s[2], is_flip = 0;
+ int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
+ FILE *fpo[2];
+
+ do { // avoid boundary failure
+ ran = ran_normal();
+ ran = ran * std_dev + dist;
+ d = (int)(ran + 0.5);
+ d = d > max_size? d : max_size;
+ pos = (int)((l - d + 1) * drand48());
+ } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
+
+ // flip or not
+ if (drand48() < 0.5) {
+ fpo[0] = fpout1; fpo[1] = fpout2;
+ s[0] = size[0]; s[1] = size[1];
+ } else {
+ fpo[1] = fpout1; fpo[0] = fpout2;
+ s[1] = size[0]; s[0] = size[1];
+ is_flip = 1;
+ }
+
+ // generate the read sequences
+ target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
+ n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
+
+#define __gen_read(x, start, iter) do { \
+ for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \
+ int c = target[i], mut_type = c & mutmsk; \
+ if (ext_coor[x] < 0) { \
+ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
+ ext_coor[x] = i; \
+ } \
+ if (mut_type == DELETE) ++n_indel[x]; \
+ else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
+ tmp_seq[x][k++] = c & 0xf; \
+ if (mut_type == SUBSTITUTE) ++n_sub[x]; \
+ } else { \
+ int n, ins; \
+ ++n_indel[x]; \
+ tmp_seq[x][k++] = c & 0xf; \
+ for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
+ tmp_seq[x][k++] = ins & 0x3; \
+ } \
+ } \
+ if (k != s[x]) ext_coor[x] = -10; \
+ } while (0)
+
+ __gen_read(0, pos, ++i);
+ __gen_read(1, pos + d - 1, --i);
+ for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
+ if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
+ --ii;
+ continue;
+ }
+
+ // generate sequencing errors
+ for (j = 0; j < 2; ++j) {
+ int n_n = 0;
+ for (i = 0; i < s[j]; ++i) {
+ int c = tmp_seq[j][i];
+ if (c >= 4) { // actually c should be never larger than 4 if everything is correct
+ c = 4;
+ ++n_n;
+ } else if (drand48() < ERR_RATE) {
+ // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
+ c = (c + 1) & 3; // recurrent sequencing errors
+ ++n_err[j];
+ }
+ tmp_seq[j][i] = c;
+ }
+ if ((double)n_n / s[j] > MAX_N_RATIO) break;
+ }
+ if (j < 2) { // too many ambiguous bases on one of the reads
+ --ii;
+ continue;
+ }
+
+ // print
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < s[j]; ++i) qstr[i] = Q;
+ qstr[i] = 0;
+ fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
+ n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+ (long long)ii, j==0? is_flip+1 : 2-is_flip);
+ for (i = 0; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
+ fprintf(fpo[j], "\n+\n%s\n", qstr);
+ }
+ }
+ free(rseq[0].s); free(rseq[1].s);
+ }
+ kseq_destroy(ks);
+ gzclose(fp_fa);
+ free(qstr);
+ free(tmp_seq[0]); free(tmp_seq[1]);
+}
+
+static int simu_usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: wgsim (short read simulator)\n");
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Contact: Heng Li <lh3 at sanger.ac.uk>\n\n");
+ fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
+ fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
+ fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
+ fprintf(stderr, " -s INT standard deviation [50]\n");
+ fprintf(stderr, " -N INT number of read pairs [1000000]\n");
+ fprintf(stderr, " -1 INT length of the first read [70]\n");
+ fprintf(stderr, " -2 INT length of the second read [70]\n");
+ fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
+ fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
+ fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
+ fprintf(stderr, " -S INT seed for random generator [-1]\n");
+ fprintf(stderr, " -h haplotype mode\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ int64_t N;
+ int dist, std_dev, c, size_l, size_r, is_hap = 0;
+ FILE *fpout1, *fpout2;
+ int seed = -1;
+
+ N = 1000000; dist = 500; std_dev = 50;
+ size_l = size_r = 70;
+ while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) {
+ switch (c) {
+ case 'd': dist = atoi(optarg); break;
+ case 's': std_dev = atoi(optarg); break;
+ case 'N': N = atoi(optarg); break;
+ case '1': size_l = atoi(optarg); break;
+ case '2': size_r = atoi(optarg); break;
+ case 'e': ERR_RATE = atof(optarg); break;
+ case 'r': MUT_RATE = atof(optarg); break;
+ case 'R': INDEL_FRAC = atof(optarg); break;
+ case 'X': INDEL_EXTEND = atof(optarg); break;
+ case 'S': seed = atoi(optarg); break;
+ case 'h': is_hap = 1; break;
+ }
+ }
+ if (argc - optind < 3) return simu_usage();
+ fpout1 = fopen(argv[optind+1], "w");
+ fpout2 = fopen(argv[optind+2], "w");
+ if (!fpout1 || !fpout2) {
+ fprintf(stderr, "[wgsim] file open error\n");
+ return 1;
+ }
+ srand48(seed > 0? seed : time(0));
+ wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
+
+ fclose(fpout1); fclose(fpout2);
+ return 0;
+}
diff --git a/samtools-0.1.19/misc/wgsim_eval.pl b/samtools-0.1.19/misc/wgsim_eval.pl
new file mode 100755
index 0000000..f919a06
--- /dev/null
+++ b/samtools-0.1.19/misc/wgsim_eval.pl
@@ -0,0 +1,91 @@
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.5
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&wgsim_eval;
+exit;
+
+sub wgsim_eval {
+ my %opts = (g=>5);
+ getopts('pcag:', \%opts);
+ die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
+ my (@c0, @c1, %fnfp);
+ my ($max_q, $flag) = (0, 0);
+ my $gap = $opts{g};
+ $flag |= 1 if (defined $opts{p});
+ $flag |= 2 if (defined $opts{c});
+ while (<>) {
+ next if (/^\@/);
+ my @t = split("\t");
+ next if (@t < 11);
+ my $line = $_;
+ my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]);
+ $max_q = $q if ($q > $max_q);
+ # right coordinate
+ $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg;
+ --$rght;
+ # correct for soft clipping
+ my ($left0, $rght0) = ($left, $rght);
+ $left -= $1 if (/^(\d+)[SH]/);
+ $rght += $1 if (/(\d+)[SH]$/);
+ $left0 -= $1 if (/(\d+)[SH]$/);
+ $rght0 += $1 if (/^(\d+)[SH]/);
+ # skip unmapped reads
+ next if (($t[1]&0x4) || $chr eq '*');
+ # parse read name and check
+ if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) {
+ if ($1 ne $chr) { # different chr
+ $is_correct = 0;
+ } else {
+ if ($flag & 2) {
+ if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward
+ $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
+ } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse
+ $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap);
+ } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward
+ $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap);
+ } else { # R3, reverse
+ $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap);
+ }
+ } else {
+ if ($t[1] & 0x10) { # reverse
+ $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads
+ } else {
+ $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
+ }
+ }
+ }
+ } else {
+ warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n");
+ next;
+ }
+ ++$c0[$q];
+ ++$c1[$q] unless ($is_correct);
+ @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]});
+ ++$fnfp{$t[4]}[0];
+ ++$fnfp{$t[4]}[1] unless ($is_correct);
+ print STDERR $line if (($flag&1) && !$is_correct && $q > 0);
+ }
+ # print
+ my ($cc0, $cc1) = (0, 0);
+ if (!defined($opts{a})) {
+ for (my $i = $max_q; $i >= 0; --$i) {
+ $c0[$i] = 0 unless (defined $c0[$i]);
+ $c1[$i] = 0 unless (defined $c1[$i]);
+ $cc0 += $c0[$i]; $cc1 += $c1[$i];
+ printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0);
+ }
+ } else {
+ for (reverse(sort {$a<=>$b} (keys %fnfp))) {
+ next if ($_ == 0);
+ $cc0 += $fnfp{$_}[0];
+ $cc1 += $fnfp{$_}[1];
+ print join("\t", $_, $cc0, $cc1), "\n";
+ }
+ }
+}
diff --git a/samtools-0.1.19/misc/zoom2sam.pl b/samtools-0.1.19/misc/zoom2sam.pl
new file mode 100755
index 0000000..5306bfa
--- /dev/null
+++ b/samtools-0.1.19/misc/zoom2sam.pl
@@ -0,0 +1,97 @@
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.0
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&zoom2sam;
+exit;
+
+sub mating {
+ my ($s1, $s2) = @_;
+ my $isize = 0;
+ if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
+ my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
+ my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
+ $isize = $x2 - $x1;
+ }
+ # update mate coordinate
+ if ($s2->[2] ne '*') {
+ @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
+ $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
+ } else {
+ $s1->[1] |= 0x8;
+ }
+ if ($s1->[2] ne '*') {
+ @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
+ $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
+ } else {
+ $s2->[1] |= 0x8;
+ }
+}
+
+sub zoom2sam {
+ my %opts = ();
+ getopts("p", \%opts);
+ die("Usage: zoom2sam.pl [-p] <readLen> <aln.zoom>
+Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2);
+ my $is_paired = defined($opts{p});
+ my $len = shift(@ARGV);
+ # core loop
+ my @s1 = ();
+ my @s2 = ();
+ my ($s_last, $s_curr) = (\@s1, \@s2);
+ while (<>) {
+ &zoom2sam_aux($_, $s_curr, $is_paired, $len);
+ if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
+ &mating($s_last, $s_curr);
+ print join("\t", @$s_last), "\n";
+ print join("\t", @$s_curr), "\n";
+ @$s_last = (); @$s_curr = ();
+ } else {
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+ my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
+ }
+ }
+ print join("\t", @$s_last), "\n" if (@$s_last != 0);
+}
+
+sub zoom2sam_aux {
+ my ($line, $s, $is_paired, $len) = @_;
+ chomp($line);
+ my @t = split("\t", $line);
+ @$s = ();
+ # read name
+ $s->[0] = $t[0];
+ # initial flag (will be updated later)
+ $s->[1] = 0;
+ $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/);
+ $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/);
+ $s->[1] |= 2 if ($is_paired);
+ # read & quality
+ $s->[9] = "*"; $s->[10] = "*";
+ # cigar
+ $s->[5] = $len . "M";
+ # coor
+ my @s = split(/\s+/, $t[1]);
+ $s->[2] = $s[0];
+ $t[1] =~ /:(\d+)$/;
+ $s->[3] = $1 + 1;
+ if ($s->[0] =~ /_[FR]$/) {
+ my $u = ($s->[0] =~ /_F$/)? 1 : 0;
+ my $w = ($t[2] eq '+')? 1 : 0;
+ $s->[1] |= 0x10 if ($u ^ $w);
+ $s->[0] =~ s/_[FR]$//;
+ } else {
+ $s->[1] |= 0x10 if ($t[2] eq '-');
+ }
+ # mapQ
+ $s->[4] = 30;
+ # mate coordinate
+ $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+ # aux
+ push(@$s, "NM:i:$t[3]");
+}
diff --git a/samtools-0.1.19/padding.c b/samtools-0.1.19/padding.c
new file mode 100644
index 0000000..a8da562
--- /dev/null
+++ b/samtools-0.1.19/padding.c
@@ -0,0 +1,479 @@
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include "kstring.h"
+#include "sam_header.h"
+#include "sam.h"
+#include "bam.h"
+#include "faidx.h"
+
+bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/
+
+static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
+{
+ if (n != b->core.n_cigar) {
+ int o = b->core.l_qname + b->core.n_cigar * 4;
+ if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) {
+ b->m_data = b->data_len + (n - b->core.n_cigar) * 4;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o);
+ memcpy(b->data + b->core.l_qname, cigar, n * 4);
+ b->data_len += (n - b->core.n_cigar) * 4;
+ b->core.n_cigar = n;
+ } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
+}
+
+#define write_cigar(_c, _n, _m, _v) do { \
+ if (_n == _m) { \
+ _m = _m? _m<<1 : 4; \
+ _c = (uint32_t*)realloc(_c, _m * 4); \
+ } \
+ _c[_n++] = (_v); \
+ } while (0)
+
+static void unpad_seq(bam1_t *b, kstring_t *s)
+{
+ int k, j, i;
+ int length;
+ uint32_t *cigar = bam1_cigar(b);
+ uint8_t *seq = bam1_seq(b);
+ // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
+ // We need the padded length after alignment from the CIGAR (excluding
+ // soft clips S, but including pads from CIGAR D operations)
+ length = 0;
+ for (k = 0; k < b->core.n_cigar; ++k) {
+ int op, ol;
+ op= bam_cigar_op(cigar[k]);
+ ol = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL)
+ length += ol;
+ }
+ ks_resize(s, length);
+ for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
+ int op, ol;
+ op = bam_cigar_op(cigar[k]);
+ ol = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j);
+ } else if (op == BAM_CSOFT_CLIP) {
+ j += ol;
+ } else if (op == BAM_CHARD_CLIP) {
+ /* do nothing */
+ } else if (op == BAM_CDEL) {
+ for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
+ } else {
+ fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b));
+ assert(-1);
+ }
+ }
+ assert(length == s->l);
+}
+
+int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
+{
+ char base;
+ char *fai_ref = 0;
+ int fai_ref_len = 0, k;
+
+ fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
+ if (fai_ref_len != ref_len) {
+ fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len);
+ free(fai_ref);
+ return -1;
+ }
+ ks_resize(seq, ref_len);
+ seq->l = 0;
+ for (k = 0; k < ref_len; ++k) {
+ base = fai_ref[k];
+ if (base == '-' || base == '*') {
+ // Map gaps to null to match unpad_seq function
+ seq->s[seq->l++] = 0;
+ } else {
+ int i = bam_nt16_table[(int)base];
+ if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
+ fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
+ free(fai_ref);
+ return -1;
+ }
+ seq->s[seq->l++] = i;
+ }
+ }
+ assert(ref_len == seq->l);
+ free(fai_ref);
+ return 0;
+}
+
+int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len)
+{
+ char base;
+ char *fai_ref = 0;
+ int fai_ref_len = 0, k;
+ int bases=0, gaps=0;
+
+ fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
+ if (fai_ref_len != padded_len) {
+ fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len);
+ free(fai_ref);
+ return -1;
+ }
+ for (k = 0; k < padded_len; ++k) {
+ //fprintf(stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref));
+ base = fai_ref[k];
+ if (base == '-' || base == '*') {
+ gaps += 1;
+ } else {
+ int i = bam_nt16_table[(int)base];
+ if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
+ fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
+ free(fai_ref);
+ return -1;
+ }
+ bases += 1;
+ }
+ }
+ free(fai_ref);
+ assert (padded_len == bases + gaps);
+ return bases;
+}
+
+inline int * update_posmap(int *posmap, kstring_t ref)
+{
+ int i, k;
+ posmap = realloc(posmap, ref.m * sizeof(int));
+ for (i = k = 0; i < ref.l; ++i) {
+ posmap[i] = k;
+ if (ref.s[i]) ++k;
+ }
+ return posmap;
+}
+
+int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
+{
+ bam_header_t *h = 0;
+ bam1_t *b = 0;
+ kstring_t r, q;
+ int r_tid = -1;
+ uint32_t *cigar2 = 0;
+ int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
+
+ b = bam_init1();
+ r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
+ int read_ret;
+ h = in->header;
+ while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in'
+ uint32_t *cigar = bam1_cigar(b);
+ n2 = 0;
+ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
+ // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b));
+ r_tid = b->core.tid;
+ unpad_seq(b, &r);
+ if (h->target_len[r_tid] != r.l) {
+ fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l);
+ return -1;
+ }
+ if (fai) {
+ // Check the embedded reference matches the FASTA file
+ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ assert(r.l == q.l);
+ int i;
+ for (i = 0; i < r.l; ++i) {
+ if (r.s[i] != q.s[i]) {
+ // Show gaps as ASCII 45
+ fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
+ h->target_name[b->core.tid], i+1,
+ r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45,
+ q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45);
+ return -1;
+ }
+ }
+ }
+ write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
+ replace_cigar(b, n2, cigar2);
+ posmap = update_posmap(posmap, r);
+ } else if (b->core.n_cigar > 0) {
+ int i, k, op;
+ if (b->core.tid < 0) {
+ fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b));
+ return -1;
+ } else if (b->core.tid == r_tid) {
+ ; // good case, reference available
+ //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b));
+ } else if (fai) {
+ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ posmap = update_posmap(posmap, r);
+ r_tid = b->core.tid;
+ // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]);
+ } else {
+ fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ unpad_seq(b, &q);
+ if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[0]);
+ } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[0]);
+ if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[1]);
+ }
+ }
+ /* Determine CIGAR operator for each base in the aligned read */
+ for (i = 0, k = b->core.pos; i < q.l; ++i, ++k)
+ q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD);
+ /* Include any pads if starts with an insert */
+ if (q.s[0] == BAM_CINS) {
+ for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
+ if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD));
+ }
+ /* Count consecutive CIGAR operators to turn into a CIGAR string */
+ for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
+ if (op != q.s[i]) {
+ write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
+ op = q.s[i]; k = 1;
+ } else ++k;
+ }
+ write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
+ if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
+ } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) {
+ if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]);
+ }
+ write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
+ }
+ /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */
+ int pre_op, post_op;
+ for (i = 2; i < n2; ++i)
+ if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) {
+ pre_op = bam_cigar_op(cigar2[i-2]);
+ post_op = bam_cigar_op(cigar2[i]);
+ /* Note don't need to check for X/= as code above will use M only */
+ if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) {
+ /* This is a redundant P operator */
+ cigar2[i-1] = 0; // i.e. 0M
+ /* If had same operator either side, combine them in post_op */
+ if (pre_op == post_op) {
+ /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/
+ cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op);
+ cigar2[i-2] = 0; // i.e. 0M
+ }
+ }
+ }
+ /* Remove the zero'd operators (0M) */
+ for (i = k = 0; i < n2; ++i)
+ if (cigar2[i]) cigar2[k++] = cigar2[i];
+ n2 = k;
+ replace_cigar(b, n2, cigar2);
+ b->core.pos = posmap[b->core.pos];
+ if (b->core.mtid < 0 || b->core.mpos < 0) {
+ /* Nice case, no mate to worry about*/
+ // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b));
+ /* TODO - Warning if FLAG says mate should be mapped? */
+ /* Clean up funny input where mate position is given but mate reference is missing: */
+ b->core.mtid = -1;
+ b->core.mpos = -1;
+ } else if (b->core.mtid == b->core.tid) {
+ /* Nice case, same reference */
+ // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b));
+ b->core.mpos = posmap[b->core.mpos];
+ } else {
+ /* Nasty case, Must load alternative posmap */
+ // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]);
+ if (!fai) {
+ fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]);
+ return -1;
+ }
+ /* Temporarily load the other reference sequence */
+ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]);
+ return -1;
+ }
+ posmap = update_posmap(posmap, r);
+ b->core.mpos = posmap[b->core.mpos];
+ /* Restore the reference and posmap*/
+ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ posmap = update_posmap(posmap, r);
+ }
+ }
+ samwrite(out, b);
+ }
+ if (read_ret < -1) {
+ fprintf(stderr, "[depad] truncated file.\n");
+ ret = 1;
+ }
+ free(r.s); free(q.s); free(posmap);
+ bam_destroy1(b);
+ return ret;
+}
+
+bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
+{
+ int i = 0, unpadded_len = 0;
+ bam_header_t *header = 0 ;
+
+ header = bam_header_dup(old);
+ for (i = 0; i < old->n_targets; ++i) {
+ unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
+ if (unpadded_len < 0) {
+ fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]);
+ } else {
+ header->target_len[i] = unpadded_len;
+ //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]);
+ }
+ }
+ /* Duplicating the header allocated new buffer for header string */
+ /* After modifying the @SQ lines it will only get smaller, since */
+ /* the LN entries will be the same or shorter, and we'll remove */
+ /* any MD entries (MD5 checksums). */
+ assert(strlen(old->text) == strlen(header->text));
+ assert (0==strcmp(old->text, header->text));
+ const char *text;
+ text = old->text;
+ header->text[0] = '\0'; /* Resuse the allocated buffer */
+ char * newtext = header->text;
+ char * end=NULL;
+ while (text[0]=='@') {
+ end = strchr(text, '\n');
+ assert(end != 0);
+ if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') {
+ /* TODO - edit the @SQ line here to remove MD and fix LN. */
+ /* For now just remove the @SQ line, and samtools will */
+ /* automatically generate a minimal replacement with LN. */
+ /* However, that discards any other tags like AS, SP, UR. */
+ //fprintf(stderr, "[depad] Removing @SQ line\n");
+ } else {
+ /* Copy this line to the new header */
+ strncat(newtext, text, end - text + 1);
+ }
+ text = end + 1;
+ }
+ assert (text[0]=='\0');
+ /* Check we didn't overflow the buffer */
+ assert (strlen(header->text) <= strlen(old->text));
+ if (strlen(header->text) < header->l_text) {
+ //fprintf(stderr, "[depad] Reallocating header buffer\n");
+ assert (newtext == header->text);
+ newtext = malloc(strlen(header->text) + 1);
+ strcpy(newtext, header->text);
+ free(header->text);
+ header->text = newtext;
+ header->l_text = strlen(newtext);
+ }
+ //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
+ return header;
+}
+
+static int usage(int is_long_help);
+
+int main_pad2unpad(int argc, char *argv[])
+{
+ samfile_t *in = 0, *out = 0;
+ bam_header_t *h = 0;
+ faidx_t *fai = 0;
+ int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0;
+ char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
+ int ret=0;
+
+ /* parse command-line options */
+ strcpy(in_mode, "r"); strcpy(out_mode, "w");
+ while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) {
+ switch (c) {
+ case 'S': is_bamin = 0; break;
+ case 's': assert(compress_level == -1); is_bamout = 0; break;
+ case 'o': fn_out = strdup(optarg); break;
+ case 'u': assert(is_bamout == 1); compress_level = 0; break;
+ case '1': assert(is_bamout == 1); compress_level = 1; break;
+ case 'T': fn_ref = strdup(optarg); break;
+ case '?': is_long_help = 1; break;
+ default: return usage(is_long_help);
+ }
+ }
+ if (argc == optind) return usage(is_long_help);
+
+ if (is_bamin) strcat(in_mode, "b");
+ if (is_bamout) strcat(out_mode, "b");
+ strcat(out_mode, "h");
+ if (compress_level >= 0) {
+ char tmp[2];
+ tmp[0] = compress_level + '0'; tmp[1] = '\0';
+ strcat(out_mode, tmp);
+ }
+
+ // Load FASTA reference (also needed for SAM -> BAM if missing header)
+ if (fn_ref) {
+ fn_list = samfaipath(fn_ref);
+ fai = fai_load(fn_ref);
+ }
+ // open file handlers
+ if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+ fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
+ ret = 1;
+ goto depad_end;
+ }
+ if (in->header == 0) {
+ fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+ ret = 1;
+ goto depad_end;
+ }
+ if (in->header->text == 0 || in->header->l_text == 0) {
+ fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]);
+ assert (0 == in->header->l_text);
+ assert (0 == in->header->text);
+ }
+ if (fn_ref) {
+ h = fix_header(in->header, fai);
+ } else {
+ fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
+ h = in->header;
+ }
+ if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) {
+ fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
+ ret = 1;
+ goto depad_end;
+ }
+
+ // Do the depad
+ ret = bam_pad2unpad(in, out, fai);
+
+depad_end:
+ // close files, free and return
+ if (fai) fai_destroy(fai);
+ if (h != in->header) bam_header_destroy(h);
+ samclose(in);
+ samclose(out);
+ free(fn_list); free(fn_out);
+ return ret;
+}
+
+static int usage(int is_long_help)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools depad <in.bam>\n\n");
+ fprintf(stderr, "Options: -s output is SAM (default is BAM)\n");
+ fprintf(stderr, " -S input is SAM (default is BAM)\n");
+ fprintf(stderr, " -u uncompressed BAM output (can't use with -s)\n");
+ fprintf(stderr, " -1 fast compression BAM output (can't use with -s)\n");
+ fprintf(stderr, " -T FILE reference sequence file [null]\n");
+ fprintf(stderr, " -o FILE output file name [stdout]\n");
+ fprintf(stderr, " -? longer help\n");
+ fprintf(stderr, "\n");
+ if (is_long_help)
+ fprintf(stderr, "Notes:\n\
+\n\
+ 1. Requires embedded reference sequences (before the reads for that reference),\n\
+ with the future aim to also support a FASTA padded reference sequence file.\n\
+\n\
+ 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\
+\n");
+ return 1;
+}
diff --git a/samtools-0.1.19/phase.c b/samtools-0.1.19/phase.c
new file mode 100644
index 0000000..ef4eff9
--- /dev/null
+++ b/samtools-0.1.19/phase.c
@@ -0,0 +1,687 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <math.h>
+#include <zlib.h>
+#include "bam.h"
+#include "errmod.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MAX_VARS 256
+#define FLIP_PENALTY 2
+#define FLIP_THRES 4
+#define MASK_THRES 3
+
+#define FLAG_FIX_CHIMERA 0x1
+#define FLAG_LIST_EXCL 0x4
+#define FLAG_DROP_AMBI 0x8
+
+typedef struct {
+ // configurations, initialized in the main function
+ int flag, k, min_baseQ, min_varLOD, max_depth;
+ // other global variables
+ int vpos_shift;
+ bamFile fp;
+ char *pre;
+ bamFile out[3];
+ // alignment queue
+ int n, m;
+ bam1_t **b;
+} phaseg_t;
+
+typedef struct {
+ int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
+ int vpos, beg, end;
+ uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1;
+ uint32_t in:16, out:16; // in-phase and out-phase
+} frag_t, *frag_p;
+
+#define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+KHASH_MAP_INIT_INT64(64, frag_t)
+
+typedef khash_t(64) nseq_t;
+
+#include "ksort.h"
+KSORT_INIT(rseq, frag_p, rseq_lt)
+
+static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+static inline uint64_t X31_hash_string(const char *s)
+{
+ uint64_t h = *s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return h;
+}
+
+static void count1(int l, const uint8_t *seq, int *cnt)
+{
+ int i, j, n_ambi;
+ uint32_t z, x;
+ if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
+ for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
+ if (seq[i] == 0) ++n_ambi;
+ if (l - n_ambi <= 1) return; // only one SNP
+ for (x = 0; x < 1u<<n_ambi; ++x) { // count
+ for (i = j = 0, z = 0; i < l; ++i) {
+ int c;
+ if (seq[i]) c = seq[i] - 1;
+ else {
+ c = x>>j&1;
+ ++j;
+ }
+ z = z<<1 | c;
+ }
+ ++cnt[z];
+ }
+}
+
+static int **count_all(int l, int vpos, nseq_t *hash)
+{
+ khint_t k;
+ int i, j, **cnt;
+ uint8_t *seq;
+ seq = calloc(l, 1);
+ cnt = calloc(vpos, sizeof(void*));
+ for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
+ if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
+ f->single = 1;
+ continue;
+ }
+ for (j = 1; j < f->vlen; ++j) {
+ for (i = 0; i < l; ++i)
+ seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
+ count1(l, seq, cnt[f->vpos + j]);
+ }
+ }
+ }
+ free(seq);
+ return cnt;
+}
+
+// phasing
+static int8_t *dynaprog(int l, int vpos, int **w)
+{
+ int *f[2], *curr, *prev, max, i;
+ int8_t **b, *h = 0;
+ uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
+ f[0] = calloc(z, sizeof(int));
+ f[1] = calloc(z, sizeof(int));
+ b = calloc(vpos, sizeof(void*));
+ prev = f[0]; curr = f[1];
+ // fill the backtrack matrix
+ for (i = 0; i < vpos; ++i) {
+ int *wi = w[i], *tmp;
+ int8_t *bi;
+ bi = b[i] = calloc(z, 1);
+ /* In the following, x is the current state, which is the
+ * lexicographically smaller local haplotype. xc is the complement of
+ * x, or the larger local haplotype; y0 and y1 are the two predecessors
+ * of x. */
+ for (x = 0; x < z; ++x) { // x0 is the smaller
+ uint32_t y0, y1, xc;
+ int c0, c1;
+ xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
+ c0 = prev[y0] + wi[x] + wi[xc];
+ c1 = prev[y1] + wi[x] + wi[xc];
+ if (c0 > c1) bi[x] = 0, curr[x] = c0;
+ else bi[x] = 1, curr[x] = c1;
+ }
+ tmp = prev; prev = curr; curr = tmp; // swap
+ }
+ { // backtrack
+ uint32_t max_x = 0;
+ int which = 0;
+ h = calloc(vpos, 1);
+ for (x = 0, max = 0, max_x = 0; x < z; ++x)
+ if (prev[x] > max) max = prev[x], max_x = x;
+ for (i = vpos - 1, x = max_x; i >= 0; --i) {
+ h[i] = which? (~x&1) : (x&1);
+ which = b[i][x]? !which : which;
+ x = b[i][x]? (~x&mask)>>1 : x>>1;
+ }
+ }
+ // free
+ for (i = 0; i < vpos; ++i) free(b[i]);
+ free(f[0]); free(f[1]); free(b);
+ return h;
+}
+
+// phase each fragment
+static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
+{
+ khint_t k;
+ uint64_t *pcnt;
+ uint32_t *left, *rght, max;
+ left = rght = 0; max = 0;
+ pcnt = calloc(vpos, 8);
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ int i, c[2];
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos >= vpos) continue;
+ // get the phase
+ c[0] = c[1] = 0;
+ for (i = 0; i < f->vlen; ++i) {
+ if (f->seq[i] == 0) continue;
+ ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
+ }
+ f->phase = c[0] > c[1]? 0 : 1;
+ f->in = c[f->phase]; f->out = c[1 - f->phase];
+ f->phased = f->in == f->out? 0 : 1;
+ f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0;
+ // fix chimera
+ f->flip = 0;
+ if (flip && c[0] >= 3 && c[1] >= 3) {
+ int sum[2], m, mi, md;
+ if (f->vlen > max) { // enlarge the array
+ max = f->vlen;
+ kroundup32(max);
+ left = realloc(left, max * 4);
+ rght = realloc(rght, max * 4);
+ }
+ for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
+ if (f->seq[i]) {
+ int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ ++sum[c == path[f->vpos + i]? 0 : 1];
+ }
+ left[i] = sum[1]<<16 | sum[0];
+ }
+ for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
+ if (f->seq[i]) {
+ int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ ++sum[c == path[f->vpos + i]? 0 : 1];
+ }
+ rght[i] = sum[1]<<16 | sum[0];
+ }
+ // find the best flip point
+ for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
+ int a[2];
+ a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
+ a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
+ if (a[0] > a[1]) {
+ if (a[0] > m) m = a[0], md = 0, mi = i;
+ } else {
+ if (a[1] > m) m = a[1], md = 1, mi = i;
+ }
+ }
+ if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
+ f->flip = 1;
+ if (md == 0) { // flip the tail
+ for (i = mi + 1; i < f->vlen; ++i)
+ if (f->seq[i] == 1) f->seq[i] = 2;
+ else if (f->seq[i] == 2) f->seq[i] = 1;
+ } else { // flip the head
+ for (i = 0; i <= mi; ++i)
+ if (f->seq[i] == 1) f->seq[i] = 2;
+ else if (f->seq[i] == 2) f->seq[i] = 1;
+ }
+ }
+ }
+ // update pcnt[]
+ if (!f->single) {
+ for (i = 0; i < f->vlen; ++i) {
+ int c;
+ if (f->seq[i] == 0) continue;
+ c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ if (c == path[f->vpos + i]) {
+ if (f->phase == 0) ++pcnt[f->vpos + i];
+ else pcnt[f->vpos + i] += 1ull<<32;
+ } else {
+ if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
+ else pcnt[f->vpos + i] += 1ull<<48;
+ }
+ }
+ }
+ }
+ }
+ free(left); free(rght);
+ return pcnt;
+}
+
+static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
+{
+ int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
+ uint64_t *list = 0;
+ for (i = 0; i < vpos; ++i) {
+ uint64_t x = pcnt[i];
+ int c[4], pre = score, s;
+ c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
+ s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
+ if (c[3] > c[2]) s += c[3] - c[2];
+ if (c[1] > c[0]) s += c[1] - c[0];
+ score += s;
+ if (score < 0) score = 0;
+ if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
+ if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
+ if (n == m) {
+ m = m? m<<1 : 4;
+ list = realloc(list, m * 8);
+ }
+ list[n++] = (uint64_t)beg<<32 | max_i;
+ i = max_i; // reset i to max_i
+ score = 0;
+ } else if (score > max) max = score, max_i = i;
+ if (score == 0) max = 0;
+ }
+ *_n = n;
+ return list;
+}
+
+// trim heading and tailing ambiguous bases; mark deleted and remove sequence
+static int clean_seqs(int vpos, nseq_t *hash)
+{
+ khint_t k;
+ int ret = 0;
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ int beg, end, i;
+ if (f->vpos >= vpos) {
+ ret = 1;
+ continue;
+ }
+ for (i = 0; i < f->vlen; ++i)
+ if (f->seq[i] != 0) break;
+ beg = i;
+ for (i = f->vlen - 1; i >= 0; --i)
+ if (f->seq[i] != 0) break;
+ end = i + 1;
+ if (end - beg <= 0) kh_del(64, hash, k);
+ else {
+ if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
+ f->vpos += beg; f->vlen = end - beg;
+ f->single = f->vlen == 1? 1 : 0;
+ }
+ }
+ }
+ return ret;
+}
+
+static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+{
+ int i, is_flip, drop_ambi;
+ drop_ambi = g->flag & FLAG_DROP_AMBI;
+ is_flip = (drand48() < 0.5);
+ for (i = 0; i < g->n; ++i) {
+ int end, which;
+ uint64_t key;
+ khint_t k;
+ bam1_t *b = g->b[i];
+ key = X31_hash_string(bam1_qname(b));
+ end = bam_calend(&b->core, bam1_cigar(b));
+ if (end > min_pos) break;
+ k = kh_get(64, hash, key);
+ if (k == kh_end(hash)) which = 3;
+ else {
+ frag_t *f = &kh_val(hash, k);
+ if (f->ambig) which = drop_ambi? 2 : 3;
+ else if (f->phased && f->flip) which = 2;
+ else if (f->phased == 0) which = 3;
+ else { // phased and not flipped
+ char c = 'Y';
+ which = f->phase;
+ bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
+ }
+ if (which < 2 && is_flip) which = 1 - which; // increase the randomness
+ }
+ if (which == 3) which = (drand48() < 0.5);
+ bam_write1(g->out[which], b);
+ bam_destroy1(b);
+ g->b[i] = 0;
+ }
+ memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
+ g->n -= i;
+}
+
+static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
+{
+ int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
+ khint_t k;
+ frag_t **seqs;
+ int8_t *path, *sitemask;
+ uint64_t *pcnt, *regmask;
+
+ if (vpos == 0) return 0;
+ i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
+ min_pos = i? cns[vpos]>>32 : 0x7fffffff;
+ if (vpos == 1) {
+ printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
+ printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
+ "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos) continue;
+ f->flip = 0;
+ if (f->seq[0] == 0) f->phased = 0;
+ else f->phased = 1, f->phase = f->seq[0] - 1;
+ }
+ }
+ dump_aln(g, min_pos, hash);
+ ++g->vpos_shift;
+ return 1;
+ }
+ { // phase
+ int **cnt;
+ uint64_t *mask;
+ printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
+ sitemask = calloc(vpos, 1);
+ cnt = count_all(g->k, vpos, hash);
+ path = dynaprog(g->k, vpos, cnt);
+ for (i = 0; i < vpos; ++i) free(cnt[i]);
+ free(cnt);
+ pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
+ mask = genmask(vpos, pcnt, &n_masked);
+ regmask = calloc(n_masked, 8);
+ for (i = 0; i < n_masked; ++i) {
+ regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
+ for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
+ sitemask[j] = 1;
+ }
+ free(mask);
+ if (g->flag & FLAG_FIX_CHIMERA) {
+ free(pcnt);
+ pcnt = fragphase(vpos, path, hash, 1);
+ }
+ }
+ for (i = 0; i < n_masked; ++i)
+ printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
+ for (i = 0; i < vpos; ++i) {
+ uint64_t x = pcnt[i];
+ int8_t c[2];
+ c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
+ c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
+ printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
+ i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
+ }
+ free(path); free(pcnt); free(regmask); free(sitemask);
+ seqs = calloc(n_seqs, sizeof(void*));
+ for (k = 0, i = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
+ seqs[i++] = &kh_val(hash, k);
+ n_seqs = i;
+ ks_introsort_rseq(n_seqs, seqs);
+ for (i = 0; i < n_seqs; ++i) {
+ frag_t *f = seqs[i];
+ printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
+ for (j = 0; j < f->vlen; ++j) {
+ uint32_t c = cns[f->vpos + j];
+ if (f->seq[j] == 0) putchar('N');
+ else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
+ }
+ printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
+ }
+ free(seqs);
+ printf("//\n");
+ fflush(stdout);
+ g->vpos_shift += vpos;
+ dump_aln(g, min_pos, hash);
+ return vpos;
+}
+
+static void update_vpos(int vpos, nseq_t *hash)
+{
+ khint_t k;
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
+ else f->vpos -= vpos;
+ }
+ }
+}
+
+static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
+{
+ return hash;
+}
+
+static int readaln(void *data, bam1_t *b)
+{
+ phaseg_t *g = (phaseg_t*)data;
+ int ret;
+ ret = bam_read1(g->fp, b);
+ if (ret < 0) return ret;
+ if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
+ if (g->n == g->m) {
+ g->m = g->m? g->m<<1 : 16;
+ g->b = realloc(g->b, g->m * sizeof(void*));
+ }
+ g->b[g->n++] = bam_dup1(b);
+ }
+ return ret;
+}
+
+static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
+{
+ gzFile fp;
+ kstream_t *ks;
+ int ret, dret;
+ kstring_t *str;
+ khash_t(set64) *hash;
+
+ hash = kh_init(set64);
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ int tid = bam_get_tid(h, str->s);
+ if (tid >= 0 && dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+ kh_put(set64, hash, x, &ret);
+ } else break;
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (dret < 0) break;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ return hash;
+}
+
+static int gl2cns(float q[16])
+{
+ int i, j, min_ij;
+ float min, min2;
+ min = min2 = 1e30; min_ij = -1;
+ for (i = 0; i < 4; ++i) {
+ for (j = i; j < 4; ++j) {
+ if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
+ else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
+ }
+ }
+ return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
+}
+
+int main_phase(int argc, char *argv[])
+{
+ extern void bam_init_header_hash(bam_header_t *header);
+ int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+ const bam_pileup1_t *plp;
+ bam_plp_t iter;
+ bam_header_t *h;
+ nseq_t *seqs;
+ uint64_t *cns = 0;
+ phaseg_t g;
+ char *fn_list = 0;
+ khash_t(set64) *set = 0;
+ errmod_t *em;
+ uint16_t *bases;
+
+ memset(&g, 0, sizeof(phaseg_t));
+ g.flag = FLAG_FIX_CHIMERA;
+ g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
+ while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
+ switch (c) {
+ case 'D': g.max_depth = atoi(optarg); break;
+ case 'q': g.min_varLOD = atoi(optarg); break;
+ case 'Q': g.min_baseQ = atoi(optarg); break;
+ case 'k': g.k = atoi(optarg); break;
+ case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
+ case 'e': g.flag |= FLAG_LIST_EXCL; break;
+ case 'A': g.flag |= FLAG_DROP_AMBI; break;
+ case 'b': g.pre = strdup(optarg); break;
+ case 'l': fn_list = strdup(optarg); break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools phase [options] <in.bam>\n\n");
+ fprintf(stderr, "Options: -k INT block length [%d]\n", g.k);
+ fprintf(stderr, " -b STR prefix of BAMs to output [null]\n");
+ fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD);
+ fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ);
+ fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth);
+// fprintf(stderr, " -l FILE list of sites to phase [null]\n");
+ fprintf(stderr, " -F do not attempt to fix chimeras\n");
+ fprintf(stderr, " -A drop reads with ambiguous phase\n");
+// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ h = bam_header_read(g.fp);
+ if (fn_list) { // read the list of sites to phase
+ bam_init_header_hash(h);
+ set = loadpos(fn_list, h);
+ free(fn_list);
+ } else g.flag &= ~FLAG_LIST_EXCL;
+ if (g.pre) { // open BAMs to write
+ char *s = malloc(strlen(g.pre) + 20);
+ strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
+ strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
+ strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
+ for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
+ free(s);
+ }
+
+ iter = bam_plp_init(readaln, &g);
+ g.vpos_shift = 0;
+ seqs = kh_init(64);
+ em = errmod_init(1. - 0.83);
+ bases = calloc(g.max_depth, 2);
+ printf("CC\n");
+ printf("CC\tDescriptions:\nCC\n");
+ printf("CC\t CC comments\n");
+ printf("CC\t PS start of a phase set\n");
+ printf("CC\t FL filtered region\n");
+ printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n");
+ printf("CC\t EV supporting reads; SAM format\n");
+ printf("CC\t // end of a phase set\nCC\n");
+ printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
+ printf("CC\t PS chr phaseSetStart phaseSetEnd\n");
+ printf("CC\t FL chr filterStart filterEnd\n");
+ printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n");
+ printf("CC\nCC\n");
+ fflush(stdout);
+ while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
+ int i, k, c, tmp, dophase = 1, in_set = 0;
+ float q[16];
+ if (tid < 0) break;
+ if (tid != lasttid) { // change of chromosome
+ g.vpos_shift = 0;
+ if (lasttid >= 0) {
+ seqs = shrink_hash(seqs);
+ phase(&g, h->target_name[lasttid], vpos, cns, seqs);
+ update_vpos(0x7fffffff, seqs);
+ }
+ lasttid = tid;
+ vpos = 0;
+ }
+ if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
+ if (n > g.max_depth) continue; // do not proceed if the depth is too high
+ // fill the bases array and check if there is a variant
+ for (i = k = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint8_t *seq;
+ int q, baseQ, b;
+ if (p->is_del || p->is_refskip) continue;
+ baseQ = bam1_qual(p->b)[p->qpos];
+ if (baseQ < g.min_baseQ) continue;
+ seq = bam1_seq(p->b);
+ b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+ if (b > 3) continue;
+ q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+ if (q < 4) q = 4;
+ if (q > 63) q = 63;
+ bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+ }
+ if (k == 0) continue;
+ errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
+ c = gl2cns(q); // get the consensus
+ // tell if to proceed
+ if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
+ if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
+ // add the variant
+ if (vpos == max_vpos) {
+ max_vpos = max_vpos? max_vpos<<1 : 128;
+ cns = realloc(cns, max_vpos * 8);
+ }
+ cns[vpos] = (uint64_t)pos<<32 | c;
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint64_t key;
+ khint_t k;
+ uint8_t *seq = bam1_seq(p->b);
+ frag_t *f;
+ if (p->is_del || p->is_refskip) continue;
+ if (p->b->core.qual == 0) continue;
+ // get the base code
+ c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
+ if (c == (cns[vpos]&3)) c = 1;
+ else if (c == (cns[vpos]>>16&3)) c = 2;
+ else c = 0;
+ // write to seqs
+ key = X31_hash_string(bam1_qname(p->b));
+ k = kh_put(64, seqs, key, &tmp);
+ f = &kh_val(seqs, k);
+ if (tmp == 0) { // present in the hash table
+ if (vpos - f->vpos + 1 < MAX_VARS) {
+ f->vlen = vpos - f->vpos + 1;
+ f->seq[f->vlen-1] = c;
+ f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+ }
+ dophase = 0;
+ } else { // absent
+ memset(f->seq, 0, MAX_VARS);
+ f->beg = p->b->core.pos;
+ f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+ f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0;
+ }
+ }
+ if (dophase) {
+ seqs = shrink_hash(seqs);
+ phase(&g, h->target_name[tid], vpos, cns, seqs);
+ update_vpos(vpos, seqs);
+ cns[0] = cns[vpos];
+ vpos = 0;
+ }
+ ++vpos;
+ }
+ if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
+ bam_header_destroy(h);
+ bam_plp_destroy(iter);
+ bam_close(g.fp);
+ kh_destroy(64, seqs);
+ kh_destroy(set64, set);
+ free(cns);
+ errmod_destroy(em);
+ free(bases);
+ if (g.pre) {
+ for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
+ free(g.pre); free(g.b);
+ }
+ return 0;
+}
diff --git a/samtools-0.1.19/razf.c b/samtools-0.1.19/razf.c
new file mode 100644
index 0000000..e7499f9
--- /dev/null
+++ b/samtools-0.1.19/razf.c
@@ -0,0 +1,853 @@
+/*
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue at gmail.com>, Heng Li <lh3 at sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NO_RAZF
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "razf.h"
+
+
+#if ZLIB_VERNUM < 0x1221
+struct _gz_header_s {
+ int text;
+ uLong time;
+ int xflags;
+ int os;
+ Bytef *extra;
+ uInt extra_len;
+ uInt extra_max;
+ Bytef *name;
+ uInt name_max;
+ Bytef *comment;
+ uInt comm_max;
+ int hcrc;
+ int done;
+};
+#warning "zlib < 1.2.2.1; RAZF writing is disabled."
+#endif
+
+#define DEF_MEM_LEVEL 8
+
+static inline uint32_t byte_swap_4(uint32_t v){
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint64_t byte_swap_8(uint64_t v){
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+
+static inline int is_big_endian(){
+ int x = 0x01;
+ char *c = (char*)&x;
+ return (c[0] != 0x01);
+}
+
+#ifndef _RZ_READONLY
+static void add_zindex(RAZF *rz, int64_t in, int64_t out){
+ if(rz->index->size == rz->index->cap){
+ rz->index->cap = rz->index->cap * 1.5 + 2;
+ rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
+ rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
+ }
+ if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
+ rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
+ rz->index->size ++;
+}
+
+static void save_zindex(RAZF *rz, int fd){
+ int32_t i, v32;
+ int is_be;
+ is_be = is_big_endian();
+ if(is_be) write(fd, &rz->index->size, sizeof(int));
+ else {
+ v32 = byte_swap_4((uint32_t)rz->index->size);
+ write(fd, &v32, sizeof(uint32_t));
+ }
+ v32 = rz->index->size / RZ_BIN_SIZE + 1;
+ if(!is_be){
+ for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+ for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+ }
+ write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+ write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
+}
+#endif
+
+#ifdef _USE_KNETFILE
+static void load_zindex(RAZF *rz, knetFile *fp){
+#else
+static void load_zindex(RAZF *rz, int fd){
+#endif
+ int32_t i, v32;
+ int is_be;
+ if(!rz->load_index) return;
+ if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
+ is_be = is_big_endian();
+#ifdef _USE_KNETFILE
+ knet_read(fp, &rz->index->size, sizeof(int));
+#else
+ read(fd, &rz->index->size, sizeof(int));
+#endif
+ if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
+ rz->index->cap = rz->index->size;
+ v32 = rz->index->size / RZ_BIN_SIZE + 1;
+ rz->index->bin_offsets = malloc(sizeof(int64_t) * v32);
+#ifdef _USE_KNETFILE
+ knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#else
+ read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#endif
+ rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
+#ifdef _USE_KNETFILE
+ knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#else
+ read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#endif
+ if(!is_be){
+ for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+ for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+ }
+}
+
+#ifdef _RZ_READONLY
+static RAZF* razf_open_w(int fd)
+{
+ fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n");
+ return 0;
+}
+#else
+static RAZF* razf_open_w(int fd){
+ RAZF *rz;
+#ifdef _WIN32
+ setmode(fd, O_BINARY);
+#endif
+ rz = calloc(1, sizeof(RAZF));
+ rz->mode = 'w';
+#ifdef _USE_KNETFILE
+ rz->x.fpw = fd;
+#else
+ rz->filedes = fd;
+#endif
+ rz->stream = calloc(sizeof(z_stream), 1);
+ rz->inbuf = malloc(RZ_BUFFER_SIZE);
+ rz->outbuf = malloc(RZ_BUFFER_SIZE);
+ rz->index = calloc(sizeof(ZBlockIndex), 1);
+ deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ rz->header = calloc(sizeof(gz_header), 1);
+ rz->header->os = 0x03; //Unix
+ rz->header->text = 0;
+ rz->header->time = 0;
+ rz->header->extra = malloc(7);
+ strncpy((char*)rz->header->extra, "RAZF", 4);
+ rz->header->extra[4] = 1; // obsolete field
+ // block size = RZ_BLOCK_SIZE, Big-Endian
+ rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
+ rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
+ rz->header->extra_len = 7;
+ rz->header->name = rz->header->comment = 0;
+ rz->header->hcrc = 0;
+ deflateSetHeader(rz->stream, rz->header);
+ rz->block_pos = rz->block_off = 0;
+ return rz;
+}
+
+static void _razf_write(RAZF* rz, const void *data, int size){
+ int tout;
+ rz->stream->avail_in = size;
+ rz->stream->next_in = (void*)data;
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_NO_FLUSH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out) break;
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ if(rz->stream->avail_in == 0) break;
+ };
+ rz->in += size - rz->stream->avail_in;
+ rz->block_off += size - rz->stream->avail_in;
+}
+
+static void razf_flush(RAZF *rz){
+ uint32_t tout;
+ if(rz->buf_len){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_off = rz->buf_len = 0;
+ }
+ if(rz->stream->avail_out){
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ }
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_FULL_FLUSH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out == 0){
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ } else break;
+ }
+ rz->block_pos = rz->out;
+ rz->block_off = 0;
+}
+
+static void razf_end_flush(RAZF *rz){
+ uint32_t tout;
+ if(rz->buf_len){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_off = rz->buf_len = 0;
+ }
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_FINISH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ } else break;
+ }
+}
+
+static void _razf_buffered_write(RAZF *rz, const void *data, int size){
+ int i, n;
+ while(1){
+ if(rz->buf_len == RZ_BUFFER_SIZE){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_len = 0;
+ }
+ if(size + rz->buf_len < RZ_BUFFER_SIZE){
+ for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+ rz->buf_len += size;
+ return;
+ } else {
+ n = RZ_BUFFER_SIZE - rz->buf_len;
+ for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+ size -= n;
+ data += n;
+ rz->buf_len += n;
+ }
+ }
+}
+
+int razf_write(RAZF* rz, const void *data, int size){
+ int ori_size, n;
+ int64_t next_block;
+ ori_size = size;
+ next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+ while(rz->in + rz->buf_len + size >= next_block){
+ n = next_block - rz->in - rz->buf_len;
+ _razf_buffered_write(rz, data, n);
+ data += n;
+ size -= n;
+ razf_flush(rz);
+ add_zindex(rz, rz->in, rz->out);
+ next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+ }
+ _razf_buffered_write(rz, data, size);
+ return ori_size;
+}
+#endif
+
+/* gzip flag byte */
+#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
+#define COMMENT 0x10 /* bit 4 set: file comment present */
+#define RESERVED 0xE0 /* bits 5..7: reserved */
+
+static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
+ int method, flags, n, len;
+ if(size < 2) return 0;
+ if(data[0] != 0x1f || data[1] != 0x8b) return 0;
+ if(size < 4) return 0;
+ method = data[2];
+ flags = data[3];
+ if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
+ n = 4 + 6; // Skip 6 bytes
+ *extra_off = n + 2;
+ *extra_len = 0;
+ if(flags & EXTRA_FIELD){
+ if(size < n + 2) return 0;
+ len = ((int)data[n + 1] << 8) | data[n];
+ n += 2;
+ *extra_off = n;
+ while(len){
+ if(n >= size) return 0;
+ n ++;
+ len --;
+ }
+ *extra_len = n - (*extra_off);
+ }
+ if(flags & ORIG_NAME) while(n < size && data[n++]);
+ if(flags & COMMENT) while(n < size && data[n++]);
+ if(flags & HEAD_CRC){
+ if(n + 2 > size) return 0;
+ n += 2;
+ }
+ return n;
+}
+
+#ifdef _USE_KNETFILE
+static RAZF* razf_open_r(knetFile *fp, int _load_index){
+#else
+static RAZF* razf_open_r(int fd, int _load_index){
+#endif
+ RAZF *rz;
+ int ext_off, ext_len;
+ int n, is_be, ret;
+ int64_t end;
+ unsigned char c[] = "RAZF";
+ rz = calloc(1, sizeof(RAZF));
+ rz->mode = 'r';
+#ifdef _USE_KNETFILE
+ rz->x.fpr = fp;
+#else
+#ifdef _WIN32
+ setmode(fd, O_BINARY);
+#endif
+ rz->filedes = fd;
+#endif
+ rz->stream = calloc(sizeof(z_stream), 1);
+ rz->inbuf = malloc(RZ_BUFFER_SIZE);
+ rz->outbuf = malloc(RZ_BUFFER_SIZE);
+ rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
+#ifdef _USE_KNETFILE
+ n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+ n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif
+ ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
+ if(ret == 0){
+ PLAIN_FILE:
+ rz->in = n;
+ rz->file_type = FILE_TYPE_PLAIN;
+ memcpy(rz->outbuf, rz->inbuf, n);
+ rz->buf_len = n;
+ free(rz->stream);
+ rz->stream = NULL;
+ return rz;
+ }
+ rz->header_size = ret;
+ ret = inflateInit2(rz->stream, -WINDOW_BITS);
+ if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
+ rz->stream->avail_in = n - rz->header_size;
+ rz->stream->next_in = rz->inbuf + rz->header_size;
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ rz->file_type = FILE_TYPE_GZ;
+ rz->in = rz->header_size;
+ rz->block_pos = rz->header_size;
+ rz->next_block_pos = rz->header_size;
+ rz->block_off = 0;
+ if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
+ if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
+ fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
+ return rz;
+ }
+ rz->load_index = _load_index;
+ rz->file_type = FILE_TYPE_RZ;
+#ifdef _USE_KNETFILE
+ if(knet_seek(fp, -16, SEEK_END) == -1){
+#else
+ if(lseek(fd, -16, SEEK_END) == -1){
+#endif
+ UNSEEKABLE:
+ rz->seekable = 0;
+ rz->index = NULL;
+ rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
+ } else {
+ is_be = is_big_endian();
+ rz->seekable = 1;
+#ifdef _USE_KNETFILE
+ knet_read(fp, &end, sizeof(int64_t));
+#else
+ read(fd, &end, sizeof(int64_t));
+#endif
+ if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
+ else rz->src_end = end;
+
+#ifdef _USE_KNETFILE
+ knet_read(fp, &end, sizeof(int64_t));
+#else
+ read(fd, &end, sizeof(int64_t));
+#endif
+ if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
+ else rz->end = end;
+ if(n > rz->end){
+ rz->stream->avail_in -= n - rz->end;
+ n = rz->end;
+ }
+ if(rz->end > rz->src_end){
+#ifdef _USE_KNETFILE
+ knet_seek(fp, rz->in, SEEK_SET);
+#else
+ lseek(fd, rz->in, SEEK_SET);
+#endif
+ goto UNSEEKABLE;
+ }
+#ifdef _USE_KNETFILE
+ knet_seek(fp, rz->end, SEEK_SET);
+ if(knet_tell(fp) != rz->end){
+ knet_seek(fp, rz->in, SEEK_SET);
+#else
+ if(lseek(fd, rz->end, SEEK_SET) != rz->end){
+ lseek(fd, rz->in, SEEK_SET);
+#endif
+ goto UNSEEKABLE;
+ }
+#ifdef _USE_KNETFILE
+ load_zindex(rz, fp);
+ knet_seek(fp, n, SEEK_SET);
+#else
+ load_zindex(rz, fd);
+ lseek(fd, n, SEEK_SET);
+#endif
+ }
+ return rz;
+}
+
+#ifdef _USE_KNETFILE
+RAZF* razf_dopen(int fd, const char *mode){
+ if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n");
+ else if(strstr(mode, "w")) return razf_open_w(fd);
+ return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+ fprintf(stderr,"[razf_dopen2] implement me\n");
+ return NULL;
+}
+#else
+RAZF* razf_dopen(int fd, const char *mode){
+ if(strstr(mode, "r")) return razf_open_r(fd, 1);
+ else if(strstr(mode, "w")) return razf_open_w(fd);
+ else return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+ if(strstr(mode, "r")) return razf_open_r(fd, 0);
+ else if(strstr(mode, "w")) return razf_open_w(fd);
+ else return NULL;
+}
+#endif
+
+static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
+ int fd;
+ RAZF *rz;
+ if(strstr(mode, "r")){
+#ifdef _USE_KNETFILE
+ knetFile *fd = knet_open(filename, "r");
+ if (fd == 0) {
+ fprintf(stderr, "[_razf_open] fail to open %s\n", filename);
+ return NULL;
+ }
+#else
+#ifdef _WIN32
+ fd = open(filename, O_RDONLY | O_BINARY);
+#else
+ fd = open(filename, O_RDONLY);
+#endif
+#endif
+ if(fd < 0) return NULL;
+ rz = razf_open_r(fd, _load_index);
+ } else if(strstr(mode, "w")){
+#ifdef _WIN32
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666);
+#else
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
+#endif
+ if(fd < 0) return NULL;
+ rz = razf_open_w(fd);
+ } else return NULL;
+ return rz;
+}
+
+RAZF* razf_open(const char *filename, const char *mode){
+ return _razf_open(filename, mode, 1);
+}
+
+RAZF* razf_open2(const char *filename, const char *mode){
+ return _razf_open(filename, mode, 0);
+}
+
+int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
+ int64_t n;
+ if(rz->mode != 'r' && rz->mode != 'R') return 0;
+ switch(rz->file_type){
+ case FILE_TYPE_PLAIN:
+ if(rz->end == 0x7fffffffffffffffLL){
+#ifdef _USE_KNETFILE
+ if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0;
+ n = knet_tell(rz->x.fpr);
+ knet_seek(rz->x.fpr, 0, SEEK_END);
+ rz->end = knet_tell(rz->x.fpr);
+ knet_seek(rz->x.fpr, n, SEEK_SET);
+#else
+ if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
+ rz->end = lseek(rz->filedes, 0, SEEK_END);
+ lseek(rz->filedes, n, SEEK_SET);
+#endif
+ }
+ *u_size = *c_size = rz->end;
+ return 1;
+ case FILE_TYPE_GZ:
+ return 0;
+ case FILE_TYPE_RZ:
+ if(rz->src_end == rz->end) return 0;
+ *u_size = rz->src_end;
+ *c_size = rz->end;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int _razf_read(RAZF* rz, void *data, int size){
+ int ret, tin;
+ if(rz->z_eof || rz->z_err) return 0;
+ if (rz->file_type == FILE_TYPE_PLAIN) {
+#ifdef _USE_KNETFILE
+ ret = knet_read(rz->x.fpr, data, size);
+#else
+ ret = read(rz->filedes, data, size);
+#endif
+ if (ret == 0) rz->z_eof = 1;
+ return ret;
+ }
+ rz->stream->avail_out = size;
+ rz->stream->next_out = data;
+ while(rz->stream->avail_out){
+ if(rz->stream->avail_in == 0){
+ if(rz->in >= rz->end){ rz->z_eof = 1; break; }
+ if(rz->end - rz->in < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE
+ rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in);
+#else
+ rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
+#endif
+ } else {
+#ifdef _USE_KNETFILE
+ rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+ rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif
+ }
+ if(rz->stream->avail_in == 0){
+ rz->z_eof = 1;
+ break;
+ }
+ rz->stream->next_in = rz->inbuf;
+ }
+ tin = rz->stream->avail_in;
+ ret = inflate(rz->stream, Z_BLOCK);
+ rz->in += tin - rz->stream->avail_in;
+ if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
+ fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__);
+ rz->z_err = 1;
+ break;
+ }
+ if(ret == Z_STREAM_END){
+ rz->z_eof = 1;
+ break;
+ }
+ if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
+ rz->buf_flush = 1;
+ rz->next_block_pos = rz->in;
+ break;
+ }
+ }
+ return size - rz->stream->avail_out;
+}
+
+int razf_read(RAZF *rz, void *data, int size){
+ int ori_size, i;
+ ori_size = size;
+ while(size > 0){
+ if(rz->buf_len){
+ if(size < rz->buf_len){
+ for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+ rz->buf_off += size;
+ rz->buf_len -= size;
+ data += size;
+ rz->block_off += size;
+ size = 0;
+ break;
+ } else {
+ for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+ data += rz->buf_len;
+ size -= rz->buf_len;
+ rz->block_off += rz->buf_len;
+ rz->buf_off = 0;
+ rz->buf_len = 0;
+ if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ }
+ } else if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ if(rz->buf_flush) continue;
+ rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+ if(rz->z_eof && rz->buf_len == 0) break;
+ }
+ rz->out += ori_size - size;
+ return ori_size - size;
+}
+
+int razf_skip(RAZF* rz, int size){
+ int ori_size;
+ ori_size = size;
+ while(size > 0){
+ if(rz->buf_len){
+ if(size < rz->buf_len){
+ rz->buf_off += size;
+ rz->buf_len -= size;
+ rz->block_off += size;
+ size = 0;
+ break;
+ } else {
+ size -= rz->buf_len;
+ rz->buf_off = 0;
+ rz->buf_len = 0;
+ rz->block_off += rz->buf_len;
+ if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ }
+ } else if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ if(rz->buf_flush) continue;
+ rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+ if(rz->z_eof || rz->z_err) break;
+ }
+ rz->out += ori_size - size;
+ return ori_size - size;
+}
+
+static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
+#ifdef _USE_KNETFILE
+ knet_seek(rz->x.fpr, in, SEEK_SET);
+#else
+ lseek(rz->filedes, in, SEEK_SET);
+#endif
+ rz->in = in;
+ rz->out = out;
+ rz->block_pos = in;
+ rz->next_block_pos = in;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ rz->z_eof = rz->z_err = 0;
+ inflateReset(rz->stream);
+ rz->stream->avail_in = 0;
+ rz->buf_off = rz->buf_len = 0;
+}
+
+int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
+ int64_t pos;
+ rz->z_eof = 0;
+ if(rz->file_type == FILE_TYPE_PLAIN){
+ rz->buf_off = rz->buf_len = 0;
+ pos = block_start + block_offset;
+#ifdef _USE_KNETFILE
+ knet_seek(rz->x.fpr, pos, SEEK_SET);
+ pos = knet_tell(rz->x.fpr);
+#else
+ pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+ rz->out = rz->in = pos;
+ return pos;
+ }
+ if(block_start == rz->block_pos && block_offset >= rz->block_off) {
+ block_offset -= rz->block_off;
+ goto SKIP; // Needn't reset inflate
+ }
+ if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
+ _razf_reset_read(rz, block_start, 0);
+ SKIP:
+ if(block_offset) razf_skip(rz, block_offset);
+ return rz->block_off;
+}
+
+int64_t razf_seek(RAZF* rz, int64_t pos, int where){
+ int64_t idx;
+ int64_t seek_pos, new_out;
+ rz->z_eof = 0;
+ if (where == SEEK_CUR) pos += rz->out;
+ else if (where == SEEK_END) pos += rz->src_end;
+ if(rz->file_type == FILE_TYPE_PLAIN){
+#ifdef _USE_KNETFILE
+ knet_seek(rz->x.fpr, pos, SEEK_SET);
+ seek_pos = knet_tell(rz->x.fpr);
+#else
+ seek_pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+ rz->buf_off = rz->buf_len = 0;
+ rz->out = rz->in = seek_pos;
+ return seek_pos;
+ } else if(rz->file_type == FILE_TYPE_GZ){
+ if(pos >= rz->out) goto SKIP;
+ return rz->out;
+ }
+ if(pos == rz->out) return pos;
+ if(pos > rz->src_end) return rz->out;
+ if(!rz->seekable || !rz->load_index){
+ if(pos >= rz->out) goto SKIP;
+ }
+ idx = pos / RZ_BLOCK_SIZE - 1;
+ seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+ new_out = (idx + 1) * RZ_BLOCK_SIZE;
+ if(pos > rz->out && new_out <= rz->out) goto SKIP;
+ _razf_reset_read(rz, seek_pos, new_out);
+ SKIP:
+ razf_skip(rz, (int)(pos - rz->out));
+ return rz->out;
+}
+
+uint64_t razf_tell2(RAZF *rz)
+{
+ /*
+ if (rz->load_index) {
+ int64_t idx, seek_pos;
+ idx = rz->out / RZ_BLOCK_SIZE - 1;
+ seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+ if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
+ fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
+ (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
+ }
+ */
+ return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
+}
+
+int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
+{
+ if (where != SEEK_SET) return -1;
+ return razf_jump(rz, voffset>>16, voffset&0xffff);
+}
+
+void razf_close(RAZF *rz){
+ if(rz->mode == 'w'){
+#ifndef _RZ_READONLY
+ razf_end_flush(rz);
+ deflateEnd(rz->stream);
+#ifdef _USE_KNETFILE
+ save_zindex(rz, rz->x.fpw);
+ if(is_big_endian()){
+ write(rz->x.fpw, &rz->in, sizeof(int64_t));
+ write(rz->x.fpw, &rz->out, sizeof(int64_t));
+ } else {
+ uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+ write(rz->x.fpw, &v64, sizeof(int64_t));
+ v64 = byte_swap_8((uint64_t)rz->out);
+ write(rz->x.fpw, &v64, sizeof(int64_t));
+ }
+#else
+ save_zindex(rz, rz->filedes);
+ if(is_big_endian()){
+ write(rz->filedes, &rz->in, sizeof(int64_t));
+ write(rz->filedes, &rz->out, sizeof(int64_t));
+ } else {
+ uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+ write(rz->filedes, &v64, sizeof(int64_t));
+ v64 = byte_swap_8((uint64_t)rz->out);
+ write(rz->filedes, &v64, sizeof(int64_t));
+ }
+#endif
+#endif
+ } else if(rz->mode == 'r'){
+ if(rz->stream) inflateEnd(rz->stream);
+ }
+ if(rz->inbuf) free(rz->inbuf);
+ if(rz->outbuf) free(rz->outbuf);
+ if(rz->header){
+ free(rz->header->extra);
+ free(rz->header->name);
+ free(rz->header->comment);
+ free(rz->header);
+ }
+ if(rz->index){
+ free(rz->index->bin_offsets);
+ free(rz->index->cell_offsets);
+ free(rz->index);
+ }
+ free(rz->stream);
+#ifdef _USE_KNETFILE
+ if (rz->mode == 'r')
+ knet_close(rz->x.fpr);
+ if (rz->mode == 'w')
+ close(rz->x.fpw);
+#else
+ close(rz->filedes);
+#endif
+ free(rz);
+}
+
+#endif
diff --git a/samtools-0.1.19/razf.h b/samtools-0.1.19/razf.h
new file mode 100644
index 0000000..60a0c96
--- /dev/null
+++ b/samtools-0.1.19/razf.h
@@ -0,0 +1,134 @@
+ /*-
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue at gmail.com>, Heng Li <lh3 at sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef __RAZF_RJ_H
+#define __RAZF_RJ_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+#if ZLIB_VERNUM < 0x1221
+#define _RZ_READONLY
+struct _gz_header_s;
+typedef struct _gz_header_s _gz_header;
+#define gz_header _gz_header
+#endif
+
+#define WINDOW_BITS 15
+
+#ifndef RZ_BLOCK_SIZE
+#define RZ_BLOCK_SIZE (1<<WINDOW_BITS)
+#endif
+
+#ifndef RZ_BUFFER_SIZE
+#define RZ_BUFFER_SIZE 4096
+#endif
+
+#ifndef RZ_COMPRESS_LEVEL
+#define RZ_COMPRESS_LEVEL 6
+#endif
+
+#define RZ_BIN_SIZE ((1LLU << 32) / RZ_BLOCK_SIZE)
+
+typedef struct {
+ uint32_t *cell_offsets; // i
+ int64_t *bin_offsets; // i / BIN_SIZE
+ int size;
+ int cap;
+} ZBlockIndex;
+/* When storing index, output bytes in Big-Endian everywhere */
+
+#define FILE_TYPE_RZ 1
+#define FILE_TYPE_PLAIN 2
+#define FILE_TYPE_GZ 3
+
+typedef struct RandomAccessZFile {
+ char mode; /* 'w' : write mode; 'r' : read mode */
+ int file_type;
+ /* plain file or rz file, razf_read support plain file as input too, in this case, razf_read work as buffered fread */
+#ifdef _USE_KNETFILE
+ union {
+ knetFile *fpr;
+ int fpw;
+ } x;
+#else
+ int filedes; /* the file descriptor */
+#endif
+ z_stream *stream;
+ ZBlockIndex *index;
+ int64_t in, out, end, src_end;
+ /* in: n bytes total in; out: n bytes total out; */
+ /* end: the end of all data blocks, while the start of index; src_end: the true end position in uncompressed file */
+ int buf_flush; // buffer should be flush, suspend inflate util buffer is empty
+ int64_t block_pos, block_off, next_block_pos;
+ /* block_pos: the start postiion of current block in compressed file */
+ /* block_off: tell how many bytes have been read from current block */
+ void *inbuf, *outbuf;
+ int header_size;
+ gz_header *header;
+ /* header is used to transfer inflate_state->mode from HEAD to TYPE after call inflateReset */
+ int buf_off, buf_len;
+ int z_err, z_eof;
+ int seekable;
+ /* Indice where the source is seekable */
+ int load_index;
+ /* set has_index to 0 in mode 'w', then index will be discarded */
+} RAZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ RAZF* razf_dopen(int data_fd, const char *mode);
+ RAZF *razf_open(const char *fn, const char *mode);
+ int razf_write(RAZF* rz, const void *data, int size);
+ int razf_read(RAZF* rz, void *data, int size);
+ int64_t razf_seek(RAZF* rz, int64_t pos, int where);
+ void razf_close(RAZF* rz);
+
+#define razf_tell(rz) ((rz)->out)
+
+ RAZF* razf_open2(const char *filename, const char *mode);
+ RAZF* razf_dopen2(int fd, const char *mode);
+ uint64_t razf_tell2(RAZF *rz);
+ int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/razip.c b/samtools-0.1.19/razip.c
new file mode 100644
index 0000000..825e732
--- /dev/null
+++ b/samtools-0.1.19/razip.c
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include "razf.h"
+
+#define WINDOW_SIZE 4096
+
+static int razf_main_usage()
+{
+ printf("\n");
+ printf("Usage: razip [options] [file] ...\n\n");
+ printf("Options: -c write on standard output, keep original files unchanged\n");
+ printf(" -d decompress\n");
+ printf(" -l list compressed file contents\n");
+ printf(" -b INT decompress at INT position in the uncompressed file\n");
+ printf(" -s INT decompress INT bytes in the uncompressed file\n");
+ printf(" -h give this help\n");
+ printf("\n");
+ return 0;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+ int fd = -1;
+ char c;
+ if (!is_forced) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
+ printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+ scanf("%c", &c);
+ if (c != 'Y' && c != 'y') {
+ printf("razip: not overwritten\n");
+ exit(1);
+ }
+ }
+ }
+ if (fd < 0) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
+ fprintf(stderr, "razip: %s: Fail to write\n", fn);
+ exit(1);
+ }
+ }
+ return fd;
+}
+
+int main(int argc, char **argv)
+{
+ int c, compress, pstdout, is_forced;
+ RAZF *rz;
+ void *buffer;
+ long start, end, size;
+
+ compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+ while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+ switch(c){
+ case 'h': return razf_main_usage();
+ case 'd': compress = 0; break;
+ case 'c': pstdout = 1; break;
+ case 'l': compress = 2; break;
+ case 'b': start = atol(optarg); break;
+ case 's': size = atol(optarg); break;
+ case 'f': is_forced = 1; break;
+ }
+ }
+ if (size >= 0) end = start + size;
+ if(end >= 0 && end < start){
+ fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+ return 1;
+ }
+ if(compress == 1){
+ int f_src, f_dst = -1;
+ if(argc > optind){
+ if((f_src = open(argv[optind], O_RDONLY)) < 0){
+ fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+ return 1;
+ }
+ if(pstdout){
+ f_dst = fileno(stdout);
+ } else {
+ char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+ strcpy(name, argv[optind]);
+ strcat(name, ".rz");
+ f_dst = write_open(name, is_forced);
+ if (f_dst < 0) return 1;
+ free(name);
+ }
+ } else if(pstdout){
+ f_src = fileno(stdin);
+ f_dst = fileno(stdout);
+ } else return razf_main_usage();
+ rz = razf_dopen(f_dst, "w");
+ buffer = malloc(WINDOW_SIZE);
+ while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
+ razf_close(rz); // f_dst will be closed here
+ if (argc > optind && !pstdout) unlink(argv[optind]);
+ free(buffer);
+ close(f_src);
+ return 0;
+ } else {
+ if(argc <= optind) return razf_main_usage();
+ if(compress == 2){
+ rz = razf_open(argv[optind], "r");
+ if(rz->file_type == FILE_TYPE_RZ) {
+ printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name");
+ printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end,
+ argv[optind]);
+ } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]);
+ } else {
+ int f_dst;
+ if (argc > optind && !pstdout) {
+ char *name;
+ if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) {
+ printf("razip: %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = write_open(name, is_forced);
+ free(name);
+ } else f_dst = fileno(stdout);
+ rz = razf_open(argv[optind], "r");
+ buffer = malloc(WINDOW_SIZE);
+ razf_seek(rz, start, SEEK_SET);
+ while(1){
+ if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE);
+ else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if(c <= 0) break;
+ start += c;
+ write(f_dst, buffer, c);
+ if(end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (!pstdout) unlink(argv[optind]);
+ }
+ razf_close(rz);
+ return 0;
+ }
+}
+
diff --git a/samtools-0.1.19/sam.c b/samtools-0.1.19/sam.c
new file mode 100644
index 0000000..fa11df6
--- /dev/null
+++ b/samtools-0.1.19/sam.c
@@ -0,0 +1,186 @@
+#include <string.h>
+#include <unistd.h>
+#include "faidx.h"
+#include "sam.h"
+
+#define TYPE_BAM 1
+#define TYPE_READ 2
+
+bam_header_t *bam_header_dup(const bam_header_t *h0)
+{
+ bam_header_t *h;
+ int i;
+ h = bam_header_init();
+ *h = *h0;
+ h->hash = h->dict = h->rg2lib = 0;
+ h->text = (char*)calloc(h->l_text + 1, 1);
+ memcpy(h->text, h0->text, h->l_text);
+ h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+ for (i = 0; i < h->n_targets; ++i) {
+ h->target_len[i] = h0->target_len[i];
+ h->target_name[i] = strdup(h0->target_name[i]);
+ }
+ return h;
+}
+static void append_header_text(bam_header_t *header, char* text, int len)
+{
+ int x = header->l_text + 1;
+ int y = header->l_text + len + 1; // 1 byte null
+ if (text == 0) return;
+ kroundup32(x);
+ kroundup32(y);
+ if (x < y) header->text = (char*)realloc(header->text, y);
+ strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
+ header->l_text += len;
+ header->text[header->l_text] = 0;
+}
+
+int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
+{
+ if (!(fp->type&1) || (fp->type&2)) return -1;
+ bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
+ return 0;
+}
+
+samfile_t *samopen(const char *fn, const char *mode, const void *aux)
+{
+ samfile_t *fp;
+ fp = (samfile_t*)calloc(1, sizeof(samfile_t));
+ if (strchr(mode, 'r')) { // read
+ fp->type |= TYPE_READ;
+ if (strchr(mode, 'b')) { // binary
+ fp->type |= TYPE_BAM;
+ fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+ if (fp->x.bam == 0) goto open_err_ret;
+ fp->header = bam_header_read(fp->x.bam);
+ } else { // text
+ fp->x.tamr = sam_open(fn);
+ if (fp->x.tamr == 0) goto open_err_ret;
+ fp->header = sam_header_read(fp->x.tamr);
+ if (fp->header->n_targets == 0) { // no @SQ fields
+ if (aux) { // check if aux is present
+ bam_header_t *textheader = fp->header;
+ fp->header = sam_header_read2((const char*)aux);
+ if (fp->header == 0) goto open_err_ret;
+ append_header_text(fp->header, textheader->text, textheader->l_text);
+ bam_header_destroy(textheader);
+ }
+ if (fp->header->n_targets == 0 && bam_verbose >= 1)
+ fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
+ } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
+ }
+ } else if (strchr(mode, 'w')) { // write
+ fp->header = bam_header_dup((const bam_header_t*)aux);
+ if (strchr(mode, 'b')) { // binary
+ char bmode[3];
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
+ fp->type |= TYPE_BAM;
+ fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
+ if (fp->x.bam == 0) goto open_err_ret;
+ bam_header_write(fp->x.bam, fp->header);
+ } else { // text
+ // open file
+ fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
+ if (fp->x.tamw == 0) goto open_err_ret;
+ if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
+ else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
+ else fp->type |= BAM_OFDEC<<2;
+ // write header
+ if (strchr(mode, 'h')) {
+ int i;
+ bam_header_t *alt;
+ // parse the header text
+ alt = bam_header_init();
+ alt->l_text = fp->header->l_text; alt->text = fp->header->text;
+ sam_header_parse(alt);
+ alt->l_text = 0; alt->text = 0;
+ // check if there are @SQ lines in the header
+ fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
+ if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
+ if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
+ fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
+ } else { // then dump ->target_{name,len}
+ for (i = 0; i < fp->header->n_targets; ++i)
+ fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
+ }
+ bam_header_destroy(alt);
+ }
+ }
+ }
+ return fp;
+
+open_err_ret:
+ free(fp);
+ return 0;
+}
+
+void samclose(samfile_t *fp)
+{
+ if (fp == 0) return;
+ if (fp->header) bam_header_destroy(fp->header);
+ if (fp->type & TYPE_BAM) bam_close(fp->x.bam);
+ else if (fp->type & TYPE_READ) sam_close(fp->x.tamr);
+ else fclose(fp->x.tamw);
+ free(fp);
+}
+
+int samread(samfile_t *fp, bam1_t *b)
+{
+ if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading
+ if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b);
+ else return sam_read1(fp->x.tamr, fp->header, b);
+}
+
+int samwrite(samfile_t *fp, const bam1_t *b)
+{
+ if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing
+ if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b);
+ else {
+ char *s = bam_format1_core(fp->header, b, fp->type>>2&3);
+ int l = strlen(s);
+ fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw);
+ free(s);
+ return l + 1;
+ }
+}
+
+int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
+{
+ bam_plbuf_t *buf;
+ int ret;
+ bam1_t *b;
+ b = bam_init1();
+ buf = bam_plbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf, mask);
+ while ((ret = samread(fp, b)) >= 0)
+ bam_plbuf_push(b, buf);
+ bam_plbuf_push(0, buf);
+ bam_plbuf_destroy(buf);
+ bam_destroy1(b);
+ return 0;
+}
+
+char *samfaipath(const char *fn_ref)
+{
+ char *fn_list = 0;
+ if (fn_ref == 0) return 0;
+ fn_list = calloc(strlen(fn_ref) + 5, 1);
+ strcat(strcpy(fn_list, fn_ref), ".fai");
+ if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
+ if (access(fn_ref, R_OK) == -1) {
+ fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
+ } else {
+ if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
+ if (fai_build(fn_ref) == -1) {
+ fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
+ free(fn_list); fn_list = 0;
+ }
+ }
+ }
+ return fn_list;
+}
diff --git a/samtools-0.1.19/sam.h b/samtools-0.1.19/sam.h
new file mode 100644
index 0000000..0495501
--- /dev/null
+++ b/samtools-0.1.19/sam.h
@@ -0,0 +1,99 @@
+#ifndef BAM_SAM_H
+#define BAM_SAM_H
+
+#include "bam.h"
+
+/*!
+ @header
+
+ This file provides higher level of I/O routines and unifies the APIs
+ for SAM and BAM formats. These APIs are more convenient and
+ recommended.
+
+ @copyright Genome Research Ltd.
+ */
+
+/*! @typedef
+ @abstract SAM/BAM file handler
+ @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format
+ @field bam BAM file handler; valid if (type&1) == 1
+ @field tamr SAM file handler for reading; valid if type == 2
+ @field tamw SAM file handler for writing; valid if type == 0
+ @field header header struct
+ */
+typedef struct {
+ int type;
+ union {
+ tamFile tamr;
+ bamFile bam;
+ FILE *tamw;
+ } x;
+ bam_header_t *header;
+} samfile_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /*!
+ @abstract Open a SAM/BAM file
+
+ @param fn SAM/BAM file name; "-" is recognized as stdin (for
+ reading) or stdout (for writing).
+
+ @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading,
+ 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output,
+ 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for
+ string flag. If 'b' present, it must immediately follow 'r' or
+ 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX",
+ "rb", "wb" and "wbu" exclusively.
+
+ @param aux auxiliary data; if mode[0]=='w', aux points to
+ bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM
+ are absent, aux points the file name of the list of the reference;
+ aux is not used otherwise. If @SQ header lines are present in SAM,
+ aux is not used, either.
+
+ @return SAM/BAM file handler
+ */
+ samfile_t *samopen(const char *fn, const char *mode, const void *aux);
+
+ /*!
+ @abstract Close a SAM/BAM handler
+ @param fp file handler to be closed
+ */
+ void samclose(samfile_t *fp);
+
+ /*!
+ @abstract Read one alignment
+ @param fp file handler
+ @param b alignment
+ @return bytes read
+ */
+ int samread(samfile_t *fp, bam1_t *b);
+
+ /*!
+ @abstract Write one alignment
+ @param fp file handler
+ @param b alignment
+ @return bytes written
+ */
+ int samwrite(samfile_t *fp, const bam1_t *b);
+
+ /*!
+ @abstract Get the pileup for a whole alignment file
+ @param fp file handler
+ @param mask mask transferred to bam_plbuf_set_mask()
+ @param func user defined function called in the pileup process
+ #param data user provided data for func()
+ */
+ int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data);
+
+ char *samfaipath(const char *fn_ref);
+ int samthreads(samfile_t *fp, int n_threads, int n_sub_blks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/sam_header.c b/samtools-0.1.19/sam_header.c
new file mode 100644
index 0000000..88b6a1c
--- /dev/null
+++ b/samtools-0.1.19/sam_header.c
@@ -0,0 +1,810 @@
+#include "sam_header.h"
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, const char *)
+
+struct _HeaderList
+{
+ struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
+ struct _HeaderList *next;
+ void *data;
+};
+typedef struct _HeaderList list_t;
+typedef list_t HeaderDict;
+
+typedef struct
+{
+ char key[2];
+ char *value;
+}
+HeaderTag;
+
+typedef struct
+{
+ char type[2];
+ list_t *tags;
+}
+HeaderLine;
+
+const char *o_hd_tags[] = {"SO","GO",NULL};
+const char *r_hd_tags[] = {"VN",NULL};
+
+const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
+const char *r_sq_tags[] = {"SN","LN",NULL};
+const char *u_sq_tags[] = {"SN",NULL};
+
+const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
+const char *r_rg_tags[] = {"ID",NULL};
+const char *u_rg_tags[] = {"ID",NULL};
+
+const char *o_pg_tags[] = {"VN","CL",NULL};
+const char *r_pg_tags[] = {"ID",NULL};
+
+const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
+const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
+const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
+const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
+
+
+static void debug(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+}
+
+#if 0
+// Replaced by list_append_to_end
+static list_t *list_prepend(list_t *root, void *data)
+{
+ list_t *l = malloc(sizeof(list_t));
+ l->next = root;
+ l->data = data;
+ return l;
+}
+#endif
+
+// Relies on the root->last being correct. Do not use with the other list_*
+// routines unless they are fixed to modify root->last as well.
+static list_t *list_append_to_end(list_t *root, void *data)
+{
+ list_t *l = malloc(sizeof(list_t));
+ l->last = l;
+ l->next = NULL;
+ l->data = data;
+
+ if ( !root )
+ return l;
+
+ root->last->next = l;
+ root->last = l;
+ return root;
+}
+
+static list_t *list_append(list_t *root, void *data)
+{
+ list_t *l = root;
+ while (l && l->next)
+ l = l->next;
+ if ( l )
+ {
+ l->next = malloc(sizeof(list_t));
+ l = l->next;
+ }
+ else
+ {
+ l = malloc(sizeof(list_t));
+ root = l;
+ }
+ l->data = data;
+ l->next = NULL;
+ return root;
+}
+
+static void list_free(list_t *root)
+{
+ list_t *l = root;
+ while (root)
+ {
+ l = root;
+ root = root->next;
+ free(l);
+ }
+}
+
+
+
+// Look for a tag "XY" in a predefined const char *[] array.
+static int tag_exists(const char *tag, const char **tags)
+{
+ int itag=0;
+ if ( !tags ) return -1;
+ while ( tags[itag] )
+ {
+ if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
+ itag++;
+ }
+ return -1;
+}
+
+
+
+// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
+// or NULL if everything has been read. The lineptr should be freed by the caller. The
+// newline character is stripped.
+static const char *nextline(char **lineptr, size_t *n, const char *text)
+{
+ int len;
+ const char *to = text;
+
+ if ( !*to ) return NULL;
+
+ while ( *to && *to!='\n' && *to!='\r' ) to++;
+ len = to - text + 1;
+
+ if ( *to )
+ {
+ // Advance the pointer for the next call
+ if ( *to=='\n' ) to++;
+ else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
+ }
+ if ( !len )
+ return to;
+
+ if ( !*lineptr )
+ {
+ *lineptr = malloc(len);
+ *n = len;
+ }
+ else if ( *n<len )
+ {
+ *lineptr = realloc(*lineptr, len);
+ *n = len;
+ }
+ if ( !*lineptr ) {
+ debug("[nextline] Insufficient memory!\n");
+ return 0;
+ }
+
+ memcpy(*lineptr,text,len);
+ (*lineptr)[len-1] = 0;
+
+ return to;
+}
+
+// name points to "XY", value_from points to the first character of the value string and
+// value_to points to the last character of the value string.
+static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
+{
+ HeaderTag *tag = malloc(sizeof(HeaderTag));
+ int len = value_to-value_from+1;
+
+ tag->key[0] = name[0];
+ tag->key[1] = name[1];
+ tag->value = malloc(len+1);
+ memcpy(tag->value,value_from,len+1);
+ tag->value[len] = 0;
+ return tag;
+}
+
+static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
+{
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
+ tags = tags->next;
+ }
+ return NULL;
+}
+
+
+// Return codes:
+// 0 .. different types or unique tags differ or conflicting tags, cannot be merged
+// 1 .. all tags identical -> no need to merge, drop one
+// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
+// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
+static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
+{
+ HeaderTag *t1, *t2;
+
+ if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
+ return 0;
+
+ int itype = tag_exists(hline1->type,types);
+ if ( itype==-1 ) {
+ debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
+ return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
+ }
+
+ if ( unique_tags[itype] )
+ {
+ t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
+ t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
+ if ( !t1 || !t2 ) // this should never happen, the unique tags are required
+ return 2;
+
+ if ( strcmp(t1->value,t2->value) )
+ return 0; // the unique tags differ, cannot be merged
+ }
+ if ( !required_tags[itype] && !optional_tags[itype] )
+ {
+ t1 = hline1->tags->data;
+ t2 = hline2->tags->data;
+ if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
+ return 0;
+ }
+
+ int missing=0, itag=0;
+ while ( required_tags[itype] && required_tags[itype][itag] )
+ {
+ t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
+ t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
+ if ( !t1 && !t2 )
+ return 2; // this should never happen
+ else if ( !t1 || !t2 )
+ missing = 1; // there is some tag missing in one of the hlines
+ else if ( strcmp(t1->value,t2->value) )
+ {
+ if ( unique_tags[itype] )
+ return 2; // the lines have a matching unique tag but have a conflicting tag
+
+ return 0; // the lines contain conflicting tags, cannot be merged
+ }
+ itag++;
+ }
+ itag = 0;
+ while ( optional_tags[itype] && optional_tags[itype][itag] )
+ {
+ t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
+ t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
+ if ( !t1 && !t2 )
+ {
+ itag++;
+ continue;
+ }
+ if ( !t1 || !t2 )
+ missing = 1; // there is some tag missing in one of the hlines
+ else if ( strcmp(t1->value,t2->value) )
+ {
+ if ( unique_tags[itype] )
+ return 2; // the lines have a matching unique tag but have a conflicting tag
+
+ return 0; // the lines contain conflicting tags, cannot be merged
+ }
+ itag++;
+ }
+ if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged
+ return 1;
+}
+
+
+static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
+{
+ list_t *tags;
+ HeaderLine *out = malloc(sizeof(HeaderLine));
+ out->type[0] = hline->type[0];
+ out->type[1] = hline->type[1];
+ out->tags = NULL;
+
+ tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *old = tags->data;
+
+ HeaderTag *new = malloc(sizeof(HeaderTag));
+ new->key[0] = old->key[0];
+ new->key[1] = old->key[1];
+ new->value = strdup(old->value);
+ out->tags = list_append(out->tags, new);
+
+ tags = tags->next;
+ }
+ return out;
+}
+
+static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
+{
+ list_t *tmpl_tags;
+
+ if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
+ return 0;
+
+ tmpl_tags = tmpl_hline->tags;
+ while (tmpl_tags)
+ {
+ HeaderTag *tmpl_tag = tmpl_tags->data;
+ HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key);
+ if ( !out_tag )
+ {
+ HeaderTag *tag = malloc(sizeof(HeaderTag));
+ tag->key[0] = tmpl_tag->key[0];
+ tag->key[1] = tmpl_tag->key[1];
+ tag->value = strdup(tmpl_tag->value);
+ out_hline->tags = list_append(out_hline->tags,tag);
+ }
+ tmpl_tags = tmpl_tags->next;
+ }
+ return 1;
+}
+
+
+static HeaderLine *sam_header_line_parse(const char *headerLine)
+{
+ HeaderLine *hline;
+ HeaderTag *tag;
+ const char *from, *to;
+ from = headerLine;
+
+ if ( *from != '@' ) {
+ debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+ return 0;
+ }
+ to = ++from;
+
+ while (*to && *to!='\t') to++;
+ if ( to-from != 2 ) {
+ debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
+ return 0;
+ }
+
+ hline = malloc(sizeof(HeaderLine));
+ hline->type[0] = from[0];
+ hline->type[1] = from[1];
+ hline->tags = NULL;
+
+ int itype = tag_exists(hline->type, types);
+
+ from = to;
+ while (*to && *to=='\t') to++;
+ if ( to-from != 1 ) {
+ debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ free(hline);
+ return 0;
+ }
+ from = to;
+ while (*from)
+ {
+ while (*to && *to!='\t') to++;
+
+ if ( !required_tags[itype] && !optional_tags[itype] )
+ {
+ // CO is a special case, it can contain anything, including tabs
+ if ( *to ) { to++; continue; }
+ tag = new_tag(" ",from,to-1);
+ }
+ else
+ tag = new_tag(from,from+3,to-1);
+
+ if ( header_line_has_tag(hline,tag->key) )
+ debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
+ hline->tags = list_append(hline->tags, tag);
+
+ from = to;
+ while (*to && *to=='\t') to++;
+ if ( *to && to-from != 1 ) {
+ debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ return 0;
+ }
+
+ from = to;
+ }
+ return hline;
+}
+
+
+// Must be of an existing type, all tags must be recognised and all required tags must be present
+static int sam_header_line_validate(HeaderLine *hline)
+{
+ list_t *tags;
+ HeaderTag *tag;
+ int itype, itag;
+
+ // Is the type correct?
+ itype = tag_exists(hline->type, types);
+ if ( itype==-1 )
+ {
+ debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
+ return 0;
+ }
+
+ // Has all required tags?
+ itag = 0;
+ while ( required_tags[itype] && required_tags[itype][itag] )
+ {
+ if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
+ {
+ debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
+ hline->type[0],hline->type[1]);
+ return 0;
+ }
+ itag++;
+ }
+
+ // Are all tags recognised?
+ tags = hline->tags;
+ while ( tags )
+ {
+ tag = tags->data;
+ if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
+ {
+ // Lower case tags are user-defined values.
+ if( !(islower(tag->key[0]) || islower(tag->key[1])) )
+ {
+ // Neither is lower case, but tag was not recognized.
+ debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
+ // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes
+ }
+ // else - allow user defined tag
+ }
+ tags = tags->next;
+ }
+
+ return 1;
+}
+
+
+static void print_header_line(FILE *fp, HeaderLine *hline)
+{
+ list_t *tags = hline->tags;
+ HeaderTag *tag;
+
+ fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
+ while (tags)
+ {
+ tag = tags->data;
+
+ fprintf(fp, "\t");
+ if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+ fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
+ fprintf(fp, "%s", tag->value);
+
+ tags = tags->next;
+ }
+ fprintf(fp,"\n");
+}
+
+
+static void sam_header_line_free(HeaderLine *hline)
+{
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ free(tag->value);
+ free(tag);
+ tags = tags->next;
+ }
+ list_free(hline->tags);
+ free(hline);
+}
+
+void sam_header_free(void *_header)
+{
+ HeaderDict *header = (HeaderDict*)_header;
+ list_t *hlines = header;
+ while (hlines)
+ {
+ sam_header_line_free(hlines->data);
+ hlines = hlines->next;
+ }
+ list_free(header);
+}
+
+HeaderDict *sam_header_clone(const HeaderDict *dict)
+{
+ HeaderDict *out = NULL;
+ while (dict)
+ {
+ HeaderLine *hline = dict->data;
+ out = list_append(out, sam_header_line_clone(hline));
+ dict = dict->next;
+ }
+ return out;
+}
+
+// Returns a newly allocated string
+char *sam_header_write(const void *_header)
+{
+ const HeaderDict *header = (const HeaderDict*)_header;
+ char *out = NULL;
+ int len=0, nout=0;
+ const list_t *hlines;
+
+ // Calculate the length of the string to allocate
+ hlines = header;
+ while (hlines)
+ {
+ len += 4; // @XY and \n
+
+ HeaderLine *hline = hlines->data;
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ len += strlen(tag->value) + 1; // \t
+ if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+ len += strlen(tag->value) + 3; // XY:
+ tags = tags->next;
+ }
+ hlines = hlines->next;
+ }
+
+ nout = 0;
+ out = malloc(len+1);
+ hlines = header;
+ while (hlines)
+ {
+ HeaderLine *hline = hlines->data;
+
+ nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
+
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ nout += sprintf(out+nout,"\t");
+ if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+ nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
+ nout += sprintf(out+nout,"%s", tag->value);
+ tags = tags->next;
+ }
+ hlines = hlines->next;
+ nout += sprintf(out+nout,"\n");
+ }
+ out[len] = 0;
+ return out;
+}
+
+void *sam_header_parse2(const char *headerText)
+{
+ list_t *hlines = NULL;
+ HeaderLine *hline;
+ const char *text;
+ char *buf=NULL;
+ size_t nbuf = 0;
+ int tovalidate = 0;
+
+ if ( !headerText )
+ return 0;
+
+ text = headerText;
+ while ( (text=nextline(&buf, &nbuf, text)) )
+ {
+ hline = sam_header_line_parse(buf);
+ if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
+ // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
+ hlines = list_append_to_end(hlines, hline);
+ else
+ {
+ if (hline) sam_header_line_free(hline);
+ sam_header_free(hlines);
+ if ( buf ) free(buf);
+ return NULL;
+ }
+ }
+ if ( buf ) free(buf);
+
+ return hlines;
+}
+
+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
+{
+ const HeaderDict *dict = (const HeaderDict*)_dict;
+ const list_t *l = dict;
+ khash_t(str) *tbl = kh_init(str);
+ khiter_t k;
+ int ret;
+
+ if (_dict == 0) return tbl; // return an empty (not null) hash table
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key, *value;
+ key = header_line_has_tag(hline,key_tag);
+ value = header_line_has_tag(hline,value_tag);
+ if ( !key || !value )
+ {
+ l = l->next;
+ continue;
+ }
+
+ k = kh_get(str, tbl, key->value);
+ if ( k != kh_end(tbl) )
+ debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
+ k = kh_put(str, tbl, key->value, &ret);
+ kh_value(tbl, k) = value->value;
+
+ l = l->next;
+ }
+ return tbl;
+}
+
+char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
+{
+ const HeaderDict *dict = (const HeaderDict*)_dict;
+ const list_t *l = dict;
+ int max, n;
+ char **ret;
+
+ ret = 0; *_n = max = n = 0;
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key;
+ key = header_line_has_tag(hline,key_tag);
+ if ( !key )
+ {
+ l = l->next;
+ continue;
+ }
+
+ if (n == max) {
+ max = max? max<<1 : 4;
+ ret = realloc(ret, max * sizeof(void*));
+ }
+ ret[n++] = key->value;
+
+ l = l->next;
+ }
+ *_n = n;
+ return ret;
+}
+
+void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
+{
+ list_t *l = iter;
+ if ( !l ) return NULL;
+
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key, *value;
+ key = header_line_has_tag(hline,key_tag);
+ value = header_line_has_tag(hline,value_tag);
+ if ( !key && !value )
+ {
+ l = l->next;
+ continue;
+ }
+
+ *_key = key->value;
+ *_value = value->value;
+ return l->next;
+ }
+ return l;
+}
+
+const char *sam_tbl_get(void *h, const char *key)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ khint_t k;
+ k = kh_get(str, tbl, key);
+ return k == kh_end(tbl)? 0 : kh_val(tbl, k);
+}
+
+int sam_tbl_size(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ return h? kh_size(tbl) : 0;
+}
+
+void sam_tbl_destroy(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ kh_destroy(str, tbl);
+}
+
+void *sam_header_merge(int n, const void **_dicts)
+{
+ const HeaderDict **dicts = (const HeaderDict**)_dicts;
+ HeaderDict *out_dict;
+ int idict, status;
+
+ if ( n<2 ) return NULL;
+
+ out_dict = sam_header_clone(dicts[0]);
+
+ for (idict=1; idict<n; idict++)
+ {
+ const list_t *tmpl_hlines = dicts[idict];
+
+ while ( tmpl_hlines )
+ {
+ list_t *out_hlines = out_dict;
+ int inserted = 0;
+ while ( out_hlines )
+ {
+ status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
+ if ( status==0 )
+ {
+ out_hlines = out_hlines->next;
+ continue;
+ }
+
+ if ( status==2 )
+ {
+ print_header_line(stderr,tmpl_hlines->data);
+ print_header_line(stderr,out_hlines->data);
+ debug("Conflicting lines, cannot merge the headers.\n");
+ return 0;
+ }
+ if ( status==3 )
+ sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
+
+ inserted = 1;
+ break;
+ }
+ if ( !inserted )
+ out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
+
+ tmpl_hlines = tmpl_hlines->next;
+ }
+ }
+
+ return out_dict;
+}
+
+char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n)
+{
+ int nout = 0;
+ char **out = NULL;
+
+ *n = 0;
+ list_t *l = (list_t *)dict;
+ if ( !l ) return NULL;
+
+ int i, ntags = 0;
+ while ( tags[ntags] ) ntags++;
+
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+ out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags);
+ for (i=0; i<ntags; i++)
+ {
+ HeaderTag *key = header_line_has_tag(hline, tags[i]);
+ if ( !key )
+ {
+ out[nout*ntags+i] = NULL;
+ continue;
+ }
+ out[nout*ntags+i] = key->value;
+ }
+ nout++;
+ l = l->next;
+ }
+ *n = nout;
+ return out;
+}
+
diff --git a/samtools-0.1.19/sam_header.h b/samtools-0.1.19/sam_header.h
new file mode 100644
index 0000000..4b0cb03
--- /dev/null
+++ b/samtools-0.1.19/sam_header.h
@@ -0,0 +1,48 @@
+#ifndef __SAM_HEADER_H__
+#define __SAM_HEADER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ void *sam_header_parse2(const char *headerText);
+ void *sam_header_merge(int n, const void **dicts);
+ void sam_header_free(void *header);
+ char *sam_header_write(const void *headerDict); // returns a newly allocated string
+
+ /*
+ // Usage example
+ const char *key, *val;
+ void *iter = sam_header_parse2(bam->header->text);
+ while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val);
+ */
+ void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value);
+ char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n);
+
+ /*
+ // Usage example
+ int i, j, n;
+ const char *tags[] = {"SN","LN","UR","M5",NULL};
+ void *dict = sam_header_parse2(bam->header->text);
+ char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n);
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<4; j++)
+ if ( tbl[4*i+j] ) printf("\t%s", tbl[4*i+j]);
+ else printf("-");
+ printf("\n");
+ }
+ if (tbl) free(tbl);
+ */
+ char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n);
+
+ void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]);
+ const char *sam_tbl_get(void *h, const char *key);
+ int sam_tbl_size(void *h);
+ void sam_tbl_destroy(void *h);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/samtools-0.1.19/sam_view.c b/samtools-0.1.19/sam_view.c
new file mode 100644
index 0000000..7f3fdab
--- /dev/null
+++ b/samtools-0.1.19/sam_view.c
@@ -0,0 +1,441 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <math.h>
+#include <inttypes.h>
+#include "sam_header.h"
+#include "sam.h"
+#include "faidx.h"
+#include "kstring.h"
+#include "khash.h"
+KHASH_SET_INIT_STR(rg)
+
+// When counting records instead of printing them,
+// data passed to the bam_fetch callback is encapsulated in this struct.
+typedef struct {
+ bam_header_t *header;
+ int64_t *count; // int does overflow for very big BAMs
+} count_func_data_t;
+
+typedef khash_t(rg) *rghash_t;
+
+// FIXME: we'd better use no global variables...
+static rghash_t g_rghash = 0;
+static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0, g_qual_scale = 0, g_min_qlen = 0;
+static uint32_t g_subsam_seed = 0;
+static double g_subsam_frac = -1.;
+static char *g_library, *g_rg;
+static void *g_bed;
+
+void *bed_read(const char *fn);
+void bed_destroy(void *_h);
+int bed_overlap(const void *_h, const char *chr, int beg, int end);
+
+static int process_aln(const bam_header_t *h, bam1_t *b)
+{
+ if (g_qual_scale > 1) {
+ int i;
+ uint8_t *qual = bam1_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i) {
+ int c = qual[i] * g_qual_scale;
+ qual[i] = c < 93? c : 93;
+ }
+ }
+ if (g_min_qlen > 0) {
+ int k, qlen = 0;
+ uint32_t *cigar = bam1_cigar(b);
+ for (k = 0; k < b->core.n_cigar; ++k)
+ if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP)
+ qlen += bam_cigar_oplen(cigar[k]);
+ if (qlen < g_min_qlen) return 1;
+ }
+ if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))
+ return 1;
+ if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))))
+ return 1;
+ if (g_subsam_frac > 0.) {
+ uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + g_subsam_seed;
+ if ((double)(k&0xffffff) / 0x1000000 >= g_subsam_frac) return 1;
+ }
+ if (g_rg || g_rghash) {
+ uint8_t *s = bam_aux_get(b, "RG");
+ if (s) {
+ if (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1;
+ if (g_rghash) {
+ khint_t k = kh_get(rg, g_rghash, (char*)(s + 1));
+ return (k != kh_end(g_rghash))? 0 : 1;
+ }
+ }
+ }
+ if (g_library) {
+ const char *p = bam_get_library((bam_header_t*)h, b);
+ return (p && strcmp(p, g_library) == 0)? 0 : 1;
+ }
+ return 0;
+}
+
+static char *drop_rg(char *hdtxt, rghash_t h, int *len)
+{
+ char *p = hdtxt, *q, *r, *s;
+ kstring_t str;
+ memset(&str, 0, sizeof(kstring_t));
+ while (1) {
+ int toprint = 0;
+ q = strchr(p, '\n');
+ if (q == 0) q = p + strlen(p);
+ if (q - p < 3) break; // the line is too short; then stop
+ if (strncmp(p, "@RG\t", 4) == 0) {
+ int c;
+ khint_t k;
+ if ((r = strstr(p, "\tID:")) != 0) {
+ r += 4;
+ for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s);
+ c = *s; *s = '\0';
+ k = kh_get(rg, h, r);
+ *s = c;
+ if (k != kh_end(h)) toprint = 1;
+ }
+ } else toprint = 1;
+ if (toprint) {
+ kputsn(p, q - p, &str); kputc('\n', &str);
+ }
+ p = q + 1;
+ }
+ *len = str.l;
+ return str.s;
+}
+
+// callback function for bam_fetch() that prints nonskipped records
+static int view_func(const bam1_t *b, void *data)
+{
+ if (!process_aln(((samfile_t*)data)->header, (bam1_t*)b))
+ samwrite((samfile_t*)data, b);
+ return 0;
+}
+
+// callback function for bam_fetch() that counts nonskipped records
+static int count_func(const bam1_t *b, void *data)
+{
+ if (!process_aln(((count_func_data_t*)data)->header, (bam1_t*)b)) {
+ (*((count_func_data_t*)data)->count)++;
+ }
+ return 0;
+}
+
+static int usage(int is_long_help);
+
+int main_samview(int argc, char *argv[])
+{
+ int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0;
+ int of_type = BAM_OFDEC, is_long_help = 0, n_threads = 0;
+ int64_t count = 0;
+ samfile_t *in = 0, *out = 0;
+ char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0, *q;
+
+ /* parse command-line options */
+ strcpy(in_mode, "r"); strcpy(out_mode, "w");
+ while ((c = getopt(argc, argv, "SbBct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:Q:@:m:")) >= 0) {
+ switch (c) {
+ case 's':
+ if ((g_subsam_seed = strtol(optarg, &q, 10)) != 0) {
+ srand(g_subsam_seed);
+ g_subsam_seed = rand();
+ }
+ g_subsam_frac = strtod(q, &q);
+ break;
+ case 'm': g_min_qlen = atoi(optarg); break;
+ case 'c': is_count = 1; break;
+ case 'S': is_bamin = 0; break;
+ case 'b': is_bamout = 1; break;
+ case 't': fn_list = strdup(optarg); is_bamin = 0; break;
+ case 'h': is_header = 1; break;
+ case 'H': is_header_only = 1; break;
+ case 'o': fn_out = strdup(optarg); break;
+ case 'f': g_flag_on = strtol(optarg, 0, 0); break;
+ case 'F': g_flag_off = strtol(optarg, 0, 0); break;
+ case 'q': g_min_mapQ = atoi(optarg); break;
+ case 'u': compress_level = 0; break;
+ case '1': compress_level = 1; break;
+ case 'l': g_library = strdup(optarg); break;
+ case 'L': g_bed = bed_read(optarg); break;
+ case 'r': g_rg = strdup(optarg); break;
+ case 'R': fn_rg = strdup(optarg); break;
+ case 'x': of_type = BAM_OFHEX; break;
+ case 'X': of_type = BAM_OFSTR; break;
+ case '?': is_long_help = 1; break;
+ case 'T': fn_ref = strdup(optarg); is_bamin = 0; break;
+ case 'B': bam_no_B = 1; break;
+ case 'Q': g_qual_scale = atoi(optarg); break;
+ case '@': n_threads = strtol(optarg, 0, 0); break;
+ default: return usage(is_long_help);
+ }
+ }
+ if (compress_level >= 0) is_bamout = 1;
+ if (is_header_only) is_header = 1;
+ if (is_bamout) strcat(out_mode, "b");
+ else {
+ if (of_type == BAM_OFHEX) strcat(out_mode, "x");
+ else if (of_type == BAM_OFSTR) strcat(out_mode, "X");
+ }
+ if (is_bamin) strcat(in_mode, "b");
+ if (is_header) strcat(out_mode, "h");
+ if (compress_level >= 0) {
+ char tmp[2];
+ tmp[0] = compress_level + '0'; tmp[1] = '\0';
+ strcat(out_mode, tmp);
+ }
+ if (argc == optind) return usage(is_long_help); // potential memory leak...
+
+ // read the list of read groups
+ if (fn_rg) {
+ FILE *fp_rg;
+ char buf[1024];
+ int ret;
+ g_rghash = kh_init(rg);
+ fp_rg = fopen(fn_rg, "r");
+ while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me...
+ kh_put(rg, g_rghash, strdup(buf), &ret); // we'd better check duplicates...
+ fclose(fp_rg);
+ }
+
+ // generate the fn_list if necessary
+ if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
+ // open file handlers
+ if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+ fprintf(stderr, "[main_samview] fail to open \"%s\" for reading.\n", argv[optind]);
+ ret = 1;
+ goto view_end;
+ }
+ if (in->header == 0) {
+ fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]);
+ ret = 1;
+ goto view_end;
+ }
+ if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for...
+ char *tmp;
+ int l;
+ tmp = drop_rg(in->header->text, g_rghash, &l);
+ free(in->header->text);
+ in->header->text = tmp;
+ in->header->l_text = l;
+ }
+ if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
+ fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
+ ret = 1;
+ goto view_end;
+ }
+ if (n_threads > 1) samthreads(out, n_threads, 256);
+ if (is_header_only) goto view_end; // no need to print alignments
+
+ if (argc == optind + 1) { // convert/print the entire file
+ bam1_t *b = bam_init1();
+ int r;
+ while ((r = samread(in, b)) >= 0) { // read one alignment from `in'
+ if (!process_aln(in->header, b)) {
+ if (!is_count) samwrite(out, b); // write the alignment to `out'
+ count++;
+ }
+ }
+ if (r < -1) {
+ fprintf(stderr, "[main_samview] truncated file.\n");
+ ret = 1;
+ }
+ bam_destroy1(b);
+ } else { // retrieve alignments in specified regions
+ int i;
+ bam_index_t *idx = 0;
+ if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index
+ if (idx == 0) { // index is unavailable
+ fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n");
+ ret = 1;
+ goto view_end;
+ }
+ for (i = optind + 1; i < argc; ++i) {
+ int tid, beg, end, result;
+ bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200'
+ if (tid < 0) { // reference name is not found
+ fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
+ continue;
+ }
+ // fetch alignments
+ if (is_count) {
+ count_func_data_t count_data = { in->header, &count };
+ result = bam_fetch(in->x.bam, idx, tid, beg, end, &count_data, count_func);
+ } else
+ result = bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func);
+ if (result < 0) {
+ fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]);
+ ret = 1;
+ break;
+ }
+ }
+ bam_index_destroy(idx); // destroy the BAM index
+ }
+
+view_end:
+ if (is_count && ret == 0)
+ printf("%" PRId64 "\n", count);
+
+ // close files, free and return
+ free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);
+ if (g_bed) bed_destroy(g_bed);
+ if (g_rghash) {
+ khint_t k;
+ for (k = 0; k < kh_end(g_rghash); ++k)
+ if (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k));
+ kh_destroy(rg, g_rghash);
+ }
+ samclose(in);
+ if (!is_count)
+ samclose(out);
+ return ret;
+}
+
+static int usage(int is_long_help)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]\n\n");
+ fprintf(stderr, "Options: -b output BAM\n");
+ fprintf(stderr, " -h print header for the SAM output\n");
+ fprintf(stderr, " -H print header only (no alignments)\n");
+ fprintf(stderr, " -S input is SAM\n");
+ fprintf(stderr, " -u uncompressed BAM output (force -b)\n");
+ fprintf(stderr, " -1 fast compression (force -b)\n");
+ fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n");
+ fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n");
+ fprintf(stderr, " -c print only the count of matching records\n");
+ fprintf(stderr, " -B collapse the backward CIGAR operation\n");
+ fprintf(stderr, " -@ INT number of BAM compression threads [0]\n");
+ fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n");
+ fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n");
+ fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n");
+ fprintf(stderr, " -o FILE output file name [stdout]\n");
+ fprintf(stderr, " -R FILE list of read groups to be outputted [null]\n");
+ fprintf(stderr, " -f INT required flag, 0 for unset [0]\n");
+ fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n");
+ fprintf(stderr, " -q INT minimum mapping quality [0]\n");
+ fprintf(stderr, " -l STR only output reads in library STR [null]\n");
+ fprintf(stderr, " -r STR only output reads in read group STR [null]\n");
+ fprintf(stderr, " -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n");
+ fprintf(stderr, " -? longer help\n");
+ fprintf(stderr, "\n");
+ if (is_long_help)
+ fprintf(stderr, "Notes:\n\
+\n\
+ 1. By default, this command assumes the file on the command line is in\n\
+ the BAM format and it prints the alignments in SAM. If `-t' is\n\
+ applied, the input file is assumed to be in the SAM format. The\n\
+ file supplied with `-t' is SPACE/TAB delimited with the first two\n\
+ fields of each line consisting of the reference name and the\n\
+ corresponding sequence length. The `.fai' file generated by `faidx'\n\
+ can be used here. This file may be empty if reads are unaligned.\n\
+\n\
+ 2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\
+\n\
+ 3. BAM->SAM conversion: `samtools view in.bam'.\n\
+\n\
+ 4. A region should be presented in one of the following formats:\n\
+ `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\
+ specified, the input alignment file must be an indexed BAM file.\n\
+\n\
+ 5. Option `-u' is preferred over `-b' when the output is piped to\n\
+ another samtools command.\n\
+\n\
+ 6. In a string FLAG, each character represents one bit with\n\
+ p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\n\
+ U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\n\
+ 1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \n\
+ f=0x200 (failure) and d=0x400 (duplicate). Note that `-x' and\n\
+ `-X' are samtools-C specific. Picard and older samtools do not\n\
+ support HEX or string flags.\n\
+\n");
+ return 1;
+}
+
+int main_import(int argc, char *argv[])
+{
+ int argc2, ret;
+ char **argv2;
+ if (argc != 4) {
+ fprintf(stderr, "Usage: bamtk import <in.ref_list> <in.sam> <out.bam>\n");
+ return 1;
+ }
+ argc2 = 6;
+ argv2 = calloc(6, sizeof(char*));
+ argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
+ ret = main_samview(argc2, argv2);
+ free(argv2);
+ return ret;
+}
+
+int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 };
+
+int main_bam2fq(int argc, char *argv[])
+{
+ bamFile fp;
+ bam_header_t *h;
+ bam1_t *b;
+ int8_t *buf;
+ int max_buf, c, no12 = 0;
+ while ((c = getopt(argc, argv, "n")) > 0)
+ if (c == 'n') no12 = 1;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: samtools bam2fq <in.bam>\n");
+ return 1;
+ }
+ fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ if (fp == 0) return 1;
+ h = bam_header_read(fp);
+ b = bam_init1();
+ buf = 0;
+ max_buf = 0;
+ while (bam_read1(fp, b) >= 0) {
+ int i, qlen = b->core.l_qseq;
+ uint8_t *seq;
+ putchar('@'); fputs(bam1_qname(b), stdout);
+ if (no12) putchar('\n');
+ else {
+ if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1");
+ else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2");
+ else putchar('\n');
+ }
+ if (max_buf < qlen + 1) {
+ max_buf = qlen + 1;
+ kroundup32(max_buf);
+ buf = realloc(buf, max_buf);
+ }
+ buf[qlen] = 0;
+ seq = bam1_seq(b);
+ for (i = 0; i < qlen; ++i)
+ buf[i] = bam1_seqi(seq, i);
+ if (b->core.flag & 16) { // reverse complement
+ for (i = 0; i < qlen>>1; ++i) {
+ int8_t t = seq_comp_table[buf[qlen - 1 - i]];
+ buf[qlen - 1 - i] = seq_comp_table[buf[i]];
+ buf[i] = t;
+ }
+ if (qlen&1) buf[i] = seq_comp_table[buf[i]];
+ }
+ for (i = 0; i < qlen; ++i)
+ buf[i] = bam_nt16_rev_table[buf[i]];
+ puts((char*)buf);
+ puts("+");
+ seq = bam1_qual(b);
+ for (i = 0; i < qlen; ++i)
+ buf[i] = 33 + seq[i];
+ if (b->core.flag & 16) { // reverse
+ for (i = 0; i < qlen>>1; ++i) {
+ int8_t t = buf[qlen - 1 - i];
+ buf[qlen - 1 - i] = buf[i];
+ buf[i] = t;
+ }
+ }
+ puts((char*)buf);
+ }
+ free(buf);
+ bam_destroy1(b);
+ bam_header_destroy(h);
+ bam_close(fp);
+ return 0;
+}
diff --git a/samtools-0.1.19/sample.c b/samtools-0.1.19/sample.c
new file mode 100644
index 0000000..830b9d1
--- /dev/null
+++ b/samtools-0.1.19/sample.c
@@ -0,0 +1,107 @@
+#include <stdlib.h>
+#include <string.h>
+#include "sample.h"
+#include "khash.h"
+KHASH_MAP_INIT_STR(sm, int)
+
+bam_sample_t *bam_smpl_init(void)
+{
+ bam_sample_t *s;
+ s = calloc(1, sizeof(bam_sample_t));
+ s->rg2smid = kh_init(sm);
+ s->sm2id = kh_init(sm);
+ return s;
+}
+
+void bam_smpl_destroy(bam_sample_t *sm)
+{
+ int i;
+ khint_t k;
+ khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
+ if (sm == 0) return;
+ for (i = 0; i < sm->n; ++i) free(sm->smpl[i]);
+ free(sm->smpl);
+ for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k)
+ if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k));
+ kh_destroy(sm, sm->rg2smid);
+ kh_destroy(sm, sm->sm2id);
+ free(sm);
+}
+
+static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val)
+{
+ khint_t k_rg, k_sm;
+ int ret;
+ khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
+ k_rg = kh_get(sm, rg2smid, key);
+ if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID
+ k_rg = kh_put(sm, rg2smid, strdup(key), &ret);
+ k_sm = kh_get(sm, sm2id, val);
+ if (k_sm == kh_end(sm2id)) { // absent
+ if (sm->n == sm->m) {
+ sm->m = sm->m? sm->m<<1 : 1;
+ sm->smpl = realloc(sm->smpl, sizeof(void*) * sm->m);
+ }
+ sm->smpl[sm->n] = strdup(val);
+ k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret);
+ kh_val(sm2id, k_sm) = sm->n++;
+ }
+ kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm);
+}
+
+int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
+{
+ const char *p = txt, *q, *r;
+ kstring_t buf, first_sm;
+ int n = 0;
+ khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
+ if (txt == 0) {
+ add_pair(sm, sm2id, fn, fn);
+ return 0;
+ }
+ memset(&buf, 0, sizeof(kstring_t));
+ memset(&first_sm, 0, sizeof(kstring_t));
+ while ((q = strstr(p, "@RG")) != 0) {
+ p = q + 3;
+ r = q = 0;
+ if ((q = strstr(p, "\tID:")) != 0) q += 4;
+ if ((r = strstr(p, "\tSM:")) != 0) r += 4;
+ if (r && q) {
+ char *u, *v;
+ int oq, or;
+ for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
+ for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
+ oq = *u; or = *v; *u = *v = '\0';
+ buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
+ add_pair(sm, sm2id, buf.s, r);
+ if ( !first_sm.s )
+ kputs(r,&first_sm);
+ *u = oq; *v = or;
+ } else break;
+ p = q > r? q : r;
+ ++n;
+ }
+ if (n == 0) add_pair(sm, sm2id, fn, fn);
+ // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
+ // use the tag instead.
+ else if ( n==1 && first_sm.s )
+ add_pair(sm,sm2id,fn,first_sm.s);
+ if ( first_sm.s )
+ free(first_sm.s);
+
+// add_pair(sm, sm2id, fn, fn);
+ free(buf.s);
+ return 0;
+}
+
+int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str)
+{
+ khint_t k;
+ khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
+ if (rg) {
+ str->l = 0;
+ kputs(fn, str); kputc('/', str); kputs(rg, str);
+ k = kh_get(sm, rg2smid, str->s);
+ } else k = kh_get(sm, rg2smid, fn);
+ return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k);
+}
diff --git a/samtools-0.1.19/sample.h b/samtools-0.1.19/sample.h
new file mode 100644
index 0000000..85fe499
--- /dev/null
+++ b/samtools-0.1.19/sample.h
@@ -0,0 +1,17 @@
+#ifndef BAM_SAMPLE_H
+#define BAM_SAMPLE_H
+
+#include "kstring.h"
+
+typedef struct {
+ int n, m;
+ char **smpl;
+ void *rg2smid, *sm2id;
+} bam_sample_t;
+
+bam_sample_t *bam_smpl_init(void);
+int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt);
+int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str);
+void bam_smpl_destroy(bam_sample_t *sm);
+
+#endif
diff --git a/samtools-0.1.19/samtools.1 b/samtools-0.1.19/samtools.1
new file mode 100644
index 0000000..5923abd
--- /dev/null
+++ b/samtools-0.1.19/samtools.1
@@ -0,0 +1,1066 @@
+.TH samtools 1 "15 March 2013" "samtools-0.1.19" "Bioinformatics tools"
+.SH NAME
+.PP
+samtools - Utilities for the Sequence Alignment/Map (SAM) format
+
+bcftools - Utilities for the Binary Call Format (BCF) and VCF
+.SH SYNOPSIS
+.PP
+samtools view -bt ref_list.txt -o aln.bam aln.sam.gz
+.PP
+samtools sort aln.bam aln.sorted
+.PP
+samtools index aln.sorted.bam
+.PP
+samtools idxstats aln.sorted.bam
+.PP
+samtools view aln.sorted.bam chr2:20,100,000-20,200,000
+.PP
+samtools merge out.bam in1.bam in2.bam in3.bam
+.PP
+samtools faidx ref.fasta
+.PP
+samtools pileup -vcf ref.fasta aln.sorted.bam
+.PP
+samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
+.PP
+samtools tview aln.sorted.bam ref.fasta
+.PP
+bcftools index in.bcf
+.PP
+bcftools view in.bcf chr2:100-200 > out.vcf
+.PP
+bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs
+
+.SH DESCRIPTION
+.PP
+Samtools is a set of utilities that manipulate alignments in the BAM
+format. It imports from and exports to the SAM (Sequence Alignment/Map)
+format, does sorting, merging and indexing, and allows to retrieve reads
+in any regions swiftly.
+
+Samtools is designed to work on a stream. It regards an input file `-'
+as the standard input (stdin) and an output file `-' as the standard
+output (stdout). Several commands can thus be combined with Unix
+pipes. Samtools always output warning and error messages to the standard
+error output (stderr).
+
+Samtools is also able to open a BAM (not SAM) file on a remote FTP or
+HTTP server if the BAM file name starts with `ftp://' or `http://'.
+Samtools checks the current working directory for the index file and
+will download the index upon absence. Samtools does not retrieve the
+entire alignment file unless it is asked to do so.
+
+.SH SAMTOOLS COMMANDS AND OPTIONS
+
+.TP 10
+.B view
+samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] [-F
+skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] <in.bam>|<in.sam> [region1 [...]]
+
+Extract/print all or sub alignments in SAM or BAM format. If no region
+is specified, all the alignments will be printed; otherwise only
+alignments overlapping the specified regions will be output. An
+alignment may be given multiple times if it is overlapping several
+regions. A region can be presented, for example, in the following
+format: `chr2' (the whole chr2), `chr2:1000000' (region starting from
+1,000,000bp) or `chr2:1,000,000-2,000,000' (region between 1,000,000 and
+2,000,000bp including the end points). The coordinate is 1-based.
+
+.B OPTIONS:
+.RS
+.TP 10
+.B -b
+Output in the BAM format.
+.TP
+.BI -f \ INT
+Only output alignments with all bits in INT present in the FLAG
+field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
+.TP
+.BI -F \ INT
+Skip alignments with bits present in INT [0]
+.TP
+.B -h
+Include the header in the output.
+.TP
+.B -H
+Output the header only.
+.TP
+.BI -l \ STR
+Only output reads in library STR [null]
+.TP
+.BI -o \ FILE
+Output file [stdout]
+.TP
+.BI -q \ INT
+Skip alignments with MAPQ smaller than INT [0]
+.TP
+.BI -r \ STR
+Only output reads in read group STR [null]
+.TP
+.BI -R \ FILE
+Output reads in read groups listed in
+.I FILE
+[null]
+.TP
+.BI -s \ FLOAT
+Fraction of templates/pairs to subsample; the integer part is treated as the
+seed for the random number generator [-1]
+.TP
+.B -S
+Input is in SAM. If @SQ header lines are absent, the
+.B `-t'
+option is required.
+.TP
+.B -c
+Instead of printing the alignments, only count them and print the
+total number. All filter options, such as
+.B `-f',
+.B `-F'
+and
+.B `-q'
+, are taken into account.
+.TP
+.BI -t \ FILE
+This file is TAB-delimited. Each line must contain the reference name
+and the length of the reference, one line for each distinct reference;
+additional fields are ignored. This file also defines the order of the
+reference sequences in sorting. If you run `samtools faidx <ref.fa>',
+the resultant index file
+.I <ref.fa>.fai
+can be used as this
+.I <in.ref_list>
+file.
+.TP
+.B -u
+Output uncompressed BAM. This option saves time spent on
+compression/decomprssion and is thus preferred when the output is piped
+to another samtools command.
+.RE
+
+.TP
+.B tview
+samtools tview
+.RB [ \-p
+.IR chr:pos ]
+.RB [ \-s
+.IR STR ]
+.RB [ \-d
+.IR display ]
+.RI <in.sorted.bam>
+.RI [ref.fasta]
+
+Text alignment viewer (based on the ncurses library). In the viewer,
+press `?' for help and press `g' to check the alignment start from a
+region in the format like `chr10:10,000,000' or `=10,000,000' when
+viewing the same reference sequence.
+
+.B Options:
+.RS
+.TP 14
+.BI -d \ display
+Output as (H)tml or (C)urses or (T)ext
+.TP
+.BI -p \ chr:pos
+Go directly to this position
+.TP
+.BI -s \ STR
+Display only reads from this sample or read group
+.RE
+
+.TP
+.B mpileup
+samtools mpileup
+.RB [ \-EBugp ]
+.RB [ \-C
+.IR capQcoef ]
+.RB [ \-r
+.IR reg ]
+.RB [ \-f
+.IR in.fa ]
+.RB [ \-l
+.IR list ]
+.RB [ \-M
+.IR capMapQ ]
+.RB [ \-Q
+.IR minBaseQ ]
+.RB [ \-q
+.IR minMapQ ]
+.I in.bam
+.RI [ in2.bam
+.RI [ ... ]]
+
+Generate BCF or pileup for one or multiple BAM files. Alignment records
+are grouped by sample identifiers in @RG header lines. If sample
+identifiers are absent, each input file is regarded as one sample.
+
+In the pileup format (without
+.BR -u or -g ),
+each
+line represents a genomic position, consisting of chromosome name,
+coordinate, reference base, read bases, read qualities and alignment
+mapping qualities. Information on match, mismatch, indel, strand,
+mapping quality and start and end of a read are all encoded at the read
+base column. At this column, a dot stands for a match to the reference
+base on the forward strand, a comma for a match on the reverse strand,
+a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward
+strand and `acgtn' for a mismatch on the reverse strand. A pattern
+`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this
+reference position and the next reference position. The length of the
+insertion is given by the integer in the pattern, followed by the
+inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
+represents a deletion from the reference. The deleted bases will be
+presented as `*' in the following lines. Also at the read base column, a
+symbol `^' marks the start of a read. The ASCII of the character
+following `^' minus 33 gives the mapping quality. A symbol `$' marks the
+end of a read segment.
+
+.B Input Options:
+.RS
+.TP 10
+.B -6
+Assume the quality is in the Illumina 1.3+ encoding.
+.B -A
+Do not skip anomalous read pairs in variant calling.
+.TP
+.B -B
+Disable probabilistic realignment for the computation of base alignment
+quality (BAQ). BAQ is the Phred-scaled probability of a read base being
+misaligned. Applying this option greatly helps to reduce false SNPs
+caused by misalignments.
+.TP
+.BI -b \ FILE
+List of input BAM files, one file per line [null]
+.TP
+.BI -C \ INT
+Coefficient for downgrading mapping quality for reads containing
+excessive mismatches. Given a read with a phred-scaled probability q of
+being generated from the mapped position, the new mapping quality is
+about sqrt((INT-q)/INT)*INT. A zero value disables this
+functionality; if enabled, the recommended value for BWA is 50. [0]
+.TP
+.BI -d \ INT
+At a position, read maximally
+.I INT
+reads per input BAM. [250]
+.TP
+.B -E
+Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt
+specificity a little bit.
+.TP
+.BI -f \ FILE
+The
+.BR faidx -indexed
+reference file in the FASTA format. The file can be optionally compressed by
+.BR razip .
+[null]
+.TP
+.BI -l \ FILE
+BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null]
+.TP
+.BI -q \ INT
+Minimum mapping quality for an alignment to be used [0]
+.TP
+.BI -Q \ INT
+Minimum base quality for a base to be considered [13]
+.TP
+.BI -r \ STR
+Only generate pileup in region
+.I STR
+[all sites]
+.TP
+.B Output Options:
+
+.TP
+.B -D
+Output per-sample read depth
+.TP
+.B -g
+Compute genotype likelihoods and output them in the binary call format (BCF).
+.TP
+.B -S
+Output per-sample Phred-scaled strand bias P-value
+.TP
+.B -u
+Similar to
+.B -g
+except that the output is uncompressed BCF, which is preferred for piping.
+
+.TP
+.B Options for Genotype Likelihood Computation (for -g or -u):
+
+.TP
+.BI -e \ INT
+Phred-scaled gap extension sequencing error probability. Reducing
+.I INT
+leads to longer indels. [20]
+.TP
+.BI -h \ INT
+Coefficient for modeling homopolymer errors. Given an
+.IR l -long
+homopolymer
+run, the sequencing error of an indel of size
+.I s
+is modeled as
+.IR INT * s / l .
+[100]
+.TP
+.B -I
+Do not perform INDEL calling
+.TP
+.BI -L \ INT
+Skip INDEL calling if the average per-sample depth is above
+.IR INT .
+[250]
+.TP
+.BI -o \ INT
+Phred-scaled gap open sequencing error probability. Reducing
+.I INT
+leads to more indel calls. [40]
+.TP
+.BI -p
+Apply -m and -F thresholds per sample to increase sensitivity of calling.
+By default both options are applied to reads pooled from all samples.
+.TP
+.BI -P \ STR
+Comma dilimited list of platforms (determined by
+.BR @RG-PL )
+from which indel candidates are obtained. It is recommended to collect
+indel candidates from sequencing technologies that have low indel error
+rate such as ILLUMINA. [all]
+.RE
+
+.TP
+.B reheader
+samtools reheader <in.header.sam> <in.bam>
+
+Replace the header in
+.I in.bam
+with the header in
+.I in.header.sam.
+This command is much faster than replacing the header with a
+BAM->SAM->BAM conversion.
+
+.TP
+.B cat
+samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [ ... ]
+
+Concatenate BAMs. The sequence dictionary of each input BAM must be identical,
+although this command does not check this. This command uses a similar trick
+to
+.B reheader
+which enables fast BAM concatenation.
+
+.TP
+.B sort
+samtools sort [-nof] [-m maxMem] <in.bam> <out.prefix>
+
+Sort alignments by leftmost coordinates. File
+.I <out.prefix>.bam
+will be created. This command may also create temporary files
+.I <out.prefix>.%d.bam
+when the whole alignment cannot be fitted into memory (controlled by
+option -m).
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -o
+Output the final alignment to the standard output.
+.TP
+.B -n
+Sort by read names rather than by chromosomal coordinates
+.TP
+.B -f
+Use
+.I <out.prefix>
+as the full output path and do not append
+.I .bam
+suffix.
+.TP
+.BI -m \ INT
+Approximately the maximum required memory. [500000000]
+.RE
+
+.TP
+.B merge
+samtools merge [-nur1f] [-h inh.sam] [-R reg] <out.bam> <in1.bam> <in2.bam> [...]
+
+Merge multiple sorted alignments.
+The header reference lists of all the input BAM files, and the @SQ headers of
+.IR inh.sam ,
+if any, must all refer to the same set of reference sequences.
+The header reference list and (unless overridden by
+.BR -h )
+`@' headers of
+.I in1.bam
+will be copied to
+.IR out.bam ,
+and the headers of other files will be ignored.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -1
+Use zlib compression level 1 to comrpess the output
+.TP
+.B -f
+Force to overwrite the output file if present.
+.TP 8
+.BI -h \ FILE
+Use the lines of
+.I FILE
+as `@' headers to be copied to
+.IR out.bam ,
+replacing any header lines that would otherwise be copied from
+.IR in1.bam .
+.RI ( FILE
+is actually in SAM format, though any alignment records it may contain
+are ignored.)
+.TP
+.B -n
+The input alignments are sorted by read names rather than by chromosomal
+coordinates
+.TP
+.BI -R \ STR
+Merge files in the specified region indicated by
+.I STR
+[null]
+.TP
+.B -r
+Attach an RG tag to each alignment. The tag value is inferred from file names.
+.TP
+.B -u
+Uncompressed BAM output
+.RE
+
+.TP
+.B index
+samtools index <aln.bam>
+
+Index sorted alignment for fast random access. Index file
+.I <aln.bam>.bai
+will be created.
+
+.TP
+.B idxstats
+samtools idxstats <aln.bam>
+
+Retrieve and print stats in the index file. The output is TAB delimited
+with each line consisting of reference sequence name, sequence length, #
+mapped reads and # unmapped reads.
+
+.TP
+.B faidx
+samtools faidx <ref.fasta> [region1 [...]]
+
+Index reference sequence in the FASTA format or extract subsequence from
+indexed reference sequence. If no region is specified,
+.B faidx
+will index the file and create
+.I <ref.fasta>.fai
+on the disk. If regions are speficified, the subsequences will be
+retrieved and printed to stdout in the FASTA format. The input file can
+be compressed in the
+.B RAZF
+format.
+
+.TP
+.B fixmate
+samtools fixmate <in.nameSrt.bam> <out.bam>
+
+Fill in mate coordinates, ISIZE and mate related flags from a
+name-sorted alignment.
+
+.TP
+.B rmdup
+samtools rmdup [-sS] <input.srt.bam> <out.bam>
+
+Remove potential PCR duplicates: if multiple read pairs have identical
+external coordinates, only retain the pair with highest mapping quality.
+In the paired-end mode, this command
+.B ONLY
+works with FR orientation and requires ISIZE is correctly set. It does
+not work for unpaired reads (e.g. two ends mapped to different
+chromosomes or orphan reads).
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -s
+Remove duplicate for single-end reads. By default, the command works for
+paired-end reads only.
+.TP 8
+.B -S
+Treat paired-end reads and single-end reads.
+.RE
+
+.TP
+.B calmd
+samtools calmd [-EeubSr] [-C capQcoef] <aln.bam> <ref.fasta>
+
+Generate the MD tag. If the MD tag is already present, this command will
+give a warning if the MD tag generated is different from the existing
+tag. Output SAM by default.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -A
+When used jointly with
+.B -r
+this option overwrites the original base quality.
+.TP 8
+.B -e
+Convert a the read base to = if it is identical to the aligned reference
+base. Indel caller does not support the = bases at the moment.
+.TP
+.B -u
+Output uncompressed BAM
+.TP
+.B -b
+Output compressed BAM
+.TP
+.B -S
+The input is SAM with header lines
+.TP
+.BI -C \ INT
+Coefficient to cap mapping quality of poorly mapped reads. See the
+.B pileup
+command for details. [0]
+.TP
+.B -r
+Compute the BQ tag (without -A) or cap base quality by BAQ (with -A).
+.TP
+.B -E
+Extended BAQ calculation. This option trades specificity for sensitivity, though the
+effect is minor.
+.RE
+
+.TP
+.B targetcut
+samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>
+
+This command identifies target regions by examining the continuity of read depth, computes
+haploid consensus sequences of targets and outputs a SAM with each sequence corresponding
+to a target. When option
+.B -f
+is in use, BAQ will be applied. This command is
+.B only
+designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)].
+.RE
+
+.TP
+.B phase
+samtools phase [-AF] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] <in.bam>
+
+Call and phase heterozygous SNPs.
+.B OPTIONS:
+.RS
+.TP 8
+.B -A
+Drop reads with ambiguous phase.
+.TP 8
+.BI -b \ STR
+Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file
+.BR STR .0.bam
+and phase-1 reads in
+.BR STR .1.bam.
+Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads
+with switch errors will be saved in
+.BR STR .chimeric.bam.
+[null]
+.TP
+.B -F
+Do not attempt to fix chimeric reads.
+.TP
+.BI -k \ INT
+Maximum length for local phasing. [13]
+.TP
+.BI -q \ INT
+Minimum Phred-scaled LOD to call a heterozygote. [40]
+.TP
+.BI -Q \ INT
+Minimum base quality to be used in het calling. [13]
+.RE
+
+.SH BCFTOOLS COMMANDS AND OPTIONS
+
+.TP 10
+.B view
+.B bcftools view
+.RB [ \-AbFGNQSucgv ]
+.RB [ \-D
+.IR seqDict ]
+.RB [ \-l
+.IR listLoci ]
+.RB [ \-s
+.IR listSample ]
+.RB [ \-i
+.IR gapSNPratio ]
+.RB [ \-t
+.IR mutRate ]
+.RB [ \-p
+.IR varThres ]
+.RB [ \-m
+.IR varThres ]
+.RB [ \-P
+.IR prior ]
+.RB [ \-1
+.IR nGroup1 ]
+.RB [ \-d
+.IR minFrac ]
+.RB [ \-U
+.IR nPerm ]
+.RB [ \-X
+.IR permThres ]
+.RB [ \-T
+.IR trioType ]
+.I in.bcf
+.RI [ region ]
+
+Convert between BCF and VCF, call variant candidates and estimate allele
+frequencies.
+
+.RS
+.TP
+.B Input/Output Options:
+.TP 10
+.B -A
+Retain all possible alternate alleles at variant sites. By default, the view
+command discards unlikely alleles.
+.TP 10
+.B -b
+Output in the BCF format. The default is VCF.
+.TP
+.BI -D \ FILE
+Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null]
+.TP
+.B -F
+Indicate PL is generated by r921 or before (ordering is different).
+.TP
+.B -G
+Suppress all individual genotype information.
+.TP
+.BI -l \ FILE
+List of sites at which information are outputted [all sites]
+.TP
+.B -N
+Skip sites where the REF field is not A/C/G/T
+.TP
+.B -Q
+Output the QCALL likelihood format
+.TP
+.BI -s \ FILE
+List of samples to use. The first column in the input gives the sample names
+and the second gives the ploidy, which can only be 1 or 2. When the 2nd column
+is absent, the sample ploidy is assumed to be 2. In the output, the ordering of
+samples will be identical to the one in
+.IR FILE .
+[null]
+.TP
+.B -S
+The input is VCF instead of BCF.
+.TP
+.B -u
+Uncompressed BCF output (force -b).
+.TP
+.B Consensus/Variant Calling Options:
+.TP 10
+.B -c
+Call variants using Bayesian inference. This option automatically invokes option
+.BR -e .
+.TP
+.BI -d \ FLOAT
+When
+.B -v
+is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0]
+.TP
+.B -e
+Perform max-likelihood inference only, including estimating the site allele frequency,
+testing Hardy-Weinberg equlibrium and testing associations with LRT.
+.TP
+.B -g
+Call per-sample genotypes at variant sites (force -c)
+.TP
+.BI -i \ FLOAT
+Ratio of INDEL-to-SNP mutation rate [0.15]
+.TP
+.BI -m \ FLOAT
+New model for improved multiallelic and rare-variant calling. Another
+ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The
+parameter seems robust and the actual value usually does not affect the results
+much; a good value to use is 0.99. This is the recommended calling method. [0]
+.TP
+.BI -p \ FLOAT
+A site is considered to be a variant if P(ref|D)<FLOAT [0.5]
+.TP
+.BI -P \ STR
+Prior or initial allele frequency spectrum. If STR can be
+.IR full ,
+.IR cond2 ,
+.I flat
+or the file consisting of error output from a previous variant calling
+run.
+.TP
+.BI -t \ FLOAT
+Scaled muttion rate for variant calling [0.001]
+.TP
+.BI -T \ STR
+Enable pair/trio calling. For trio calling, option
+.B -s
+is usually needed to be applied to configure the trio members and their ordering.
+In the file supplied to the option
+.BR -s ,
+the first sample must be the child, the second the father and the third the mother.
+The valid values of
+.I STR
+are `pair', `trioauto', `trioxd' and `trioxs', where `pair' calls differences between two input samples, and `trioxd' (`trioxs') specifies that the input
+is from the X chromosome non-PAR regions and the child is a female (male). [null]
+.TP
+.B -v
+Output variant sites only (force -c)
+.TP
+.B Contrast Calling and Association Test Options:
+.TP
+.BI -1 \ INT
+Number of group-1 samples. This option is used for dividing the samples into
+two groups for contrast SNP calling or association test.
+When this option is in use, the following VCF INFO will be outputted:
+PC2, PCHI2 and QCHI2. [0]
+.TP
+.BI -U \ INT
+Number of permutations for association test (effective only with
+.BR -1 )
+[0]
+.TP
+.BI -X \ FLOAT
+Only perform permutations for P(chi^2)<FLOAT (effective only with
+.BR -U )
+[0.01]
+.RE
+
+.TP
+.B index
+.B bcftools index
+.I in.bcf
+
+Index sorted BCF for random access.
+.RE
+
+.TP
+.B cat
+.B bcftools cat
+.I in1.bcf
+.RI [ "in2.bcf " [ ... "]]]"
+
+Concatenate BCF files. The input files are required to be sorted and
+have identical samples appearing in the same order.
+.RE
+.SH SAM FORMAT
+
+Sequence Alignment/Map (SAM) format is TAB-delimited. Apart from the header lines, which are started
+with the `@' symbol, each alignment line consists of:
+
+.TS
+center box;
+cb | cb | cb
+n | l | l .
+Col Field Description
+_
+1 QNAME Query template/pair NAME
+2 FLAG bitwise FLAG
+3 RNAME Reference sequence NAME
+4 POS 1-based leftmost POSition/coordinate of clipped sequence
+5 MAPQ MAPping Quality (Phred-scaled)
+6 CIAGR extended CIGAR string
+7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
+8 MPOS 1-based Mate POSistion
+9 TLEN inferred Template LENgth (insert size)
+10 SEQ query SEQuence on the same strand as the reference
+11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
+12+ OPT variable OPTional fields in the format TAG:VTYPE:VALUE
+.TE
+
+.PP
+Each bit in the FLAG field is defined as:
+
+.TS
+center box;
+cb | cb | cb
+l | c | l .
+Flag Chr Description
+_
+0x0001 p the read is paired in sequencing
+0x0002 P the read is mapped in a proper pair
+0x0004 u the query sequence itself is unmapped
+0x0008 U the mate is unmapped
+0x0010 r strand of the query (1 for reverse)
+0x0020 R strand of the mate
+0x0040 1 the read is the first read in a pair
+0x0080 2 the read is the second read in a pair
+0x0100 s the alignment is not primary
+0x0200 f the read fails platform/vendor quality checks
+0x0400 d the read is either a PCR or an optical duplicate
+.TE
+
+where the second column gives the string representation of the FLAG field.
+
+.SH VCF FORMAT
+
+The Variant Call Format (VCF) is a TAB-delimited format with each data line consists of the following fields:
+.TS
+center box;
+cb | cb | cb
+n | l | l .
+Col Field Description
+_
+1 CHROM CHROMosome name
+2 POS the left-most POSition of the variant
+3 ID unique variant IDentifier
+4 REF the REFerence allele
+5 ALT the ALTernate allele(s), separated by comma
+6 QUAL variant/reference QUALity
+7 FILTER FILTers applied
+8 INFO INFOrmation related to the variant, separated by semi-colon
+9 FORMAT FORMAT of the genotype fields, separated by colon (optional)
+10+ SAMPLE SAMPLE genotypes and per-sample information (optional)
+.TE
+
+.PP
+The following table gives the
+.B INFO
+tags used by samtools and bcftools.
+
+.TS
+center box;
+cb | cb | cb
+l | l | l .
+Tag Format Description
+_
+AF1 double Max-likelihood estimate of the site allele frequency (AF) of the first ALT allele
+DP int Raw read depth (without quality filtering)
+DP4 int[4] # high-quality reference forward bases, ref reverse, alternate for and alt rev bases
+FQ int Consensus quality. Positive: sample genotypes different; negative: otherwise
+MQ int Root-Mean-Square mapping quality of covering reads
+PC2 int[2] Phred probability of AF in group1 samples being larger (,smaller) than in group2
+PCHI2 double Posterior weighted chi^2 P-value between group1 and group2 samples
+PV4 double[4] P-value for strand bias, baseQ bias, mapQ bias and tail distance bias
+QCHI2 int Phred-scaled PCHI2
+RP int # permutations yielding a smaller PCHI2
+CLR int Phred log ratio of genotype likelihoods with and without the trio/pair constraint
+UGT string Most probable genotype configuration without the trio constraint
+CGT string Most probable configuration with the trio constraint
+VDB float Tests variant positions within reads. Intended for filtering RNA-seq artifacts around splice sites
+RPB float Mann-Whitney rank-sum test for tail distance bias
+HWE float Hardy-Weinberg equilibrium test, Wigginton et al., PMID: 15789306
+.TE
+
+.SH EXAMPLES
+.IP o 2
+Import SAM to BAM when
+.B @SQ
+lines are present in the header:
+
+ samtools view -bS aln.sam > aln.bam
+
+If
+.B @SQ
+lines are absent:
+
+ samtools faidx ref.fa
+ samtools view -bt ref.fa.fai aln.sam > aln.bam
+
+where
+.I ref.fa.fai
+is generated automatically by the
+.B faidx
+command.
+
+.IP o 2
+Attach the
+.B RG
+tag while merging sorted alignments:
+
+ perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\n at RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt
+ samtools merge -rh rg.txt merged.bam ga.bam 454.bam
+
+The value in a
+.B RG
+tag is determined by the file name the read is coming from. In this
+example, in the
+.IR merged.bam ,
+reads from
+.I ga.bam
+will be attached
+.IR RG:Z:ga ,
+while reads from
+.I 454.bam
+will be attached
+.IR RG:Z:454 .
+
+.IP o 2
+Call SNPs and short INDELs for one diploid individual:
+
+ samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > var.raw.bcf
+ bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > var.flt.vcf
+
+The
+.B -D
+option of varFilter controls the maximum read depth, which should be
+adjusted to about twice the average read depth. One may consider to add
+.B -C50
+to
+.B mpileup
+if mapping quality is overestimated for reads containing excessive
+mismatches. Applying this option usually helps
+.B BWA-short
+but may not other mappers.
+
+.IP o 2
+Generate the consensus sequence for one diploid individual:
+
+ samtools mpileup -uf ref.fa aln.bam | bcftools view -cg - | vcfutils.pl vcf2fq > cns.fq
+
+.IP o 2
+Call somatic mutations from a pair of samples:
+
+ samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair - > var.bcf
+
+In the output INFO field,
+.I CLR
+gives the Phred-log ratio between the likelihood by treating the
+two samples independently, and the likelihood by requiring the genotype to be identical.
+This
+.I CLR
+is effectively a score measuring the confidence of somatic calls. The higher the better.
+
+.IP o 2
+Call de novo and somatic mutations from a family trio:
+
+ samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair -s samples.txt - > var.bcf
+
+File
+.I samples.txt
+should consist of three lines specifying the member and order of samples (in the order of child-father-mother).
+Similarly,
+.I CLR
+gives the Phred-log likelihood ratio with and without the trio constraint.
+.I UGT
+shows the most likely genotype configuration without the trio constraint, and
+.I CGT
+gives the most likely genotype configuration satisfying the trio constraint.
+
+.IP o 2
+Phase one individual:
+
+ samtools calmd -AEur aln.bam ref.fa | samtools phase -b prefix - > phase.out
+
+The
+.B calmd
+command is used to reduce false heterozygotes around INDELs.
+
+.IP o 2
+Call SNPs and short indels for multiple diploid individuals:
+
+ samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -bcvg - > var.raw.bcf
+ bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > var.flt.vcf
+
+Individuals are identified from the
+.B SM
+tags in the
+.B @RG
+header lines. Individuals can be pooled in one alignment file; one
+individual can also be separated into multiple files. The
+.B -P
+option specifies that indel candidates should be collected only from
+read groups with the
+.B @RG-PL
+tag set to
+.IR ILLUMINA .
+Collecting indel candidates from reads sequenced by an indel-prone
+technology may affect the performance of indel calling.
+
+Note that there is a new calling model which can be invoked by
+
+ bcftools view -m0.99 ...
+
+which fixes some severe limitations of the default method.
+
+For filtering, best results seem to be achieved by first applying the
+.IR SnpGap
+filter and then applying some machine learning approach
+
+ vcf-annotate -f SnpGap=n
+ vcf filter ...
+
+Both can be found in the
+.B vcftools
+and
+.B htslib
+package (links below).
+
+.IP o 2
+Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals:
+
+ samtools mpileup -Igf ref.fa *.bam > all.bcf
+ bcftools view -bl sites.list all.bcf > sites.bcf
+ bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs
+ bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs
+ bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs
+ ......
+
+where
+.I sites.list
+contains the list of sites with each line consisting of the reference
+sequence name and position. The following
+.B bcftools
+commands estimate AFS by EM.
+
+.IP o 2
+Dump BAQ applied alignment for other SNP callers:
+
+ samtools calmd -bAr aln.bam > aln.baq.bam
+
+It adds and corrects the
+.B NM
+and
+.B MD
+tags at the same time. The
+.B calmd
+command also comes with the
+.B -C
+option, the same as the one in
+.B pileup
+and
+.BR mpileup .
+Apply if it helps.
+
+.SH LIMITATIONS
+.PP
+.IP o 2
+Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c.
+.IP o 2
+Samtools paired-end rmdup does not work for unpaired reads (e.g. orphan
+reads or ends mapped to different chromosomes). If this is a concern,
+please use Picard's MarkDuplicate which correctly handles these cases,
+although a little slower.
+
+.SH AUTHOR
+.PP
+Heng Li from the Sanger Institute wrote the C version of samtools. Bob
+Handsaker from the Broad Institute implemented the BGZF library and Jue
+Ruan from Beijing Genomics Institute wrote the RAZF library. John
+Marshall and Petr Danecek contribute to the source code and various
+people from the 1000 Genomes Project have contributed to the SAM format
+specification.
+
+.SH SEE ALSO
+.PP
+Samtools website: <http://samtools.sourceforge.net>
+.br
+Samtools latest source: <https://github.com/samtools/samtools>
+.br
+VCFtools website with stable link to VCF specification: <http://vcftools.sourceforge.net>
+.br
+HTSlib website: <https://github.com/samtools/htslib>
diff --git a/samtools-0.1.19/win32/libcurses.a b/samtools-0.1.19/win32/libcurses.a
new file mode 100644
index 0000000..a3863b8
Binary files /dev/null and b/samtools-0.1.19/win32/libcurses.a differ
diff --git a/samtools-0.1.19/win32/libz.a b/samtools-0.1.19/win32/libz.a
new file mode 100644
index 0000000..23e8d60
Binary files /dev/null and b/samtools-0.1.19/win32/libz.a differ
diff --git a/samtools-0.1.19/win32/xcurses.h b/samtools-0.1.19/win32/xcurses.h
new file mode 100644
index 0000000..6f3ce19
--- /dev/null
+++ b/samtools-0.1.19/win32/xcurses.h
@@ -0,0 +1,1377 @@
+/* Public Domain Curses */
+
+/* $Id: curses.h,v 1.295 2008/07/15 17:13:25 wmcbrine Exp $ */
+
+/*----------------------------------------------------------------------*
+ * PDCurses *
+ *----------------------------------------------------------------------*/
+
+#ifndef __PDCURSES__
+#define __PDCURSES__ 1
+
+/*man-start**************************************************************
+
+PDCurses definitions list: (Only define those needed)
+
+ XCURSES True if compiling for X11.
+ PDC_RGB True if you want to use RGB color definitions
+ (Red = 1, Green = 2, Blue = 4) instead of BGR.
+ PDC_WIDE True if building wide-character support.
+ PDC_DLL_BUILD True if building a Win32 DLL.
+ NCURSES_MOUSE_VERSION Use the ncurses mouse API instead
+ of PDCurses' traditional mouse API.
+
+PDCurses portable platform definitions list:
+
+ PDC_BUILD Defines API build version.
+ PDCURSES Enables access to PDCurses-only routines.
+ XOPEN Always true.
+ SYSVcurses True if you are compiling for SYSV portability.
+ BSDcurses True if you are compiling for BSD portability.
+
+**man-end****************************************************************/
+
+#define PDC_BUILD 3401
+#define PDCURSES 1 /* PDCurses-only routines */
+#define XOPEN 1 /* X/Open Curses routines */
+#define SYSVcurses 1 /* System V Curses routines */
+#define BSDcurses 1 /* BSD Curses routines */
+#define CHTYPE_LONG 1 /* size of chtype; long */
+
+/*----------------------------------------------------------------------*/
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h> /* Required by X/Open usage below */
+
+#ifdef PDC_WIDE
+# include <wchar.h>
+#endif
+
+#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS)
+extern "C"
+{
+# define bool _bool
+#endif
+
+/*----------------------------------------------------------------------
+ *
+ * PDCurses Manifest Constants
+ *
+ */
+
+#ifndef FALSE
+# define FALSE 0
+#endif
+#ifndef TRUE
+# define TRUE 1
+#endif
+#ifndef NULL
+# define NULL (void *)0
+#endif
+#ifndef ERR
+# define ERR (-1)
+#endif
+#ifndef OK
+# define OK 0
+#endif
+
+/*----------------------------------------------------------------------
+ *
+ * PDCurses Type Declarations
+ *
+ */
+
+typedef unsigned char bool; /* PDCurses Boolean type */
+
+#ifdef CHTYPE_LONG
+# if _LP64
+typedef unsigned int chtype;
+# else
+typedef unsigned long chtype; /* 16-bit attr + 16-bit char */
+# endif
+#else
+typedef unsigned short chtype; /* 8-bit attr + 8-bit char */
+#endif
+
+#ifdef PDC_WIDE
+typedef chtype cchar_t;
+#endif
+
+typedef chtype attr_t;
+
+/*----------------------------------------------------------------------
+ *
+ * PDCurses Mouse Interface -- SYSVR4, with extensions
+ *
+ */
+
+typedef struct
+{
+ int x; /* absolute column, 0 based, measured in characters */
+ int y; /* absolute row, 0 based, measured in characters */
+ short button[3]; /* state of each button */
+ int changes; /* flags indicating what has changed with the mouse */
+} MOUSE_STATUS;
+
+#define BUTTON_RELEASED 0x0000
+#define BUTTON_PRESSED 0x0001
+#define BUTTON_CLICKED 0x0002
+#define BUTTON_DOUBLE_CLICKED 0x0003
+#define BUTTON_TRIPLE_CLICKED 0x0004
+#define BUTTON_MOVED 0x0005 /* PDCurses */
+#define WHEEL_SCROLLED 0x0006 /* PDCurses */
+#define BUTTON_ACTION_MASK 0x0007 /* PDCurses */
+
+#define PDC_BUTTON_SHIFT 0x0008 /* PDCurses */
+#define PDC_BUTTON_CONTROL 0x0010 /* PDCurses */
+#define PDC_BUTTON_ALT 0x0020 /* PDCurses */
+#define BUTTON_MODIFIER_MASK 0x0038 /* PDCurses */
+
+#define MOUSE_X_POS (Mouse_status.x)
+#define MOUSE_Y_POS (Mouse_status.y)
+
+/*
+ * Bits associated with the .changes field:
+ * 3 2 1 0
+ * 210987654321098765432109876543210
+ * 1 <- button 1 has changed
+ * 10 <- button 2 has changed
+ * 100 <- button 3 has changed
+ * 1000 <- mouse has moved
+ * 10000 <- mouse position report
+ * 100000 <- mouse wheel up
+ * 1000000 <- mouse wheel down
+ */
+
+#define PDC_MOUSE_MOVED 0x0008
+#define PDC_MOUSE_POSITION 0x0010
+#define PDC_MOUSE_WHEEL_UP 0x0020
+#define PDC_MOUSE_WHEEL_DOWN 0x0040
+
+#define A_BUTTON_CHANGED (Mouse_status.changes & 7)
+#define MOUSE_MOVED (Mouse_status.changes & PDC_MOUSE_MOVED)
+#define MOUSE_POS_REPORT (Mouse_status.changes & PDC_MOUSE_POSITION)
+#define BUTTON_CHANGED(x) (Mouse_status.changes & (1 << ((x) - 1)))
+#define BUTTON_STATUS(x) (Mouse_status.button[(x) - 1])
+#define MOUSE_WHEEL_UP (Mouse_status.changes & PDC_MOUSE_WHEEL_UP)
+#define MOUSE_WHEEL_DOWN (Mouse_status.changes & PDC_MOUSE_WHEEL_DOWN)
+
+/* mouse bit-masks */
+
+#define BUTTON1_RELEASED 0x00000001L
+#define BUTTON1_PRESSED 0x00000002L
+#define BUTTON1_CLICKED 0x00000004L
+#define BUTTON1_DOUBLE_CLICKED 0x00000008L
+#define BUTTON1_TRIPLE_CLICKED 0x00000010L
+#define BUTTON1_MOVED 0x00000010L /* PDCurses */
+
+#define BUTTON2_RELEASED 0x00000020L
+#define BUTTON2_PRESSED 0x00000040L
+#define BUTTON2_CLICKED 0x00000080L
+#define BUTTON2_DOUBLE_CLICKED 0x00000100L
+#define BUTTON2_TRIPLE_CLICKED 0x00000200L
+#define BUTTON2_MOVED 0x00000200L /* PDCurses */
+
+#define BUTTON3_RELEASED 0x00000400L
+#define BUTTON3_PRESSED 0x00000800L
+#define BUTTON3_CLICKED 0x00001000L
+#define BUTTON3_DOUBLE_CLICKED 0x00002000L
+#define BUTTON3_TRIPLE_CLICKED 0x00004000L
+#define BUTTON3_MOVED 0x00004000L /* PDCurses */
+
+/* For the ncurses-compatible functions only, BUTTON4_PRESSED and
+ BUTTON5_PRESSED are returned for mouse scroll wheel up and down;
+ otherwise PDCurses doesn't support buttons 4 and 5 */
+
+#define BUTTON4_RELEASED 0x00008000L
+#define BUTTON4_PRESSED 0x00010000L
+#define BUTTON4_CLICKED 0x00020000L
+#define BUTTON4_DOUBLE_CLICKED 0x00040000L
+#define BUTTON4_TRIPLE_CLICKED 0x00080000L
+
+#define BUTTON5_RELEASED 0x00100000L
+#define BUTTON5_PRESSED 0x00200000L
+#define BUTTON5_CLICKED 0x00400000L
+#define BUTTON5_DOUBLE_CLICKED 0x00800000L
+#define BUTTON5_TRIPLE_CLICKED 0x01000000L
+
+#define MOUSE_WHEEL_SCROLL 0x02000000L /* PDCurses */
+#define BUTTON_MODIFIER_SHIFT 0x04000000L /* PDCurses */
+#define BUTTON_MODIFIER_CONTROL 0x08000000L /* PDCurses */
+#define BUTTON_MODIFIER_ALT 0x10000000L /* PDCurses */
+
+#define ALL_MOUSE_EVENTS 0x1fffffffL
+#define REPORT_MOUSE_POSITION 0x20000000L
+
+/* ncurses mouse interface */
+
+typedef unsigned long mmask_t;
+
+typedef struct
+{
+ short id; /* unused, always 0 */
+ int x, y, z; /* x, y same as MOUSE_STATUS; z unused */
+ mmask_t bstate; /* equivalent to changes + button[], but
+ in the same format as used for mousemask() */
+} MEVENT;
+
+#ifdef NCURSES_MOUSE_VERSION
+# define BUTTON_SHIFT BUTTON_MODIFIER_SHIFT
+# define BUTTON_CONTROL BUTTON_MODIFIER_CONTROL
+# define BUTTON_CTRL BUTTON_MODIFIER_CONTROL
+# define BUTTON_ALT BUTTON_MODIFIER_ALT
+#else
+# define BUTTON_SHIFT PDC_BUTTON_SHIFT
+# define BUTTON_CONTROL PDC_BUTTON_CONTROL
+# define BUTTON_ALT PDC_BUTTON_ALT
+#endif
+
+/*----------------------------------------------------------------------
+ *
+ * PDCurses Structure Definitions
+ *
+ */
+
+typedef struct _win /* definition of a window */
+{
+ int _cury; /* current pseudo-cursor */
+ int _curx;
+ int _maxy; /* max window coordinates */
+ int _maxx;
+ int _begy; /* origin on screen */
+ int _begx;
+ int _flags; /* window properties */
+ chtype _attrs; /* standard attributes and colors */
+ chtype _bkgd; /* background, normally blank */
+ bool _clear; /* causes clear at next refresh */
+ bool _leaveit; /* leaves cursor where it is */
+ bool _scroll; /* allows window scrolling */
+ bool _nodelay; /* input character wait flag */
+ bool _immed; /* immediate update flag */
+ bool _sync; /* synchronise window ancestors */
+ bool _use_keypad; /* flags keypad key mode active */
+ chtype **_y; /* pointer to line pointer array */
+ int *_firstch; /* first changed character in line */
+ int *_lastch; /* last changed character in line */
+ int _tmarg; /* top of scrolling region */
+ int _bmarg; /* bottom of scrolling region */
+ int _delayms; /* milliseconds of delay for getch() */
+ int _parx, _pary; /* coords relative to parent (0,0) */
+ struct _win *_parent; /* subwin's pointer to parent win */
+} WINDOW;
+
+/* Avoid using the SCREEN struct directly -- use the corresponding
+ functions if possible. This struct may eventually be made private. */
+
+typedef struct
+{
+ bool alive; /* if initscr() called, and not endwin() */
+ bool autocr; /* if cr -> lf */
+ bool cbreak; /* if terminal unbuffered */
+ bool echo; /* if terminal echo */
+ bool raw_inp; /* raw input mode (v. cooked input) */
+ bool raw_out; /* raw output mode (7 v. 8 bits) */
+ bool audible; /* FALSE if the bell is visual */
+ bool mono; /* TRUE if current screen is mono */
+ bool resized; /* TRUE if TERM has been resized */
+ bool orig_attr; /* TRUE if we have the original colors */
+ short orig_fore; /* original screen foreground color */
+ short orig_back; /* original screen foreground color */
+ int cursrow; /* position of physical cursor */
+ int curscol; /* position of physical cursor */
+ int visibility; /* visibility of cursor */
+ int orig_cursor; /* original cursor size */
+ int lines; /* new value for LINES */
+ int cols; /* new value for COLS */
+ unsigned long _trap_mbe; /* trap these mouse button events */
+ unsigned long _map_mbe_to_key; /* map mouse buttons to slk */
+ int mouse_wait; /* time to wait (in ms) for a
+ button release after a press, in
+ order to count it as a click */
+ int slklines; /* lines in use by slk_init() */
+ WINDOW *slk_winptr; /* window for slk */
+ int linesrippedoff; /* lines ripped off via ripoffline() */
+ int linesrippedoffontop; /* lines ripped off on
+ top via ripoffline() */
+ int delaytenths; /* 1/10ths second to wait block
+ getch() for */
+ bool _preserve; /* TRUE if screen background
+ to be preserved */
+ int _restore; /* specifies if screen background
+ to be restored, and how */
+ bool save_key_modifiers; /* TRUE if each key modifiers saved
+ with each key press */
+ bool return_key_modifiers; /* TRUE if modifier keys are
+ returned as "real" keys */
+ bool key_code; /* TRUE if last key is a special key;
+ used internally by get_wch() */
+#ifdef XCURSES
+ int XcurscrSize; /* size of Xcurscr shared memory block */
+ bool sb_on;
+ int sb_viewport_y;
+ int sb_viewport_x;
+ int sb_total_y;
+ int sb_total_x;
+ int sb_cur_y;
+ int sb_cur_x;
+#endif
+ short line_color; /* color of line attributes - default -1 */
+} SCREEN;
+
+/*----------------------------------------------------------------------
+ *
+ * PDCurses External Variables
+ *
+ */
+
+#ifdef PDC_DLL_BUILD
+# ifdef CURSES_LIBRARY
+# define PDCEX __declspec(dllexport) extern
+# else
+# define PDCEX __declspec(dllimport)
+# endif
+#else
+# define PDCEX extern
+#endif
+
+PDCEX int LINES; /* terminal height */
+PDCEX int COLS; /* terminal width */
+PDCEX WINDOW *stdscr; /* the default screen window */
+PDCEX WINDOW *curscr; /* the current screen image */
+PDCEX SCREEN *SP; /* curses variables */
+PDCEX MOUSE_STATUS Mouse_status;
+PDCEX int COLORS;
+PDCEX int COLOR_PAIRS;
+PDCEX int TABSIZE;
+PDCEX chtype acs_map[]; /* alternate character set map */
+PDCEX char ttytype[]; /* terminal name/description */
+
+/*man-start**************************************************************
+
+PDCurses Text Attributes
+========================
+
+Originally, PDCurses used a short (16 bits) for its chtype. To include
+color, a number of things had to be sacrificed from the strict Unix and
+System V support. The main problem was fitting all character attributes
+and color into an unsigned char (all 8 bits!).
+
+Today, PDCurses by default uses a long (32 bits) for its chtype, as in
+System V. The short chtype is still available, by undefining CHTYPE_LONG
+and rebuilding the library.
+
+The following is the structure of a win->_attrs chtype:
+
+short form:
+
+-------------------------------------------------
+|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
+-------------------------------------------------
+ color number | attrs | character eg 'a'
+
+The available non-color attributes are bold, reverse and blink. Others
+have no effect. The high order char is an index into an array of
+physical colors (defined in color.c) -- 32 foreground/background color
+pairs (5 bits) plus 3 bits for other attributes.
+
+long form:
+
+----------------------------------------------------------------------------
+|31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|..| 3| 2| 1| 0|
+----------------------------------------------------------------------------
+ color number | modifiers | character eg 'a'
+
+The available non-color attributes are bold, underline, invisible,
+right-line, left-line, protect, reverse and blink. 256 color pairs (8
+bits), 8 bits for other attributes, and 16 bits for character data.
+
+**man-end****************************************************************/
+
+/*** Video attribute macros ***/
+
+#define A_NORMAL (chtype)0
+
+#ifdef CHTYPE_LONG
+# define A_ALTCHARSET (chtype)0x00010000
+# define A_RIGHTLINE (chtype)0x00020000
+# define A_LEFTLINE (chtype)0x00040000
+# define A_INVIS (chtype)0x00080000
+# define A_UNDERLINE (chtype)0x00100000
+# define A_REVERSE (chtype)0x00200000
+# define A_BLINK (chtype)0x00400000
+# define A_BOLD (chtype)0x00800000
+
+# define A_ATTRIBUTES (chtype)0xffff0000
+# define A_CHARTEXT (chtype)0x0000ffff
+# define A_COLOR (chtype)0xff000000
+
+# define A_ITALIC A_INVIS
+# define A_PROTECT (A_UNDERLINE | A_LEFTLINE | A_RIGHTLINE)
+
+# define PDC_ATTR_SHIFT 19
+# define PDC_COLOR_SHIFT 24
+#else
+# define A_BOLD (chtype)0x0100 /* X/Open */
+# define A_REVERSE (chtype)0x0200 /* X/Open */
+# define A_BLINK (chtype)0x0400 /* X/Open */
+
+# define A_ATTRIBUTES (chtype)0xff00 /* X/Open */
+# define A_CHARTEXT (chtype)0x00ff /* X/Open */
+# define A_COLOR (chtype)0xf800 /* System V */
+
+# define A_ALTCHARSET A_NORMAL /* X/Open */
+# define A_PROTECT A_NORMAL /* X/Open */
+# define A_UNDERLINE A_NORMAL /* X/Open */
+
+# define A_LEFTLINE A_NORMAL
+# define A_RIGHTLINE A_NORMAL
+# define A_ITALIC A_NORMAL
+# define A_INVIS A_NORMAL
+
+# define PDC_ATTR_SHIFT 8
+# define PDC_COLOR_SHIFT 11
+#endif
+
+#define A_STANDOUT (A_REVERSE | A_BOLD) /* X/Open */
+#define A_DIM A_NORMAL
+
+#define CHR_MSK A_CHARTEXT /* Obsolete */
+#define ATR_MSK A_ATTRIBUTES /* Obsolete */
+#define ATR_NRM A_NORMAL /* Obsolete */
+
+/* For use with attr_t -- X/Open says, "these shall be distinct", so
+ this is a non-conforming implementation. */
+
+#define WA_ALTCHARSET A_ALTCHARSET
+#define WA_BLINK A_BLINK
+#define WA_BOLD A_BOLD
+#define WA_DIM A_DIM
+#define WA_INVIS A_INVIS
+#define WA_LEFT A_LEFTLINE
+#define WA_PROTECT A_PROTECT
+#define WA_REVERSE A_REVERSE
+#define WA_RIGHT A_RIGHTLINE
+#define WA_STANDOUT A_STANDOUT
+#define WA_UNDERLINE A_UNDERLINE
+
+#define WA_HORIZONTAL A_NORMAL
+#define WA_LOW A_NORMAL
+#define WA_TOP A_NORMAL
+#define WA_VERTICAL A_NORMAL
+
+/*** Alternate character set macros ***/
+
+/* 'w' = 32-bit chtype; acs_map[] index | A_ALTCHARSET
+ 'n' = 16-bit chtype; it gets the fallback set because no bit is
+ available for A_ALTCHARSET */
+
+#ifdef CHTYPE_LONG
+# define ACS_PICK(w, n) ((chtype)w | A_ALTCHARSET)
+#else
+# define ACS_PICK(w, n) ((chtype)n)
+#endif
+
+/* VT100-compatible symbols -- box chars */
+
+#define ACS_ULCORNER ACS_PICK('l', '+')
+#define ACS_LLCORNER ACS_PICK('m', '+')
+#define ACS_URCORNER ACS_PICK('k', '+')
+#define ACS_LRCORNER ACS_PICK('j', '+')
+#define ACS_RTEE ACS_PICK('u', '+')
+#define ACS_LTEE ACS_PICK('t', '+')
+#define ACS_BTEE ACS_PICK('v', '+')
+#define ACS_TTEE ACS_PICK('w', '+')
+#define ACS_HLINE ACS_PICK('q', '-')
+#define ACS_VLINE ACS_PICK('x', '|')
+#define ACS_PLUS ACS_PICK('n', '+')
+
+/* VT100-compatible symbols -- other */
+
+#define ACS_S1 ACS_PICK('o', '-')
+#define ACS_S9 ACS_PICK('s', '_')
+#define ACS_DIAMOND ACS_PICK('`', '+')
+#define ACS_CKBOARD ACS_PICK('a', ':')
+#define ACS_DEGREE ACS_PICK('f', '\'')
+#define ACS_PLMINUS ACS_PICK('g', '#')
+#define ACS_BULLET ACS_PICK('~', 'o')
+
+/* Teletype 5410v1 symbols -- these are defined in SysV curses, but
+ are not well-supported by most terminals. Stick to VT100 characters
+ for optimum portability. */
+
+#define ACS_LARROW ACS_PICK(',', '<')
+#define ACS_RARROW ACS_PICK('+', '>')
+#define ACS_DARROW ACS_PICK('.', 'v')
+#define ACS_UARROW ACS_PICK('-', '^')
+#define ACS_BOARD ACS_PICK('h', '#')
+#define ACS_LANTERN ACS_PICK('i', '*')
+#define ACS_BLOCK ACS_PICK('0', '#')
+
+/* That goes double for these -- undocumented SysV symbols. Don't use
+ them. */
+
+#define ACS_S3 ACS_PICK('p', '-')
+#define ACS_S7 ACS_PICK('r', '-')
+#define ACS_LEQUAL ACS_PICK('y', '<')
+#define ACS_GEQUAL ACS_PICK('z', '>')
+#define ACS_PI ACS_PICK('{', 'n')
+#define ACS_NEQUAL ACS_PICK('|', '+')
+#define ACS_STERLING ACS_PICK('}', 'L')
+
+/* Box char aliases */
+
+#define ACS_BSSB ACS_ULCORNER
+#define ACS_SSBB ACS_LLCORNER
+#define ACS_BBSS ACS_URCORNER
+#define ACS_SBBS ACS_LRCORNER
+#define ACS_SBSS ACS_RTEE
+#define ACS_SSSB ACS_LTEE
+#define ACS_SSBS ACS_BTEE
+#define ACS_BSSS ACS_TTEE
+#define ACS_BSBS ACS_HLINE
+#define ACS_SBSB ACS_VLINE
+#define ACS_SSSS ACS_PLUS
+
+/* cchar_t aliases */
+
+#ifdef PDC_WIDE
+# define WACS_ULCORNER (&(acs_map['l']))
+# define WACS_LLCORNER (&(acs_map['m']))
+# define WACS_URCORNER (&(acs_map['k']))
+# define WACS_LRCORNER (&(acs_map['j']))
+# define WACS_RTEE (&(acs_map['u']))
+# define WACS_LTEE (&(acs_map['t']))
+# define WACS_BTEE (&(acs_map['v']))
+# define WACS_TTEE (&(acs_map['w']))
+# define WACS_HLINE (&(acs_map['q']))
+# define WACS_VLINE (&(acs_map['x']))
+# define WACS_PLUS (&(acs_map['n']))
+
+# define WACS_S1 (&(acs_map['o']))
+# define WACS_S9 (&(acs_map['s']))
+# define WACS_DIAMOND (&(acs_map['`']))
+# define WACS_CKBOARD (&(acs_map['a']))
+# define WACS_DEGREE (&(acs_map['f']))
+# define WACS_PLMINUS (&(acs_map['g']))
+# define WACS_BULLET (&(acs_map['~']))
+
+# define WACS_LARROW (&(acs_map[',']))
+# define WACS_RARROW (&(acs_map['+']))
+# define WACS_DARROW (&(acs_map['.']))
+# define WACS_UARROW (&(acs_map['-']))
+# define WACS_BOARD (&(acs_map['h']))
+# define WACS_LANTERN (&(acs_map['i']))
+# define WACS_BLOCK (&(acs_map['0']))
+
+# define WACS_S3 (&(acs_map['p']))
+# define WACS_S7 (&(acs_map['r']))
+# define WACS_LEQUAL (&(acs_map['y']))
+# define WACS_GEQUAL (&(acs_map['z']))
+# define WACS_PI (&(acs_map['{']))
+# define WACS_NEQUAL (&(acs_map['|']))
+# define WACS_STERLING (&(acs_map['}']))
+
+# define WACS_BSSB WACS_ULCORNER
+# define WACS_SSBB WACS_LLCORNER
+# define WACS_BBSS WACS_URCORNER
+# define WACS_SBBS WACS_LRCORNER
+# define WACS_SBSS WACS_RTEE
+# define WACS_SSSB WACS_LTEE
+# define WACS_SSBS WACS_BTEE
+# define WACS_BSSS WACS_TTEE
+# define WACS_BSBS WACS_HLINE
+# define WACS_SBSB WACS_VLINE
+# define WACS_SSSS WACS_PLUS
+#endif
+
+/*** Color macros ***/
+
+#define COLOR_BLACK 0
+
+#ifdef PDC_RGB /* RGB */
+# define COLOR_RED 1
+# define COLOR_GREEN 2
+# define COLOR_BLUE 4
+#else /* BGR */
+# define COLOR_BLUE 1
+# define COLOR_GREEN 2
+# define COLOR_RED 4
+#endif
+
+#define COLOR_CYAN (COLOR_BLUE | COLOR_GREEN)
+#define COLOR_MAGENTA (COLOR_RED | COLOR_BLUE)
+#define COLOR_YELLOW (COLOR_RED | COLOR_GREEN)
+
+#define COLOR_WHITE 7
+
+/*----------------------------------------------------------------------
+ *
+ * Function and Keypad Key Definitions.
+ * Many are just for compatibility.
+ *
+ */
+
+#define KEY_CODE_YES 0x100 /* If get_wch() gives a key code */
+
+#define KEY_BREAK 0x101 /* Not on PC KBD */
+#define KEY_DOWN 0x102 /* Down arrow key */
+#define KEY_UP 0x103 /* Up arrow key */
+#define KEY_LEFT 0x104 /* Left arrow key */
+#define KEY_RIGHT 0x105 /* Right arrow key */
+#define KEY_HOME 0x106 /* home key */
+#define KEY_BACKSPACE 0x107 /* not on pc */
+#define KEY_F0 0x108 /* function keys; 64 reserved */
+
+#define KEY_DL 0x148 /* delete line */
+#define KEY_IL 0x149 /* insert line */
+#define KEY_DC 0x14a /* delete character */
+#define KEY_IC 0x14b /* insert char or enter ins mode */
+#define KEY_EIC 0x14c /* exit insert char mode */
+#define KEY_CLEAR 0x14d /* clear screen */
+#define KEY_EOS 0x14e /* clear to end of screen */
+#define KEY_EOL 0x14f /* clear to end of line */
+#define KEY_SF 0x150 /* scroll 1 line forward */
+#define KEY_SR 0x151 /* scroll 1 line back (reverse) */
+#define KEY_NPAGE 0x152 /* next page */
+#define KEY_PPAGE 0x153 /* previous page */
+#define KEY_STAB 0x154 /* set tab */
+#define KEY_CTAB 0x155 /* clear tab */
+#define KEY_CATAB 0x156 /* clear all tabs */
+#define KEY_ENTER 0x157 /* enter or send (unreliable) */
+#define KEY_SRESET 0x158 /* soft/reset (partial/unreliable) */
+#define KEY_RESET 0x159 /* reset/hard reset (unreliable) */
+#define KEY_PRINT 0x15a /* print/copy */
+#define KEY_LL 0x15b /* home down/bottom (lower left) */
+#define KEY_ABORT 0x15c /* abort/terminate key (any) */
+#define KEY_SHELP 0x15d /* short help */
+#define KEY_LHELP 0x15e /* long help */
+#define KEY_BTAB 0x15f /* Back tab key */
+#define KEY_BEG 0x160 /* beg(inning) key */
+#define KEY_CANCEL 0x161 /* cancel key */
+#define KEY_CLOSE 0x162 /* close key */
+#define KEY_COMMAND 0x163 /* cmd (command) key */
+#define KEY_COPY 0x164 /* copy key */
+#define KEY_CREATE 0x165 /* create key */
+#define KEY_END 0x166 /* end key */
+#define KEY_EXIT 0x167 /* exit key */
+#define KEY_FIND 0x168 /* find key */
+#define KEY_HELP 0x169 /* help key */
+#define KEY_MARK 0x16a /* mark key */
+#define KEY_MESSAGE 0x16b /* message key */
+#define KEY_MOVE 0x16c /* move key */
+#define KEY_NEXT 0x16d /* next object key */
+#define KEY_OPEN 0x16e /* open key */
+#define KEY_OPTIONS 0x16f /* options key */
+#define KEY_PREVIOUS 0x170 /* previous object key */
+#define KEY_REDO 0x171 /* redo key */
+#define KEY_REFERENCE 0x172 /* ref(erence) key */
+#define KEY_REFRESH 0x173 /* refresh key */
+#define KEY_REPLACE 0x174 /* replace key */
+#define KEY_RESTART 0x175 /* restart key */
+#define KEY_RESUME 0x176 /* resume key */
+#define KEY_SAVE 0x177 /* save key */
+#define KEY_SBEG 0x178 /* shifted beginning key */
+#define KEY_SCANCEL 0x179 /* shifted cancel key */
+#define KEY_SCOMMAND 0x17a /* shifted command key */
+#define KEY_SCOPY 0x17b /* shifted copy key */
+#define KEY_SCREATE 0x17c /* shifted create key */
+#define KEY_SDC 0x17d /* shifted delete char key */
+#define KEY_SDL 0x17e /* shifted delete line key */
+#define KEY_SELECT 0x17f /* select key */
+#define KEY_SEND 0x180 /* shifted end key */
+#define KEY_SEOL 0x181 /* shifted clear line key */
+#define KEY_SEXIT 0x182 /* shifted exit key */
+#define KEY_SFIND 0x183 /* shifted find key */
+#define KEY_SHOME 0x184 /* shifted home key */
+#define KEY_SIC 0x185 /* shifted input key */
+
+#define KEY_SLEFT 0x187 /* shifted left arrow key */
+#define KEY_SMESSAGE 0x188 /* shifted message key */
+#define KEY_SMOVE 0x189 /* shifted move key */
+#define KEY_SNEXT 0x18a /* shifted next key */
+#define KEY_SOPTIONS 0x18b /* shifted options key */
+#define KEY_SPREVIOUS 0x18c /* shifted prev key */
+#define KEY_SPRINT 0x18d /* shifted print key */
+#define KEY_SREDO 0x18e /* shifted redo key */
+#define KEY_SREPLACE 0x18f /* shifted replace key */
+#define KEY_SRIGHT 0x190 /* shifted right arrow */
+#define KEY_SRSUME 0x191 /* shifted resume key */
+#define KEY_SSAVE 0x192 /* shifted save key */
+#define KEY_SSUSPEND 0x193 /* shifted suspend key */
+#define KEY_SUNDO 0x194 /* shifted undo key */
+#define KEY_SUSPEND 0x195 /* suspend key */
+#define KEY_UNDO 0x196 /* undo key */
+
+/* PDCurses-specific key definitions -- PC only */
+
+#define ALT_0 0x197
+#define ALT_1 0x198
+#define ALT_2 0x199
+#define ALT_3 0x19a
+#define ALT_4 0x19b
+#define ALT_5 0x19c
+#define ALT_6 0x19d
+#define ALT_7 0x19e
+#define ALT_8 0x19f
+#define ALT_9 0x1a0
+#define ALT_A 0x1a1
+#define ALT_B 0x1a2
+#define ALT_C 0x1a3
+#define ALT_D 0x1a4
+#define ALT_E 0x1a5
+#define ALT_F 0x1a6
+#define ALT_G 0x1a7
+#define ALT_H 0x1a8
+#define ALT_I 0x1a9
+#define ALT_J 0x1aa
+#define ALT_K 0x1ab
+#define ALT_L 0x1ac
+#define ALT_M 0x1ad
+#define ALT_N 0x1ae
+#define ALT_O 0x1af
+#define ALT_P 0x1b0
+#define ALT_Q 0x1b1
+#define ALT_R 0x1b2
+#define ALT_S 0x1b3
+#define ALT_T 0x1b4
+#define ALT_U 0x1b5
+#define ALT_V 0x1b6
+#define ALT_W 0x1b7
+#define ALT_X 0x1b8
+#define ALT_Y 0x1b9
+#define ALT_Z 0x1ba
+
+#define CTL_LEFT 0x1bb /* Control-Left-Arrow */
+#define CTL_RIGHT 0x1bc
+#define CTL_PGUP 0x1bd
+#define CTL_PGDN 0x1be
+#define CTL_HOME 0x1bf
+#define CTL_END 0x1c0
+
+#define KEY_A1 0x1c1 /* upper left on Virtual keypad */
+#define KEY_A2 0x1c2 /* upper middle on Virt. keypad */
+#define KEY_A3 0x1c3 /* upper right on Vir. keypad */
+#define KEY_B1 0x1c4 /* middle left on Virt. keypad */
+#define KEY_B2 0x1c5 /* center on Virt. keypad */
+#define KEY_B3 0x1c6 /* middle right on Vir. keypad */
+#define KEY_C1 0x1c7 /* lower left on Virt. keypad */
+#define KEY_C2 0x1c8 /* lower middle on Virt. keypad */
+#define KEY_C3 0x1c9 /* lower right on Vir. keypad */
+
+#define PADSLASH 0x1ca /* slash on keypad */
+#define PADENTER 0x1cb /* enter on keypad */
+#define CTL_PADENTER 0x1cc /* ctl-enter on keypad */
+#define ALT_PADENTER 0x1cd /* alt-enter on keypad */
+#define PADSTOP 0x1ce /* stop on keypad */
+#define PADSTAR 0x1cf /* star on keypad */
+#define PADMINUS 0x1d0 /* minus on keypad */
+#define PADPLUS 0x1d1 /* plus on keypad */
+#define CTL_PADSTOP 0x1d2 /* ctl-stop on keypad */
+#define CTL_PADCENTER 0x1d3 /* ctl-enter on keypad */
+#define CTL_PADPLUS 0x1d4 /* ctl-plus on keypad */
+#define CTL_PADMINUS 0x1d5 /* ctl-minus on keypad */
+#define CTL_PADSLASH 0x1d6 /* ctl-slash on keypad */
+#define CTL_PADSTAR 0x1d7 /* ctl-star on keypad */
+#define ALT_PADPLUS 0x1d8 /* alt-plus on keypad */
+#define ALT_PADMINUS 0x1d9 /* alt-minus on keypad */
+#define ALT_PADSLASH 0x1da /* alt-slash on keypad */
+#define ALT_PADSTAR 0x1db /* alt-star on keypad */
+#define ALT_PADSTOP 0x1dc /* alt-stop on keypad */
+#define CTL_INS 0x1dd /* ctl-insert */
+#define ALT_DEL 0x1de /* alt-delete */
+#define ALT_INS 0x1df /* alt-insert */
+#define CTL_UP 0x1e0 /* ctl-up arrow */
+#define CTL_DOWN 0x1e1 /* ctl-down arrow */
+#define CTL_TAB 0x1e2 /* ctl-tab */
+#define ALT_TAB 0x1e3
+#define ALT_MINUS 0x1e4
+#define ALT_EQUAL 0x1e5
+#define ALT_HOME 0x1e6
+#define ALT_PGUP 0x1e7
+#define ALT_PGDN 0x1e8
+#define ALT_END 0x1e9
+#define ALT_UP 0x1ea /* alt-up arrow */
+#define ALT_DOWN 0x1eb /* alt-down arrow */
+#define ALT_RIGHT 0x1ec /* alt-right arrow */
+#define ALT_LEFT 0x1ed /* alt-left arrow */
+#define ALT_ENTER 0x1ee /* alt-enter */
+#define ALT_ESC 0x1ef /* alt-escape */
+#define ALT_BQUOTE 0x1f0 /* alt-back quote */
+#define ALT_LBRACKET 0x1f1 /* alt-left bracket */
+#define ALT_RBRACKET 0x1f2 /* alt-right bracket */
+#define ALT_SEMICOLON 0x1f3 /* alt-semi-colon */
+#define ALT_FQUOTE 0x1f4 /* alt-forward quote */
+#define ALT_COMMA 0x1f5 /* alt-comma */
+#define ALT_STOP 0x1f6 /* alt-stop */
+#define ALT_FSLASH 0x1f7 /* alt-forward slash */
+#define ALT_BKSP 0x1f8 /* alt-backspace */
+#define CTL_BKSP 0x1f9 /* ctl-backspace */
+#define PAD0 0x1fa /* keypad 0 */
+
+#define CTL_PAD0 0x1fb /* ctl-keypad 0 */
+#define CTL_PAD1 0x1fc
+#define CTL_PAD2 0x1fd
+#define CTL_PAD3 0x1fe
+#define CTL_PAD4 0x1ff
+#define CTL_PAD5 0x200
+#define CTL_PAD6 0x201
+#define CTL_PAD7 0x202
+#define CTL_PAD8 0x203
+#define CTL_PAD9 0x204
+
+#define ALT_PAD0 0x205 /* alt-keypad 0 */
+#define ALT_PAD1 0x206
+#define ALT_PAD2 0x207
+#define ALT_PAD3 0x208
+#define ALT_PAD4 0x209
+#define ALT_PAD5 0x20a
+#define ALT_PAD6 0x20b
+#define ALT_PAD7 0x20c
+#define ALT_PAD8 0x20d
+#define ALT_PAD9 0x20e
+
+#define CTL_DEL 0x20f /* clt-delete */
+#define ALT_BSLASH 0x210 /* alt-back slash */
+#define CTL_ENTER 0x211 /* ctl-enter */
+
+#define SHF_PADENTER 0x212 /* shift-enter on keypad */
+#define SHF_PADSLASH 0x213 /* shift-slash on keypad */
+#define SHF_PADSTAR 0x214 /* shift-star on keypad */
+#define SHF_PADPLUS 0x215 /* shift-plus on keypad */
+#define SHF_PADMINUS 0x216 /* shift-minus on keypad */
+#define SHF_UP 0x217 /* shift-up on keypad */
+#define SHF_DOWN 0x218 /* shift-down on keypad */
+#define SHF_IC 0x219 /* shift-insert on keypad */
+#define SHF_DC 0x21a /* shift-delete on keypad */
+
+#define KEY_MOUSE 0x21b /* "mouse" key */
+#define KEY_SHIFT_L 0x21c /* Left-shift */
+#define KEY_SHIFT_R 0x21d /* Right-shift */
+#define KEY_CONTROL_L 0x21e /* Left-control */
+#define KEY_CONTROL_R 0x21f /* Right-control */
+#define KEY_ALT_L 0x220 /* Left-alt */
+#define KEY_ALT_R 0x221 /* Right-alt */
+#define KEY_RESIZE 0x222 /* Window resize */
+#define KEY_SUP 0x223 /* Shifted up arrow */
+#define KEY_SDOWN 0x224 /* Shifted down arrow */
+
+#define KEY_MIN KEY_BREAK /* Minimum curses key value */
+#define KEY_MAX KEY_SDOWN /* Maximum curses key */
+
+#define KEY_F(n) (KEY_F0 + (n))
+
+/*----------------------------------------------------------------------
+ *
+ * PDCurses Function Declarations
+ *
+ */
+
+/* Standard */
+
+int addch(const chtype);
+int addchnstr(const chtype *, int);
+int addchstr(const chtype *);
+int addnstr(const char *, int);
+int addstr(const char *);
+int attroff(chtype);
+int attron(chtype);
+int attrset(chtype);
+int attr_get(attr_t *, short *, void *);
+int attr_off(attr_t, void *);
+int attr_on(attr_t, void *);
+int attr_set(attr_t, short, void *);
+int baudrate(void);
+int beep(void);
+int bkgd(chtype);
+void bkgdset(chtype);
+int border(chtype, chtype, chtype, chtype, chtype, chtype, chtype, chtype);
+int box(WINDOW *, chtype, chtype);
+bool can_change_color(void);
+int cbreak(void);
+int chgat(int, attr_t, short, const void *);
+int clearok(WINDOW *, bool);
+int clear(void);
+int clrtobot(void);
+int clrtoeol(void);
+int color_content(short, short *, short *, short *);
+int color_set(short, void *);
+int copywin(const WINDOW *, WINDOW *, int, int, int, int, int, int, int);
+int curs_set(int);
+int def_prog_mode(void);
+int def_shell_mode(void);
+int delay_output(int);
+int delch(void);
+int deleteln(void);
+void delscreen(SCREEN *);
+int delwin(WINDOW *);
+WINDOW *derwin(WINDOW *, int, int, int, int);
+int doupdate(void);
+WINDOW *dupwin(WINDOW *);
+int echochar(const chtype);
+int echo(void);
+int endwin(void);
+char erasechar(void);
+int erase(void);
+void filter(void);
+int flash(void);
+int flushinp(void);
+chtype getbkgd(WINDOW *);
+int getnstr(char *, int);
+int getstr(char *);
+WINDOW *getwin(FILE *);
+int halfdelay(int);
+bool has_colors(void);
+bool has_ic(void);
+bool has_il(void);
+int hline(chtype, int);
+void idcok(WINDOW *, bool);
+int idlok(WINDOW *, bool);
+void immedok(WINDOW *, bool);
+int inchnstr(chtype *, int);
+int inchstr(chtype *);
+chtype inch(void);
+int init_color(short, short, short, short);
+int init_pair(short, short, short);
+WINDOW *initscr(void);
+int innstr(char *, int);
+int insch(chtype);
+int insdelln(int);
+int insertln(void);
+int insnstr(const char *, int);
+int insstr(const char *);
+int instr(char *);
+int intrflush(WINDOW *, bool);
+bool isendwin(void);
+bool is_linetouched(WINDOW *, int);
+bool is_wintouched(WINDOW *);
+char *keyname(int);
+int keypad(WINDOW *, bool);
+char killchar(void);
+int leaveok(WINDOW *, bool);
+char *longname(void);
+int meta(WINDOW *, bool);
+int move(int, int);
+int mvaddch(int, int, const chtype);
+int mvaddchnstr(int, int, const chtype *, int);
+int mvaddchstr(int, int, const chtype *);
+int mvaddnstr(int, int, const char *, int);
+int mvaddstr(int, int, const char *);
+int mvchgat(int, int, int, attr_t, short, const void *);
+int mvcur(int, int, int, int);
+int mvdelch(int, int);
+int mvderwin(WINDOW *, int, int);
+int mvgetch(int, int);
+int mvgetnstr(int, int, char *, int);
+int mvgetstr(int, int, char *);
+int mvhline(int, int, chtype, int);
+chtype mvinch(int, int);
+int mvinchnstr(int, int, chtype *, int);
+int mvinchstr(int, int, chtype *);
+int mvinnstr(int, int, char *, int);
+int mvinsch(int, int, chtype);
+int mvinsnstr(int, int, const char *, int);
+int mvinsstr(int, int, const char *);
+int mvinstr(int, int, char *);
+int mvprintw(int, int, const char *, ...);
+int mvscanw(int, int, const char *, ...);
+int mvvline(int, int, chtype, int);
+int mvwaddchnstr(WINDOW *, int, int, const chtype *, int);
+int mvwaddchstr(WINDOW *, int, int, const chtype *);
+int mvwaddch(WINDOW *, int, int, const chtype);
+int mvwaddnstr(WINDOW *, int, int, const char *, int);
+int mvwaddstr(WINDOW *, int, int, const char *);
+int mvwchgat(WINDOW *, int, int, int, attr_t, short, const void *);
+int mvwdelch(WINDOW *, int, int);
+int mvwgetch(WINDOW *, int, int);
+int mvwgetnstr(WINDOW *, int, int, char *, int);
+int mvwgetstr(WINDOW *, int, int, char *);
+int mvwhline(WINDOW *, int, int, chtype, int);
+int mvwinchnstr(WINDOW *, int, int, chtype *, int);
+int mvwinchstr(WINDOW *, int, int, chtype *);
+chtype mvwinch(WINDOW *, int, int);
+int mvwinnstr(WINDOW *, int, int, char *, int);
+int mvwinsch(WINDOW *, int, int, chtype);
+int mvwinsnstr(WINDOW *, int, int, const char *, int);
+int mvwinsstr(WINDOW *, int, int, const char *);
+int mvwinstr(WINDOW *, int, int, char *);
+int mvwin(WINDOW *, int, int);
+int mvwprintw(WINDOW *, int, int, const char *, ...);
+int mvwscanw(WINDOW *, int, int, const char *, ...);
+int mvwvline(WINDOW *, int, int, chtype, int);
+int napms(int);
+WINDOW *newpad(int, int);
+SCREEN *newterm(const char *, FILE *, FILE *);
+WINDOW *newwin(int, int, int, int);
+int nl(void);
+int nocbreak(void);
+int nodelay(WINDOW *, bool);
+int noecho(void);
+int nonl(void);
+void noqiflush(void);
+int noraw(void);
+int notimeout(WINDOW *, bool);
+int overlay(const WINDOW *, WINDOW *);
+int overwrite(const WINDOW *, WINDOW *);
+int pair_content(short, short *, short *);
+int pechochar(WINDOW *, chtype);
+int pnoutrefresh(WINDOW *, int, int, int, int, int, int);
+int prefresh(WINDOW *, int, int, int, int, int, int);
+int printw(const char *, ...);
+int putwin(WINDOW *, FILE *);
+void qiflush(void);
+int raw(void);
+int redrawwin(WINDOW *);
+int refresh(void);
+int reset_prog_mode(void);
+int reset_shell_mode(void);
+int resetty(void);
+int ripoffline(int, int (*)(WINDOW *, int));
+int savetty(void);
+int scanw(const char *, ...);
+int scr_dump(const char *);
+int scr_init(const char *);
+int scr_restore(const char *);
+int scr_set(const char *);
+int scrl(int);
+int scroll(WINDOW *);
+int scrollok(WINDOW *, bool);
+SCREEN *set_term(SCREEN *);
+int setscrreg(int, int);
+int slk_attroff(const chtype);
+int slk_attr_off(const attr_t, void *);
+int slk_attron(const chtype);
+int slk_attr_on(const attr_t, void *);
+int slk_attrset(const chtype);
+int slk_attr_set(const attr_t, short, void *);
+int slk_clear(void);
+int slk_color(short);
+int slk_init(int);
+char *slk_label(int);
+int slk_noutrefresh(void);
+int slk_refresh(void);
+int slk_restore(void);
+int slk_set(int, const char *, int);
+int slk_touch(void);
+int standend(void);
+int standout(void);
+int start_color(void);
+WINDOW *subpad(WINDOW *, int, int, int, int);
+WINDOW *subwin(WINDOW *, int, int, int, int);
+int syncok(WINDOW *, bool);
+chtype termattrs(void);
+attr_t term_attrs(void);
+char *termname(void);
+void timeout(int);
+int touchline(WINDOW *, int, int);
+int touchwin(WINDOW *);
+int typeahead(int);
+int untouchwin(WINDOW *);
+void use_env(bool);
+int vidattr(chtype);
+int vid_attr(attr_t, short, void *);
+int vidputs(chtype, int (*)(int));
+int vid_puts(attr_t, short, void *, int (*)(int));
+int vline(chtype, int);
+int vw_printw(WINDOW *, const char *, va_list);
+int vwprintw(WINDOW *, const char *, va_list);
+int vw_scanw(WINDOW *, const char *, va_list);
+int vwscanw(WINDOW *, const char *, va_list);
+int waddchnstr(WINDOW *, const chtype *, int);
+int waddchstr(WINDOW *, const chtype *);
+int waddch(WINDOW *, const chtype);
+int waddnstr(WINDOW *, const char *, int);
+int waddstr(WINDOW *, const char *);
+int wattroff(WINDOW *, chtype);
+int wattron(WINDOW *, chtype);
+int wattrset(WINDOW *, chtype);
+int wattr_get(WINDOW *, attr_t *, short *, void *);
+int wattr_off(WINDOW *, attr_t, void *);
+int wattr_on(WINDOW *, attr_t, void *);
+int wattr_set(WINDOW *, attr_t, short, void *);
+void wbkgdset(WINDOW *, chtype);
+int wbkgd(WINDOW *, chtype);
+int wborder(WINDOW *, chtype, chtype, chtype, chtype,
+ chtype, chtype, chtype, chtype);
+int wchgat(WINDOW *, int, attr_t, short, const void *);
+int wclear(WINDOW *);
+int wclrtobot(WINDOW *);
+int wclrtoeol(WINDOW *);
+int wcolor_set(WINDOW *, short, void *);
+void wcursyncup(WINDOW *);
+int wdelch(WINDOW *);
+int wdeleteln(WINDOW *);
+int wechochar(WINDOW *, const chtype);
+int werase(WINDOW *);
+int wgetch(WINDOW *);
+int wgetnstr(WINDOW *, char *, int);
+int wgetstr(WINDOW *, char *);
+int whline(WINDOW *, chtype, int);
+int winchnstr(WINDOW *, chtype *, int);
+int winchstr(WINDOW *, chtype *);
+chtype winch(WINDOW *);
+int winnstr(WINDOW *, char *, int);
+int winsch(WINDOW *, chtype);
+int winsdelln(WINDOW *, int);
+int winsertln(WINDOW *);
+int winsnstr(WINDOW *, const char *, int);
+int winsstr(WINDOW *, const char *);
+int winstr(WINDOW *, char *);
+int wmove(WINDOW *, int, int);
+int wnoutrefresh(WINDOW *);
+int wprintw(WINDOW *, const char *, ...);
+int wredrawln(WINDOW *, int, int);
+int wrefresh(WINDOW *);
+int wscanw(WINDOW *, const char *, ...);
+int wscrl(WINDOW *, int);
+int wsetscrreg(WINDOW *, int, int);
+int wstandend(WINDOW *);
+int wstandout(WINDOW *);
+void wsyncdown(WINDOW *);
+void wsyncup(WINDOW *);
+void wtimeout(WINDOW *, int);
+int wtouchln(WINDOW *, int, int, int);
+int wvline(WINDOW *, chtype, int);
+
+/* Wide-character functions */
+
+#ifdef PDC_WIDE
+int addnwstr(const wchar_t *, int);
+int addwstr(const wchar_t *);
+int add_wch(const cchar_t *);
+int add_wchnstr(const cchar_t *, int);
+int add_wchstr(const cchar_t *);
+int border_set(const cchar_t *, const cchar_t *, const cchar_t *,
+ const cchar_t *, const cchar_t *, const cchar_t *,
+ const cchar_t *, const cchar_t *);
+int box_set(WINDOW *, const cchar_t *, const cchar_t *);
+int echo_wchar(const cchar_t *);
+int erasewchar(wchar_t *);
+int getbkgrnd(cchar_t *);
+int getcchar(const cchar_t *, wchar_t *, attr_t *, short *, void *);
+int getn_wstr(wint_t *, int);
+int get_wch(wint_t *);
+int get_wstr(wint_t *);
+int hline_set(const cchar_t *, int);
+int innwstr(wchar_t *, int);
+int ins_nwstr(const wchar_t *, int);
+int ins_wch(const cchar_t *);
+int ins_wstr(const wchar_t *);
+int inwstr(wchar_t *);
+int in_wch(cchar_t *);
+int in_wchnstr(cchar_t *, int);
+int in_wchstr(cchar_t *);
+char *key_name(wchar_t);
+int killwchar(wchar_t *);
+int mvaddnwstr(int, int, const wchar_t *, int);
+int mvaddwstr(int, int, const wchar_t *);
+int mvadd_wch(int, int, const cchar_t *);
+int mvadd_wchnstr(int, int, const cchar_t *, int);
+int mvadd_wchstr(int, int, const cchar_t *);
+int mvgetn_wstr(int, int, wint_t *, int);
+int mvget_wch(int, int, wint_t *);
+int mvget_wstr(int, int, wint_t *);
+int mvhline_set(int, int, const cchar_t *, int);
+int mvinnwstr(int, int, wchar_t *, int);
+int mvins_nwstr(int, int, const wchar_t *, int);
+int mvins_wch(int, int, const cchar_t *);
+int mvins_wstr(int, int, const wchar_t *);
+int mvinwstr(int, int, wchar_t *);
+int mvin_wch(int, int, cchar_t *);
+int mvin_wchnstr(int, int, cchar_t *, int);
+int mvin_wchstr(int, int, cchar_t *);
+int mvvline_set(int, int, const cchar_t *, int);
+int mvwaddnwstr(WINDOW *, int, int, const wchar_t *, int);
+int mvwaddwstr(WINDOW *, int, int, const wchar_t *);
+int mvwadd_wch(WINDOW *, int, int, const cchar_t *);
+int mvwadd_wchnstr(WINDOW *, int, int, const cchar_t *, int);
+int mvwadd_wchstr(WINDOW *, int, int, const cchar_t *);
+int mvwgetn_wstr(WINDOW *, int, int, wint_t *, int);
+int mvwget_wch(WINDOW *, int, int, wint_t *);
+int mvwget_wstr(WINDOW *, int, int, wint_t *);
+int mvwhline_set(WINDOW *, int, int, const cchar_t *, int);
+int mvwinnwstr(WINDOW *, int, int, wchar_t *, int);
+int mvwins_nwstr(WINDOW *, int, int, const wchar_t *, int);
+int mvwins_wch(WINDOW *, int, int, const cchar_t *);
+int mvwins_wstr(WINDOW *, int, int, const wchar_t *);
+int mvwin_wch(WINDOW *, int, int, cchar_t *);
+int mvwin_wchnstr(WINDOW *, int, int, cchar_t *, int);
+int mvwin_wchstr(WINDOW *, int, int, cchar_t *);
+int mvwinwstr(WINDOW *, int, int, wchar_t *);
+int mvwvline_set(WINDOW *, int, int, const cchar_t *, int);
+int pecho_wchar(WINDOW *, const cchar_t*);
+int setcchar(cchar_t*, const wchar_t*, const attr_t, short, const void*);
+int slk_wset(int, const wchar_t *, int);
+int unget_wch(const wchar_t);
+int vline_set(const cchar_t *, int);
+int waddnwstr(WINDOW *, const wchar_t *, int);
+int waddwstr(WINDOW *, const wchar_t *);
+int wadd_wch(WINDOW *, const cchar_t *);
+int wadd_wchnstr(WINDOW *, const cchar_t *, int);
+int wadd_wchstr(WINDOW *, const cchar_t *);
+int wbkgrnd(WINDOW *, const cchar_t *);
+void wbkgrndset(WINDOW *, const cchar_t *);
+int wborder_set(WINDOW *, const cchar_t *, const cchar_t *,
+ const cchar_t *, const cchar_t *, const cchar_t *,
+ const cchar_t *, const cchar_t *, const cchar_t *);
+int wecho_wchar(WINDOW *, const cchar_t *);
+int wgetbkgrnd(WINDOW *, cchar_t *);
+int wgetn_wstr(WINDOW *, wint_t *, int);
+int wget_wch(WINDOW *, wint_t *);
+int wget_wstr(WINDOW *, wint_t *);
+int whline_set(WINDOW *, const cchar_t *, int);
+int winnwstr(WINDOW *, wchar_t *, int);
+int wins_nwstr(WINDOW *, const wchar_t *, int);
+int wins_wch(WINDOW *, const cchar_t *);
+int wins_wstr(WINDOW *, const wchar_t *);
+int winwstr(WINDOW *, wchar_t *);
+int win_wch(WINDOW *, cchar_t *);
+int win_wchnstr(WINDOW *, cchar_t *, int);
+int win_wchstr(WINDOW *, cchar_t *);
+wchar_t *wunctrl(cchar_t *);
+int wvline_set(WINDOW *, const cchar_t *, int);
+#endif
+
+/* Quasi-standard */
+
+chtype getattrs(WINDOW *);
+int getbegx(WINDOW *);
+int getbegy(WINDOW *);
+int getmaxx(WINDOW *);
+int getmaxy(WINDOW *);
+int getparx(WINDOW *);
+int getpary(WINDOW *);
+int getcurx(WINDOW *);
+int getcury(WINDOW *);
+void traceoff(void);
+void traceon(void);
+char *unctrl(chtype);
+
+int crmode(void);
+int nocrmode(void);
+int draino(int);
+int resetterm(void);
+int fixterm(void);
+int saveterm(void);
+int setsyx(int, int);
+
+int mouse_set(unsigned long);
+int mouse_on(unsigned long);
+int mouse_off(unsigned long);
+int request_mouse_pos(void);
+int map_button(unsigned long);
+void wmouse_position(WINDOW *, int *, int *);
+unsigned long getmouse(void);
+unsigned long getbmap(void);
+
+/* ncurses */
+
+int assume_default_colors(int, int);
+const char *curses_version(void);
+bool has_key(int);
+int use_default_colors(void);
+int wresize(WINDOW *, int, int);
+
+int mouseinterval(int);
+mmask_t mousemask(mmask_t, mmask_t *);
+bool mouse_trafo(int *, int *, bool);
+int nc_getmouse(MEVENT *);
+int ungetmouse(MEVENT *);
+bool wenclose(const WINDOW *, int, int);
+bool wmouse_trafo(const WINDOW *, int *, int *, bool);
+
+/* PDCurses */
+
+int addrawch(chtype);
+int insrawch(chtype);
+bool is_termresized(void);
+int mvaddrawch(int, int, chtype);
+int mvdeleteln(int, int);
+int mvinsertln(int, int);
+int mvinsrawch(int, int, chtype);
+int mvwaddrawch(WINDOW *, int, int, chtype);
+int mvwdeleteln(WINDOW *, int, int);
+int mvwinsertln(WINDOW *, int, int);
+int mvwinsrawch(WINDOW *, int, int, chtype);
+int raw_output(bool);
+int resize_term(int, int);
+WINDOW *resize_window(WINDOW *, int, int);
+int waddrawch(WINDOW *, chtype);
+int winsrawch(WINDOW *, chtype);
+char wordchar(void);
+
+#ifdef PDC_WIDE
+wchar_t *slk_wlabel(int);
+#endif
+
+void PDC_debug(const char *, ...);
+int PDC_ungetch(int);
+int PDC_set_blink(bool);
+int PDC_set_line_color(short);
+void PDC_set_title(const char *);
+
+int PDC_clearclipboard(void);
+int PDC_freeclipboard(char *);
+int PDC_getclipboard(char **, long *);
+int PDC_setclipboard(const char *, long);
+
+unsigned long PDC_get_input_fd(void);
+unsigned long PDC_get_key_modifiers(void);
+int PDC_return_key_modifiers(bool);
+int PDC_save_key_modifiers(bool);
+
+#ifdef XCURSES
+WINDOW *Xinitscr(int, char **);
+void XCursesExit(void);
+int sb_init(void);
+int sb_set_horz(int, int, int);
+int sb_set_vert(int, int, int);
+int sb_get_horz(int *, int *, int *);
+int sb_get_vert(int *, int *, int *);
+int sb_refresh(void);
+#endif
+
+/*** Functions defined as macros ***/
+
+/* getch() and ungetch() conflict with some DOS libraries */
+
+#define getch() wgetch(stdscr)
+#define ungetch(ch) PDC_ungetch(ch)
+
+#define COLOR_PAIR(n) (((chtype)(n) << PDC_COLOR_SHIFT) & A_COLOR)
+#define PAIR_NUMBER(n) (((n) & A_COLOR) >> PDC_COLOR_SHIFT)
+
+/* These will _only_ work as macros */
+
+#define getbegyx(w, y, x) (y = getbegy(w), x = getbegx(w))
+#define getmaxyx(w, y, x) (y = getmaxy(w), x = getmaxx(w))
+#define getparyx(w, y, x) (y = getpary(w), x = getparx(w))
+#define getyx(w, y, x) (y = getcury(w), x = getcurx(w))
+
+#define getsyx(y, x) { if (curscr->_leaveit) (y)=(x)=-1; \
+ else getyx(curscr,(y),(x)); }
+
+#ifdef NCURSES_MOUSE_VERSION
+# define getmouse(x) nc_getmouse(x)
+#endif
+
+/* return codes from PDC_getclipboard() and PDC_setclipboard() calls */
+
+#define PDC_CLIP_SUCCESS 0
+#define PDC_CLIP_ACCESS_ERROR 1
+#define PDC_CLIP_EMPTY 2
+#define PDC_CLIP_MEMORY_ERROR 3
+
+/* PDCurses key modifier masks */
+
+#define PDC_KEY_MODIFIER_SHIFT 1
+#define PDC_KEY_MODIFIER_CONTROL 2
+#define PDC_KEY_MODIFIER_ALT 4
+#define PDC_KEY_MODIFIER_NUMLOCK 8
+
+#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS)
+# undef bool
+}
+#endif
+
+#endif /* __PDCURSES__ */
diff --git a/samtools-0.1.19/win32/zconf.h b/samtools-0.1.19/win32/zconf.h
new file mode 100644
index 0000000..03a9431
--- /dev/null
+++ b/samtools-0.1.19/win32/zconf.h
@@ -0,0 +1,332 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZCONF_H
+#define ZCONF_H
+
+/*
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ */
+#ifdef Z_PREFIX
+# define deflateInit_ z_deflateInit_
+# define deflate z_deflate
+# define deflateEnd z_deflateEnd
+# define inflateInit_ z_inflateInit_
+# define inflate z_inflate
+# define inflateEnd z_inflateEnd
+# define deflateInit2_ z_deflateInit2_
+# define deflateSetDictionary z_deflateSetDictionary
+# define deflateCopy z_deflateCopy
+# define deflateReset z_deflateReset
+# define deflateParams z_deflateParams
+# define deflateBound z_deflateBound
+# define deflatePrime z_deflatePrime
+# define inflateInit2_ z_inflateInit2_
+# define inflateSetDictionary z_inflateSetDictionary
+# define inflateSync z_inflateSync
+# define inflateSyncPoint z_inflateSyncPoint
+# define inflateCopy z_inflateCopy
+# define inflateReset z_inflateReset
+# define inflateBack z_inflateBack
+# define inflateBackEnd z_inflateBackEnd
+# define compress z_compress
+# define compress2 z_compress2
+# define compressBound z_compressBound
+# define uncompress z_uncompress
+# define adler32 z_adler32
+# define crc32 z_crc32
+# define get_crc_table z_get_crc_table
+# define zError z_zError
+
+# define alloc_func z_alloc_func
+# define free_func z_free_func
+# define in_func z_in_func
+# define out_func z_out_func
+# define Byte z_Byte
+# define uInt z_uInt
+# define uLong z_uLong
+# define Bytef z_Bytef
+# define charf z_charf
+# define intf z_intf
+# define uIntf z_uIntf
+# define uLongf z_uLongf
+# define voidpf z_voidpf
+# define voidp z_voidp
+#endif
+
+#if defined(__MSDOS__) && !defined(MSDOS)
+# define MSDOS
+#endif
+#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
+# define OS2
+#endif
+#if defined(_WINDOWS) && !defined(WINDOWS)
+# define WINDOWS
+#endif
+#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
+# ifndef WIN32
+# define WIN32
+# endif
+#endif
+#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
+# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
+# ifndef SYS16BIT
+# define SYS16BIT
+# endif
+# endif
+#endif
+
+/*
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#ifdef SYS16BIT
+# define MAXSEG_64K
+#endif
+#ifdef MSDOS
+# define UNALIGNED_OK
+#endif
+
+#ifdef __STDC_VERSION__
+# ifndef STDC
+# define STDC
+# endif
+# if __STDC_VERSION__ >= 199901L
+# ifndef STDC99
+# define STDC99
+# endif
+# endif
+#endif
+#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
+# define STDC
+#endif
+#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
+# define STDC
+#endif
+#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
+# define STDC
+#endif
+#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
+# define STDC
+#endif
+
+#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */
+# define STDC
+#endif
+
+#ifndef STDC
+# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+# define const /* note: need a more gentle solution here */
+# endif
+#endif
+
+/* Some Mac compilers merge all .h files incorrectly: */
+#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__)
+# define NO_DUMMY_DECL
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+# ifdef MAXSEG_64K
+# define MAX_MEM_LEVEL 8
+# else
+# define MAX_MEM_LEVEL 9
+# endif
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+# define MAX_WBITS 15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+ (1 << (windowBits+2)) + (1 << (memLevel+9))
+ that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+ make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+ The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus a few kilobytes
+ for small objects.
+*/
+
+ /* Type declarations */
+
+#ifndef OF /* function prototypes */
+# ifdef STDC
+# define OF(args) args
+# else
+# define OF(args) ()
+# endif
+#endif
+
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h. If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#ifdef SYS16BIT
+# if defined(M_I86SM) || defined(M_I86MM)
+ /* MSC small or medium model */
+# define SMALL_MEDIUM
+# ifdef _MSC_VER
+# define FAR _far
+# else
+# define FAR far
+# endif
+# endif
+# if (defined(__SMALL__) || defined(__MEDIUM__))
+ /* Turbo C small or medium model */
+# define SMALL_MEDIUM
+# ifdef __BORLANDC__
+# define FAR _far
+# else
+# define FAR far
+# endif
+# endif
+#endif
+
+#if defined(WINDOWS) || defined(WIN32)
+ /* If building or using zlib as a DLL, define ZLIB_DLL.
+ * This is not mandatory, but it offers a little performance increase.
+ */
+# ifdef ZLIB_DLL
+# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
+# ifdef ZLIB_INTERNAL
+# define ZEXTERN extern __declspec(dllexport)
+# else
+# define ZEXTERN extern __declspec(dllimport)
+# endif
+# endif
+# endif /* ZLIB_DLL */
+ /* If building or using zlib with the WINAPI/WINAPIV calling convention,
+ * define ZLIB_WINAPI.
+ * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+ */
+# ifdef ZLIB_WINAPI
+# ifdef FAR
+# undef FAR
+# endif
+# include <windows.h>
+ /* No need for _export, use ZLIB.DEF instead. */
+ /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+# define ZEXPORT WINAPI
+# ifdef WIN32
+# define ZEXPORTVA WINAPIV
+# else
+# define ZEXPORTVA FAR CDECL
+# endif
+# endif
+#endif
+
+#if defined (__BEOS__)
+# ifdef ZLIB_DLL
+# ifdef ZLIB_INTERNAL
+# define ZEXPORT __declspec(dllexport)
+# define ZEXPORTVA __declspec(dllexport)
+# else
+# define ZEXPORT __declspec(dllimport)
+# define ZEXPORTVA __declspec(dllimport)
+# endif
+# endif
+#endif
+
+#ifndef ZEXTERN
+# define ZEXTERN extern
+#endif
+#ifndef ZEXPORT
+# define ZEXPORT
+#endif
+#ifndef ZEXPORTVA
+# define ZEXPORTVA
+#endif
+
+#ifndef FAR
+# define FAR
+#endif
+
+#if !defined(__MACTYPES__)
+typedef unsigned char Byte; /* 8 bits */
+#endif
+typedef unsigned int uInt; /* 16 bits or more */
+typedef unsigned long uLong; /* 32 bits or more */
+
+#ifdef SMALL_MEDIUM
+ /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
+# define Bytef Byte FAR
+#else
+ typedef Byte FAR Bytef;
+#endif
+typedef char FAR charf;
+typedef int FAR intf;
+typedef uInt FAR uIntf;
+typedef uLong FAR uLongf;
+
+#ifdef STDC
+ typedef void const *voidpc;
+ typedef void FAR *voidpf;
+ typedef void *voidp;
+#else
+ typedef Byte const *voidpc;
+ typedef Byte FAR *voidpf;
+ typedef Byte *voidp;
+#endif
+
+#if 0 /* HAVE_UNISTD_H -- this line is updated by ./configure */
+# include <sys/types.h> /* for off_t */
+# include <unistd.h> /* for SEEK_* and off_t */
+# ifdef VMS
+# include <unixio.h> /* for off_t */
+# endif
+# define z_off_t off_t
+#endif
+#ifndef SEEK_SET
+# define SEEK_SET 0 /* Seek from beginning of file. */
+# define SEEK_CUR 1 /* Seek from current position. */
+# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */
+#endif
+#ifndef z_off_t
+# define z_off_t long
+#endif
+
+#if defined(__OS400__)
+# define NO_vsnprintf
+#endif
+
+#if defined(__MVS__)
+# define NO_vsnprintf
+# ifdef FAR
+# undef FAR
+# endif
+#endif
+
+/* MVS linker does not support external names larger than 8 bytes */
+#if defined(__MVS__)
+# pragma map(deflateInit_,"DEIN")
+# pragma map(deflateInit2_,"DEIN2")
+# pragma map(deflateEnd,"DEEND")
+# pragma map(deflateBound,"DEBND")
+# pragma map(inflateInit_,"ININ")
+# pragma map(inflateInit2_,"ININ2")
+# pragma map(inflateEnd,"INEND")
+# pragma map(inflateSync,"INSY")
+# pragma map(inflateSetDictionary,"INSEDI")
+# pragma map(compressBound,"CMBND")
+# pragma map(inflate_table,"INTABL")
+# pragma map(inflate_fast,"INFA")
+# pragma map(inflate_copyright,"INCOPY")
+#endif
+
+#endif /* ZCONF_H */
diff --git a/samtools-0.1.19/win32/zlib.h b/samtools-0.1.19/win32/zlib.h
new file mode 100644
index 0000000..0228179
--- /dev/null
+++ b/samtools-0.1.19/win32/zlib.h
@@ -0,0 +1,1357 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.3, July 18th, 2005
+
+ Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup at gzip.org madler at alumni.caltech.edu
+
+
+ The data format used by the zlib library is described by RFCs (Request for
+ Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+ (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+#ifndef ZLIB_H
+#define ZLIB_H
+
+#include "zconf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIB_VERSION "1.2.3"
+#define ZLIB_VERNUM 0x1230
+
+/*
+ The 'zlib' compression library provides in-memory compression and
+ decompression functions, including integrity checks of the uncompressed
+ data. This version of the library supports only one compression method
+ (deflation) but other algorithms will be added later and will have the same
+ stream interface.
+
+ Compression can be done in a single step if the buffers are large
+ enough (for example if an input file is mmap'ed), or can be done by
+ repeated calls of the compression function. In the latter case, the
+ application must provide more input and/or consume the output
+ (providing more output space) before each call.
+
+ The compressed data format used by default by the in-memory functions is
+ the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+ around a deflate stream, which is itself documented in RFC 1951.
+
+ The library also supports reading and writing files in gzip (.gz) format
+ with an interface similar to that of stdio using the functions that start
+ with "gz". The gzip format is different from the zlib format. gzip is a
+ gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+ This library can optionally read and write gzip streams in memory as well.
+
+ The zlib format was designed to be compact and fast for use in memory
+ and on communications channels. The gzip format was designed for single-
+ file compression on file systems, has a larger header than zlib to maintain
+ directory information, and uses a different, slower check method than zlib.
+
+ The library does not install any signal handler. The decoder checks
+ the consistency of the compressed data, so the library should never
+ crash even in case of corrupted input.
+*/
+
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void (*free_func) OF((voidpf opaque, voidpf address));
+
+struct internal_state;
+
+typedef struct z_stream_s {
+ Bytef *next_in; /* next input byte */
+ uInt avail_in; /* number of bytes available at next_in */
+ uLong total_in; /* total nb of input bytes read so far */
+
+ Bytef *next_out; /* next output byte should be put there */
+ uInt avail_out; /* remaining free space at next_out */
+ uLong total_out; /* total nb of bytes output so far */
+
+ char *msg; /* last error message, NULL if no error */
+ struct internal_state FAR *state; /* not visible by applications */
+
+ alloc_func zalloc; /* used to allocate the internal state */
+ free_func zfree; /* used to free the internal state */
+ voidpf opaque; /* private data object passed to zalloc and zfree */
+
+ int data_type; /* best guess about the data type: binary or text */
+ uLong adler; /* adler32 value of the uncompressed data */
+ uLong reserved; /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+ gzip header information passed to and from zlib routines. See RFC 1952
+ for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+ int text; /* true if compressed data believed to be text */
+ uLong time; /* modification time */
+ int xflags; /* extra flags (not used when writing a gzip file) */
+ int os; /* operating system */
+ Bytef *extra; /* pointer to extra field or Z_NULL if none */
+ uInt extra_len; /* extra field length (valid if extra != Z_NULL) */
+ uInt extra_max; /* space at extra (only when reading header) */
+ Bytef *name; /* pointer to zero-terminated file name or Z_NULL */
+ uInt name_max; /* space at name (only when reading header) */
+ Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */
+ uInt comm_max; /* space at comment (only when reading header) */
+ int hcrc; /* true if there was or will be a header crc */
+ int done; /* true when done reading gzip header (not used
+ when writing a gzip file) */
+} gz_header;
+
+typedef gz_header FAR *gz_headerp;
+
+/*
+ The application must update next_in and avail_in when avail_in has
+ dropped to zero. It must update next_out and avail_out when avail_out
+ has dropped to zero. The application must initialize zalloc, zfree and
+ opaque before calling the init function. All other fields are set by the
+ compression library and must not be updated by the application.
+
+ The opaque value provided by the application will be passed as the first
+ parameter for calls of zalloc and zfree. This can be useful for custom
+ memory management. The compression library attaches no meaning to the
+ opaque value.
+
+ zalloc must return Z_NULL if there is not enough memory for the object.
+ If zlib is used in a multi-threaded application, zalloc and zfree must be
+ thread safe.
+
+ On 16-bit systems, the functions zalloc and zfree must be able to allocate
+ exactly 65536 bytes, but will not be required to allocate more than this
+ if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
+ pointers returned by zalloc for objects of exactly 65536 bytes *must*
+ have their offset normalized to zero. The default allocation function
+ provided by this library ensures this (see zutil.c). To reduce memory
+ requirements and avoid any allocation of 64K objects, at the expense of
+ compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
+
+ The fields total_in and total_out can be used for statistics or
+ progress reports. After compression, total_in holds the total size of
+ the uncompressed data and may be saved for use in the decompressor
+ (particularly if the decompressor wants to decompress everything in
+ a single step).
+*/
+
+ /* constants */
+
+#define Z_NO_FLUSH 0
+#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */
+#define Z_SYNC_FLUSH 2
+#define Z_FULL_FLUSH 3
+#define Z_FINISH 4
+#define Z_BLOCK 5
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK 0
+#define Z_STREAM_END 1
+#define Z_NEED_DICT 2
+#define Z_ERRNO (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR (-3)
+#define Z_MEM_ERROR (-4)
+#define Z_BUF_ERROR (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative
+ * values are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION 0
+#define Z_BEST_SPEED 1
+#define Z_BEST_COMPRESSION 9
+#define Z_DEFAULT_COMPRESSION (-1)
+/* compression levels */
+
+#define Z_FILTERED 1
+#define Z_HUFFMAN_ONLY 2
+#define Z_RLE 3
+#define Z_FIXED 4
+#define Z_DEFAULT_STRATEGY 0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY 0
+#define Z_TEXT 1
+#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN 2
+/* Possible values of the data_type field (though see inflate()) */
+
+#define Z_DEFLATED 8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+ /* basic functions */
+
+ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+ If the first character differs, the library code actually used is
+ not compatible with the zlib.h header file used by the application.
+ This check is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+
+ Initializes the internal stream state for compression. The fields
+ zalloc, zfree and opaque must be initialized before by the caller.
+ If zalloc and zfree are set to Z_NULL, deflateInit updates them to
+ use default allocation functions.
+
+ The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+ 1 gives best speed, 9 gives best compression, 0 gives no compression at
+ all (the input data is simply copied a block at a time).
+ Z_DEFAULT_COMPRESSION requests a default compromise between speed and
+ compression (currently equivalent to level 6).
+
+ deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if level is not a valid compression level,
+ Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+ with the version assumed by the caller (ZLIB_VERSION).
+ msg is set to null if there is no error message. deflateInit does not
+ perform any compression: this will be done by deflate().
+*/
+
+
+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+/*
+ deflate compresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce some
+ output latency (reading input without producing any output) except when
+ forced to flush.
+
+ The detailed semantics are as follows. deflate performs one or both of the
+ following actions:
+
+ - Compress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in and avail_in are updated and
+ processing will resume at this point for the next call of deflate().
+
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. This action is forced if the parameter flush is non zero.
+ Forcing flush frequently degrades the compression ratio, so this parameter
+ should be set only when necessary (in interactive applications).
+ Some output may be provided even if flush is not set.
+
+ Before the call of deflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming
+ more output, and updating avail_in or avail_out accordingly; avail_out
+ should never be zero before the call. The application can consume the
+ compressed output when it wants, for example when the output buffer is full
+ (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
+ and with zero avail_out, it must be called again after making room in the
+ output buffer because there might be more output pending.
+
+ Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+ decide how much data to accumualte before producing output, in order to
+ maximize compression.
+
+ If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+ flushed to the output buffer and the output is aligned on a byte boundary, so
+ that the decompressor can get all input data available so far. (In particular
+ avail_in is zero after the call if enough output space has been provided
+ before the call.) Flushing may degrade compression for some compression
+ algorithms and so it should be used only when necessary.
+
+ If flush is set to Z_FULL_FLUSH, all output is flushed as with
+ Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+ restart from this point if previous compressed data has been damaged or if
+ random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
+ compression.
+
+ If deflate returns with avail_out == 0, this function must be called again
+ with the same value of the flush parameter and more output space (updated
+ avail_out), until the flush is complete (deflate returns with non-zero
+ avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+ avail_out is greater than six to avoid repeated flush markers due to
+ avail_out == 0 on return.
+
+ If the parameter flush is set to Z_FINISH, pending input is processed,
+ pending output is flushed and deflate returns with Z_STREAM_END if there
+ was enough output space; if deflate returns with Z_OK, this function must be
+ called again with Z_FINISH and more output space (updated avail_out) but no
+ more input data, until it returns with Z_STREAM_END or an error. After
+ deflate has returned Z_STREAM_END, the only possible operations on the
+ stream are deflateReset or deflateEnd.
+
+ Z_FINISH can be used immediately after deflateInit if all the compression
+ is to be done in a single step. In this case, avail_out must be at least
+ the value returned by deflateBound (see below). If deflate does not return
+ Z_STREAM_END, then it must be called again as described above.
+
+ deflate() sets strm->adler to the adler32 checksum of all input read
+ so far (that is, total_in bytes).
+
+ deflate() may update strm->data_type if it can make a good guess about
+ the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
+ binary. This field is only for information purposes and does not affect
+ the compression algorithm in any manner.
+
+ deflate() returns Z_OK if some progress has been made (more input
+ processed or more output produced), Z_STREAM_END if all input has been
+ consumed and all output has been produced (only when flush is set to
+ Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+ if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible
+ (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
+ fatal, and deflate() can be called again with more input and more output
+ space to continue compressing.
+*/
+
+
+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+/*
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any
+ pending output.
+
+ deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+ stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+ prematurely (some input or output was discarded). In the error case,
+ msg may be set but then points to a static string (which must not be
+ deallocated).
+*/
+
+
+/*
+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+
+ Initializes the internal stream state for decompression. The fields
+ next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+ the caller. If next_in is not Z_NULL and avail_in is large enough (the exact
+ value depends on the compression method), inflateInit determines the
+ compression method from the zlib header and allocates all data structures
+ accordingly; otherwise the allocation will be deferred to the first call of
+ inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to
+ use default allocation functions.
+
+ inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+ version assumed by the caller. msg is set to null if there is no error
+ message. inflateInit does not perform any decompression apart from reading
+ the zlib header if present: this will be done by inflate(). (So next_in and
+ avail_in may be modified, but next_out and avail_out are unchanged.)
+*/
+
+
+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+/*
+ inflate decompresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce
+ some output latency (reading input without producing any output) except when
+ forced to flush.
+
+ The detailed semantics are as follows. inflate performs one or both of the
+ following actions:
+
+ - Decompress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in is updated and processing
+ will resume at this point for the next call of inflate().
+
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. inflate() provides as much output as possible, until there
+ is no more input data or no more space in the output buffer (see below
+ about the flush parameter).
+
+ Before the call of inflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming
+ more output, and updating the next_* and avail_* values accordingly.
+ The application can consume the uncompressed output when it wants, for
+ example when the output buffer is full (avail_out == 0), or after each
+ call of inflate(). If inflate returns Z_OK and with zero avail_out, it
+ must be called again after making room in the output buffer because there
+ might be more output pending.
+
+ The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
+ Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
+ output as possible to the output buffer. Z_BLOCK requests that inflate() stop
+ if and when it gets to the next deflate block boundary. When decoding the
+ zlib or gzip format, this will cause inflate() to return immediately after
+ the header and before the first block. When doing a raw inflate, inflate()
+ will go ahead and process the first block, and will return when it gets to
+ the end of that block, or when it runs out of data.
+
+ The Z_BLOCK option assists in appending to or combining deflate streams.
+ Also to assist in this, on return inflate() will set strm->data_type to the
+ number of unused bits in the last byte taken from strm->next_in, plus 64
+ if inflate() is currently decoding the last block in the deflate stream,
+ plus 128 if inflate() returned immediately after decoding an end-of-block
+ code or decoding the complete header up to just before the first byte of the
+ deflate stream. The end-of-block will not be indicated until all of the
+ uncompressed data from that block has been written to strm->next_out. The
+ number of unused bits may in general be greater than seven, except when
+ bit 7 of data_type is set, in which case the number of unused bits will be
+ less than eight.
+
+ inflate() should normally be called until it returns Z_STREAM_END or an
+ error. However if all decompression is to be performed in a single step
+ (a single call of inflate), the parameter flush should be set to
+ Z_FINISH. In this case all pending input is processed and all pending
+ output is flushed; avail_out must be large enough to hold all the
+ uncompressed data. (The size of the uncompressed data may have been saved
+ by the compressor for this purpose.) The next operation on this stream must
+ be inflateEnd to deallocate the decompression state. The use of Z_FINISH
+ is never required, but can be used to inform inflate that a faster approach
+ may be used for the single inflate() call.
+
+ In this implementation, inflate() always flushes as much output as
+ possible to the output buffer, and always uses the faster approach on the
+ first call. So the only effect of the flush parameter in this implementation
+ is on the return value of inflate(), as noted below, or when it returns early
+ because Z_BLOCK is used.
+
+ If a preset dictionary is needed after this call (see inflateSetDictionary
+ below), inflate sets strm->adler to the adler32 checksum of the dictionary
+ chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+ strm->adler to the adler32 checksum of all output produced so far (that is,
+ total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+ below. At the end of the stream, inflate() checks that its computed adler32
+ checksum is equal to that saved by the compressor and returns Z_STREAM_END
+ only if the checksum is correct.
+
+ inflate() will decompress and check either zlib-wrapped or gzip-wrapped
+ deflate data. The header type is detected automatically. Any information
+ contained in the gzip header is not retained, so applications that need that
+ information should instead use raw inflate, see inflateInit2() below, or
+ inflateBack() and perform their own processing of the gzip header and
+ trailer.
+
+ inflate() returns Z_OK if some progress has been made (more input processed
+ or more output produced), Z_STREAM_END if the end of the compressed data has
+ been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+ preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+ corrupted (input stream not conforming to the zlib format or incorrect check
+ value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+ if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+ Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+ output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+ inflate() can be called again with more input and more output space to
+ continue decompressing. If Z_DATA_ERROR is returned, the application may then
+ call inflateSync() to look for a good compression block if a partial recovery
+ of the data is desired.
+*/
+
+
+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+/*
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any
+ pending output.
+
+ inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+ was inconsistent. In the error case, msg may be set but then points to a
+ static string (which must not be deallocated).
+*/
+
+ /* Advanced functions */
+
+/*
+ The following functions are needed only in some special applications.
+*/
+
+/*
+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
+ int level,
+ int method,
+ int windowBits,
+ int memLevel,
+ int strategy));
+
+ This is another version of deflateInit with more compression options. The
+ fields next_in, zalloc, zfree and opaque must be initialized before by
+ the caller.
+
+ The method parameter is the compression method. It must be Z_DEFLATED in
+ this version of the library.
+
+ The windowBits parameter is the base two logarithm of the window size
+ (the size of the history buffer). It should be in the range 8..15 for this
+ version of the library. Larger values of this parameter result in better
+ compression at the expense of memory usage. The default value is 15 if
+ deflateInit is used instead.
+
+ windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
+ determines the window size. deflate() will then generate raw deflate data
+ with no zlib header or trailer, and will not compute an adler32 check value.
+
+ windowBits can also be greater than 15 for optional gzip encoding. Add
+ 16 to windowBits to write a simple gzip header and trailer around the
+ compressed data instead of a zlib wrapper. The gzip header will have no
+ file name, no extra data, no comment, no modification time (set to zero),
+ no header crc, and the operating system will be set to 255 (unknown). If a
+ gzip stream is being written, strm->adler is a crc32 instead of an adler32.
+
+ The memLevel parameter specifies how much memory should be allocated
+ for the internal compression state. memLevel=1 uses minimum memory but
+ is slow and reduces compression ratio; memLevel=9 uses maximum memory
+ for optimal speed. The default value is 8. See zconf.h for total memory
+ usage as a function of windowBits and memLevel.
+
+ The strategy parameter is used to tune the compression algorithm. Use the
+ value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+ filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+ string match), or Z_RLE to limit match distances to one (run-length
+ encoding). Filtered data consists mostly of small values with a somewhat
+ random distribution. In this case, the compression algorithm is tuned to
+ compress them better. The effect of Z_FILTERED is to force more Huffman
+ coding and less string matching; it is somewhat intermediate between
+ Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as
+ Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy
+ parameter only affects the compression ratio but not the correctness of the
+ compressed output even if it is not set appropriately. Z_FIXED prevents the
+ use of dynamic Huffman codes, allowing for a simpler decoder for special
+ applications.
+
+ deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid
+ method). msg is set to null if there is no error message. deflateInit2 does
+ not perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+/*
+ Initializes the compression dictionary from the given byte sequence
+ without producing any compressed output. This function must be called
+ immediately after deflateInit, deflateInit2 or deflateReset, before any
+ call of deflate. The compressor and decompressor must use exactly the same
+ dictionary (see inflateSetDictionary).
+
+ The dictionary should consist of strings (byte sequences) that are likely
+ to be encountered later in the data to be compressed, with the most commonly
+ used strings preferably put towards the end of the dictionary. Using a
+ dictionary is most useful when the data to be compressed is short and can be
+ predicted with good accuracy; the data can then be compressed better than
+ with the default empty dictionary.
+
+ Depending on the size of the compression data structures selected by
+ deflateInit or deflateInit2, a part of the dictionary may in effect be
+ discarded, for example if the dictionary is larger than the window size in
+ deflate or deflate2. Thus the strings most likely to be useful should be
+ put at the end of the dictionary, not at the front. In addition, the
+ current implementation of deflate will use at most the window size minus
+ 262 bytes of the provided dictionary.
+
+ Upon return of this function, strm->adler is set to the adler32 value
+ of the dictionary; the decompressor may later use this value to determine
+ which dictionary has been used by the compressor. (The adler32 value
+ applies to the whole dictionary even if only a subset of the dictionary is
+ actually used by the compressor.) If a raw deflate was requested, then the
+ adler32 value is not computed and strm->adler is not set.
+
+ deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+ parameter is invalid (such as NULL dictionary) or the stream state is
+ inconsistent (for example if deflate has already been called for this stream
+ or if the compression method is bsort). deflateSetDictionary does not
+ perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
+ z_streamp source));
+/*
+ Sets the destination stream as a complete copy of the source stream.
+
+ This function can be useful when several compression strategies will be
+ tried, for example when there are several ways of pre-processing the input
+ data with a filter. The streams that will be discarded should then be freed
+ by calling deflateEnd. Note that deflateCopy duplicates the internal
+ compression state which can be quite large, so this strategy is slow and
+ can consume lots of memory.
+
+ deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being NULL). msg is left unchanged in both source and
+ destination.
+*/
+
+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+/*
+ This function is equivalent to deflateEnd followed by deflateInit,
+ but does not free and reallocate all the internal compression state.
+ The stream will keep the same compression level and any other attributes
+ that may have been set by deflateInit2.
+
+ deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
+ int level,
+ int strategy));
+/*
+ Dynamically update the compression level and compression strategy. The
+ interpretation of level and strategy is as in deflateInit2. This can be
+ used to switch between compression and straight copy of the input data, or
+ to switch to a different kind of input data requiring a different
+ strategy. If the compression level is changed, the input available so far
+ is compressed with the old level (and may be flushed); the new level will
+ take effect only at the next call of deflate().
+
+ Before the call of deflateParams, the stream state must be set as for
+ a call of deflate(), since the currently available input may have to
+ be compressed and flushed. In particular, strm->avail_out must be non-zero.
+
+ deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
+ stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR
+ if strm->avail_out was zero.
+*/
+
+ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
+ int good_length,
+ int max_lazy,
+ int nice_length,
+ int max_chain));
+/*
+ Fine tune deflate's internal compression parameters. This should only be
+ used by someone who understands the algorithm used by zlib's deflate for
+ searching for the best matching string, and even then only by the most
+ fanatic optimizer trying to squeeze out the last compressed bit for their
+ specific input data. Read the deflate.c source code for the meaning of the
+ max_lazy, good_length, nice_length, and max_chain parameters.
+
+ deflateTune() can be called after deflateInit() or deflateInit2(), and
+ returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
+ uLong sourceLen));
+/*
+ deflateBound() returns an upper bound on the compressed size after
+ deflation of sourceLen bytes. It must be called after deflateInit()
+ or deflateInit2(). This would be used to allocate an output buffer
+ for deflation in a single pass, and so would be called before deflate().
+*/
+
+ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+/*
+ deflatePrime() inserts bits in the deflate output stream. The intent
+ is that this function is used to start off the deflate output with the
+ bits leftover from a previous deflate stream when appending to it. As such,
+ this function can only be used for raw deflate, and must be used before the
+ first deflate() call after a deflateInit2() or deflateReset(). bits must be
+ less than or equal to 16, and that many of the least significant bits of
+ value will be inserted in the output.
+
+ deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
+ gz_headerp head));
+/*
+ deflateSetHeader() provides gzip header information for when a gzip
+ stream is requested by deflateInit2(). deflateSetHeader() may be called
+ after deflateInit2() or deflateReset() and before the first call of
+ deflate(). The text, time, os, extra field, name, and comment information
+ in the provided gz_header structure are written to the gzip header (xflag is
+ ignored -- the extra flags are set according to the compression level). The
+ caller must assure that, if not Z_NULL, name and comment are terminated with
+ a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+ available there. If hcrc is true, a gzip header crc is included. Note that
+ the current versions of the command-line version of gzip (up through version
+ 1.3.x) do not support header crc's, and will report that it is a "multi-part
+ gzip file" and give up.
+
+ If deflateSetHeader is not used, the default gzip header has text false,
+ the time set to zero, and os set to 255, with no extra, name, or comment
+ fields. The gzip header is returned to the default state by deflateReset().
+
+ deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
+ int windowBits));
+
+ This is another version of inflateInit with an extra parameter. The
+ fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+ before by the caller.
+
+ The windowBits parameter is the base two logarithm of the maximum window
+ size (the size of the history buffer). It should be in the range 8..15 for
+ this version of the library. The default value is 15 if inflateInit is used
+ instead. windowBits must be greater than or equal to the windowBits value
+ provided to deflateInit2() while compressing, or it must be equal to 15 if
+ deflateInit2() was not used. If a compressed stream with a larger window
+ size is given as input, inflate() will return with the error code
+ Z_DATA_ERROR instead of trying to allocate a larger window.
+
+ windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+ determines the window size. inflate() will then process raw deflate data,
+ not looking for a zlib or gzip header, not generating a check value, and not
+ looking for any check values for comparison at the end of the stream. This
+ is for use with other formats that use the deflate compressed data format
+ such as zip. Those formats provide their own check values. If a custom
+ format is developed using the raw deflate format for compressed data, it is
+ recommended that a check value such as an adler32 or a crc32 be applied to
+ the uncompressed data as is done in the zlib, gzip, and zip formats. For
+ most applications, the zlib format should be used as is. Note that comments
+ above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+ windowBits can also be greater than 15 for optional gzip decoding. Add
+ 32 to windowBits to enable zlib and gzip decoding with automatic header
+ detection, or add 16 to decode only the gzip format (the zlib format will
+ return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is
+ a crc32 instead of an adler32.
+
+ inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
+ is set to null if there is no error message. inflateInit2 does not perform
+ any decompression apart from reading the zlib header if present: this will
+ be done by inflate(). (So next_in and avail_in may be modified, but next_out
+ and avail_out are unchanged.)
+*/
+
+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+/*
+ Initializes the decompression dictionary from the given uncompressed byte
+ sequence. This function must be called immediately after a call of inflate,
+ if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+ can be determined from the adler32 value returned by that call of inflate.
+ The compressor and decompressor must use exactly the same dictionary (see
+ deflateSetDictionary). For raw inflate, this function can be called
+ immediately after inflateInit2() or inflateReset() and before any call of
+ inflate() to set the dictionary. The application must insure that the
+ dictionary that was used for compression is provided.
+
+ inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+ parameter is invalid (such as NULL dictionary) or the stream state is
+ inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+ expected one (incorrect adler32 value). inflateSetDictionary does not
+ perform any decompression: this will be done by subsequent calls of
+ inflate().
+*/
+
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+/*
+ Skips invalid compressed data until a full flush point (see above the
+ description of deflate with Z_FULL_FLUSH) can be found, or until all
+ available input is skipped. No output is provided.
+
+ inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR
+ if no more input was provided, Z_DATA_ERROR if no flush point has been found,
+ or Z_STREAM_ERROR if the stream structure was inconsistent. In the success
+ case, the application may save the current current value of total_in which
+ indicates where valid compressed data was found. In the error case, the
+ application may repeatedly call inflateSync, providing more input each time,
+ until success or end of the input data.
+*/
+
+ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
+ z_streamp source));
+/*
+ Sets the destination stream as a complete copy of the source stream.
+
+ This function can be useful when randomly accessing a large stream. The
+ first pass through the stream can periodically record the inflate state,
+ allowing restarting inflate at those points when randomly accessing the
+ stream.
+
+ inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being NULL). msg is left unchanged in both source and
+ destination.
+*/
+
+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+/*
+ This function is equivalent to inflateEnd followed by inflateInit,
+ but does not free and reallocate all the internal decompression state.
+ The stream will keep attributes that may have been set by inflateInit2.
+
+ inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+/*
+ This function inserts bits in the inflate input stream. The intent is
+ that this function is used to start inflating at a bit position in the
+ middle of a byte. The provided bits will be used before any bytes are used
+ from next_in. This function should only be used with raw inflate, and
+ should be used before the first inflate() call after inflateInit2() or
+ inflateReset(). bits must be less than or equal to 16, and that many of the
+ least significant bits of value will be inserted in the input.
+
+ inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
+ gz_headerp head));
+/*
+ inflateGetHeader() requests that gzip header information be stored in the
+ provided gz_header structure. inflateGetHeader() may be called after
+ inflateInit2() or inflateReset(), and before the first call of inflate().
+ As inflate() processes the gzip stream, head->done is zero until the header
+ is completed, at which time head->done is set to one. If a zlib stream is
+ being decoded, then head->done is set to -1 to indicate that there will be
+ no gzip header information forthcoming. Note that Z_BLOCK can be used to
+ force inflate() to return immediately after header processing is complete
+ and before any actual data is decompressed.
+
+ The text, time, xflags, and os fields are filled in with the gzip header
+ contents. hcrc is set to true if there is a header CRC. (The header CRC
+ was valid if done is set to one.) If extra is not Z_NULL, then extra_max
+ contains the maximum number of bytes to write to extra. Once done is true,
+ extra_len contains the actual extra field length, and extra contains the
+ extra field, or that field truncated if extra_max is less than extra_len.
+ If name is not Z_NULL, then up to name_max characters are written there,
+ terminated with a zero unless the length is greater than name_max. If
+ comment is not Z_NULL, then up to comm_max characters are written there,
+ terminated with a zero unless the length is greater than comm_max. When
+ any of extra, name, or comment are not Z_NULL and the respective field is
+ not present in the header, then that field is set to Z_NULL to signal its
+ absence. This allows the use of deflateSetHeader() with the returned
+ structure to duplicate the header. However if those fields are set to
+ allocated memory, then the application will need to save those pointers
+ elsewhere so that they can be eventually freed.
+
+ If inflateGetHeader is not used, then the header information is simply
+ discarded. The header is always checked for validity, including the header
+ CRC if present. inflateReset() will reset the process to discard the header
+ information. The application would need to call inflateGetHeader() again to
+ retrieve the header from the next gzip stream.
+
+ inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window));
+
+ Initialize the internal stream state for decompression using inflateBack()
+ calls. The fields zalloc, zfree and opaque in strm must be initialized
+ before the call. If zalloc and zfree are Z_NULL, then the default library-
+ derived memory allocation routines are used. windowBits is the base two
+ logarithm of the window size, in the range 8..15. window is a caller
+ supplied buffer of that size. Except for special applications where it is
+ assured that deflate was used with small window sizes, windowBits must be 15
+ and a 32K byte window must be supplied to be able to decompress general
+ deflate streams.
+
+ See inflateBack() for the usage of these routines.
+
+ inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+ the paramaters are invalid, Z_MEM_ERROR if the internal state could not
+ be allocated, or Z_VERSION_ERROR if the version of the library does not
+ match the version of the header file.
+*/
+
+typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *));
+typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+
+ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
+ in_func in, void FAR *in_desc,
+ out_func out, void FAR *out_desc));
+/*
+ inflateBack() does a raw inflate with a single call using a call-back
+ interface for input and output. This is more efficient than inflate() for
+ file i/o applications in that it avoids copying between the output and the
+ sliding window by simply making the window itself the output buffer. This
+ function trusts the application to not change the output buffer passed by
+ the output function, at least until inflateBack() returns.
+
+ inflateBackInit() must be called first to allocate the internal state
+ and to initialize the state with the user-provided window buffer.
+ inflateBack() may then be used multiple times to inflate a complete, raw
+ deflate stream with each call. inflateBackEnd() is then called to free
+ the allocated state.
+
+ A raw deflate stream is one with no zlib or gzip header or trailer.
+ This routine would normally be used in a utility that reads zip or gzip
+ files and writes out uncompressed files. The utility would decode the
+ header and process the trailer on its own, hence this routine expects
+ only the raw deflate stream to decompress. This is different from the
+ normal behavior of inflate(), which expects either a zlib or gzip header and
+ trailer around the deflate stream.
+
+ inflateBack() uses two subroutines supplied by the caller that are then
+ called by inflateBack() for input and output. inflateBack() calls those
+ routines until it reads a complete deflate stream and writes out all of the
+ uncompressed data, or until it encounters an error. The function's
+ parameters and return types are defined above in the in_func and out_func
+ typedefs. inflateBack() will call in(in_desc, &buf) which should return the
+ number of bytes of provided input, and a pointer to that input in buf. If
+ there is no input available, in() must return zero--buf is ignored in that
+ case--and inflateBack() will return a buffer error. inflateBack() will call
+ out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out()
+ should return zero on success, or non-zero on failure. If out() returns
+ non-zero, inflateBack() will return with an error. Neither in() nor out()
+ are permitted to change the contents of the window provided to
+ inflateBackInit(), which is also the buffer that out() uses to write from.
+ The length written by out() will be at most the window size. Any non-zero
+ amount of input may be provided by in().
+
+ For convenience, inflateBack() can be provided input on the first call by
+ setting strm->next_in and strm->avail_in. If that input is exhausted, then
+ in() will be called. Therefore strm->next_in must be initialized before
+ calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called
+ immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in
+ must also be initialized, and then if strm->avail_in is not zero, input will
+ initially be taken from strm->next_in[0 .. strm->avail_in - 1].
+
+ The in_desc and out_desc parameters of inflateBack() is passed as the
+ first parameter of in() and out() respectively when they are called. These
+ descriptors can be optionally used to pass any information that the caller-
+ supplied in() and out() functions need to do their job.
+
+ On return, inflateBack() will set strm->next_in and strm->avail_in to
+ pass back any unused input that was provided by the last in() call. The
+ return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+ if in() or out() returned an error, Z_DATA_ERROR if there was a format
+ error in the deflate stream (in which case strm->msg is set to indicate the
+ nature of the error), or Z_STREAM_ERROR if the stream was not properly
+ initialized. In the case of Z_BUF_ERROR, an input or output error can be
+ distinguished using strm->next_in which will be Z_NULL only if in() returned
+ an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to
+ out() returning non-zero. (in() will always be called before out(), so
+ strm->next_in is assured to be defined if out() returns non-zero.) Note
+ that inflateBack() cannot return Z_OK.
+*/
+
+ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+/*
+ All memory allocated by inflateBackInit() is freed.
+
+ inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+ state was inconsistent.
+*/
+
+ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+/* Return flags indicating compile-time options.
+
+ Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+ 1.0: size of uInt
+ 3.2: size of uLong
+ 5.4: size of voidpf (pointer)
+ 7.6: size of z_off_t
+
+ Compiler, assembler, and debug options:
+ 8: DEBUG
+ 9: ASMV or ASMINF -- use ASM code
+ 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+ 11: 0 (reserved)
+
+ One-time table building (smaller code, but not thread-safe if true):
+ 12: BUILDFIXED -- build static block decoding tables when needed
+ 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+ 14,15: 0 (reserved)
+
+ Library content (indicates missing functionality):
+ 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+ deflate code when not needed)
+ 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+ and decode gzip streams (to avoid linking crc code)
+ 18-19: 0 (reserved)
+
+ Operation variations (changes in library functionality):
+ 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+ 21: FASTEST -- deflate algorithm with only one, lowest compression level
+ 22,23: 0 (reserved)
+
+ The sprintf variant used by gzprintf (zero is best):
+ 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+ 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+ 26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+ Remainder:
+ 27-31: 0 (reserved)
+ */
+
+
+ /* utility functions */
+
+/*
+ The following utility functions are implemented on top of the
+ basic stream-oriented functions. To simplify the interface, some
+ default options are assumed (compression level and memory usage,
+ standard memory allocation functions). The source code of these
+ utility functions can easily be modified if you need special options.
+*/
+
+ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+/*
+ Compresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total
+ size of the destination buffer, which must be at least the value returned
+ by compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+ This function can be used to compress a whole file at once if the
+ input file is mmap'ed.
+ compress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer.
+*/
+
+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen,
+ int level));
+/*
+ Compresses the source buffer into the destination buffer. The level
+ parameter has the same meaning as in deflateInit. sourceLen is the byte
+ length of the source buffer. Upon entry, destLen is the total size of the
+ destination buffer, which must be at least the value returned by
+ compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+
+ compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+/*
+ compressBound() returns an upper bound on the compressed size after
+ compress() or compress2() on sourceLen bytes. It would be used before
+ a compress() or compress2() call to allocate the destination buffer.
+*/
+
+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+/*
+ Decompresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total
+ size of the destination buffer, which must be large enough to hold the
+ entire uncompressed data. (The size of the uncompressed data must have
+ been saved previously by the compressor and transmitted to the decompressor
+ by some mechanism outside the scope of this compression library.)
+ Upon exit, destLen is the actual size of the compressed buffer.
+ This function can be used to decompress a whole file at once if the
+ input file is mmap'ed.
+
+ uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.
+*/
+
+
+typedef voidp gzFile;
+
+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+/*
+ Opens a gzip (.gz) file for reading or writing. The mode parameter
+ is as in fopen ("rb" or "wb") but can also include a compression level
+ ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for
+ Huffman only compression as in "wb1h", or 'R' for run-length encoding
+ as in "wb1R". (See the description of deflateInit2 for more information
+ about the strategy parameter.)
+
+ gzopen can be used to read a file which is not in gzip format; in this
+ case gzread will directly read from the file without decompression.
+
+ gzopen returns NULL if the file could not be opened or if there was
+ insufficient memory to allocate the (de)compression state; errno
+ can be checked to distinguish the two cases (if errno is zero, the
+ zlib error is Z_MEM_ERROR). */
+
+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+/*
+ gzdopen() associates a gzFile with the file descriptor fd. File
+ descriptors are obtained from calls like open, dup, creat, pipe or
+ fileno (in the file has been previously opened with fopen).
+ The mode parameter is as in gzopen.
+ The next call of gzclose on the returned gzFile will also close the
+ file descriptor fd, just like fclose(fdopen(fd), mode) closes the file
+ descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode).
+ gzdopen returns NULL if there was insufficient memory to allocate
+ the (de)compression state.
+*/
+
+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+/*
+ Dynamically update the compression level or strategy. See the description
+ of deflateInit2 for the meaning of these parameters.
+ gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
+ opened for writing.
+*/
+
+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
+/*
+ Reads the given number of uncompressed bytes from the compressed file.
+ If the input file was not in gzip format, gzread copies the given number
+ of bytes into the buffer.
+ gzread returns the number of uncompressed bytes actually read (0 for
+ end of file, -1 for error). */
+
+ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
+ voidpc buf, unsigned len));
+/*
+ Writes the given number of uncompressed bytes into the compressed file.
+ gzwrite returns the number of uncompressed bytes actually written
+ (0 in case of error).
+*/
+
+ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...));
+/*
+ Converts, formats, and writes the args to the compressed file under
+ control of the format string, as in fprintf. gzprintf returns the number of
+ uncompressed bytes actually written (0 in case of error). The number of
+ uncompressed bytes written is limited to 4095. The caller should assure that
+ this limit is not exceeded. If it is exceeded, then gzprintf() will return
+ return an error (0) with nothing written. In this case, there may also be a
+ buffer overflow with unpredictable consequences, which is possible only if
+ zlib was compiled with the insecure functions sprintf() or vsprintf()
+ because the secure snprintf() or vsnprintf() functions were not available.
+*/
+
+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+/*
+ Writes the given null-terminated string to the compressed file, excluding
+ the terminating null character.
+ gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+/*
+ Reads bytes from the compressed file until len-1 characters are read, or
+ a newline character is read and transferred to buf, or an end-of-file
+ condition is encountered. The string is then terminated with a null
+ character.
+ gzgets returns buf, or Z_NULL in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
+/*
+ Writes c, converted to an unsigned char, into the compressed file.
+ gzputc returns the value that was written, or -1 in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+/*
+ Reads one byte from the compressed file. gzgetc returns this byte
+ or -1 in case of end of file or error.
+*/
+
+ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
+/*
+ Push one character back onto the stream to be read again later.
+ Only one character of push-back is allowed. gzungetc() returns the
+ character pushed, or -1 on failure. gzungetc() will fail if a
+ character has been pushed but not read yet, or if c is -1. The pushed
+ character will be discarded if the stream is repositioned with gzseek()
+ or gzrewind().
+*/
+
+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
+/*
+ Flushes all pending output into the compressed file. The parameter
+ flush is as in the deflate() function. The return value is the zlib
+ error number (see function gzerror below). gzflush returns Z_OK if
+ the flush parameter is Z_FINISH and all output could be flushed.
+ gzflush should be called only when strictly necessary because it can
+ degrade compression.
+*/
+
+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
+ z_off_t offset, int whence));
+/*
+ Sets the starting position for the next gzread or gzwrite on the
+ given compressed file. The offset represents a number of bytes in the
+ uncompressed data stream. The whence parameter is defined as in lseek(2);
+ the value SEEK_END is not supported.
+ If the file is opened for reading, this function is emulated but can be
+ extremely slow. If the file is opened for writing, only forward seeks are
+ supported; gzseek then compresses a sequence of zeroes up to the new
+ starting position.
+
+ gzseek returns the resulting offset location as measured in bytes from
+ the beginning of the uncompressed stream, or -1 in case of error, in
+ particular if the file is opened for writing and the new starting position
+ would be before the current position.
+*/
+
+ZEXTERN int ZEXPORT gzrewind OF((gzFile file));
+/*
+ Rewinds the given file. This function is supported only for reading.
+
+ gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+*/
+
+ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file));
+/*
+ Returns the starting position for the next gzread or gzwrite on the
+ given compressed file. This position represents a number of bytes in the
+ uncompressed data stream.
+
+ gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+/*
+ Returns 1 when EOF has previously been detected reading the given
+ input stream, otherwise zero.
+*/
+
+ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+/*
+ Returns 1 if file is being read directly without decompression, otherwise
+ zero.
+*/
+
+ZEXTERN int ZEXPORT gzclose OF((gzFile file));
+/*
+ Flushes all pending output if necessary, closes the compressed file
+ and deallocates all the (de)compression state. The return value is the zlib
+ error number (see function gzerror below).
+*/
+
+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+/*
+ Returns the error message for the last error which occurred on the
+ given compressed file. errnum is set to zlib error number. If an
+ error occurred in the file system and not in the compression library,
+ errnum is set to Z_ERRNO and the application may consult errno
+ to get the exact error code.
+*/
+
+ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+/*
+ Clears the error and end-of-file flags for file. This is analogous to the
+ clearerr() function in stdio. This is useful for continuing to read a gzip
+ file that is being written concurrently.
+*/
+
+ /* checksum functions */
+
+/*
+ These functions are not related to compression but are exported
+ anyway because they might be useful in applications using the
+ compression library.
+*/
+
+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+/*
+ Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+ return the updated checksum. If buf is NULL, this function returns
+ the required initial value for the checksum.
+ An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+ much faster. Usage example:
+
+ uLong adler = adler32(0L, Z_NULL, 0);
+
+ while (read_buffer(buffer, length) != EOF) {
+ adler = adler32(adler, buffer, length);
+ }
+ if (adler != original_adler) error();
+*/
+
+ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
+ z_off_t len2));
+/*
+ Combine two Adler-32 checksums into one. For two sequences of bytes, seq1
+ and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+ each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of
+ seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.
+*/
+
+ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
+/*
+ Update a running CRC-32 with the bytes buf[0..len-1] and return the
+ updated CRC-32. If buf is NULL, this function returns the required initial
+ value for the for the crc. Pre- and post-conditioning (one's complement) is
+ performed within this function so it shouldn't be done by the application.
+ Usage example:
+
+ uLong crc = crc32(0L, Z_NULL, 0);
+
+ while (read_buffer(buffer, length) != EOF) {
+ crc = crc32(crc, buffer, length);
+ }
+ if (crc != original_crc) error();
+*/
+
+ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+
+/*
+ Combine two CRC-32 check values into one. For two sequences of bytes,
+ seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+ calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32
+ check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+ len2.
+*/
+
+
+ /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method,
+ int windowBits, int memLevel,
+ int strategy, const char *version,
+ int stream_size));
+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window,
+ const char *version,
+ int stream_size));
+#define deflateInit(strm, level) \
+ deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit(strm) \
+ inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream))
+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+ deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+ (strategy), ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit2(strm, windowBits) \
+ inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
+#define inflateBackInit(strm, windowBits, window) \
+ inflateBackInit_((strm), (windowBits), (window), \
+ ZLIB_VERSION, sizeof(z_stream))
+
+
+#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL)
+ struct internal_state {int dummy;}; /* hack for buggy compilers */
+#endif
+
+ZEXTERN const char * ZEXPORT zError OF((int));
+ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z));
+ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZLIB_H */
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/wgs-assembler.git
More information about the debian-med-commit
mailing list