[med-svn] [ea-utils] 01/03: Imported Upstream version 1.1.2+dfsg
Andreas Tille
tille at debian.org
Sat Jul 25 06:14:41 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository ea-utils.
commit 8ad9ba2d886b19085ec277b1120739ca2a7ad8ce
Author: Andreas Tille <tille at debian.org>
Date: Sat Jul 25 08:11:59 2015 +0200
Imported Upstream version 1.1.2+dfsg
---
CHANGES | 14 +
Makefile | 105 ++++
README | 63 ++
alc | 178 ++++++
bam.c | 474 ++++++++++++++
bam_aux.c | 213 +++++++
bam_cat.c | 185 ++++++
bam_import.c | 489 +++++++++++++++
bam_index.c | 724 ++++++++++++++++++++++
bam_lpileup.c | 198 ++++++
bam_md.c | 389 ++++++++++++
bam_pileup.c | 437 +++++++++++++
bam_reheader.c | 62 ++
bam_sort.c | 566 +++++++++++++++++
bedidx.c | 162 +++++
bgzf.c | 694 +++++++++++++++++++++
determine-phred | 86 +++
ea-utils.spec | 37 ++
faidx.c | 437 +++++++++++++
fastq-clipper.c | 279 +++++++++
fastq-join.c | 424 +++++++++++++
fastq-lib.cpp | 375 +++++++++++
fastq-lib.h | 113 ++++
fastq-mcf.c | 1697 ++++++++++++++++++++++++++++++++++++++++++++++++++
fastq-multx.c | 1087 ++++++++++++++++++++++++++++++++
fastq-stats.cpp | 672 ++++++++++++++++++++
fastx-graph | 149 +++++
gcModel.c | 207 +++++++
gcModel.h | 7 +
gtf2bed | 116 ++++
kaln.c | 486 +++++++++++++++
knetfile.c | 632 +++++++++++++++++++
kprobaln.c | 280 +++++++++
kstring.c | 212 +++++++
padding.c | 479 +++++++++++++++
phase.c | 687 +++++++++++++++++++++
randomFQ | 245 ++++++++
razf.c | 853 +++++++++++++++++++++++++
razip.c | 141 +++++
sam-stats.cpp | 1121 +++++++++++++++++++++++++++++++++
sam.c | 186 ++++++
sam_header.c | 772 +++++++++++++++++++++++
tidx/fastq-lib.cpp | 1 +
tidx/fastq-lib.h | 1 +
tidx/tidx-lib.cpp | 436 +++++++++++++
tidx/tidx.cpp | 220 +++++++
tidx/tidx.h | 43 ++
tidx/utils.cpp | 1 +
tidx/utils.h | 1 +
utils.h | 5 +
varcall.cpp | 1744 ++++++++++++++++++++++++++++++++++++++++++++++++++++
51 files changed, 19185 insertions(+)
diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..f313903
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,14 @@
+r181 - CASAVA purity filtering
+r154 - fixed major bug in RMN's that would invalidate reads
+r152 - allowed short adapters to work at the 'begin' of reads
+r171 - paired-ends get trimmed like anything else....other behavior was too conservative
+r258 - gzip support on input and output, append barcode to unmatched id
+r353 - support for dual-indexed nextera reads in fastq-multx, new defaults based on ROC curve analysis
+r401 - added -L, included google dir, included build for sam-stats
+r408 - updated fast-join docs, changed default mismatch to 8%
+r425 - fastq-mcf filtering options, multx verif char fixed
+r474 - RNAmode & coverage stats output
+r475 - fix paired-end forward/reverse counts
+r534 - -S can be before -R, also ver num increment
+r551 - eventer -l 0 bug fix, debug output improvement
+r558 - sam-stats snp rate change, fastq-lib poorqual N's issue fixed, buffering added (todo: add to lib)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..36b0b5b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,105 @@
+#
+# $Id: Makefile 670 2013-12-13 16:47:07Z earonesty $
+
+CC=g++
+PREFIX?=/usr
+BINDIR?=$(PREFIX)/bin
+CFLAGS?=-O3 -I.
+CPPFLAGS?=-O3 -I.
+# for debugging:
+# CFLAGS?=-g -I.
+# CPPFLAGS?=-g -I.
+
+PKG=ea-utils
+REL := $(shell svnversion 2>/dev/null | perl -ne 'print $$1 if /:(\d+)/' )
+VER := $(shell grep '%define ver' ${PKG}.spec | perl -ne 'print $$1 if / (\S+) *$$/')
+
+SRC=fastq-clipper.c fastq-mcf.c fastq-multx.c fastq-join.c fastq-stats.cpp gcModel.c
+BIN=fastq-mcf fastq-multx fastq-join fastq-stats fastq-clipper sam-stats varcall
+TOOLS=fastx-graph gtf2bed determine-phred randomFQ alc
+
+all: $(BIN)
+
+debug:
+ CPPFLAGS=-g ${MAKE} $(MFLAGS) varcall
+
+install: $(BIN) $(BINDIR)/fastq-clipper $(BINDIR)/fastq-mcf $(BINDIR)/fastq-multx $(BINDIR)/fastq-join $(BINDIR)/fastq-stats $(BINDIR)/sam-stats $(BINDIR)/varcall $(BINDIR)/fastx-graph $(BINDIR)/determine-phred $(BINDIR)/randomFQ $(BINDIR)/alc
+
+$(BINDIR):
+ mkdir -p $(BINDIR)
+
+$(BINDIR)/%: % $(BINDIR)
+ cp $< $@
+
+dist: getrel $(PKG).${VER}-${REL}.tar.gz
+
+# these shenanigans are done to ensure than the release in the spec file is the same as the subversion release
+# a less verbose way should be possible
+
+getrel:
+ grep "${REL}" $(PKG).spec || touch $(PKG).spex
+
+.PHONY: getrel debug
+
+$(PKG).spec: $(PKG).spex
+ perl -pe 's/%RELEASE%/${REL}/' $(PKG).spex > $(PKG).spec
+
+$(PKG).tar.gz: Makefile $(TOOLS) $(SRC) $(PKG).spec fastq-lib.cpp fastq-lib.h sam-stats.cpp fastq-stats.cpp gcModel.c gcModel.h varcall.cpp utils.h README CHANGES google sparsehash samtools/*.c
+ rm -rf $(PKG).${VER}-${REL}
+ mkdir $(PKG).${VER}-${REL}
+ mkdir $(PKG).${VER}-${REL}/tidx
+ mkdir $(PKG).${VER}-${REL}/samtools
+ cp -nr $^ $(PKG).${VER}-${REL}
+ cp -nr tidx/*.cpp tidx/*.h $(PKG).${VER}-${REL}/tidx
+ cp -nr samtools/*.c samtools/*.h samtools/Makefile $(PKG).${VER}-${REL}/samtools
+ tar --exclude=".svn" -cvzf $(PKG).tar.gz $(PKG).${VER}-${REL}
+ rm -rf $(PKG).${VER}-${REL}
+
+disttest: $(PKG).tar.gz
+ tar -xzvf $(PKG).tar.gz
+ cd $(PKG).${VER}-${REL} && make
+ rm -rf $(PKG).${VER}-${REL}
+
+$(PKG).${VER}-${REL}.tar.gz: $(PKG).tar.gz
+ cp $< $@
+
+%: %.c fastq-lib.cpp fastq-lib.h
+ $(CC) $(CFLAGS) fastq-lib.cpp -o $@ $<
+
+%: %.cpp fastq-lib.cpp fastq-lib.h
+ $(CC) $(CFLAGS) fastq-lib.cpp -o $@ $<
+
+
+%: %.c gcModel.c gcModel.h
+ $(CC) $(CFLAGS) gcModel.c -o $@ $<
+
+%: %.cpp gcModel.c gcModel.h
+ $(CC) $(CFLAGS) gcModel.c -o $@ $<
+
+# why the libbam.a doesn't work? not sure... *.o works
+sam-stats: sam-stats.cpp samtools/libbam.a samtools/bam.h fastq-lib.h
+ifeq ($(OS),Windows_NT)
+ $(CC) $(CFLAGS) samtools/*.o -lz -lpthread -lws2_32 fastq-lib.cpp $< -o $@
+else
+ $(CC) $(CFLAGS) samtools/*.o -lz -lpthread fastq-lib.cpp $< -o $@
+endif
+
+samtools/libbam.a: samtools/*.c samtools/*.h
+ cd samtools && make libbam.a
+
+varcall: varcall.cpp fastq-lib.cpp tidx/tidx-lib.cpp
+ifeq ($(OS),Windows_NT)
+ echo varcall: not supported yet
+else
+ $(CC) $(CFLAGS) fastq-lib.cpp tidx/tidx-lib.cpp -o $@ $< -lgsl -lgslcblas
+endif
+
+fastq-stats: fastq-stats.cpp fastq-lib.cpp gcModel.c
+ $(CC) $(CFLAGS) fastq-lib.cpp gcModel.c -o $@ $<
+
+bam-filter: bam-filter.cpp
+ $(CC) $(CFLAGS) fastq-lib.cpp -o $@ $< -lbamtools
+
+clean:
+ rm -f *.o $(BIN)
+ cd samtools && make clean
diff --git a/README b/README
new file mode 100644
index 0000000..8700228
--- /dev/null
+++ b/README
@@ -0,0 +1,63 @@
+OVERVIEW:
+
+fastq-mcf
+
+Scans a sequence file for adapters, and, based on a log-scaled threshold, determines a set of clipping parameters and performs clipping. Also does skewing detection and quality filtering.
+
+fastq-multx
+
+Demultiplexes a fastq. Capable of auto-determining barcode id's based on a master set fields. Keeps multiple reads in-sync during demultiplexing. Can verify that the reads are in-sync as well, and fail if they're not.
+
+fastq-join
+Similar to audy's stitch program, but in C, more efficient and supports some automatic benchmarking and tuning. It uses the same "squared distance for anchored alignment" as other tools.
+
+fastq-stats
+Outputs stats for fastqs
+
+sam-stats
+Output stats for sam/bam files
+
+varcall
+Variant caller, takes bam or pileup output and does variant calling with advanced features like PCR duplicate filtering, homopolymer repeat filtering, calculation of error rate and dectectibility (minimum percentage) thresholds.
+
+REQUIRES:
+
+For building sam-stats, please install this first!
+
+https://github.com/pezmaster31/bamtools/wiki/Building-and-installing
+
+QUICK FAQ:
+
+This is based on feedback/emails, etc.
+
+fastq-mcf does a 300k sub-sampling to determine what to do. There are lots of paramters to play with, but the "automatic" mode should do the right thing most of the time. If it doesn't, I really would like to hear why/what it did. The point in this tool is that the basic quality and adapter filtering should be something that's done automagically 90% of the time - not by manually picking paramters for each run. The fact that it's making decisions "for the user" means it will probabl [...]
+
+If you want fastq-mcf to be similar to other tools, you need to pass -m XX, and -s 100, so it's a fixed-length. If you try running with unrealistic, or "test" data, the heuristic won't work. Instead, try with a subsample of 50000 or so "real" reads.
+
+fastq-mcf doubles as a read-filtering program, it supports a broad range of filtering arguments.
+
+fastq-join produces a "report". This is just a list of lengths of joined reads. Also it chooses the "better quality base" when overlapping. Very stable code at this point.
+
+fastq-multx is intended to keep mates in sync, so you can demultiplex in one-pass. For single-reads, it's not better than other tools out there, except that you don't need to predefine your sets... which can help logistics in high-volume situations. Also, notice the output file's "%-sign" substitution... this is instead of lots of prefix and suffix arguments. Mismatch algorithm is "maximal unique"... ie... if it's possible that 2 barcodes can match, it won't use *either*. Qualities [...]
+
+Dual-indexed codes are listed as SEQUENCE-SEQUENCE in the barcode file. I haven't tried mixing them with others on the autodetect code, I can't imaginge there's a reason to do that.
+
+The latest version can ignores bases that have extremely low qualities (<5), and refuses to match a barcode that isn't a minimum distance from another best match. It's a lot safer, but for some poor-quality runs these features will need to be disabled.
+
+sam-stats take a lot of options for a variety of reports. The most important ones to note are -D, which builds a huge hash of probe ids, and -R which produces a coverage matrix. It could autodetect if reads are sorted by probe ID and save RAM. It could also reduce RAM by removing common prefixes from the hash after some X reads. It doesn't do those things now.
+
+INSTALL:
+
+Should be able to run "make install" on most machines that have g++ installed. On windows, install a copy of the MinGW environment. You'll need zlib installed for some tools. fastq-mcf, fastq-stats, etc. are pretty basic, and work without any external libs.
+
+Example:
+
+PREFIX=/usr make install
+
+OR to a subdir:
+
+BINDIR=/usr/bin/ea-utils make install
+
+Or with other options:
+
+CC=g++ PREFIX=/usr/local make install
diff --git a/alc b/alc
new file mode 100755
index 0000000..54615f2
--- /dev/null
+++ b/alc
@@ -0,0 +1,178 @@
+#!/usr/bin/perl
+
+# Copyright (c) 2011 Erik Aronesty (erik at q32.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# NOTE: Please let me know if you use it or like it, AKA: "Thank You Mask Man"
+
+use strict;
+use Getopt::Long;
+our $VERSION = 1.2;
+
+my %nl;
+my $w=200000; # window size
+my $x=10; # segment count
+my $only;
+GetOptions("only"=>\$only, "window=i"=>\$w, "segs=i"=>\$x);
+
+die usage() unless @ARGV;
+die usage() if $only && @ARGV > 1;
+$|=1;
+#$x-=1;
+my $tl=0;
+for my $f (@ARGV) {
+ my $s = -s $f;
+ my $gz = $f =~ /\.gz$/;
+ open (IN, ($gz ? "gunzip -c '$f'|" : $f)) || (warn("$f: $!\n"), next);
+ my $nl = 0;
+ my $gzratio = 1;
+ if ($gz) {
+ if ( $s > $w ) {
+ # todo, this can be done all-in-one stream, by buffering the read and gzipping in a loop
+ # it will be much faster!
+ my (@bz, @lz);
+ $x = 5; # 5 points
+ my $ss = $w/$x;
+ for (my $i=0; $i<$x; ++$i) {
+ my $o = int($ss * ($i+1));
+ $bz[$i]=`gunzip -c '$f' | head -$o | gzip -c | wc -c`;
+ if (abs($s-$bz[$i]) < ($ss+length($f))) {
+ $nl=`gunzip -c '$f' | wc -l`;
+ goto MAXXED;
+ } else {
+ $lz[$i]=$s*$o/$bz[$i];
+ }
+# warn("o: ", $o, ", lzi:", $lz[$i], ", bzi:", $bz[$i], "\n");
+ }
+
+ # simple linear model log(bytes)~lines
+ my ($xxs, $xys, $xs, $ys);
+ for (my $i=0; $i<$x; ++$i) {
+ $xs+=log($bz[$i]);
+ $ys+=$lz[$i];
+ $xxs+=log($bz[$i])*log($bz[$i]);
+ $xys+=$lz[$i]*log($bz[$i]);
+ }
+ my $slope = ($xys - $xs*$ys/$x) / ($xxs - $xs*$xs/$x);
+ my $intercept = ($ys - ($slope*$xs))/$x;
+# warn("intercept: $intercept, slope: $slope, s: $s\n");
+# log size predictive of lines.....
+ $nl = $intercept + $slope * log($s);
+ MAXXED:
+ } else {
+ close(IN);
+ $nl = `gunzip -c '$f' | wc -l` + 0;
+ }
+ } else {
+ my $rc = 0;
+ if ($s > ($w*2)) {
+ my $ss = $x == 1 ? $w : (($s-($w/$x))/($x-1));
+ my $d;
+# warn("segments: $x, window: $w, ss:$ss\n");
+ my $tweight;
+ for (my $i=0;$i<$x;++$i) {
+ seek(IN, $i*$ss, 0) if $i > 0;
+ read(IN, $d, $w/$x);
+ #whole lines only, prevents overcounting
+ if ($i > 0) {
+ # seek before current block, getting more accurate "long-first-lines"
+ my $z;
+ seek(IN, $i*$ss-1000, 0);
+ read(IN, $z, 1000);
+ $z=substr($z, rindex($z, "\n")+1);
+ $d=$z.$d;
+ } else {
+ #$d=substr($d, index($d, "\n")+1); # remove first line
+ }
+ $d=substr($d, 0, rindex($d, "\n")+1); # remove last line
+ $rc += length($d);
+ $nl += xlc($d);
+ # printf STDERR "seek: %d, len: %d, xlc: %d, rc: %d, nl: %d\n", $i*$ss, length($d), xlc($d), $rc, $nl;
+ }
+ } else {
+ my $d;
+ read(IN, $d, $w);
+ $d=substr($d, 0, rindex($d, "\n")+1); # remove last line
+ $rc += length($d);
+ $nl += xlc($d);
+ }
+ $nl = int($rc > 0 ? $nl * $s / $rc : 0);
+ }
+ $tl += $nl;
+ print "$nl" if $only;
+ print "\n" if $only && -t STDOUT;
+ $nl{$f} = $nl if !$only;
+}
+
+if (!$only) {
+ my $m=length($tl)+1;
+ for my $f (@ARGV) {
+ printf "%*d %s\n", $m, $nl{$f}, $f;
+ }
+ printf "%*d %s\n", $m, $tl, "total" if @ARGV > 1;
+}
+
+sub xlc {
+ my $d = $_[0];
+ my $p=0;
+ my $c=0;
+ my $i=0;
+ while (($i=index($d, "\n", $p))>=0) {
+ ++$c;
+ $p=$i+1;
+ }
+ return $p<length($d) ? $c+1 : $c; # correct count
+}
+
+sub usage {<<EOF
+Usage: alc [-o] <file1> [<file2>] ...
+
+Approximate line counts for each file. Attempts to be
+somewhat compatible with "wc -l" by default.
+
+-o|--only Output line count only for a single file.
+-w|--window <int> Read <int> bytes from head, mid, and tail.
+-s|--segs <int> Divide file & window into <int> segments.
+EOF
+};
+
+__END__
+
+=head1 NAME
+
+alc - Approximate line count
+
+=head1 DESCRIPTION
+
+Approximate line counts for each file. Attempts to be
+somewhat compatible with "wc -l" by default.
+
+=head1 AUTHOR
+
+Erik Aronesty C<earonesty at cpan.org>
+
+=head1 LICENSE
+
+This program is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+See L<http://www.perl.com/perl/misc/Artistic.html>.
+
+=cut
diff --git a/bam.c b/bam.c
new file mode 100644
index 0000000..b00d6a6
--- /dev/null
+++ b/bam.c
@@ -0,0 +1,474 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+#include "bam.h"
+#include "bam_endian.h"
+#include "kstring.h"
+#include "sam_header.h"
+
+int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0;
+char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
+
+/**************************
+ * CIGAR related routines *
+ **************************/
+
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
+{
+ int k, end = c->pos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = bam_cigar_op(cigar[k]);
+ int len = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CBACK) { // move backward
+ int l, u, v;
+ if (k == c->n_cigar - 1) break; // skip trailing 'B'
+ for (l = k - 1, u = v = 0; l >= 0; --l) {
+ int op1 = bam_cigar_op(cigar[l]);
+ int len1 = bam_cigar_oplen(cigar[l]);
+ if (bam_cigar_type(op1)&1) { // consume query
+ if (u + len1 >= len) { // stop
+ if (bam_cigar_type(op1)&2) v += len - u;
+ break;
+ } else u += len1;
+ }
+ if (bam_cigar_type(op1)&2) v += len1;
+ }
+ end = l < 0? c->pos : end - v;
+ } else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]);
+ }
+ return end;
+}
+
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
+{
+ uint32_t k;
+ int32_t l = 0;
+ for (k = 0; k < c->n_cigar; ++k)
+ if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
+ l += bam_cigar_oplen(cigar[k]);
+ return l;
+}
+
+/********************
+ * BAM I/O routines *
+ ********************/
+
+bam_header_t *bam_header_init()
+{
+ bam_is_be = bam_is_big_endian();
+ return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+ int32_t i;
+ extern void bam_destroy_header_hash(bam_header_t *header);
+ if (header == 0) return;
+ if (header->target_name) {
+ for (i = 0; i < header->n_targets; ++i)
+ free(header->target_name[i]);
+ free(header->target_name);
+ free(header->target_len);
+ }
+ free(header->text);
+ if (header->dict) sam_header_free(header->dict);
+ if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
+ bam_destroy_header_hash(header);
+ free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+ bam_header_t *header;
+ char buf[4];
+ int magic_len;
+ int32_t i = 1, name_len;
+ // check EOF
+ i = bgzf_check_EOF(fp);
+ if (i < 0) {
+ // If the file is a pipe, checking the EOF marker will *always* fail
+ // with ESPIPE. Suppress the error message in this case.
+ if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
+ }
+ else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
+ // read "BAM1"
+ magic_len = bam_read(fp, buf, 4);
+ if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
+ fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
+ return 0;
+ }
+ header = bam_header_init();
+ // read plain text and the number of reference sequences
+ bam_read(fp, &header->l_text, 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+ header->text = (char*)calloc(header->l_text + 1, 1);
+ bam_read(fp, header->text, header->l_text);
+ bam_read(fp, &header->n_targets, 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+ // read reference sequence names and lengths
+ header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+ header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+ for (i = 0; i != header->n_targets; ++i) {
+ bam_read(fp, &name_len, 4);
+ if (bam_is_be) bam_swap_endian_4p(&name_len);
+ header->target_name[i] = (char*)calloc(name_len, 1);
+ bam_read(fp, header->target_name[i], name_len);
+ bam_read(fp, &header->target_len[i], 4);
+ if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+ }
+ return header;
+}
+
+int bam_header_write(bamFile fp, const bam_header_t *header)
+{
+ char buf[4];
+ int32_t i, name_len, x;
+ // write "BAM1"
+ strncpy(buf, "BAM\001", 4);
+ bam_write(fp, buf, 4);
+ // write plain text and the number of reference sequences
+ if (bam_is_be) {
+ x = bam_swap_endian_4(header->l_text);
+ bam_write(fp, &x, 4);
+ if (header->l_text) bam_write(fp, header->text, header->l_text);
+ x = bam_swap_endian_4(header->n_targets);
+ bam_write(fp, &x, 4);
+ } else {
+ bam_write(fp, &header->l_text, 4);
+ if (header->l_text) bam_write(fp, header->text, header->l_text);
+ bam_write(fp, &header->n_targets, 4);
+ }
+ // write sequence names and lengths
+ for (i = 0; i != header->n_targets; ++i) {
+ char *p = header->target_name[i];
+ name_len = strlen(p) + 1;
+ if (bam_is_be) {
+ x = bam_swap_endian_4(name_len);
+ bam_write(fp, &x, 4);
+ } else bam_write(fp, &name_len, 4);
+ bam_write(fp, p, name_len);
+ if (bam_is_be) {
+ x = bam_swap_endian_4(header->target_len[i]);
+ bam_write(fp, &x, 4);
+ } else bam_write(fp, &header->target_len[i], 4);
+ }
+ bgzf_flush(fp);
+ return 0;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+ uint8_t *s;
+ uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+ s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+ for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+ while (s < data + data_len) {
+ uint8_t type;
+ s += 2; // skip key
+ type = toupper(*s); ++s; // skip type
+ if (type == 'C' || type == 'A') ++s;
+ else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+ else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+ else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
+ else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+ else if (type == 'B') {
+ int32_t n, Bsize = bam_aux_type2size(*s);
+ memcpy(&n, s + 1, 4);
+ if (1 == Bsize) {
+ } else if (2 == Bsize) {
+ for (i = 0; i < n; i += 2)
+ bam_swap_endian_2p(s + 5 + i);
+ } else if (4 == Bsize) {
+ for (i = 0; i < n; i += 4)
+ bam_swap_endian_4p(s + 5 + i);
+ }
+ bam_swap_endian_4p(s+1);
+ }
+ }
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+ bam1_core_t *c = &b->core;
+ int32_t block_len, ret, i;
+ uint32_t x[8];
+
+ assert(BAM_CORE_SIZE == 32);
+ if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+ if (ret == 0) return -1; // normal end-of-file
+ else return -2; // truncated
+ }
+ if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
+ if (bam_is_be) {
+ bam_swap_endian_4p(&block_len);
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+ }
+ c->tid = x[0]; c->pos = x[1];
+ c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+ c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+ c->l_qseq = x[4];
+ c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+ b->data_len = block_len - BAM_CORE_SIZE;
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+ b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+ if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+ if (bam_no_B) bam_remove_B(b);
+ return 4 + block_len;
+}
+
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
+{
+ uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
+ int i;
+ assert(BAM_CORE_SIZE == 32);
+ x[0] = c->tid;
+ x[1] = c->pos;
+ x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+ x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+ x[4] = c->l_qseq;
+ x[5] = c->mtid;
+ x[6] = c->mpos;
+ x[7] = c->isize;
+ bgzf_flush_try(fp, 4 + block_len);
+ if (bam_is_be) {
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+ y = block_len;
+ bam_write(fp, bam_swap_endian_4p(&y), 4);
+ swap_endian_data(c, data_len, data);
+ } else bam_write(fp, &block_len, 4);
+ bam_write(fp, x, BAM_CORE_SIZE);
+ bam_write(fp, data, data_len);
+ if (bam_is_be) swap_endian_data(c, data_len, data);
+ return 4 + block_len;
+}
+
+int bam_write1(bamFile fp, const bam1_t *b)
+{
+ return bam_write1_core(fp, &b->core, b->data_len, b->data);
+}
+
+char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
+{
+ uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
+ int i;
+ const bam1_core_t *c = &b->core;
+ kstring_t str;
+ str.l = str.m = 0; str.s = 0;
+
+ kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str);
+ if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); }
+ else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
+ else { // BAM_OFSTR
+ for (i = 0; i < 16; ++i)
+ if ((c->flag & 1<<i) && bam_flag2char_table[i])
+ kputc(bam_flag2char_table[i], &str);
+ kputc('\t', &str);
+ }
+ if (c->tid < 0) kputsn("*\t", 2, &str);
+ else {
+ if (header) kputs(header->target_name[c->tid] , &str);
+ else kputw(c->tid, &str);
+ kputc('\t', &str);
+ }
+ kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
+ if (c->n_cigar == 0) kputc('*', &str);
+ else {
+ uint32_t *cigar = bam1_cigar(b);
+ for (i = 0; i < c->n_cigar; ++i) {
+ kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
+ kputc(bam_cigar_opchr(cigar[i]), &str);
+ }
+ }
+ kputc('\t', &str);
+ if (c->mtid < 0) kputsn("*\t", 2, &str);
+ else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
+ else {
+ if (header) kputs(header->target_name[c->mtid], &str);
+ else kputw(c->mtid, &str);
+ kputc('\t', &str);
+ }
+ kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
+ if (c->l_qseq) {
+ for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
+ kputc('\t', &str);
+ if (t[0] == 0xff) kputc('*', &str);
+ else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+ } else kputsn("*\t*", 3, &str);
+ s = bam1_aux(b);
+ while (s < b->data + b->data_len) {
+ uint8_t type, key[2];
+ key[0] = s[0]; key[1] = s[1];
+ s += 2; type = *s; ++s;
+ kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
+ if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
+ else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }
+ else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }
+ else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }
+ else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }
+ else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }
+ else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
+ else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
+ else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
+ else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
+ else if (type == 'B') {
+ uint8_t sub_type = *(s++);
+ int32_t n;
+ memcpy(&n, s, 4);
+ s += 4; // no point to the start of the array
+ kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing
+ for (i = 0; i < n; ++i) {
+ kputc(',', &str);
+ if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
+ else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
+ else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
+ else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
+ else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
+ else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
+ else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
+ }
+ }
+ }
+ return str.s;
+}
+
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
+{
+ return bam_format1_core(header, b, BAM_OFDEC);
+}
+
+void bam_view1(const bam_header_t *header, const bam1_t *b)
+{
+ char *s = bam_format1(header, b);
+ puts(s);
+ free(s);
+}
+
+int bam_validate1(const bam_header_t *header, const bam1_t *b)
+{
+ char *s;
+
+ if (b->core.tid < -1 || b->core.mtid < -1) return 0;
+ if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0;
+
+ if (b->data_len < b->core.l_qname) return 0;
+ s = memchr(bam1_qname(b), '\0', b->core.l_qname);
+ if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
+
+ // FIXME: Other fields could also be checked, especially the auxiliary data
+
+ return 1;
+}
+
+// FIXME: we should also check the LB tag associated with each alignment
+const char *bam_get_library(bam_header_t *h, const bam1_t *b)
+{
+ const uint8_t *rg;
+ if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+ if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
+ rg = bam_aux_get(b, "RG");
+ return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
+}
+
+/************
+ * Remove B *
+ ************/
+
+int bam_remove_B(bam1_t *b)
+{
+ int i, j, end_j, k, l, no_qual;
+ uint32_t *cigar, *new_cigar;
+ uint8_t *seq, *qual, *p;
+ // test if removal is necessary
+ if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
+ cigar = bam1_cigar(b);
+ for (k = 0; k < b->core.n_cigar; ++k)
+ if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
+ if (k == b->core.n_cigar) return 0; // no 'B'
+ if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
+ // allocate memory for the new CIGAR
+ if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
+ b->m_data = b->data_len + b->core.n_cigar * 4;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ cigar = bam1_cigar(b); // after realloc, cigar may be changed
+ }
+ new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
+ // the core loop
+ seq = bam1_seq(b); qual = bam1_qual(b);
+ no_qual = (qual[0] == 0xff); // test whether base quality is available
+ i = j = 0; end_j = -1;
+ for (k = l = 0; k < b->core.n_cigar; ++k) {
+ int op = bam_cigar_op(cigar[k]);
+ int len = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CBACK) { // the backward operation
+ int t, u;
+ if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
+ if (len > j) goto rmB_err; // an excessively long backward
+ for (t = l - 1, u = 0; t >= 0; --t) { // look back
+ int op1 = bam_cigar_op(new_cigar[t]);
+ int len1 = bam_cigar_oplen(new_cigar[t]);
+ if (bam_cigar_type(op1)&1) { // consume the query
+ if (u + len1 >= len) { // stop
+ new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
+ break;
+ } else u += len1;
+ }
+ }
+ if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
+ l = t + 1;
+ end_j = j; j -= len;
+ } else { // other CIGAR operations
+ new_cigar[l++] = cigar[k];
+ if (bam_cigar_type(op)&1) { // consume the query
+ if (i != j) { // no need to copy if i == j
+ int u, c, c0;
+ for (u = 0; u < len; ++u) { // construct the consensus
+ c = bam1_seqi(seq, i+u);
+ if (j + u < end_j) { // in an overlap
+ c0 = bam1_seqi(seq, j+u);
+ if (c != c0) { // a mismatch; choose the better base
+ if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
+ bam1_seq_seti(seq, j+u, c);
+ qual[j+u] = qual[i+u] - qual[j+u];
+ } else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
+ } else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
+ } else { // not in an overlap; copy over
+ bam1_seq_seti(seq, j+u, c);
+ qual[j+u] = qual[i+u];
+ }
+ }
+ }
+ i += len, j += len;
+ }
+ }
+ }
+ if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
+ // merge adjacent operations if possible
+ for (k = 1; k < l; ++k)
+ if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
+ new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
+ // kill zero length operations
+ for (k = i = 0; k < l; ++k)
+ if (new_cigar[k] >> BAM_CIGAR_SHIFT)
+ new_cigar[i++] = new_cigar[k];
+ l = i;
+ // update b
+ memcpy(cigar, new_cigar, l * 4); // set CIGAR
+ p = b->data + b->core.l_qname + l * 4;
+ memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
+ memmove(p, qual, j); p += j; // set QUAL
+ memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields
+ b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
+ b->data_len = p - b->data; // update record length
+ return 0;
+
+rmB_err:
+ b->core.flag |= BAM_FUNMAP;
+ return -1;
+}
diff --git a/bam_aux.c b/bam_aux.c
new file mode 100644
index 0000000..28b22e3
--- /dev/null
+++ b/bam_aux.c
@@ -0,0 +1,213 @@
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+typedef char *str_p;
+KHASH_MAP_INIT_STR(s, int)
+KHASH_MAP_INIT_STR(r2l, str_p)
+
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+ int ori_len = b->data_len;
+ b->data_len += 3 + len;
+ b->l_aux += 3 + len;
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+ b->data[ori_len + 2] = type;
+ memcpy(b->data + ori_len + 3, data, len);
+}
+
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
+{
+ return bam_aux_get(b, tag);
+}
+
+#define __skip_tag(s) do { \
+ int type = toupper(*(s)); \
+ ++(s); \
+ if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
+ else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
+ else (s) += bam_aux_type2size(type); \
+ } while(0)
+
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
+{
+ uint8_t *s;
+ int y = tag[0]<<8 | tag[1];
+ s = bam1_aux(b);
+ while (s < b->data + b->data_len) {
+ int x = (int)s[0]<<8 | s[1];
+ s += 2;
+ if (x == y) return s;
+ __skip_tag(s);
+ }
+ return 0;
+}
+// s MUST BE returned by bam_aux_get()
+int bam_aux_del(bam1_t *b, uint8_t *s)
+{
+ uint8_t *p, *aux;
+ aux = bam1_aux(b);
+ p = s - 2;
+ __skip_tag(s);
+ memmove(p, s, b->l_aux - (s - aux));
+ b->data_len -= s - p;
+ b->l_aux -= s - p;
+ return 0;
+}
+
+int bam_aux_drop_other(bam1_t *b, uint8_t *s)
+{
+ if (s) {
+ uint8_t *p, *aux;
+ aux = bam1_aux(b);
+ p = s - 2;
+ __skip_tag(s);
+ memmove(aux, p, s - p);
+ b->data_len -= b->l_aux - (s - p);
+ b->l_aux = s - p;
+ } else {
+ b->data_len -= b->l_aux;
+ b->l_aux = 0;
+ }
+ return 0;
+}
+
+void bam_init_header_hash(bam_header_t *header)
+{
+ if (header->hash == 0) {
+ int ret, i;
+ khiter_t iter;
+ khash_t(s) *h;
+ header->hash = h = kh_init(s);
+ for (i = 0; i < header->n_targets; ++i) {
+ iter = kh_put(s, h, header->target_name[i], &ret);
+ kh_value(h, iter) = i;
+ }
+ }
+}
+
+void bam_destroy_header_hash(bam_header_t *header)
+{
+ if (header->hash)
+ kh_destroy(s, (khash_t(s)*)header->hash);
+}
+
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
+{
+ khint_t k;
+ khash_t(s) *h = (khash_t(s)*)header->hash;
+ k = kh_get(s, h, seq_name);
+ return k == kh_end(h)? -1 : kh_value(h, k);
+}
+
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
+{
+ char *s;
+ int i, l, k, name_end;
+ khiter_t iter;
+ khash_t(s) *h;
+
+ bam_init_header_hash(header);
+ h = (khash_t(s)*)header->hash;
+
+ *ref_id = *beg = *end = -1;
+ name_end = l = strlen(str);
+ s = (char*)malloc(l+1);
+ // remove space
+ for (i = k = 0; i < l; ++i)
+ if (!isspace(str[i])) s[k++] = str[i];
+ s[k] = 0; l = k;
+ // determine the sequence name
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+ if (i >= 0) name_end = i;
+ if (name_end < l) { // check if this is really the end
+ int n_hyphen = 0;
+ for (i = name_end + 1; i < l; ++i) {
+ if (s[i] == '-') ++n_hyphen;
+ else if (!isdigit(s[i]) && s[i] != ',') break;
+ }
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+ s[name_end] = 0;
+ iter = kh_get(s, h, s);
+ if (iter == kh_end(h)) { // cannot find the sequence name
+ iter = kh_get(s, h, str); // try str as the name
+ if (iter == kh_end(h)) {
+ if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__);
+ free(s); return -1;
+ } else s[name_end] = ':', name_end = l;
+ }
+ } else iter = kh_get(s, h, str);
+ *ref_id = kh_val(h, iter);
+ // parse the interval
+ if (name_end < l) {
+ for (i = k = name_end + 1; i < l; ++i)
+ if (s[i] != ',') s[k++] = s[i];
+ s[k] = 0;
+ *beg = atoi(s + name_end + 1);
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+ *end = i < k? atoi(s + i + 1) : 1<<29;
+ if (*beg > 0) --*beg;
+ } else *beg = 0, *end = 1<<29;
+ free(s);
+ return *beg <= *end? 0 : -1;
+}
+
+int32_t bam_aux2i(const uint8_t *s)
+{
+ int type;
+ if (s == 0) return 0;
+ type = *s++;
+ if (type == 'c') return (int32_t)*(int8_t*)s;
+ else if (type == 'C') return (int32_t)*(uint8_t*)s;
+ else if (type == 's') return (int32_t)*(int16_t*)s;
+ else if (type == 'S') return (int32_t)*(uint16_t*)s;
+ else if (type == 'i' || type == 'I') return *(int32_t*)s;
+ else return 0;
+}
+
+float bam_aux2f(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0.0;
+ if (type == 'f') return *(float*)s;
+ else return 0.0;
+}
+
+double bam_aux2d(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0.0;
+ if (type == 'd') return *(double*)s;
+ else return 0.0;
+}
+
+char bam_aux2A(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0;
+ if (type == 'A') return *(char*)s;
+ else return 0;
+}
+
+char *bam_aux2Z(const uint8_t *s)
+{
+ int type;
+ type = *s++;
+ if (s == 0) return 0;
+ if (type == 'Z' || type == 'H') return (char*)s;
+ else return 0;
+}
+
+#ifdef _WIN32
+double drand48()
+{
+ return (double)rand() / RAND_MAX;
+}
+#endif
diff --git a/bam_cat.c b/bam_cat.c
new file mode 100644
index 0000000..a7502b9
--- /dev/null
+++ b/bam_cat.c
@@ -0,0 +1,185 @@
+/*
+
+bam_cat -- efficiently concatenates bam files
+
+bam_cat can be used to concatenate BAM files. Under special
+circumstances, it can be used as an alternative to 'samtools merge' to
+concatenate multiple sorted files into a single sorted file. For this
+to work each file must be sorted, and the sorted files must be given
+as command line arguments in order such that the final read in file i
+is less than or equal to the first read in file i+1.
+
+This code is derived from the bam_reheader function in samtools 0.1.8
+and modified to perform concatenation by Chris Saunders on behalf of
+Illumina.
+
+
+########## License:
+
+The MIT License
+
+Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
+Modified SAMtools work copyright (c) 2010 Illumina, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+
+/*
+makefile:
+"""
+CC=gcc
+CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR)
+LDFLAGS+=-L$(SAMTOOLS_DIR)
+LDLIBS+=-lbam -lz
+
+all:bam_cat
+"""
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "knetfile.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#define BUF_SIZE 0x10000
+
+#define GZIPID1 31
+#define GZIPID2 139
+
+#define BGZF_EMPTY_BLOCK_SIZE 28
+
+
+int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
+{
+ BGZF *fp;
+ FILE* fp_file;
+ uint8_t *buf;
+ uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
+ const int es=BGZF_EMPTY_BLOCK_SIZE;
+ int i;
+
+ fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
+ if (fp == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
+ return 1;
+ }
+ if (h) bam_header_write(fp, h);
+
+ buf = (uint8_t*) malloc(BUF_SIZE);
+ for(i = 0; i < nfn; ++i){
+ BGZF *in;
+ bam_header_t *old;
+ int len,j;
+
+ in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r");
+ if (in == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+ return -1;
+ }
+ if (in->is_write) return -1;
+
+ old = bam_header_read(in);
+ if (h == 0 && i == 0) bam_header_write(fp, old);
+
+ if (in->block_offset < in->block_length) {
+ bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
+ bgzf_flush(fp);
+ }
+
+ j=0;
+#ifdef _USE_KNETFILE
+ fp_file = fp->fp;
+ while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) {
+#else
+ fp_file = fp->fp;
+ while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {
+#endif
+ if(len<es){
+ int diff=es-len;
+ if(j==0) {
+ fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
+ return -1;
+ }
+ fwrite(ebuf, 1, len, fp_file);
+ memcpy(ebuf,ebuf+len,diff);
+ memcpy(ebuf+diff,buf,len);
+ } else {
+ if(j!=0) fwrite(ebuf, 1, es, fp_file);
+ len-= es;
+ memcpy(ebuf,buf+len,es);
+ fwrite(buf, 1, len, fp_file);
+ }
+ j=1;
+ }
+
+ /* check final gzip block */
+ {
+ const uint8_t gzip1=ebuf[0];
+ const uint8_t gzip2=ebuf[1];
+ const uint32_t isize=*((uint32_t*)(ebuf+es-4));
+ if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
+ fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
+ fprintf(stderr, " Possible output corruption.\n");
+ fwrite(ebuf, 1, es, fp_file);
+ }
+ }
+ bam_header_destroy(old);
+ bgzf_close(in);
+ }
+ free(buf);
+ bgzf_close(fp);
+ return 0;
+}
+
+
+
+int main_cat(int argc, char *argv[])
+{
+ bam_header_t *h = 0;
+ char *outfn = 0;
+ int c, ret;
+ while ((c = getopt(argc, argv, "h:o:")) >= 0) {
+ switch (c) {
+ case 'h': {
+ tamFile fph = sam_open(optarg);
+ if (fph == 0) {
+ fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
+ return 1;
+ }
+ h = sam_header_read(fph);
+ sam_close(fph);
+ break;
+ }
+ case 'o': outfn = strdup(optarg); break;
+ }
+ }
+ if (argc - optind < 2) {
+ fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
+ return 1;
+ }
+ ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+ free(outfn);
+ return ret;
+}
diff --git a/bam_import.c b/bam_import.c
new file mode 100644
index 0000000..da2bf94
--- /dev/null
+++ b/bam_import.c
@@ -0,0 +1,489 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#ifdef _WIN32
+#include <fcntl.h>
+#endif
+#include "kstring.h"
+#include "bam.h"
+#include "sam_header.h"
+#include "kseq.h"
+#include "khash.h"
+
+KSTREAM_INIT(gzFile, gzread, 16384)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+
+void bam_init_header_hash(bam_header_t *header);
+void bam_destroy_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+unsigned char bam_nt16_table[256] = {
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+unsigned short bam_char2flag_table[256] = {
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
+ BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
+};
+
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
+
+struct __tamFile_t {
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t *str;
+ uint64_t n_lines;
+ int is_first;
+};
+
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
+{
+ char **list = 0, *s;
+ int n = 0, dret, m = 0;
+ gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ kstream_t *ks;
+ kstring_t *str;
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, '\n', str, &dret) > 0) {
+ if (n == m) {
+ m = m? m << 1 : 16;
+ list = (char**)realloc(list, m * sizeof(char*));
+ }
+ if (str->s[str->l-1] == '\r')
+ str->s[--str->l] = '\0';
+ s = list[n++] = (char*)calloc(str->l + 1, 1);
+ strcpy(s, str->s);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ *_n = n;
+ return list;
+}
+
+static bam_header_t *hash2header(const kh_ref_t *hash)
+{
+ bam_header_t *header;
+ khiter_t k;
+ header = bam_header_init();
+ header->n_targets = kh_size(hash);
+ header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
+ header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
+ for (k = kh_begin(hash); k != kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ int i = (int)kh_value(hash, k);
+ header->target_name[i] = (char*)kh_key(hash, k);
+ header->target_len[i] = kh_value(hash, k)>>32;
+ }
+ }
+ bam_init_header_hash(header);
+ return header;
+}
+bam_header_t *sam_header_read2(const char *fn)
+{
+ bam_header_t *header;
+ int c, dret, ret, error = 0;
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t *str;
+ kh_ref_t *hash;
+ khiter_t k;
+ if (fn == 0) return 0;
+ fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+ if (fp == 0) return 0;
+ hash = kh_init(ref);
+ ks = ks_init(fp);
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ while (ks_getuntil(ks, 0, str, &dret) > 0) {
+ char *s = strdup(str->s);
+ int len, i;
+ i = kh_size(hash);
+ ks_getuntil(ks, 0, str, &dret);
+ len = atoi(str->s);
+ k = kh_put(ref, hash, s, &ret);
+ if (ret == 0) {
+ fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s);
+ error = 1;
+ }
+ kh_value(hash, k) = (uint64_t)len<<32 | i;
+ if (dret != '\n')
+ while ((c = ks_getc(ks)) != '\n' && c != -1);
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+ if (error) return 0;
+ header = hash2header(hash);
+ kh_destroy(ref, hash);
+ return header;
+}
+static inline uint8_t *alloc_data(bam1_t *b, int size)
+{
+ if (b->m_data < size) {
+ b->m_data = size;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ return b->data;
+}
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
+{
+ fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
+ abort();
+}
+static inline void append_text(bam_header_t *header, kstring_t *str)
+{
+ size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+ kroundup32(x); kroundup32(y);
+ if (x < y)
+ {
+ header->n_text = y;
+ header->text = (char*)realloc(header->text, y);
+ if ( !header->text )
+ {
+ fprintf(stderr,"realloc failed to alloc %ld bytes\n", y);
+ abort();
+ }
+ }
+ // Sanity check
+ if ( header->l_text+str->l+1 >= header->n_text )
+ {
+ fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y);
+ abort();
+ }
+ strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
+ header->l_text += str->l + 1;
+ header->text[header->l_text] = 0;
+}
+
+int sam_header_parse(bam_header_t *h)
+{
+ char **tmp;
+ int i;
+ free(h->target_len); free(h->target_name);
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+ if (h->l_text < 3) return 0;
+ if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+ tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets);
+ if (h->n_targets == 0) return 0;
+ h->target_name = calloc(h->n_targets, sizeof(void*));
+ for (i = 0; i < h->n_targets; ++i)
+ h->target_name[i] = strdup(tmp[i]);
+ free(tmp);
+ tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets);
+ h->target_len = calloc(h->n_targets, 4);
+ for (i = 0; i < h->n_targets; ++i)
+ h->target_len[i] = atoi(tmp[i]);
+ free(tmp);
+ return h->n_targets;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+ int ret, dret;
+ bam_header_t *header = bam_header_init();
+ kstring_t *str = fp->str;
+ while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
+ str->s[str->l] = dret; // note that str->s is NOT null terminated!!
+ append_text(header, str);
+ if (dret != '\n') {
+ ret = ks_getuntil(fp->ks, '\n', str, &dret);
+ str->s[str->l] = '\n'; // NOT null terminated!!
+ append_text(header, str);
+ }
+ ++fp->n_lines;
+ }
+ sam_header_parse(header);
+ bam_init_header_hash(header);
+ fp->is_first = 1;
+ return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+ int ret, doff, doff0, dret, z = 0;
+ bam1_core_t *c = &b->core;
+ kstring_t *str = fp->str;
+ kstream_t *ks = fp->ks;
+
+ if (fp->is_first) {
+ fp->is_first = 0;
+ ret = str->l;
+ } else {
+ do { // special consideration for empty lines
+ ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+ if (ret >= 0) z += str->l + 1;
+ } while (ret == 0);
+ }
+ if (ret < 0) return -1;
+ ++fp->n_lines;
+ doff = 0;
+
+ { // name
+ c->l_qname = strlen(str->s) + 1;
+ memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
+ doff += c->l_qname;
+ }
+ { // flag
+ long flag;
+ char *s;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ flag = strtol((char*)str->s, &s, 0);
+ if (*s) { // not the end of the string
+ flag = 0;
+ for (s = str->s; *s; ++s)
+ flag |= bam_char2flag_table[(int)*s];
+ }
+ c->flag = flag;
+ }
+ { // tid, pos, qual
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+ if (c->tid < 0 && strcmp(str->s, "*")) {
+ if (header->n_targets == 0) {
+ fprintf(stderr, "[sam_read1] missing header? Abort!\n");
+ exit(1);
+ } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
+ }
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
+ if (ret < 0) return -2;
+ }
+ { // cigar
+ char *s, *t;
+ int i, op;
+ long x;
+ c->n_cigar = 0;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
+ z += str->l + 1;
+ if (str->s[0] != '*') {
+ uint32_t *cigar;
+ for (s = str->s; *s; ++s) {
+ if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
+ else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
+ }
+ b->data = alloc_data(b, doff + c->n_cigar * 4);
+ cigar = bam1_cigar(b);
+ for (i = 0, s = str->s; i != c->n_cigar; ++i) {
+ x = strtol(s, &t, 10);
+ op = toupper(*t);
+ if (op == 'M') op = BAM_CMATCH;
+ else if (op == 'I') op = BAM_CINS;
+ else if (op == 'D') op = BAM_CDEL;
+ else if (op == 'N') op = BAM_CREF_SKIP;
+ else if (op == 'S') op = BAM_CSOFT_CLIP;
+ else if (op == 'H') op = BAM_CHARD_CLIP;
+ else if (op == 'P') op = BAM_CPAD;
+ else if (op == '=') op = BAM_CEQUAL;
+ else if (op == 'X') op = BAM_CDIFF;
+ else if (op == 'B') op = BAM_CBACK;
+ else parse_error(fp->n_lines, "invalid CIGAR operation");
+ s = t + 1;
+ cigar[i] = bam_cigar_gen(x, op);
+ }
+ if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
+ c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar));
+ doff += c->n_cigar * 4;
+ } else {
+ if (!(c->flag&BAM_FUNMAP)) {
+ fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
+ c->flag |= BAM_FUNMAP;
+ }
+ c->bin = bam_reg2bin(c->pos, c->pos + 1);
+ }
+ }
+ { // mtid, mpos, isize
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+ c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
+ if (ret < 0) return -4;
+ }
+ { // seq and qual
+ int i;
+ uint8_t *p = 0;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
+ z += str->l + 1;
+ if (strcmp(str->s, "*")) {
+ c->l_qseq = strlen(str->s);
+ if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
+ fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
+ (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
+ parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+ }
+ p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+ memset(p, 0, (c->l_qseq+1)/2);
+ for (i = 0; i < c->l_qseq; ++i)
+ p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+ } else c->l_qseq = 0;
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
+ z += str->l + 1;
+ if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
+ parse_error(fp->n_lines, "sequence and quality are inconsistent");
+ p += (c->l_qseq+1)/2;
+ if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
+ else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
+ doff += c->l_qseq + (c->l_qseq+1)/2;
+ }
+ doff0 = doff;
+ if (dret != '\n' && dret != '\r') { // aux
+ while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
+ uint8_t *s, type, key[2];
+ z += str->l + 1;
+ if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
+ parse_error(fp->n_lines, "missing colon in auxiliary data");
+ key[0] = str->s[0]; key[1] = str->s[1];
+ type = str->s[3];
+ s = alloc_data(b, doff + 3) + doff;
+ s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
+ if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
+ s = alloc_data(b, doff + 2) + doff;
+ *s++ = 'A'; *s = str->s[5];
+ doff += 2;
+ } else if (type == 'I' || type == 'i') {
+ long long x;
+ s = alloc_data(b, doff + 5) + doff;
+ x = (long long)atoll(str->s + 5);
+ if (x < 0) {
+ if (x >= -127) {
+ *s++ = 'c'; *(int8_t*)s = (int8_t)x;
+ s += 1; doff += 2;
+ } else if (x >= -32767) {
+ *s++ = 's'; *(int16_t*)s = (int16_t)x;
+ s += 2; doff += 3;
+ } else {
+ *s++ = 'i'; *(int32_t*)s = (int32_t)x;
+ s += 4; doff += 5;
+ if (x < -2147483648ll)
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+ (long long)fp->n_lines, x);
+ }
+ } else {
+ if (x <= 255) {
+ *s++ = 'C'; *s++ = (uint8_t)x;
+ doff += 2;
+ } else if (x <= 65535) {
+ *s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
+ s += 2; doff += 3;
+ } else {
+ *s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
+ s += 4; doff += 5;
+ if (x > 4294967295ll)
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+ (long long)fp->n_lines, x);
+ }
+ }
+ } else if (type == 'f') {
+ s = alloc_data(b, doff + 5) + doff;
+ *s++ = 'f';
+ *(float*)s = (float)atof(str->s + 5);
+ s += 4; doff += 5;
+ } else if (type == 'd') {
+ s = alloc_data(b, doff + 9) + doff;
+ *s++ = 'd';
+ *(float*)s = (float)atof(str->s + 9);
+ s += 8; doff += 9;
+ } else if (type == 'Z' || type == 'H') {
+ int size = 1 + (str->l - 5) + 1;
+ if (type == 'H') { // check whether the hex string is valid
+ int i;
+ if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
+ for (i = 0; i < str->l - 5; ++i) {
+ int c = toupper(str->s[5 + i]);
+ if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
+ parse_error(fp->n_lines, "invalid hex character");
+ }
+ }
+ s = alloc_data(b, doff + size) + doff;
+ *s++ = type;
+ memcpy(s, str->s + 5, str->l - 5);
+ s[str->l - 5] = 0;
+ doff += size;
+ } else if (type == 'B') {
+ int32_t n = 0, Bsize, k = 0, size;
+ char *p;
+ if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B");
+ Bsize = bam_aux_type2size(str->s[5]); // the size of each element
+ for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array
+ if (*p == ',') ++n;
+ p = str->s + 7; // now p points to the first number in the array
+ size = 6 + Bsize * n; // total number of bytes allocated to this tag
+ s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory
+ *s++ = 'B'; *s++ = str->s[5];
+ memcpy(s, &n, 4); s += 4; // write the number of elements
+ if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory
+ else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p;
+ else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p;
+ else parse_error(fp->n_lines, "unrecognized array type");
+ s += Bsize * n; doff += size;
+ } else parse_error(fp->n_lines, "unrecognized type");
+ if (dret == '\n' || dret == '\r') break;
+ }
+ }
+ b->l_aux = doff - doff0;
+ b->data_len = doff;
+ if (bam_no_B) bam_remove_B(b);
+ return z;
+}
+
+tamFile sam_open(const char *fn)
+{
+ tamFile fp;
+ gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");
+ if (gzfp == 0) return 0;
+ fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
+ fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ fp->fp = gzfp;
+ fp->ks = ks_init(fp->fp);
+ return fp;
+}
+
+void sam_close(tamFile fp)
+{
+ if (fp) {
+ ks_destroy(fp->ks);
+ gzclose(fp->fp);
+ free(fp->str->s); free(fp->str);
+ free(fp);
+ }
+}
diff --git a/bam_index.c b/bam_index.c
new file mode 100644
index 0000000..d6b94e2
--- /dev/null
+++ b/bam_index.c
@@ -0,0 +1,724 @@
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+/*!
+ @header
+
+ Alignment indexing. Before indexing, BAM must be sorted based on the
+ leftmost coordinate of alignments. In indexing, BAM uses two indices:
+ a UCSC binning index and a simple linear index. The binning index is
+ efficient for alignments spanning long distance, while the auxiliary
+ linear index helps to reduce unnecessary seek calls especially for
+ short alignments.
+
+ The UCSC binning scheme was suggested by Richard Durbin and Lincoln
+ Stein and is explained by Kent et al. (2002). In this scheme, each bin
+ represents a contiguous genomic region which can be fully contained in
+ another bin; each alignment is associated with a bin which represents
+ the smallest region containing the entire alignment. The binning
+ scheme is essentially another representation of R-tree. A distinct bin
+ uniquely corresponds to a distinct internal node in a R-tree. Bin A is
+ a child of Bin B if region A is contained in B.
+
+ In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
+ 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
+ 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
+ find the alignments overlapped with a region [rbeg,rend), we need to
+ calculate the list of bins that may be overlapped the region and test
+ the alignments in the bins to confirm the overlaps. If the specified
+ region is short, typically only a few alignments in six bins need to
+ be retrieved. The overlapping alignments can be quickly fetched.
+
+ */
+
+#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
+#define BAM_LIDX_SHIFT 14
+
+#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
+
+typedef struct {
+ uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(off, pair64_t, pair64_lt)
+
+typedef struct {
+ uint32_t m, n;
+ pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+ int32_t n, m;
+ uint64_t *offset;
+} bam_lidx_t;
+
+KHASH_MAP_INIT_INT(i, bam_binlist_t)
+
+struct __bam_index_t {
+ int32_t n;
+ uint64_t n_no_coor; // unmapped reads without coordinate
+ khash_t(i) **index;
+ bam_lidx_t *index2;
+};
+
+// requirement: len <= LEN_MASK
+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
+{
+ khint_t k;
+ bam_binlist_t *l;
+ int ret;
+ k = kh_put(i, h, bin, &ret);
+ l = &kh_value(h, k);
+ if (ret) { // not present
+ l->m = 1; l->n = 0;
+ l->list = (pair64_t*)calloc(l->m, 16);
+ }
+ if (l->n == l->m) {
+ l->m <<= 1;
+ l->list = (pair64_t*)realloc(l->list, l->m * 16);
+ }
+ l->list[l->n].u = beg; l->list[l->n++].v = end;
+}
+
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
+{
+ int i, beg, end;
+ beg = b->core.pos >> BAM_LIDX_SHIFT;
+ end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
+ kroundup32(index2->m);
+ index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+ }
+ if (beg == end) {
+ if (index2->offset[beg] == 0) index2->offset[beg] = offset;
+ } else {
+ for (i = beg; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ }
+ index2->n = end + 1;
+}
+
+static void merge_chunks(bam_index_t *idx)
+{
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+ khash_t(i) *index;
+ int i, l, m;
+ khint_t k;
+ for (i = 0; i < idx->n; ++i) {
+ index = idx->index[i];
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ bam_binlist_t *p;
+ if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
+ p = &kh_value(index, k);
+ m = 0;
+ for (l = 1; l < p->n; ++l) {
+#ifdef BAM_TRUE_OFFSET
+ if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
+#else
+ if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
+#endif
+ else p->list[++m] = p->list[l];
+ } // ~for(l)
+ p->n = m + 1;
+ } // ~for(k)
+ } // ~for(i)
+#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
+}
+
+static void fill_missing(bam_index_t *idx)
+{
+ int i, j;
+ for (i = 0; i < idx->n; ++i) {
+ bam_lidx_t *idx2 = &idx->index2[i];
+ for (j = 1; j < idx2->n; ++j)
+ if (idx2->offset[j] == 0)
+ idx2->offset[j] = idx2->offset[j-1];
+ }
+}
+
+bam_index_t *bam_index_core(bamFile fp)
+{
+ bam1_t *b;
+ bam_header_t *h;
+ int i, ret;
+ bam_index_t *idx;
+ uint32_t last_bin, save_bin;
+ int32_t last_coor, last_tid, save_tid;
+ bam1_core_t *c;
+ uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
+
+ h = bam_header_read(fp);
+ if(h == 0) {
+ fprintf(stderr, "[bam_index_core] Invalid BAM header.");
+ return NULL;
+ }
+
+ idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+ b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ c = &b->core;
+
+ idx->n = h->n_targets;
+ bam_header_destroy(h);
+ idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+ for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
+ idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+
+ save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
+ save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+ n_mapped = n_unmapped = n_no_coor = off_end = 0;
+ off_beg = off_end = bam_tell(fp);
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ if (c->tid < 0) ++n_no_coor;
+ if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
+ last_tid = c->tid;
+ last_bin = 0xffffffffu;
+ } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
+ bam1_qname(b), last_tid+1, c->tid+1);
+ return NULL;
+ } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
+ bam1_qname(b), last_coor, c->pos, c->tid+1);
+ return NULL;
+ }
+ if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+ if (c->bin != last_bin) { // then possibly write the binning index
+ if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+ insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+ if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
+ off_end = last_off;
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+ n_mapped = n_unmapped = 0;
+ off_beg = off_end;
+ }
+ save_off = last_off;
+ save_bin = last_bin = c->bin;
+ save_tid = c->tid;
+ if (save_tid < 0) break;
+ }
+ if (bam_tell(fp) <= last_off) {
+ fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
+ (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
+ return NULL;
+ }
+ if (c->flag & BAM_FUNMAP) ++n_unmapped;
+ else ++n_mapped;
+ last_off = bam_tell(fp);
+ last_coor = b->core.pos;
+ }
+ if (save_tid >= 0) {
+ insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+ }
+ merge_chunks(idx);
+ fill_missing(idx);
+ if (ret >= 0) {
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ ++n_no_coor;
+ if (c->tid >= 0 && n_no_coor) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
+ return NULL;
+ }
+ }
+ }
+ if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
+ free(b->data); free(b);
+ idx->n_no_coor = n_no_coor;
+ return idx;
+}
+
+void bam_index_destroy(bam_index_t *idx)
+{
+ khint_t k;
+ int i;
+ if (idx == 0) return;
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index = idx->index[i];
+ bam_lidx_t *index2 = idx->index2 + i;
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ if (kh_exist(index, k))
+ free(kh_value(index, k).list);
+ }
+ kh_destroy(i, index);
+ free(index2->offset);
+ }
+ free(idx->index); free(idx->index2);
+ free(idx);
+}
+
+void bam_index_save(const bam_index_t *idx, FILE *fp)
+{
+ int32_t i, size;
+ khint_t k;
+ fwrite("BAI\1", 1, 4, fp);
+ if (bam_is_be) {
+ uint32_t x = idx->n;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&idx->n, 4, 1, fp);
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index = idx->index[i];
+ bam_lidx_t *index2 = idx->index2 + i;
+ // write binning index
+ size = kh_size(index);
+ if (bam_is_be) { // big endian
+ uint32_t x = size;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&size, 4, 1, fp);
+ for (k = kh_begin(index); k != kh_end(index); ++k) {
+ if (kh_exist(index, k)) {
+ bam_binlist_t *p = &kh_value(index, k);
+ if (bam_is_be) { // big endian
+ uint32_t x;
+ x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ for (x = 0; (int)x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ fwrite(p->list, 16, p->n, fp);
+ for (x = 0; (int)x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ } else {
+ fwrite(&kh_key(index, k), 4, 1, fp);
+ fwrite(&p->n, 4, 1, fp);
+ fwrite(p->list, 16, p->n, fp);
+ }
+ }
+ }
+ // write linear index (index2)
+ if (bam_is_be) {
+ int x = index2->n;
+ fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+ } else fwrite(&index2->n, 4, 1, fp);
+ if (bam_is_be) { // big endian
+ int x;
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ fwrite(index2->offset, 8, index2->n, fp);
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ } else fwrite(index2->offset, 8, index2->n, fp);
+ }
+ { // write the number of reads coor-less records.
+ uint64_t x = idx->n_no_coor;
+ if (bam_is_be) bam_swap_endian_8p(&x);
+ fwrite(&x, 8, 1, fp);
+ }
+ fflush(fp);
+}
+
+static bam_index_t *bam_index_load_core(FILE *fp)
+{
+ int i;
+ char magic[4];
+ bam_index_t *idx;
+ if (fp == 0) {
+ fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
+ return 0;
+ }
+ fread(magic, 1, 4, fp);
+ if (strncmp(magic, "BAI\1", 4)) {
+ fprintf(stderr, "[bam_index_load] wrong magic number.\n");
+ fclose(fp);
+ return 0;
+ }
+ idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+ fread(&idx->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&idx->n);
+ idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+ idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+ for (i = 0; i < idx->n; ++i) {
+ khash_t(i) *index;
+ bam_lidx_t *index2 = idx->index2 + i;
+ uint32_t key, size;
+ khint_t k;
+ int j, ret;
+ bam_binlist_t *p;
+ index = idx->index[i] = kh_init(i);
+ // load binning index
+ fread(&size, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&size);
+ for (j = 0; j < (int)size; ++j) {
+ fread(&key, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&key);
+ k = kh_put(i, index, key, &ret);
+ p = &kh_value(index, k);
+ fread(&p->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&p->n);
+ p->m = p->n;
+ p->list = (pair64_t*)malloc(p->m * 16);
+ fread(p->list, 16, p->n, fp);
+ if (bam_is_be) {
+ int x;
+ for (x = 0; x < p->n; ++x) {
+ bam_swap_endian_8p(&p->list[x].u);
+ bam_swap_endian_8p(&p->list[x].v);
+ }
+ }
+ }
+ // load linear index
+ fread(&index2->n, 4, 1, fp);
+ if (bam_is_be) bam_swap_endian_4p(&index2->n);
+ index2->m = index2->n;
+ index2->offset = (uint64_t*)calloc(index2->m, 8);
+ fread(index2->offset, index2->n, 8, fp);
+ if (bam_is_be)
+ for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+ }
+ if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
+ if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
+ return idx;
+}
+
+bam_index_t *bam_index_load_local(const char *_fn)
+{
+ FILE *fp;
+ char *fnidx, *fn;
+
+ if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
+ const char *p;
+ int l = strlen(_fn);
+ for (p = _fn + l - 1; p >= _fn; --p)
+ if (*p == '/') break;
+ fn = strdup(p + 1);
+ } else fn = strdup(_fn);
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ fp = fopen(fnidx, "rb");
+ if (fp == 0) { // try "{base}.bai"
+ char *s = strstr(fn, "bam");
+ if (s == fn + strlen(fn) - 3) {
+ strcpy(fnidx, fn);
+ fnidx[strlen(fn)-1] = 'i';
+ fp = fopen(fnidx, "rb");
+ }
+ }
+ free(fnidx); free(fn);
+ if (fp) {
+ bam_index_t *idx = bam_index_load_core(fp);
+ fclose(fp);
+ return idx;
+ } else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ char *fn;
+ FILE *fp;
+ uint8_t *buf;
+ knetFile *fp_remote;
+ int l;
+ if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+ l = strlen(url);
+ for (fn = (char*)url + l - 1; fn >= url; --fn)
+ if (*fn == '/') break;
+ ++fn; // fn now points to the file name
+ fp_remote = knet_open(url, "r");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+ return;
+ }
+ if ((fp = fopen(fn, "wb")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+ knet_close(fp_remote);
+ return;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+ return;
+}
+#endif
+
+bam_index_t *bam_index_load(const char *fn)
+{
+ bam_index_t *idx;
+ idx = bam_index_load_local(fn);
+ if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
+ char *fnidx = calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".bai");
+ fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
+ download_from_remote(fnidx);
+ idx = bam_index_load_local(fn);
+ }
+ if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
+ return idx;
+}
+
+int bam_index_build2(const char *fn, const char *_fnidx)
+{
+ char *fnidx;
+ FILE *fpidx;
+ bamFile fp;
+ bam_index_t *idx;
+ if ((fp = bam_open(fn, "r")) == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
+ return -1;
+ }
+ idx = bam_index_core(fp);
+ bam_close(fp);
+ if(idx == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
+ return -1;
+ }
+ if (_fnidx == 0) {
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ } else fnidx = strdup(_fnidx);
+ fpidx = fopen(fnidx, "wb");
+ if (fpidx == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
+ free(fnidx);
+ return -1;
+ }
+ bam_index_save(idx, fpidx);
+ bam_index_destroy(idx);
+ fclose(fpidx);
+ free(fnidx);
+ return 0;
+}
+
+int bam_index_build(const char *fn)
+{
+ return bam_index_build2(fn, 0);
+}
+
+int bam_index(int argc, char *argv[])
+{
+ if (argc < 2) {
+ fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
+ return 1;
+ }
+ if (argc >= 3) bam_index_build2(argv[1], argv[2]);
+ else bam_index_build(argv[1]);
+ return 0;
+}
+
+int bam_idxstats(int argc, char *argv[])
+{
+ bam_index_t *idx;
+ bam_header_t *header;
+ bamFile fp;
+ int i;
+ if (argc < 2) {
+ fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
+ return 1;
+ }
+ fp = bam_open(argv[1], "r");
+ if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+ header = bam_header_read(fp);
+ bam_close(fp);
+ idx = bam_index_load(argv[1]);
+ if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+ for (i = 0; i < idx->n; ++i) {
+ khint_t k;
+ khash_t(i) *h = idx->index[i];
+ printf("%s\t%d", header->target_name[i], header->target_len[i]);
+ k = kh_get(i, h, BAM_MAX_BIN);
+ if (k != kh_end(h))
+ printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
+ else printf("\t0\t0");
+ putchar('\n');
+ }
+ printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
+ bam_header_destroy(header);
+ bam_index_destroy(idx);
+ return 0;
+}
+
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
+{
+ int i = 0, k;
+ if (beg >= end) return 0;
+ if (end >= 1u<<29) end = 1u<<29;
+ --end;
+ list[i++] = 0;
+ for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
+ for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
+ for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
+ for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
+ for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+ return i;
+}
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+ uint32_t rbeg = b->core.pos;
+ uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
+ return (rend > beg && rbeg < end);
+}
+
+struct __bam_iter_t {
+ int from_first; // read from the first record; no random access
+ int tid, beg, end, n_off, i, finished;
+ uint64_t curr_off;
+ pair64_t *off;
+};
+
+// bam_fetch helper function retrieves
+bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
+{
+ uint16_t *bins;
+ int i, n_bins, n_off;
+ pair64_t *off;
+ khint_t k;
+ khash_t(i) *index;
+ uint64_t min_off;
+ bam_iter_t iter = 0;
+
+ if (beg < 0) beg = 0;
+ if (end < beg) return 0;
+ // initialize iter
+ iter = calloc(1, sizeof(struct __bam_iter_t));
+ iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
+ //
+ bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
+ n_bins = reg2bins(beg, end, bins);
+ index = idx->index[tid];
+ if (idx->index2[tid].n > 0) {
+ min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+ : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+ if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
+ int n = beg>>BAM_LIDX_SHIFT;
+ if (n > idx->index2[tid].n) n = idx->index2[tid].n;
+ for (i = n - 1; i >= 0; --i)
+ if (idx->index2[tid].offset[i] != 0) break;
+ if (i >= 0) min_off = idx->index2[tid].offset[i];
+ }
+ } else min_off = 0; // tabix 0.1.2 may produce such index files
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((k = kh_get(i, index, bins[i])) != kh_end(index))
+ n_off += kh_value(index, k).n;
+ }
+ if (n_off == 0) {
+ free(bins); return iter;
+ }
+ off = (pair64_t*)calloc(n_off, 16);
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
+ int j;
+ bam_binlist_t *p = &kh_value(index, k);
+ for (j = 0; j < p->n; ++j)
+ if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+ }
+ }
+ free(bins);
+ if (n_off == 0) {
+ free(off); return iter;
+ }
+ {
+ bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ int l;
+ ks_introsort(off, n_off, off);
+ // resolve completely contained adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i)
+ if (off[l].v < off[i].v)
+ off[++l] = off[i];
+ n_off = l + 1;
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+ for (i = 1; i < n_off; ++i)
+ if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+ { // merge adjacent blocks
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+ for (i = 1, l = 0; i < n_off; ++i) {
+#ifdef BAM_TRUE_OFFSET
+ if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
+#else
+ if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+#endif
+ else off[++l] = off[i];
+ }
+ n_off = l + 1;
+#endif
+ }
+ bam_destroy1(b);
+ }
+ iter->n_off = n_off; iter->off = off;
+ return iter;
+}
+
+pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
+{ // for pysam compatibility
+ bam_iter_t iter;
+ pair64_t *off;
+ iter = bam_iter_query(idx, tid, beg, end);
+ off = iter->off; *cnt_off = iter->n_off;
+ free(iter);
+ return off;
+}
+
+void bam_iter_destroy(bam_iter_t iter)
+{
+ if (iter) { free(iter->off); free(iter); }
+}
+
+int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
+{
+ int ret;
+ if (iter && iter->finished) return -1;
+ if (iter == 0 || iter->from_first) {
+ ret = bam_read1(fp, b);
+ if (ret < 0 && iter) iter->finished = 1;
+ return ret;
+ }
+ if (iter->off == 0) return -1;
+ for (;;) {
+ if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
+ if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
+ if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
+ if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
+ bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
+ iter->curr_off = bam_tell(fp);
+ }
+ ++iter->i;
+ }
+ if ((ret = bam_read1(fp, b)) >= 0) {
+ iter->curr_off = bam_tell(fp);
+ if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
+ ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
+ break;
+ }
+ else if (is_overlap(iter->beg, iter->end, b)) return ret;
+ } else break; // end of file or error
+ }
+ iter->finished = 1;
+ return ret;
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+ int ret;
+ bam_iter_t iter;
+ bam1_t *b;
+ b = bam_init1();
+ iter = bam_iter_query(idx, tid, beg, end);
+ while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
+ bam_iter_destroy(iter);
+ bam_destroy1(b);
+ return (ret == -1)? 0 : ret;
+}
diff --git a/bam_lpileup.c b/bam_lpileup.c
new file mode 100644
index 0000000..d4dd63b
--- /dev/null
+++ b/bam_lpileup.c
@@ -0,0 +1,198 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "bam.h"
+#include "ksort.h"
+
+#define TV_GAP 2
+
+typedef struct __freenode_t {
+ uint32_t level:28, cnt:4;
+ struct __freenode_t *next;
+} freenode_t, *freenode_p;
+
+#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
+KSORT_INIT(node, freenode_p, freenode_lt)
+
+/* Memory pool, similar to the one in bam_pileup.c */
+typedef struct {
+ int cnt, n, max;
+ freenode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+ return (mempool_t*)calloc(1, sizeof(mempool_t));
+}
+static void mp_destroy(mempool_t *mp)
+{
+ int k;
+ for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
+ free(mp->buf); free(mp);
+}
+static inline freenode_t *mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
+ else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, freenode_t *p)
+{
+ --mp->cnt; p->next = 0; p->cnt = TV_GAP;
+ if (mp->n == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 256;
+ mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
+ }
+ mp->buf[mp->n++] = p;
+}
+
+/* core part */
+struct __bam_lplbuf_t {
+ int max, n_cur, n_pre;
+ int max_level, *cur_level, *pre_level;
+ mempool_t *mp;
+ freenode_t **aux, *head, *tail;
+ int n_nodes, m_aux;
+ bam_pileup_f func;
+ void *user_data;
+ bam_plbuf_t *plbuf;
+};
+
+void bam_lplbuf_reset(bam_lplbuf_t *buf)
+{
+ freenode_t *p, *q;
+ bam_plbuf_reset(buf->plbuf);
+ for (p = buf->head; p->next;) {
+ q = p->next;
+ mp_free(buf->mp, p);
+ p = q;
+ }
+ buf->head = buf->tail;
+ buf->max_level = 0;
+ buf->n_cur = buf->n_pre = 0;
+ buf->n_nodes = 0;
+}
+
+static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+ bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
+ freenode_t *p;
+ int i, l, max_level;
+ // allocate memory if necessary
+ if (tv->max < n) { // enlarge
+ tv->max = n;
+ kroundup32(tv->max);
+ tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
+ tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
+ }
+ tv->n_cur = n;
+ // update cnt
+ for (p = tv->head; p->next; p = p->next)
+ if (p->cnt > 0) --p->cnt;
+ // calculate cur_level[]
+ max_level = 0;
+ for (i = l = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->is_head) {
+ if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
+ freenode_t *p = tv->head->next;
+ tv->cur_level[i] = tv->head->level;
+ mp_free(tv->mp, tv->head);
+ tv->head = p;
+ --tv->n_nodes;
+ } else tv->cur_level[i] = ++tv->max_level;
+ } else {
+ tv->cur_level[i] = tv->pre_level[l++];
+ if (p->is_tail) { // then return a free slot
+ tv->tail->level = tv->cur_level[i];
+ tv->tail->next = mp_alloc(tv->mp);
+ tv->tail = tv->tail->next;
+ ++tv->n_nodes;
+ }
+ }
+ if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
+ ((bam_pileup1_t*)p)->level = tv->cur_level[i];
+ }
+ assert(l == tv->n_pre);
+ tv->func(tid, pos, n, pl, tv->user_data);
+ // sort the linked list
+ if (tv->n_nodes) {
+ freenode_t *q;
+ if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
+ tv->m_aux = tv->n_nodes + 1;
+ kroundup32(tv->m_aux);
+ tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
+ }
+ for (p = tv->head, i = l = 0; p->next;) {
+ if (p->level > max_level) { // then discard this entry
+ q = p->next;
+ mp_free(tv->mp, p);
+ p = q;
+ } else {
+ tv->aux[i++] = p;
+ p = p->next;
+ }
+ }
+ tv->aux[i] = tv->tail; // add a proper tail for the loop below
+ tv->n_nodes = i;
+ if (tv->n_nodes) {
+ ks_introsort(node, tv->n_nodes, tv->aux);
+ for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
+ tv->head = tv->aux[0];
+ } else tv->head = tv->tail;
+ }
+ // clean up
+ tv->max_level = max_level;
+ memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
+ // squeeze out terminated levels
+ for (i = l = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (!p->is_tail)
+ tv->pre_level[l++] = tv->pre_level[i];
+ }
+ tv->n_pre = l;
+/*
+ fprintf(stderr, "%d\t", pos+1);
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (p->is_head) fprintf(stderr, "^");
+ if (p->is_tail) fprintf(stderr, "$");
+ fprintf(stderr, "%d,", p->level);
+ }
+ fprintf(stderr, "\n");
+*/
+ return 0;
+}
+
+bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
+{
+ bam_lplbuf_t *tv;
+ tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
+ tv->mp = mp_init();
+ tv->head = tv->tail = mp_alloc(tv->mp);
+ tv->func = func;
+ tv->user_data = data;
+ tv->plbuf = bam_plbuf_init(tview_func, tv);
+ return (bam_lplbuf_t*)tv;
+}
+
+void bam_lplbuf_destroy(bam_lplbuf_t *tv)
+{
+ freenode_t *p, *q;
+ free(tv->cur_level); free(tv->pre_level);
+ bam_plbuf_destroy(tv->plbuf);
+ free(tv->aux);
+ for (p = tv->head; p->next;) {
+ q = p->next;
+ mp_free(tv->mp, p); p = q;
+ }
+ mp_free(tv->mp, p);
+ assert(tv->mp->cnt == 0);
+ mp_destroy(tv->mp);
+ free(tv);
+}
+
+int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
+{
+ return bam_plbuf_push(b, tv->plbuf);
+}
diff --git a/bam_md.c b/bam_md.c
new file mode 100644
index 0000000..ce40a12
--- /dev/null
+++ b/bam_md.c
@@ -0,0 +1,389 @@
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+#include "faidx.h"
+#include "sam.h"
+#include "kstring.h"
+#include "kaln.h"
+#include "kprobaln.h"
+
+#define USE_EQUAL 1
+#define DROP_TAG 2
+#define BIN_QUAL 4
+#define UPDATE_NM 8
+#define UPDATE_MD 16
+#define HASH_QNM 32
+
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+int bam_aux_drop_other(bam1_t *b, uint8_t *s);
+
+void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
+{
+ uint8_t *seq = bam1_seq(b);
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ int i, x, y, u = 0;
+ kstring_t *str;
+ int32_t old_nm_i = -1, nm = 0;
+
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+ if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+ ++u;
+ } else {
+ kputw(u, str); kputc(ref[x+j], str);
+ u = 0; ++nm;
+ }
+ }
+ if (j < l) break;
+ x += l; y += l;
+ } else if (op == BAM_CDEL) {
+ kputw(u, str); kputc('^', str);
+ for (j = 0; j < l; ++j) {
+ if (ref[x+j] == 0) break;
+ kputc(ref[x+j], str);
+ }
+ u = 0;
+ if (j < l) break;
+ x += l; nm += l;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+ y += l;
+ if (op == BAM_CINS) nm += l;
+ } else if (op == BAM_CREF_SKIP) {
+ x += l;
+ }
+ }
+ kputw(u, str);
+ // apply max_nm
+ if (max_nm > 0 && nm >= max_nm) {
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+ seq[z/2] |= (z&1)? 0x0f : 0xf0;
+ bam1_qual(b)[z] = 0;
+ }
+ }
+ if (j < l) break;
+ x += l; y += l;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ }
+ }
+ // update NM
+ if (flag & UPDATE_NM) {
+ uint8_t *old_nm = bam_aux_get(b, "NM");
+ if (c->flag & BAM_FUNMAP) return;
+ if (old_nm) old_nm_i = bam_aux2i(old_nm);
+ if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ else if (nm != old_nm_i) {
+ fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+ bam_aux_del(b, old_nm);
+ bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ }
+ }
+ // update MD
+ if (flag & UPDATE_MD) {
+ uint8_t *old_md = bam_aux_get(b, "MD");
+ if (c->flag & BAM_FUNMAP) return;
+ if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ else {
+ int is_diff = 0;
+ if (strlen((char*)old_md+1) == str->l) {
+ for (i = 0; i < str->l; ++i)
+ if (toupper(old_md[i+1]) != toupper(str->s[i]))
+ break;
+ if (i < str->l) is_diff = 1;
+ } else is_diff = 1;
+ if (is_diff) {
+ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+ bam_aux_del(b, old_md);
+ bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ }
+ }
+ }
+ // drop all tags but RG
+ if (flag&DROP_TAG) {
+ uint8_t *q = bam_aux_get(b, "RG");
+ bam_aux_drop_other(b, q);
+ }
+ // reduce the resolution of base quality
+ if (flag&BIN_QUAL) {
+ uint8_t *qual = bam1_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
+ }
+ free(str->s); free(str);
+}
+
+void bam_fillmd1(bam1_t *b, char *ref, int flag)
+{
+ bam_fillmd1_core(b, ref, flag, 0);
+}
+
+int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+{
+ uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ int i, x, y, mm, q, len, clip_l, clip_q;
+ double t;
+ if (thres < 0) thres = 40; // set the default
+ mm = q = len = clip_l = clip_q = 0;
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
+ ++len;
+ if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
+ ++mm;
+ q += qual[z] > 33? 33 : qual[z];
+ }
+ }
+ }
+ if (j < l) break;
+ x += l; y += l; len += l;
+ } else if (op == BAM_CDEL) {
+ for (j = 0; j < l; ++j)
+ if (ref[x+j] == 0) break;
+ if (j < l) break;
+ x += l;
+ } else if (op == BAM_CSOFT_CLIP) {
+ for (j = 0; j < l; ++j) clip_q += qual[y+j];
+ clip_l += l;
+ y += l;
+ } else if (op == BAM_CHARD_CLIP) {
+ clip_q += 13 * l;
+ clip_l += l;
+ } else if (op == BAM_CINS) y += l;
+ else if (op == BAM_CREF_SKIP) x += l;
+ }
+ for (i = 0, t = 1; i < mm; ++i)
+ t *= (double)len / (i+1);
+ t = q - 4.343 * log(t) + clip_q / 5.;
+ if (t > thres) return -1;
+ if (t < 0) t = 0;
+ t = sqrt((thres - t) / thres) * thres;
+// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
+ return (int)(t + .499);
+}
+
+int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
+{
+ int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ kpa_par_t conf = kpa_par_def;
+ uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
+ if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
+ // test if BQ or ZQ is present
+ if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
+ if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
+ if (bq && redo_baq)
+ {
+ bam_aux_del(b, bq-1);
+ bq = 0;
+ }
+ if (bq && zq) { // remove the ZQ tag
+ bam_aux_del(b, zq-1);
+ zq = 0;
+ }
+ if (bq || zq) {
+ if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
+ if (bq && apply_baq) { // then convert BQ to ZQ
+ for (i = 0; i < c->l_qseq; ++i)
+ qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
+ *(bq - 3) = 'Z';
+ } else if (zq && !apply_baq) { // then convert ZQ to BQ
+ for (i = 0; i < c->l_qseq; ++i)
+ qual[i] += (int)zq[i] - 64;
+ *(zq - 3) = 'B';
+ }
+ return 0;
+ }
+ // find the start and end of the alignment
+ x = c->pos, y = 0, yb = ye = xb = xe = -1;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op, l;
+ op = cigar[k]&0xf; l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ if (yb < 0) yb = y;
+ if (xb < 0) xb = x;
+ ye = y + l; xe = x + l;
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
+ }
+ // set bandwidth and the start and the end
+ bw = 7;
+ if (abs((xe - xb) - (ye - yb)) > bw)
+ bw = abs((xe - xb) - (ye - yb)) + 3;
+ conf.bw = bw;
+ xb -= yb + bw/2; if (xb < 0) xb = 0;
+ xe += c->l_qseq - ye + bw/2;
+ if (xe - xb - c->l_qseq > bw)
+ xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
+ { // glocal
+ uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
+ int *state;
+ bq = calloc(c->l_qseq + 1, 1);
+ memcpy(bq, qual, c->l_qseq);
+ s = calloc(c->l_qseq, 1);
+ for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
+ r = calloc(xe - xb, 1);
+ for (i = xb; i < xe; ++i) {
+ if (ref[i] == 0) { xe = i; break; }
+ r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
+ }
+ state = calloc(c->l_qseq, sizeof(int));
+ q = calloc(c->l_qseq, 1);
+ kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
+ if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
+ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k]&0xf, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (i = y; i < y + l; ++i) {
+ if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
+ else bq[i] = bq[i] < q[i]? bq[i] : q[i];
+ }
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ }
+ for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
+ } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
+ uint8_t *left, *rght;
+ left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
+ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k]&0xf, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (i = y; i < y + l; ++i)
+ bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
+ for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
+ left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
+ for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
+ rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
+ for (i = y; i < y + l; ++i)
+ bq[i] = left[i] < rght[i]? left[i] : rght[i];
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ }
+ for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
+ free(left); free(rght);
+ }
+ if (apply_baq) {
+ for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
+ bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
+ } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
+ free(bq); free(s); free(r); free(q); free(state);
+ }
+ return 0;
+}
+
+int bam_prob_realn(bam1_t *b, const char *ref)
+{
+ return bam_prob_realn_core(b, ref, 1);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+ int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ samfile_t *fp, *fpout = 0;
+ faidx_t *fai;
+ char *ref = 0, mode_w[8], mode_r[8];
+ bam1_t *b;
+
+ flt_flag = UPDATE_NM | UPDATE_MD;
+ is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
+ mode_w[0] = mode_r[0] = 0;
+ strcpy(mode_r, "r"); strcpy(mode_w, "w");
+ while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
+ switch (c) {
+ case 'r': is_realn = 1; break;
+ case 'e': flt_flag |= USE_EQUAL; break;
+ case 'd': flt_flag |= DROP_TAG; break;
+ case 'q': flt_flag |= BIN_QUAL; break;
+ case 'h': flt_flag |= HASH_QNM; break;
+ case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
+ case 'b': is_bam_out = 1; break;
+ case 'u': is_uncompressed = is_bam_out = 1; break;
+ case 'S': is_sam_in = 1; break;
+ case 'n': max_nm = atoi(optarg); break;
+ case 'C': capQ = atoi(optarg); break;
+ case 'A': baq_flag |= 1; break;
+ case 'E': baq_flag |= 2; break;
+ default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+ }
+ }
+ if (!is_sam_in) strcat(mode_r, "b");
+ if (is_bam_out) strcat(mode_w, "b");
+ else strcat(mode_w, "h");
+ if (is_uncompressed) strcat(mode_w, "u");
+ if (optind + 1 >= argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
+ fprintf(stderr, "Options: -e change identical bases to '='\n");
+ fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
+ fprintf(stderr, " -b compressed BAM output\n");
+ fprintf(stderr, " -S the input is SAM with header\n");
+ fprintf(stderr, " -A modify the quality string\n");
+ fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
+ fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n");
+ return 1;
+ }
+ fp = samopen(argv[optind], mode_r, 0);
+ if (fp == 0) return 1;
+ if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
+ fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
+ return 1;
+ }
+ fpout = samopen("-", mode_w, fp->header);
+ fai = fai_load(argv[optind+1]);
+
+ b = bam_init1();
+ while ((ret = samread(fp, b)) >= 0) {
+ if (b->core.tid >= 0) {
+ if (tid != b->core.tid) {
+ free(ref);
+ ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
+ tid = b->core.tid;
+ if (ref == 0)
+ fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
+ fp->header->target_name[tid]);
+ }
+ if (is_realn) bam_prob_realn_core(b, ref, baq_flag);
+ if (capQ > 10) {
+ int q = bam_cap_mapQ(b, ref, capQ);
+ if (b->core.qual > q) b->core.qual = q;
+ }
+ if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
+ }
+ samwrite(fpout, b);
+ }
+ bam_destroy1(b);
+
+ free(ref);
+ fai_destroy(fai);
+ samclose(fp); samclose(fpout);
+ return 0;
+}
diff --git a/bam_pileup.c b/bam_pileup.c
new file mode 100644
index 0000000..57434e0
--- /dev/null
+++ b/bam_pileup.c
@@ -0,0 +1,437 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include "sam.h"
+
+typedef struct {
+ int k, x, y, end;
+} cstate_t;
+
+static cstate_t g_cstate_null = { -1, 0, 0, 0 };
+
+typedef struct __linkbuf_t {
+ bam1_t b;
+ uint32_t beg, end;
+ cstate_t s;
+ struct __linkbuf_t *next;
+} lbnode_t;
+
+/* --- BEGIN: Memory pool */
+
+typedef struct {
+ int cnt, n, max;
+ lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+ mempool_t *mp;
+ mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+ return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+ int k;
+ for (k = 0; k < mp->n; ++k) {
+ free(mp->buf[k]->b.data);
+ free(mp->buf[k]);
+ }
+ free(mp->buf);
+ free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+ else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+ --mp->cnt; p->next = 0; // clear lbnode_t::next here
+ if (mp->n == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 256;
+ mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+ }
+ mp->buf[mp->n++] = p;
+}
+
+/* --- END: Memory pool */
+
+/* --- BEGIN: Auxiliary functions */
+
+/* s->k: the index of the CIGAR operator that has just been processed.
+ s->x: the reference coordinate of the start of s->k
+ s->y: the query coordiante of the start of s->k
+ */
+static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
+{
+#define _cop(c) ((c)&BAM_CIGAR_MASK)
+#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
+
+ bam1_t *b = p->b;
+ bam1_core_t *c = &b->core;
+ uint32_t *cigar = bam1_cigar(b);
+ int k, is_head = 0;
+ // determine the current CIGAR operation
+// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y);
+ if (s->k == -1) { // never processed
+ is_head = 1;
+ if (c->n_cigar == 1) { // just one operation, save a loop
+ if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
+ } else { // find the first match or deletion
+ for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
+ int op = _cop(cigar[k]);
+ int l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+ else if (op == BAM_CREF_SKIP) s->x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+ }
+ assert(k < c->n_cigar);
+ s->k = k;
+ }
+ } else { // the read has been processed before
+ int op, l = _cln(cigar[s->k]);
+ if (pos - s->x >= l) { // jump to the next operation
+ assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
+ op = _cop(cigar[s->k+1]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+ s->x += l;
+ ++s->k;
+ } else { // find the next M/D/N/=/X
+ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+ s->x += l;
+ for (k = s->k + 1; k < c->n_cigar; ++k) {
+ op = _cop(cigar[k]), l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+ }
+ s->k = k;
+ }
+ assert(s->k < c->n_cigar); // otherwise a bug
+ } // else, do nothing
+ }
+ { // collect pileup information
+ int op, l;
+ op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
+ p->is_del = p->indel = p->is_refskip = 0;
+ if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
+ int op2 = _cop(cigar[s->k+1]);
+ int l2 = _cln(cigar[s->k+1]);
+ if (op2 == BAM_CDEL) p->indel = -(int)l2;
+ else if (op2 == BAM_CINS) p->indel = l2;
+ else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
+ int l3 = 0;
+ for (k = s->k + 2; k < c->n_cigar; ++k) {
+ op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
+ if (op2 == BAM_CINS) l3 += l2;
+ else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
+ }
+ if (l3 > 0) p->indel = l3;
+ }
+ }
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ p->qpos = s->y + (pos - s->x);
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
+ p->is_refskip = (op == BAM_CREF_SKIP);
+ } // cannot be other operations; otherwise a bug
+ p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
+ }
+ return 1;
+}
+
+/* --- END: Auxiliary functions */
+
+/*******************
+ * pileup iterator *
+ *******************/
+
+struct __bam_plp_t {
+ mempool_t *mp;
+ lbnode_t *head, *tail, *dummy;
+ int32_t tid, pos, max_tid, max_pos;
+ int is_eof, flag_mask, max_plp, error, maxcnt;
+ bam_pileup1_t *plp;
+ // for the "auto" interface only
+ bam1_t *b;
+ bam_plp_auto_f func;
+ void *data;
+};
+
+bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
+{
+ bam_plp_t iter;
+ iter = calloc(1, sizeof(struct __bam_plp_t));
+ iter->mp = mp_init();
+ iter->head = iter->tail = mp_alloc(iter->mp);
+ iter->dummy = mp_alloc(iter->mp);
+ iter->max_tid = iter->max_pos = -1;
+ iter->flag_mask = BAM_DEF_MASK;
+ iter->maxcnt = 8000;
+ if (func) {
+ iter->func = func;
+ iter->data = data;
+ iter->b = bam_init1();
+ }
+ return iter;
+}
+
+void bam_plp_destroy(bam_plp_t iter)
+{
+ mp_free(iter->mp, iter->dummy);
+ mp_free(iter->mp, iter->head);
+ if (iter->mp->cnt != 0)
+ fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
+ mp_destroy(iter->mp);
+ if (iter->b) bam_destroy1(iter->b);
+ free(iter->plp);
+ free(iter);
+}
+
+const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+ if (iter->error) { *_n_plp = -1; return 0; }
+ *_n_plp = 0;
+ if (iter->is_eof && iter->head->next == 0) return 0;
+ while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
+ int n_plp = 0;
+ lbnode_t *p, *q;
+ // write iter->plp at iter->pos
+ iter->dummy->next = iter->head;
+ for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
+ if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
+ q->next = p->next; mp_free(iter->mp, p); p = q;
+ } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
+ if (n_plp == iter->max_plp) { // then double the capacity
+ iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
+ iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
+ }
+ iter->plp[n_plp].b = &p->b;
+ if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
+ }
+ }
+ iter->head = iter->dummy->next; // dummy->next may be changed
+ *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
+ // update iter->tid and iter->pos
+ if (iter->head->next) {
+ if (iter->tid > iter->head->b.core.tid) {
+ fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
+ iter->error = 1;
+ *_n_plp = -1;
+ return 0;
+ }
+ }
+ if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
+ iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
+ } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
+ iter->pos = iter->head->beg; // jump to the next position
+ } else ++iter->pos; // scan contiguously
+ // return
+ if (n_plp) return iter->plp;
+ if (iter->is_eof && iter->head->next == 0) break;
+ }
+ return 0;
+}
+
+int bam_plp_push(bam_plp_t iter, const bam1_t *b)
+{
+ if (iter->error) return -1;
+ if (b) {
+ if (b->core.tid < 0) return 0;
+ if (b->core.flag & iter->flag_mask) return 0;
+ if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
+ bam_copy1(&iter->tail->b, b);
+ iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
+ iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
+ if (b->core.tid < iter->max_tid) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+ iter->error = 1;
+ return -1;
+ }
+ if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
+ fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+ iter->error = 1;
+ return -1;
+ }
+ iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
+ if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
+ iter->tail->next = mp_alloc(iter->mp);
+ iter->tail = iter->tail->next;
+ }
+ } else iter->is_eof = 1;
+ return 0;
+}
+
+const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+ const bam_pileup1_t *plp;
+ if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ else { // no pileup line can be obtained; read alignments
+ *_n_plp = 0;
+ if (iter->is_eof) return 0;
+ while (iter->func(iter->data, iter->b) >= 0) {
+ if (bam_plp_push(iter, iter->b) < 0) {
+ *_n_plp = -1;
+ return 0;
+ }
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ // otherwise no pileup line can be returned; read the next alignment.
+ }
+ bam_plp_push(iter, 0);
+ if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ return 0;
+ }
+}
+
+void bam_plp_reset(bam_plp_t iter)
+{
+ lbnode_t *p, *q;
+ iter->max_tid = iter->max_pos = -1;
+ iter->tid = iter->pos = 0;
+ iter->is_eof = 0;
+ for (p = iter->head; p->next;) {
+ q = p->next;
+ mp_free(iter->mp, p);
+ p = q;
+ }
+ iter->head = iter->tail;
+}
+
+void bam_plp_set_mask(bam_plp_t iter, int mask)
+{
+ iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
+}
+
+void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
+{
+ iter->maxcnt = maxcnt;
+}
+
+/*****************
+ * callback APIs *
+ *****************/
+
+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
+{
+ bam_plbuf_t *buf;
+ int ret;
+ bam1_t *b;
+ b = bam_init1();
+ buf = bam_plbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf, mask);
+ while ((ret = bam_read1(fp, b)) >= 0)
+ bam_plbuf_push(b, buf);
+ bam_plbuf_push(0, buf);
+ bam_plbuf_destroy(buf);
+ bam_destroy1(b);
+ return 0;
+}
+
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+ bam_plp_set_mask(buf->iter, mask);
+}
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+ bam_plp_reset(buf->iter);
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+ bam_plbuf_t *buf;
+ buf = calloc(1, sizeof(bam_plbuf_t));
+ buf->iter = bam_plp_init(0, 0);
+ buf->func = func;
+ buf->data = data;
+ return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+ bam_plp_destroy(buf->iter);
+ free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+ int ret, n_plp, tid, pos;
+ const bam_pileup1_t *plp;
+ ret = bam_plp_push(buf->iter, b);
+ if (ret < 0) return ret;
+ while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)
+ buf->func(tid, pos, n_plp, plp, buf->data);
+ return 0;
+}
+
+/***********
+ * mpileup *
+ ***********/
+
+struct __bam_mplp_t {
+ int n;
+ uint64_t min, *pos;
+ bam_plp_t *iter;
+ int *n_plp;
+ const bam_pileup1_t **plp;
+};
+
+bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+{
+ int i;
+ bam_mplp_t iter;
+ iter = calloc(1, sizeof(struct __bam_mplp_t));
+ iter->pos = calloc(n, 8);
+ iter->n_plp = calloc(n, sizeof(int));
+ iter->plp = calloc(n, sizeof(void*));
+ iter->iter = calloc(n, sizeof(void*));
+ iter->n = n;
+ iter->min = (uint64_t)-1;
+ for (i = 0; i < n; ++i) {
+ iter->iter[i] = bam_plp_init(func, data[i]);
+ iter->pos[i] = iter->min;
+ }
+ return iter;
+}
+
+void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i)
+ iter->iter[i]->maxcnt = maxcnt;
+}
+
+void bam_mplp_destroy(bam_mplp_t iter)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
+ free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
+ free(iter);
+}
+
+int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, ret = 0;
+ uint64_t new_min = (uint64_t)-1;
+ for (i = 0; i < iter->n; ++i) {
+ if (iter->pos[i] == iter->min) {
+ int tid, pos;
+ iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
+ iter->pos[i] = (uint64_t)tid<<32 | pos;
+ }
+ if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
+ }
+ iter->min = new_min;
+ if (new_min == (uint64_t)-1) return 0;
+ *_tid = new_min>>32; *_pos = (uint32_t)new_min;
+ for (i = 0; i < iter->n; ++i) {
+ if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
+ n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
+ ++ret;
+ } else n_plp[i] = 0, plp[i] = 0;
+ }
+ return ret;
+}
diff --git a/bam_reheader.c b/bam_reheader.c
new file mode 100644
index 0000000..6619428
--- /dev/null
+++ b/bam_reheader.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "knetfile.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#define BUF_SIZE 0x10000
+
+int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
+{
+ BGZF *fp;
+ bam_header_t *old;
+ int len;
+ uint8_t *buf;
+ if (in->is_write) return -1;
+ buf = malloc(BUF_SIZE);
+ old = bam_header_read(in);
+ fp = bgzf_fdopen(fd, "w");
+ bam_header_write(fp, h);
+ if (in->block_offset < in->block_length) {
+ bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
+ bgzf_flush(fp);
+ }
+#ifdef _USE_KNETFILE
+ while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0)
+ fwrite(buf, 1, len, fp->fp);
+#else
+ while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0)
+ fwrite(buf, 1, len, fp->file);
+#endif
+ free(buf);
+ fp->block_offset = in->block_offset = 0;
+ bgzf_close(fp);
+ return 0;
+}
+
+int main_reheader(int argc, char *argv[])
+{
+ bam_header_t *h;
+ BGZF *in;
+ if (argc != 3) {
+ fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
+ return 1;
+ }
+ { // read the header
+ tamFile fph = sam_open(argv[1]);
+ if (fph == 0) {
+ fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
+ return 1;
+ }
+ h = sam_header_read(fph);
+ sam_close(fph);
+ }
+ in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r");
+ if (in == 0) {
+ fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]);
+ return 1;
+ }
+ bam_reheader(in, h, fileno(stdout));
+ bgzf_close(in);
+ return 0;
+}
diff --git a/bam_sort.c b/bam_sort.c
new file mode 100644
index 0000000..7d00cd1
--- /dev/null
+++ b/bam_sort.c
@@ -0,0 +1,566 @@
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "bam.h"
+#include "ksort.h"
+
+static int g_is_by_qname = 0;
+
+static int strnum_cmp(const char *_a, const char *_b)
+{
+ const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
+ const unsigned char *pa = a, *pb = b;
+ while (*pa && *pb) {
+ if (isdigit(*pa) && isdigit(*pb)) {
+ while (*pa == '0') ++pa;
+ while (*pb == '0') ++pb;
+ while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
+ if (isdigit(*pa) && isdigit(*pb)) {
+ int i = 0;
+ while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
+ return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
+ } else if (isdigit(*pa)) return 1;
+ else if (isdigit(*pb)) return -1;
+ else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
+ } else {
+ if (*pa != *pb) return (int)*pa - (int)*pb;
+ ++pa; ++pb;
+ }
+ }
+ return *pa? 1 : *pb? -1 : 0;
+}
+
+#define HEAP_EMPTY 0xffffffffffffffffull
+
+typedef struct {
+ int i;
+ uint64_t pos, idx;
+ bam1_t *b;
+} heap1_t;
+
+#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
+
+static inline int heap_lt(const heap1_t a, const heap1_t b)
+{
+ if (g_is_by_qname) {
+ int t;
+ if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;
+ t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+ return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0)));
+ } else return __pos_cmp(a, b);
+}
+
+KSORT_INIT(heap, heap1_t, heap_lt)
+
+static void swap_header_targets(bam_header_t *h1, bam_header_t *h2)
+{
+ bam_header_t t;
+ t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets;
+ t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name;
+ t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len;
+}
+
+static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
+{
+ int tempi;
+ char *temps;
+ tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;
+ temps = h1->text, h1->text = h2->text, h2->text = temps;
+}
+
+#define MERGE_RG 1
+#define MERGE_UNCOMP 2
+#define MERGE_LEVEL1 4
+#define MERGE_FORCE 8
+
+/*!
+ @abstract Merge multiple sorted BAM.
+ @param is_by_qname whether to sort by query name
+ @param out output BAM file name
+ @param headers name of SAM file from which to copy '@' header lines,
+ or NULL to copy them from the first file to be merged
+ @param n number of files to be merged
+ @param fn names of files to be merged
+
+ @discussion Padding information may NOT correctly maintained. This
+ function is NOT thread safe.
+ */
+int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level)
+{
+ bamFile fpout, *fp;
+ heap1_t *heap;
+ bam_header_t *hout = 0;
+ bam_header_t *hheaders = NULL;
+ int i, j, *RG_len = 0;
+ uint64_t idx = 0;
+ char **RG = 0, mode[8];
+ bam_iter_t *iter = 0;
+
+ if (headers) {
+ tamFile fpheaders = sam_open(headers);
+ if (fpheaders == 0) {
+ const char *message = strerror(errno);
+ fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ return -1;
+ }
+ hheaders = sam_header_read(fpheaders);
+ sam_close(fpheaders);
+ }
+
+ g_is_by_qname = by_qname;
+ fp = (bamFile*)calloc(n, sizeof(bamFile));
+ heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+ iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
+ // prepare RG tag
+ if (flag & MERGE_RG) {
+ RG = (char**)calloc(n, sizeof(void*));
+ RG_len = (int*)calloc(n, sizeof(int));
+ for (i = 0; i != n; ++i) {
+ int l = strlen(fn[i]);
+ const char *s = fn[i];
+ if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
+ for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
+ ++j; l -= j;
+ RG[i] = calloc(l + 1, 1);
+ RG_len[i] = l;
+ strncpy(RG[i], s + j, l);
+ }
+ }
+ // read the first
+ for (i = 0; i != n; ++i) {
+ bam_header_t *hin;
+ fp[i] = bam_open(fn[i], "r");
+ if (fp[i] == 0) {
+ int j;
+ fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+ for (j = 0; j < i; ++j) bam_close(fp[j]);
+ free(fp); free(heap);
+ // FIXME: possible memory leak
+ return -1;
+ }
+ hin = bam_header_read(fp[i]);
+ if (i == 0) { // the first BAM
+ hout = hin;
+ } else { // validate multiple baf
+ int min_n_targets = hout->n_targets;
+ if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;
+
+ for (j = 0; j < min_n_targets; ++j)
+ if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
+ hout->target_name[j], hin->target_name[j], fn[i]);
+ return -1;
+ }
+
+ // If this input file has additional target reference sequences,
+ // add them to the headers to be output
+ if (hin->n_targets > hout->n_targets) {
+ swap_header_targets(hout, hin);
+ // FIXME Possibly we should also create @SQ text headers
+ // for the newly added reference sequences
+ }
+
+ bam_header_destroy(hin);
+ }
+ }
+
+ if (hheaders) {
+ // If the text headers to be swapped in include any @SQ headers,
+ // check that they are consistent with the existing binary list
+ // of reference information.
+ if (hheaders->n_targets > 0) {
+ if (hout->n_targets != hheaders->n_targets) {
+ fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
+ if (!reg) return -1;
+ }
+ for (j = 0; j < hout->n_targets; ++j)
+ if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
+ if (!reg) return -1;
+ }
+ }
+
+ swap_header_text(hout, hheaders);
+ bam_header_destroy(hheaders);
+ }
+
+ if (reg) {
+ int tid, beg, end;
+ if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
+ fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
+ return -1;
+ }
+ for (i = 0; i < n; ++i) {
+ bam_index_t *idx;
+ idx = bam_index_load(fn[i]);
+ iter[i] = bam_iter_query(idx, tid, beg, end);
+ bam_index_destroy(idx);
+ }
+ }
+
+ for (i = 0; i < n; ++i) {
+ heap1_t *h = heap + i;
+ h->i = i;
+ h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
+ h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
+ h->idx = idx++;
+ }
+ else h->pos = HEAP_EMPTY;
+ }
+ if (flag & MERGE_UNCOMP) level = 0;
+ else if (flag & MERGE_LEVEL1) level = 1;
+ strcpy(mode, "w");
+ if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
+ if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
+ fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
+ return -1;
+ }
+ bam_header_write(fpout, hout);
+ bam_header_destroy(hout);
+ if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
+
+ ks_heapmake(heap, n, heap);
+ while (heap->pos != HEAP_EMPTY) {
+ bam1_t *b = heap->b;
+ if (flag & MERGE_RG) {
+ uint8_t *rg = bam_aux_get(b, "RG");
+ if (rg) bam_aux_del(b, rg);
+ bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
+ }
+ bam_write1_core(fpout, &b->core, b->data_len, b->data);
+ if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
+ heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
+ heap->idx = idx++;
+ } else if (j == -1) {
+ heap->pos = HEAP_EMPTY;
+ free(heap->b->data); free(heap->b);
+ heap->b = 0;
+ } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+ ks_heapadjust(heap, 0, n, heap);
+ }
+
+ if (flag & MERGE_RG) {
+ for (i = 0; i != n; ++i) free(RG[i]);
+ free(RG); free(RG_len);
+ }
+ for (i = 0; i != n; ++i) {
+ bam_iter_destroy(iter[i]);
+ bam_close(fp[i]);
+ }
+ bam_close(fpout);
+ free(fp); free(heap); free(iter);
+ return 0;
+}
+
+int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg)
+{
+ return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1);
+}
+
+int bam_merge(int argc, char *argv[])
+{
+ int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
+ char *fn_headers = NULL, *reg = 0;
+
+ while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) {
+ switch (c) {
+ case 'r': flag |= MERGE_RG; break;
+ case 'f': flag |= MERGE_FORCE; break;
+ case 'h': fn_headers = strdup(optarg); break;
+ case 'n': is_by_qname = 1; break;
+ case '1': flag |= MERGE_LEVEL1; break;
+ case 'u': flag |= MERGE_UNCOMP; break;
+ case 'R': reg = strdup(optarg); break;
+ case 'l': level = atoi(optarg); break;
+ case '@': n_threads = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 >= argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
+ fprintf(stderr, "Options: -n sort by read names\n");
+ fprintf(stderr, " -r attach RG tag (inferred from file names)\n");
+ fprintf(stderr, " -u uncompressed BAM output\n");
+ fprintf(stderr, " -f overwrite the output BAM if exist\n");
+ fprintf(stderr, " -1 compress level 1\n");
+ fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
+ fprintf(stderr, " -@ INT number of BAM compression threads [0]\n");
+ fprintf(stderr, " -R STR merge file in the specified region STR [all]\n");
+ fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n");
+ fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
+ fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n");
+ fprintf(stderr, " the header dictionary in merging.\n\n");
+ return 1;
+ }
+ if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
+ FILE *fp = fopen(argv[optind], "rb");
+ if (fp != NULL) {
+ fclose(fp);
+ fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
+ return 1;
+ }
+ }
+ if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1;
+ free(reg);
+ free(fn_headers);
+ return ret;
+}
+
+/***************
+ * BAM sorting *
+ ***************/
+
+#include <pthread.h>
+
+typedef bam1_t *bam1_p;
+
+static int change_SO(bam_header_t *h, const char *so)
+{
+ char *p, *q, *beg = 0, *end = 0, *newtext;
+ if (h->l_text > 3) {
+ if (strncmp(h->text, "@HD", 3) == 0) {
+ if ((p = strchr(h->text, '\n')) == 0) return -1;
+ *p = '\0';
+ if ((q = strstr(h->text, "\tSO:")) != 0) {
+ *p = '\n'; // change back
+ if (strncmp(q + 4, so, p - q - 4) != 0) {
+ beg = q;
+ for (q += 4; *q != '\n' && *q != '\t'; ++q);
+ end = q;
+ } else return 0; // no need to change
+ } else beg = end = p, *p = '\n';
+ }
+ }
+ if (beg == 0) { // no @HD
+ h->l_text += strlen(so) + 15;
+ newtext = malloc(h->l_text + 1);
+ sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so);
+ strcat(newtext, h->text);
+ } else { // has @HD but different or no SO
+ h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end);
+ newtext = malloc(h->l_text + 1);
+ strncpy(newtext, h->text, beg - h->text);
+ sprintf(newtext + (beg - h->text), "\tSO:%s", so);
+ strcat(newtext, end);
+ }
+ free(h->text);
+ h->text = newtext;
+ return 0;
+}
+
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+ if (g_is_by_qname) {
+ int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
+ return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
+ } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b)));
+}
+KSORT_INIT(sort, bam1_p, bam1_lt)
+
+typedef struct {
+ size_t buf_len;
+ const char *prefix;
+ bam1_p *buf;
+ const bam_header_t *h;
+ int index;
+} worker_t;
+
+static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads)
+{
+ size_t i;
+ bamFile fp;
+ fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode);
+ if (fp == 0) return;
+ bam_header_write(fp, h);
+ if (n_threads > 1) bgzf_mt(fp, n_threads, 256);
+ for (i = 0; i < l; ++i)
+ bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
+ bam_close(fp);
+}
+
+static void *worker(void *data)
+{
+ worker_t *w = (worker_t*)data;
+ char *name;
+ ks_mergesort(sort, w->buf_len, w->buf, 0);
+ name = (char*)calloc(strlen(w->prefix) + 20, 1);
+ sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
+ write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0);
+ free(name);
+ return 0;
+}
+
+static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads)
+{
+ int i;
+ size_t rest;
+ bam1_p *b;
+ pthread_t *tid;
+ pthread_attr_t attr;
+ worker_t *w;
+
+ if (n_threads < 1) n_threads = 1;
+ if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ w = calloc(n_threads, sizeof(worker_t));
+ tid = calloc(n_threads, sizeof(pthread_t));
+ b = buf; rest = k;
+ for (i = 0; i < n_threads; ++i) {
+ w[i].buf_len = rest / (n_threads - i);
+ w[i].buf = b;
+ w[i].prefix = prefix;
+ w[i].h = h;
+ w[i].index = n_files + i;
+ b += w[i].buf_len; rest -= w[i].buf_len;
+ pthread_create(&tid[i], &attr, worker, &w[i]);
+ }
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+ free(tid); free(w);
+ return n_files + n_threads;
+}
+
+/*!
+ @abstract Sort an unsorted BAM file based on the chromosome order
+ and the leftmost position of an alignment
+
+ @param is_by_qname whether to sort by query name
+ @param fn name of the file to be sorted
+ @param prefix prefix of the output and the temporary files; upon
+ sucessess, prefix.bam will be written.
+ @param max_mem approxiate maximum memory (very inaccurate)
+
+ @discussion It may create multiple temporary subalignment files
+ and then merge them by calling bam_merge_core(). This function is
+ NOT thread safe.
+ */
+void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level)
+{
+ int ret, i, n_files = 0;
+ size_t mem, max_k, k, max_mem;
+ bam_header_t *header;
+ bamFile fp;
+ bam1_t *b, **buf;
+ char *fnout = 0;
+
+ if (n_threads < 2) n_threads = 1;
+ g_is_by_qname = is_by_qname;
+ max_k = k = 0; mem = 0;
+ max_mem = _max_mem * n_threads;
+ buf = 0;
+ fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
+ return;
+ }
+ header = bam_header_read(fp);
+ if (is_by_qname) change_SO(header, "queryname");
+ else change_SO(header, "coordinate");
+ // write sub files
+ for (;;) {
+ if (k == max_k) {
+ size_t old_max = max_k;
+ max_k = max_k? max_k<<1 : 0x10000;
+ buf = realloc(buf, max_k * sizeof(void*));
+ memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max));
+ }
+ if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+ b = buf[k];
+ if ((ret = bam_read1(fp, b)) < 0) break;
+ if (b->data_len < b->m_data>>2) { // shrink
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = realloc(b->data, b->m_data);
+ }
+ mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
+ ++k;
+ if (mem >= max_mem) {
+ n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+ mem = k = 0;
+ }
+ }
+ if (ret != -1)
+ fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+ // output file name
+ fnout = calloc(strlen(prefix) + 20, 1);
+ if (is_stdout) sprintf(fnout, "-");
+ else sprintf(fnout, "%s.bam", prefix);
+ // write the final output
+ if (n_files == 0) { // a single block
+ char mode[8];
+ strcpy(mode, "w");
+ if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
+ ks_mergesort(sort, k, buf, 0);
+ write_buffer(fnout, mode, k, buf, header, n_threads);
+ } else { // then merge
+ char **fns;
+ n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+ fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
+ fns = (char**)calloc(n_files, sizeof(char*));
+ for (i = 0; i < n_files; ++i) {
+ fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+ sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+ }
+ bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level);
+ for (i = 0; i < n_files; ++i) {
+ unlink(fns[i]);
+ free(fns[i]);
+ }
+ free(fns);
+ }
+ free(fnout);
+ // free
+ for (k = 0; k < max_k; ++k) {
+ if (!buf[k]) continue;
+ free(buf[k]->data);
+ free(buf[k]);
+ }
+ free(buf);
+ bam_header_destroy(header);
+ bam_close(fp);
+}
+
+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
+{
+ bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1);
+}
+
+int bam_sort(int argc, char *argv[])
+{
+ size_t max_mem = 768<<20; // 512MB
+ int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1;
+ while ((c = getopt(argc, argv, "nom:@:l:")) >= 0) {
+ switch (c) {
+ case 'o': is_stdout = 1; break;
+ case 'n': is_by_qname = 1; break;
+ case 'm': {
+ char *q;
+ max_mem = strtol(optarg, &q, 0);
+ if (*q == 'k' || *q == 'K') max_mem <<= 10;
+ else if (*q == 'm' || *q == 'M') max_mem <<= 20;
+ else if (*q == 'g' || *q == 'G') max_mem <<= 30;
+ break;
+ }
+ case '@': n_threads = atoi(optarg); break;
+ case 'l': level = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools sort [options] <in.bam> <out.prefix>\n\n");
+ fprintf(stderr, "Options: -n sort by read name\n");
+ fprintf(stderr, " -o final output to stdout\n");
+ fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
+ fprintf(stderr, " -@ INT number of sorting and compression threads [1]\n");
+ fprintf(stderr, " -m INT max memory per thread; suffix K/M/G recognized [768M]\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level);
+ return 0;
+}
diff --git a/bedidx.c b/bedidx.c
new file mode 100644
index 0000000..ec75a10
--- /dev/null
+++ b/bedidx.c
@@ -0,0 +1,162 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#ifdef _WIN32
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint64_t)
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+ int n, m;
+ uint64_t *a;
+ int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+ int i, j, m, *idx;
+ m = *n_idx = 0; idx = 0;
+ for (i = 0; i < n; ++i) {
+ int beg, end;
+ beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+ if (m < end + 1) {
+ int oldm = m;
+ m = end + 1;
+ kroundup32(m);
+ idx = realloc(idx, m * sizeof(int));
+ for (j = oldm; j < m; ++j) idx[j] = -1;
+ }
+ if (beg == end) {
+ if (idx[beg] < 0) idx[beg] = i;
+ } else {
+ for (j = beg; j <= end; ++j)
+ if (idx[j] < 0) idx[j] = i;
+ }
+ *n_idx = end + 1;
+ }
+ return idx;
+}
+
+void bed_index(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ bed_reglist_t *p = &kh_val(h, k);
+ if (p->idx) free(p->idx);
+ ks_introsort(uint64_t, p->n, p->a);
+ p->idx = bed_index_core(p->n, p->a, &p->m);
+ }
+ }
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+ int i, min_off;
+ if (p->n == 0) return 0;
+ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+ int n = beg>>LIDX_SHIFT;
+ if (n > p->n) n = p->n;
+ for (i = n - 1; i >= 0; --i)
+ if (p->idx[i] >= 0) break;
+ min_off = i >= 0? p->idx[i] : 0;
+ }
+ for (i = min_off; i < p->n; ++i) {
+ if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+ if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+ return 1; // find the overlap; return
+ }
+ return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+ const reghash_t *h = (const reghash_t*)_h;
+ khint_t k;
+ if (!h) return 0;
+ k = kh_get(reg, h, chr);
+ if (k == kh_end(h)) return 0;
+ return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t *str;
+ // read the list
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) return 0;
+ str = calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+ int beg = -1, end = -1;
+ bed_reglist_t *p;
+ khint_t k = kh_get(reg, h, str->s);
+ if (k == kh_end(h)) { // absent from the hash table
+ int ret;
+ char *s = strdup(str->s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (dret != '\n') { // if the lines has other characters
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ beg = atoi(str->s); // begin
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ end = atoi(str->s); // end
+ if (end < beg) end = -1;
+ }
+ }
+ }
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+ if (beg >= 0 && end > beg) {
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ bed_index(h);
+ return h;
+}
+
+void bed_destroy(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free(kh_val(h, k).idx);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+}
diff --git a/bgzf.c b/bgzf.c
new file mode 100644
index 0000000..880d5af
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,694 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include "bgzf.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+typedef knetFile *_bgzf_file_t;
+#define _bgzf_open(fn, mode) knet_open(fn, mode)
+#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
+#define _bgzf_close(fp) knet_close(fp)
+#define _bgzf_fileno(fp) ((fp)->fd)
+#define _bgzf_tell(fp) knet_tell(fp)
+#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
+#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
+#else // ~defined(_USE_KNETFILE)
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else // ~defined(_WIN32)
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif // ~defined(_WIN32)
+typedef FILE *_bgzf_file_t;
+#define _bgzf_open(fn, mode) fopen(fn, mode)
+#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
+#define _bgzf_close(fp) fclose(fp)
+#define _bgzf_fileno(fp) fileno(fp)
+#define _bgzf_tell(fp) ftello(fp)
+#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
+#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
+#endif // ~define(_USE_KNETFILE)
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+ int size;
+ uint8_t *block;
+ int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init()
+{
+ BGZF *fp;
+ fp = calloc(1, sizeof(BGZF));
+ fp->is_write = 0;
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+#ifdef BGZF_CACHE
+ fp->cache = kh_init(cache);
+#endif
+ return fp;
+}
+
+static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
+{
+ BGZF *fp;
+ fp = calloc(1, sizeof(BGZF));
+ fp->is_write = 1;
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+ return fp;
+}
+// get the compress level from the mode string
+static int mode2level(const char *__restrict mode)
+{
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ return compress_level;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r') || strchr(mode, 'R')) {
+ _bgzf_file_t fpr;
+ if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
+ fp = bgzf_read_init();
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ FILE *fpw;
+ if ((fpw = fopen(path, "w")) == 0) return 0;
+ fp = bgzf_write_init(mode2level(mode));
+ fp->fp = fpw;
+ }
+ return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r') || strchr(mode, 'R')) {
+ _bgzf_file_t fpr;
+ if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
+ fp = bgzf_read_init();
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ FILE *fpw;
+ if ((fpw = fdopen(fd, "w")) == 0) return 0;
+ fp = bgzf_write_init(mode2level(mode));
+ fp->fp = fpw;
+ }
+ return fp;
+}
+
+static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
+{
+ uint32_t crc;
+ z_stream zs;
+ uint8_t *dst = (uint8_t*)_dst;
+
+ // compress the body
+ zs.zalloc = NULL; zs.zfree = NULL;
+ zs.next_in = src;
+ zs.avail_in = slen;
+ zs.next_out = dst + BLOCK_HEADER_LENGTH;
+ zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+ if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
+ if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
+ if (deflateEnd(&zs) != Z_OK) return -1;
+ *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ // write the header
+ memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+ packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
+ // write the footer
+ crc = crc32(crc32(0L, NULL, 0L), src, slen);
+ packInt32((uint8_t*)&dst[*dlen - 8], crc);
+ packInt32((uint8_t*)&dst[*dlen - 4], slen);
+ return 0;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+ int comp_size = BGZF_MAX_BLOCK_SIZE;
+ if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_offset = 0;
+ return comp_size;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = fp->uncompressed_block;
+ zs.avail_out = BGZF_MAX_BLOCK_SIZE;
+
+ if (inflateInit2(&zs, -15) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+ inflateEnd(&zs);
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflateEnd(&zs) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ return zs.total_out;
+}
+
+static int check_header(const uint8_t *header)
+{
+ return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2);
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+ khint_t k;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (fp->is_write) return;
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) free(kh_val(h, k).block);
+ kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ k = kh_get(cache, h, block_address);
+ if (k == kh_end(h)) return 0;
+ p = &kh_val(h, k);
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address = block_address;
+ fp->block_length = p->size;
+ memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
+ _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
+ return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+ int ret;
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
+ if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) {
+ /* A better way would be to remove the oldest block in the
+ * cache, but here we remove a random one for simplicity. This
+ * should not have a big impact on performance. */
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) break;
+ if (k < kh_end(h)) {
+ free(kh_val(h, k).block);
+ kh_del(cache, h, k);
+ }
+ }
+ k = kh_put(cache, h, fp->block_address, &ret);
+ if (ret == 0) return; // if this happens, a bug!
+ p = &kh_val(h, k);
+ p->size = fp->block_length;
+ p->end_offset = fp->block_address + size;
+ p->block = malloc(BGZF_MAX_BLOCK_SIZE);
+ memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+ uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+ int count, size = 0, block_length, remaining;
+ int64_t block_address;
+ block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
+ count = _bgzf_read(fp->fp, header, sizeof(header));
+ if (count == 0) { // no data read
+ fp->block_length = 0;
+ return 0;
+ }
+ if (count != sizeof(header) || !check_header(header)) {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
+ size = count;
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+ compressed_block = (uint8_t*)fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ remaining = block_length - BLOCK_HEADER_LENGTH;
+ count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+ if (count != remaining) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ size += count;
+ if ((count = inflate_block(fp, block_length)) < 0) return -1;
+ if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+ fp->block_address = block_address;
+ fp->block_length = count;
+ cache_block(fp, size);
+ return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
+{
+ ssize_t bytes_read = 0;
+ uint8_t *output = data;
+ if (length <= 0) return 0;
+ assert(fp->is_write == 0);
+ while (bytes_read < length) {
+ int copy_length, available = fp->block_length - fp->block_offset;
+ uint8_t *buffer;
+ if (available <= 0) {
+ if (bgzf_read_block(fp) != 0) return -1;
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) break;
+ }
+ copy_length = length - bytes_read < available? length - bytes_read : available;
+ buffer = fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = fp->block_length = 0;
+ }
+ return bytes_read;
+}
+
+/***** BEGIN: multi-threading *****/
+
+typedef struct {
+ BGZF *fp;
+ struct mtaux_t *mt;
+ void *buf;
+ int i, errcode, toproc;
+} worker_t;
+
+typedef struct mtaux_t {
+ int n_threads, n_blks, curr, done;
+ volatile int proc_cnt;
+ void **blk;
+ int *len;
+ worker_t *w;
+ pthread_t *tid;
+ pthread_mutex_t lock;
+ pthread_cond_t cv;
+} mtaux_t;
+
+static int worker_aux(worker_t *w)
+{
+ int i, tmp, stop = 0;
+ // wait for condition: to process or all done
+ pthread_mutex_lock(&w->mt->lock);
+ while (!w->toproc && !w->mt->done)
+ pthread_cond_wait(&w->mt->cv, &w->mt->lock);
+ if (w->mt->done) stop = 1;
+ w->toproc = 0;
+ pthread_mutex_unlock(&w->mt->lock);
+ if (stop) return 1; // to quit the thread
+ w->errcode = 0;
+ for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
+ int clen = BGZF_MAX_BLOCK_SIZE;
+ if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0)
+ w->errcode |= BGZF_ERR_ZLIB;
+ memcpy(w->mt->blk[i], w->buf, clen);
+ w->mt->len[i] = clen;
+ }
+ tmp = __sync_fetch_and_add(&w->mt->proc_cnt, 1);
+ return 0;
+}
+
+static void *mt_worker(void *data)
+{
+ while (worker_aux(data) == 0);
+ return 0;
+}
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+ int i;
+ mtaux_t *mt;
+ pthread_attr_t attr;
+ if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
+ mt = calloc(1, sizeof(mtaux_t));
+ mt->n_threads = n_threads;
+ mt->n_blks = n_threads * n_sub_blks;
+ mt->len = calloc(mt->n_blks, sizeof(int));
+ mt->blk = calloc(mt->n_blks, sizeof(void*));
+ for (i = 0; i < mt->n_blks; ++i)
+ mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
+ mt->tid = calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
+ mt->w = calloc(mt->n_threads, sizeof(worker_t));
+ for (i = 0; i < mt->n_threads; ++i) {
+ mt->w[i].i = i;
+ mt->w[i].mt = mt;
+ mt->w[i].fp = fp;
+ mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
+ }
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ pthread_mutex_init(&mt->lock, 0);
+ pthread_cond_init(&mt->cv, 0);
+ for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
+ pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
+ fp->mt = mt;
+ return 0;
+}
+
+static void mt_destroy(mtaux_t *mt)
+{
+ int i;
+ // signal all workers to quit
+ pthread_mutex_lock(&mt->lock);
+ mt->done = 1; mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
+ // free other data allocated on heap
+ for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
+ for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
+ free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
+ pthread_cond_destroy(&mt->cv);
+ pthread_mutex_destroy(&mt->lock);
+ free(mt);
+}
+
+static void mt_queue(BGZF *fp)
+{
+ mtaux_t *mt = (mtaux_t*)fp->mt;
+ assert(mt->curr < mt->n_blks); // guaranteed by the caller
+ memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
+ mt->len[mt->curr] = fp->block_offset;
+ fp->block_offset = 0;
+ ++mt->curr;
+}
+
+static int mt_flush(BGZF *fp)
+{
+ int i;
+ mtaux_t *mt = (mtaux_t*)fp->mt;
+ if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
+ // signal all the workers to compress
+ pthread_mutex_lock(&mt->lock);
+ for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
+ mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ // worker 0 is doing things here
+ worker_aux(&mt->w[0]);
+ // wait for all the threads to complete
+ while (mt->proc_cnt < mt->n_threads);
+ // dump data to disk
+ for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
+ for (i = 0; i < mt->curr; ++i)
+ if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i])
+ fp->errcode |= BGZF_ERR_IO;
+ mt->curr = 0;
+ return 0;
+}
+
+static int mt_lazy_flush(BGZF *fp)
+{
+ mtaux_t *mt = (mtaux_t*)fp->mt;
+ if (fp->block_offset) mt_queue(fp);
+ if (mt->curr == mt->n_blks)
+ return mt_flush(fp);
+ return -1;
+}
+
+static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length)
+{
+ const uint8_t *input = data;
+ ssize_t rest = length;
+ while (rest) {
+ int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest;
+ memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length; input += copy_length; rest -= copy_length;
+ if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp);
+ }
+ return length - rest;
+}
+
+/***** END: multi-threading *****/
+
+int bgzf_flush(BGZF *fp)
+{
+ if (!fp->is_write) return 0;
+ if (fp->mt) return mt_flush(fp);
+ while (fp->block_offset > 0) {
+ int block_length;
+ block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) return -1;
+ if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
+ fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+ if (fp->block_offset + size > BGZF_BLOCK_SIZE) {
+ if (fp->mt) return mt_lazy_flush(fp);
+ else return bgzf_flush(fp);
+ }
+ return -1;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
+{
+ const uint8_t *input = data;
+ int block_length = BGZF_BLOCK_SIZE, bytes_written = 0;
+ assert(fp->is_write);
+ if (fp->mt) return mt_write(fp, data, length);
+ while (bytes_written < length) {
+ uint8_t* buffer = fp->uncompressed_block;
+ int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ bytes_written += copy_length;
+ if (fp->block_offset == block_length && bgzf_flush(fp)) break;
+ }
+ return bytes_written;
+}
+
+int bgzf_close(BGZF* fp)
+{
+ int ret, count, block_length;
+ if (fp == 0) return -1;
+ if (fp->is_write) {
+ if (bgzf_flush(fp) != 0) return -1;
+ fp->compress_level = -1;
+ block_length = deflate_block(fp, 0); // write an empty block
+ count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
+ if (fflush(fp->fp) != 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ if (fp->mt) mt_destroy(fp->mt);
+ }
+ ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp);
+ if (ret != 0) return -1;
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free_cache(fp);
+ free(fp);
+ return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+ if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+ static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+ uint8_t buf[28];
+ off_t offset;
+ offset = _bgzf_tell((_bgzf_file_t)fp->fp);
+ if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
+ _bgzf_read(fp->fp, buf, 28);
+ _bgzf_seek(fp->fp, offset, SEEK_SET);
+ return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+ int block_offset;
+ int64_t block_address;
+
+ if (fp->is_write || where != SEEK_SET) {
+ fp->errcode |= BGZF_ERR_MISUSE;
+ return -1;
+ }
+ block_offset = pos & 0xFFFF;
+ block_address = pos >> 16;
+ if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+ uint8_t buf[16];
+ int n;
+ _bgzf_file_t fp;
+ if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
+ n = _bgzf_read(fp, buf, 16);
+ _bgzf_close(fp);
+ if (n != 16) return 0;
+ return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+ int c;
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) return -2; /* error */
+ if (fp->block_length == 0) return -1; /* end-of-file */
+ }
+ c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+ int l, state = 0;
+ unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+ str->l = 0;
+ do {
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) { state = -2; break; }
+ if (fp->block_length == 0) { state = -1; break; }
+ }
+ for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+ if (l < fp->block_length) state = 1;
+ l -= fp->block_offset;
+ if (str->l + l + 1 >= str->m) {
+ str->m = str->l + l + 2;
+ kroundup32(str->m);
+ str->s = (char*)realloc(str->s, str->m);
+ }
+ memcpy(str->s + str->l, buf + fp->block_offset, l);
+ str->l += l;
+ fp->block_offset += l + 1;
+ if (fp->block_offset >= fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ } while (state == 0);
+ if (str->l == 0 && state < 0) return state;
+ str->s[str->l] = 0;
+ return str->l;
+}
diff --git a/determine-phred b/determine-phred
new file mode 100755
index 0000000..ac8d39f
--- /dev/null
+++ b/determine-phred
@@ -0,0 +1,86 @@
+#!/usr/bin/perl
+use strict;
+
+my $ssiz=7000; # sample size
+
+if ($ARGV[0] =~ /^-[h?]/) {
+ print "Usage: determine-phred FILE
+
+Reads a sam, fastq or pileup, possibly gzipped and returns the phred-scale,
+ either 64 or 33, based on a quick scan of the data in the file.
+";
+ exit 0;
+}
+my $cnt;
+my $dphred = 64;
+if ($ARGV[0] =~ /\.gz$/) {
+ $ARGV[0] = "gunzip -c '$ARGV[0]'|";
+}
+my $qual;
+my $comm;
+my $fmt;
+if (@ARGV > 1) {
+ my @mult = @ARGV;
+ for my $f (@mult) {
+ @ARGV = ($f);
+ determine();
+ print "$f\t$dphred\n";
+ }
+} else {
+ determine();
+ print "$dphred\n";
+}
+
+sub determine {
+ $_ = <>;
+ if (/^\@/ && ! /^\@SQ\t/) {
+ # fastq
+ scalar <>; # read
+ $comm = scalar <>; # comment
+ if (!(substr($comm,0,1) eq '+')) {
+ die "Unknown file format\n";
+ }
+ $qual = <>;
+ chomp $qual;
+ $fmt = 'fq';
+ } elsif (/^\S+\t\d+\t[ACTGN]\t\d+\t\S+\t(\S+)$/i) {
+ $qual = $1;
+ $fmt = 'pileup';
+ } else {
+ # sam
+ $fmt = 'sam';
+ $qual = (split(/\t/, $_))[10];
+ }
+ if (!$qual) {
+ die "Unknown file format\n";
+ }
+ my $rc = 1;
+ while($qual) {
+ ++$rc;
+ for (my $i =length($qual)/2; $i < length($qual); ++$i) {
+ if (ord(substr($qual,$i,1)) < 64) {
+ $dphred = 33;
+ $cnt=$ssiz; # last
+ last;
+ }
+ }
+ $qual = '';
+ last if ++$cnt >= $ssiz; # got enough
+ if ($fmt eq 'fq') {
+ # fastq
+ last if ! scalar <>; # id
+ last if ! scalar <>; # read
+ last if ! scalar <>; # comment
+ $qual = <>;
+ chomp $qual;
+ } elsif ($fmt eq 'pileup') {
+ $qual = (split(/\t/, $_))[5];
+ } else {
+ # sam
+ $qual = (split(/\t/, $_))[10];
+ }
+ }
+ if ($rc < 10) {
+ $dphred = 33;
+ }
+}
diff --git a/ea-utils.spec b/ea-utils.spec
new file mode 100644
index 0000000..633c489
--- /dev/null
+++ b/ea-utils.spec
@@ -0,0 +1,37 @@
+%define name ea-utils
+%define ver 1.1.2
+%define rel 686
+
+Summary: fastq-processing utilities
+Name: %{name}
+Version: %{ver}
+Release: %{rel}
+Source: %{name}.tar.gz
+Prefix: /usr
+BuildRoot: /tmp/%{name}-%{ver}-root
+Vendor: Expression Analysis <earonesty at expressionanalysis.com>
+URL: https://code.google.com/p/ea-utils/
+License: MIT
+Group: Applications/Engineering
+Distribution: Centos 5
+Packager: Erik Aronesty <earonesty at expressionanalysis.com>
+
+%description
+Utilities for processing fastq files, stitching paired-end reads,
+demultiplexing paired-end in-sync, adapter-trimming & skew removal.
+
+%prep
+%setup -c
+
+%install
+make PREFIX=%{buildroot}/%{_prefix} install
+
+%clean
+rm -rf %{buildroot}
+
+%files
+
+%{_bindir}/fastq-join
+%{_bindir}/fastq-clipper
+%{_bindir}/fastq-mcf
+%{_bindir}/fastq-multx
diff --git a/faidx.c b/faidx.c
new file mode 100644
index 0000000..51c82ac
--- /dev/null
+++ b/faidx.c
@@ -0,0 +1,437 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "faidx.h"
+#include "khash.h"
+
+typedef struct {
+ int32_t line_len, line_blen;
+ int64_t len;
+ uint64_t offset;
+} faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+
+#ifndef _NO_RAZF
+#include "razf.h"
+#else
+#ifdef _WIN32
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
+#define RAZF FILE
+#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
+#define razf_open(fn, mode) fopen(fn, mode)
+#define razf_close(fp) fclose(fp)
+#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define razf_tell(fp) ftello(fp)
+#endif
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+struct __faidx_t {
+ RAZF *rz;
+ int n, m;
+ char **name;
+ khash_t(s) *hash;
+};
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
+{
+ khint_t k;
+ int ret;
+ faidx1_t t;
+ if (idx->n == idx->m) {
+ idx->m = idx->m? idx->m<<1 : 16;
+ idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
+ }
+ idx->name[idx->n] = strdup(name);
+ k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
+ t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
+ kh_value(idx->hash, k) = t;
+ ++idx->n;
+}
+
+faidx_t *fai_build_core(RAZF *rz)
+{
+ char c, *name;
+ int l_name, m_name, ret;
+ int line_len, line_blen, state;
+ int l1, l2;
+ faidx_t *idx;
+ uint64_t offset;
+ int64_t len;
+
+ idx = (faidx_t*)calloc(1, sizeof(faidx_t));
+ idx->hash = kh_init(s);
+ name = 0; l_name = m_name = 0;
+ len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
+ while (razf_read(rz, &c, 1)) {
+ if (c == '\n') { // an empty line
+ if (state == 1) {
+ offset = razf_tell(rz);
+ continue;
+ } else if ((state == 0 && len < 0) || state == 2) continue;
+ }
+ if (c == '>') { // fasta header
+ if (len >= 0)
+ fai_insert_index(idx, name, len, line_len, line_blen, offset);
+ l_name = 0;
+ while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
+ if (m_name < l_name + 2) {
+ m_name = l_name + 2;
+ kroundup32(m_name);
+ name = (char*)realloc(name, m_name);
+ }
+ name[l_name++] = c;
+ }
+ name[l_name] = '\0';
+ if (ret == 0) {
+ fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
+ state = 1; len = 0;
+ offset = razf_tell(rz);
+ } else {
+ if (state == 3) {
+ fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ if (state == 2) state = 3;
+ l1 = l2 = 0;
+ do {
+ ++l1;
+ if (isgraph(c)) ++l2;
+ } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
+ if (state == 3 && l2) {
+ fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
+ free(name); fai_destroy(idx);
+ return 0;
+ }
+ ++l1; len += l2;
+ if (state == 1) line_len = l1, line_blen = l2, state = 0;
+ else if (state == 0) {
+ if (l1 != line_len || l2 != line_blen) state = 2;
+ }
+ }
+ }
+ fai_insert_index(idx, name, len, line_len, line_blen, offset);
+ free(name);
+ return idx;
+}
+
+void fai_save(const faidx_t *fai, FILE *fp)
+{
+ khint_t k;
+ int i;
+ for (i = 0; i < fai->n; ++i) {
+ faidx1_t x;
+ k = kh_get(s, fai->hash, fai->name[i]);
+ x = kh_value(fai->hash, k);
+#ifdef _WIN32
+ fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
+#else
+ fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+#endif
+ }
+}
+
+faidx_t *fai_read(FILE *fp)
+{
+ faidx_t *fai;
+ char *buf, *p;
+ int len, line_len, line_blen;
+#ifdef _WIN32
+ long offset;
+#else
+ long long offset;
+#endif
+ fai = (faidx_t*)calloc(1, sizeof(faidx_t));
+ fai->hash = kh_init(s);
+ buf = (char*)calloc(0x10000, 1);
+ while (!feof(fp) && fgets(buf, 0x10000, fp)) {
+ for (p = buf; *p && isgraph(*p); ++p);
+ *p = 0; ++p;
+#ifdef _WIN32
+ sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
+#else
+ sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+#endif
+ fai_insert_index(fai, buf, len, line_len, line_blen, offset);
+ }
+ free(buf);
+ return fai;
+}
+
+void fai_destroy(faidx_t *fai)
+{
+ int i;
+ for (i = 0; i < fai->n; ++i) free(fai->name[i]);
+ free(fai->name);
+ kh_destroy(s, fai->hash);
+ if (fai->rz) razf_close(fai->rz);
+ free(fai);
+}
+
+int fai_build(const char *fn)
+{
+ char *str;
+ RAZF *rz;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+ rz = razf_open(fn, "r");
+ if (rz == 0) {
+ fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
+ free(str);
+ return -1;
+ }
+ fai = fai_build_core(rz);
+ razf_close(rz);
+ fp = fopen(str, "wb");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
+ fai_destroy(fai); free(str);
+ return -1;
+ }
+ fai_save(fai, fp);
+ fclose(fp);
+ free(str);
+ fai_destroy(fai);
+ return 0;
+}
+
+#ifdef _USE_KNETFILE
+FILE *download_and_open(const char *fn)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ uint8_t *buf;
+ FILE *fp;
+ knetFile *fp_remote;
+ const char *url = fn;
+ const char *p;
+ int l = strlen(fn);
+ for (p = fn + l - 1; p >= fn; --p)
+ if (*p == '/') break;
+ fn = p + 1;
+
+ // First try to open a local copy
+ fp = fopen(fn, "r");
+ if (fp)
+ return fp;
+
+ // If failed, download from remote and open
+ fp_remote = knet_open(url, "rb");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
+ return NULL;
+ }
+ if ((fp = fopen(fn, "wb")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
+ knet_close(fp_remote);
+ return NULL;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+
+ return fopen(fn, "r");
+}
+#endif
+
+faidx_t *fai_load(const char *fn)
+{
+ char *str;
+ FILE *fp;
+ faidx_t *fai;
+ str = (char*)calloc(strlen(fn) + 5, 1);
+ sprintf(str, "%s.fai", fn);
+
+#ifdef _USE_KNETFILE
+ if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
+ {
+ fp = download_and_open(str);
+ if ( !fp )
+ {
+ fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
+ free(str);
+ return 0;
+ }
+ }
+ else
+#endif
+ fp = fopen(str, "rb");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] build FASTA index.\n");
+ fai_build(fn);
+ fp = fopen(str, "rb");
+ if (fp == 0) {
+ fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
+ free(str);
+ return 0;
+ }
+ }
+
+ fai = fai_read(fp);
+ fclose(fp);
+
+ fai->rz = razf_open(fn, "rb");
+ free(str);
+ if (fai->rz == 0) {
+ fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
+ return 0;
+ }
+ return fai;
+}
+
+char *fai_fetch(const faidx_t *fai, const char *str, int *len)
+{
+ char *s, c;
+ int i, l, k, name_end;
+ khiter_t iter;
+ faidx1_t val;
+ khash_t(s) *h;
+ int beg, end;
+
+ beg = end = -1;
+ h = fai->hash;
+ name_end = l = strlen(str);
+ s = (char*)malloc(l+1);
+ // remove space
+ for (i = k = 0; i < l; ++i)
+ if (!isspace(str[i])) s[k++] = str[i];
+ s[k] = 0; l = k;
+ // determine the sequence name
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+ if (i >= 0) name_end = i;
+ if (name_end < l) { // check if this is really the end
+ int n_hyphen = 0;
+ for (i = name_end + 1; i < l; ++i) {
+ if (s[i] == '-') ++n_hyphen;
+ else if (!isdigit(s[i]) && s[i] != ',') break;
+ }
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+ s[name_end] = 0;
+ iter = kh_get(s, h, s);
+ if (iter == kh_end(h)) { // cannot find the sequence name
+ iter = kh_get(s, h, str); // try str as the name
+ if (iter == kh_end(h)) {
+ *len = 0;
+ free(s); return 0;
+ } else s[name_end] = ':', name_end = l;
+ }
+ } else iter = kh_get(s, h, str);
+ if(iter == kh_end(h)) {
+ fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
+ free(s);
+ return 0;
+ };
+ val = kh_value(h, iter);
+ // parse the interval
+ if (name_end < l) {
+ for (i = k = name_end + 1; i < l; ++i)
+ if (s[i] != ',') s[k++] = s[i];
+ s[k] = 0;
+ beg = atoi(s + name_end + 1);
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+ end = i < k? atoi(s + i + 1) : val.len;
+ if (beg > 0) --beg;
+ } else beg = 0, end = val.len;
+ if (beg >= val.len) beg = val.len;
+ if (end >= val.len) end = val.len;
+ if (beg > end) beg = end;
+ free(s);
+
+ // now retrieve the sequence
+ l = 0;
+ s = (char*)malloc(end - beg + 2);
+ razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
+ while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
+ if (isgraph(c)) s[l++] = c;
+ s[l] = '\0';
+ *len = l;
+ return s;
+}
+
+int faidx_main(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
+ return 1;
+ } else {
+ if (argc == 2) fai_build(argv[1]);
+ else {
+ int i, j, k, l;
+ char *s;
+ faidx_t *fai;
+ fai = fai_load(argv[1]);
+ if (fai == 0) return 1;
+ for (i = 2; i != argc; ++i) {
+ printf(">%s\n", argv[i]);
+ s = fai_fetch(fai, argv[i], &l);
+ for (j = 0; j < l; j += 60) {
+ for (k = 0; k < 60 && k < l - j; ++k)
+ putchar(s[j + k]);
+ putchar('\n');
+ }
+ free(s);
+ }
+ fai_destroy(fai);
+ }
+ }
+ return 0;
+}
+
+int faidx_fetch_nseq(const faidx_t *fai)
+{
+ return fai->n;
+}
+
+char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
+{
+ int l;
+ char c;
+ khiter_t iter;
+ faidx1_t val;
+ char *seq=NULL;
+
+ // Adjust position
+ iter = kh_get(s, fai->hash, c_name);
+ if(iter == kh_end(fai->hash)) return 0;
+ val = kh_value(fai->hash, iter);
+ if(p_end_i < p_beg_i) p_beg_i = p_end_i;
+ if(p_beg_i < 0) p_beg_i = 0;
+ else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
+ if(p_end_i < 0) p_end_i = 0;
+ else if(val.len <= p_end_i) p_end_i = val.len - 1;
+
+ // Now retrieve the sequence
+ l = 0;
+ seq = (char*)malloc(p_end_i - p_beg_i + 2);
+ razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
+ while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
+ if (isgraph(c)) seq[l++] = c;
+ seq[l] = '\0';
+ *len = l;
+ return seq;
+}
+
+#ifdef FAIDX_MAIN
+int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
+#endif
diff --git a/fastq-clipper.c b/fastq-clipper.c
new file mode 100644
index 0000000..842c464
--- /dev/null
+++ b/fastq-clipper.c
@@ -0,0 +1,279 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+
+Replaced, largely, by fastq-mcf.
+
+See "void usage" below for usage.
+
+*/
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+
+#include "fastq-lib.h"
+
+#define MAX_ADAPTER_NUM 20
+#define MAX_ADAPTER_LEN 160
+
+void usage(FILE *f);
+int hd(char *a, char *b, int n);
+int debug=0;
+int main (int argc, char **argv) {
+ char c;
+ bool eol;
+ int nmin = 4, nkeep = 15, xmax=-1, pctdiff = 20;
+ char *outfile = NULL;
+
+ int i;
+
+ char *a = NULL, *f = NULL;
+ while ( (c = getopt (argc, argv, "-hedbp:i:o:l:m:x::")) != -1) {
+ switch (c) {
+ case '\1':
+ if (!f)
+ f=optarg;
+ else if (!a)
+ a=optarg;
+ else {
+ usage(stderr); return 1;
+ }
+ break;
+ case 'm': nmin = atoi(optarg); break;
+ case 'p': pctdiff = atoi(optarg); break;
+ case 'l': nkeep = atoi(optarg); break;
+ case 'e': eol = 1; break;
+ case 'h': usage(stdout); return 1;
+ case 'b': eol = 0; break;
+ case 'd': debug = 1; break;
+ case 'x': xmax = optarg ? atoi(optarg) : -1; break;
+ case 'o': outfile = optarg; break;
+ case 'i': f = optarg; break;
+ case '?':
+ if (strchr("lm", optopt))
+ fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+ else if (isprint(optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr,
+ "Unknown option character `\\x%x'.\n",
+ optopt);
+ usage(stderr);
+ return 1;
+ }
+ }
+
+ if (argc < 3 || !a || !f) {
+ usage(stderr);
+ return 1;
+ }
+
+ FILE *fin = strcmp(f,"-") ? fopen(f, "r") : stdin;
+ if (!fin) {
+ fprintf(stderr, "Error opening file '%s': %s\n",f, strerror(errno));
+ return 1;
+ }
+
+ FILE *fout = stdout;
+ FILE *fstat = stderr;
+ if (outfile ) {
+ fout = fopen(outfile, "w");
+ if (!fout) {
+ fprintf(stderr, "Error opening output file '%s': %s",outfile, strerror(errno));
+ return 1;
+ }
+ fstat = stdout;
+ }
+
+ char *adapters[MAX_ADAPTER_NUM+1];
+ int adapter_len[MAX_ADAPTER_NUM+1];
+ char *p;
+ int adapter_count=0;
+ while (p=strtok(a,":")) {
+ a = NULL; // strtok requirement
+ adapters[adapter_count] = p;
+ adapter_len[adapter_count] = strlen(p); // append to list
+ ++adapter_count;
+ if (adapter_count >= MAX_ADAPTER_NUM) {
+ break;
+ }
+ }
+
+ char *s[4] = {0,0,0,0}; // id, sequence, comment, quality
+ size_t na[4] = {0,0,0,0}; // lengths of above
+ int ns[4] = {0,0,0,0}; // lengths of above
+ int nrec=0;
+ int nerr=0;
+ int nok=0;
+ int ntooshort=0;
+ int ntrim=0;
+ int nbtrim=0;
+ while (1) {
+ int i;
+ for (i = 0; i < 4; ++i ) {
+ ns[i] = getline(&s[i], &na[i], fin);
+ }
+
+ if (ns[1] <= 0) {
+ break;
+ }
+
+ ++nrec;
+
+ // skip malformed records
+ if (ns[1] != ns[3] || s[0][0] != '@' || s[2][0] != '+') {
+ if (nerr < 10) {
+ fprintf(stderr, "Malformed fastq record at line %d\n", nrec*4-3);
+ }
+ ++nerr;
+ continue;
+ }
+
+ // chomp
+ s[1][ns[1]-1]='\0';
+ --ns[1];
+ s[3][ns[3]-1]='\0';
+ --ns[3];
+
+ if (debug) fprintf(stderr, "seq: %s %d\n", s[1], ns[1]);
+
+ bool skip = 0;
+ int bestscore = 999, bestoff = 0, bestlen = 0;
+
+ for (i =0; i < adapter_count; ++i) {
+ int nmatch = nmin;
+ if (!nmatch) nmatch = adapter_len[i]; // full match required if nmin == 0
+
+ // how far in to search for a match?
+ int mx = adapter_len[i];
+ if (xmax) {
+ mx = ns[1];
+ if (xmax > 0 && (xmax+adapter_len[i]) < mx)
+ mx = xmax+adapter_len[i]; // xmax is added to adapter length
+ }
+
+ if (debug)
+ fprintf(stderr, "adapter: %s, adlen: %d, nmatch: %d, mx: %d\n", adapters[i], adapter_len[i], nmatch, mx);
+
+ int off;
+ for (off = nmatch; off <= mx; ++off) { // off is distance from tail of sequence
+ char *seqtail = s[1]+ns[1]-off; // search at tail
+ int ncmp = off<adapter_len[i] ? off : adapter_len[i];
+ int mind = (pctdiff * ncmp) / 100;
+ int d = hd(adapters[i],seqtail,ncmp); // # differences
+ if (debug)
+ fprintf(stderr, "tail: %s, bestoff: %d, off: %d, ncmp: %d, mind: %d, hd %d\n", seqtail, bestoff, off, ncmp, mind, d);
+ // calc squared distance over length score
+ if (d <= mind) {
+ int score = (d*d+1)/ncmp;
+ if (score <= bestscore) { // better score?
+ bestscore = score; // save max score
+ bestoff = off; // offset at max
+ bestlen = ncmp; // cmp length at max
+ }
+ if (d == 0 && (ncmp == adapter_len[i])) {
+ break;
+ }
+ }
+ }
+
+ // assure time wasn't wasted running a comparison that couldn't matter
+ assert((bestlen == 0) || (bestlen >= nmatch));
+
+ if (bestoff > 0) {
+ if ( (ns[1]-bestoff) < nkeep) {
+ ++ntooshort;
+ skip = 1;
+ break;
+ }
+ }
+ }
+
+ if (!skip) {
+ if (bestoff > 0) {
+ ++ntrim;
+ s[1][ns[1]-bestoff]='\0';
+ s[3][ns[1]-bestoff]='\0';
+ }
+ fputs(s[0],fout);
+ fputs(s[1],fout);
+ fputc('\n',fout);
+ fputs(s[2],fout);
+ fputs(s[3],fout);
+ fputc('\n',fout);
+ }
+ }
+ fprintf(fstat, "Total: %d\n", nrec);
+ fprintf(fstat, "Too Short: %d\n", ntooshort);
+ fprintf(fstat, "Trimmed: %d\n", ntrim);
+ fprintf(fstat, "Errors: %d\n", nerr);
+ return 0;
+}
+
+void usage(FILE *f) {
+ fprintf(f,
+"usage: fastq-clipper [options] <fastq-file> <adapters>\n"
+"\n"
+"Removes one or more adapter sequences from the fastq file.\n"
+"Adapter sequences are colon-delimited.\n"
+"Stats go to stderr, unless -o is specified.\n"
+"\n"
+"Options:\n"
+" -h This help\n"
+" -o FIL Output file (stats to stdout)\n"
+" -p N Maximum difference percentage (10)\n"
+" -m N Minimum clip length (1)\n"
+" -l N Minimum remaining sequence length (15)\n"
+" -x [N] Extra match length past adapter length, \n"
+" N =-1 : search all\n"
+" N = 0 : search only up to adapter length\n"
+" -e End-of-line (default)\n"
+" -b Beginning-of-line (not supported yet)\n"
+ );
+}
+
+/*
+#!/usr/bin/perl
+
+my ($f, $a) = @ARGV;
+
+my @a = split(/,/, $a);
+
+open (F, $f) || die;
+
+while (my $r = <F>) {
+ for my $a (@a) {
+ for (my $i = 1; $i < length($a); ++$i) {
+
+ }
+ }
+}
+# http://www.perlmonks.org/?node_id=500235
+sub hd{ length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
+*/
diff --git a/fastq-join.c b/fastq-join.c
new file mode 100644
index 0000000..30b54a3
--- /dev/null
+++ b/fastq-join.c
@@ -0,0 +1,424 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "fastq-lib.h"
+
+/*
+
+See "void usage" below for usage.
+
+*/
+
+#define VERSION "1.01"
+#define SVNREV atoi(strchr("$LastChangedRevision: 679 $", ':')+1)
+
+void usage(FILE *f);
+int debug=0;
+
+int main (int argc, char **argv) {
+ char c;
+ int mismatch = 0;
+ char *in[3] = {0,0,0};
+ char *out[5];
+ char *orep=NULL;
+ int out_n = 0;
+ int in_n = 0;
+ int threads = 1; // not really necessary
+ char verify='\0';
+
+ int i;
+ int mino = 6;
+ int pctdiff = 8; // this number tested well on exome data... tweak for best results
+ bool omode = false;
+ char *bfil = NULL;
+ bool norevcomp = false;
+ bool allow_ex = false;
+
+ while ( (c = getopt (argc, argv, "-dRnbeo:t:v:m:p:r:xV")) != -1) {
+ switch (c) {
+ case '\1':
+ if (!in[0])
+ in[0]=optarg;
+ else if (!in[1])
+ in[1]=optarg;
+ else if (!in[2])
+ in[2]=optarg;
+ else {
+ usage(stderr); return 1;
+ }
+ ++in_n;
+ break;
+ case 'o': if (out_n == 3) {
+ usage(stderr); return 1;
+ }
+ out[out_n++] = optarg;
+ break;
+ case 'r': orep = optarg; break;
+ case 't': threads = atoi(optarg); break;
+ case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); return 0; break;
+ case 'm': mino = atoi(optarg); break;
+ case 'x': allow_ex = true; break;
+ case 'p': pctdiff = atoi(optarg); break;
+ case 'R': norevcomp = true; break;
+ case 'd': debug = 1; break;
+ case 'v':
+ if (strlen(optarg)>1) {
+ fprintf(stderr, "Option -v requires a single character argument");
+ exit(1);
+ }
+ verify = *optarg; break;
+ case '?':
+ if (strchr("otvmpr", optopt))
+ fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+ else if (isprint(optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr,
+ "Unknown option character `\\x%x'.\n",
+ optopt);
+ usage(stderr);
+ return 1;
+ }
+ }
+
+ if (argc < 3 || !in[1] || (!in[2] && out_n != 1 && out_n != 3) || (in[2] && out_n != 1 && out_n != 5)) {
+ usage(stderr);
+ return 1;
+ }
+
+ FILE *fin[2];
+ bool gzin[2]; meminit(gzin);
+ for (i = 0; i < in_n; ++i) {
+ fin[i] = gzopen(in[i], "r",&gzin[i]);
+ if (!fin[i]) {
+ fprintf(stderr, "Error opening file '%s': %s\n",in[i], strerror(errno));
+ return 1;
+ }
+ }
+
+ const char *suffix[5]={"un1", "un2", "join", "un3", "join2"};
+ FILE *fout[5]; meminit(fout);
+ bool gzout[5]; meminit(gzout);
+ char *pre = out[0];
+ for (i = 0; i < (in[2] ? 5 : 3); ++i) {
+ // prefix out
+ if (out_n == 1) {
+ out[i]=(char *)malloc(strlen(pre)+10);
+ strcpy(out[i], pre);
+ char *p;
+ if (p=strchr(out[i], '%')) {
+ // substiture instead of append
+ strcpy(p, suffix[i]);
+ strcpy(p+strlen(suffix[i]), pre+(p-out[i])+1);
+ } else {
+ strcat(out[i], suffix[i]);
+ }
+ } // else explicit
+ fout[i] = gzopen(out[i], "w",&gzout[i]);
+ if (!fout[i]) {
+ fprintf(stderr, "Error opening output file '%s': %s\n",out[i], strerror(errno));
+ return 1;
+ }
+ }
+
+//printf("in_n:%d in:%x fo:%x", in_n, in[3], fout[4]);
+//return 1;
+
+ FILE *frep = NULL;
+ if (orep) {
+ frep = fopen(orep, "w");
+ if (!orep) {
+ fprintf(stderr, "Error opening report file '%s': %s\n",out[i], strerror(errno));
+ return 1;
+ }
+ }
+
+
+ // some basic validation of the file formats
+ {
+ for (i=0;i<in_n;++i) {
+ char c=getc(fin[i]);
+ if (c != '@') {
+ fprintf(stderr, "%s doesn't appear to be a fastq file (%c)\n", in[i], c);
+ return 1;
+ }
+ ungetc(c, fin[i]);
+ }
+ }
+
+ struct fq fq[3];
+ meminit(fq);
+
+ int nrec=0;
+ int nerr=0;
+ int nok=0;
+ int joincnt=0;
+ double tlen=0;
+ double tlensq=0;
+ int read_ok;
+
+ struct fq rc;
+ meminit(rc);
+
+ // read in 1 record from each file
+ while (read_ok=read_fq(fin[0], nrec, &fq[0])) {
+ for (i=1;i<in_n;++i) {
+ int mate_ok=read_fq(fin[i], nrec, &fq[i]);
+ if (read_ok != mate_ok) {
+ fprintf(stderr, "# of rows in mate file '%s' doesn't match primary file, quitting!\n", in[i]);
+ return 1;
+ }
+ if (verify) {
+ // verify 1 in 100
+ if (0 == (nrec % 100)) {
+ char *p=strchr(fq[i].id.s,verify);
+ if (!p) {
+ fprintf(stderr, "File %s is missing id verification char %c at line %d", in[i], verify, nrec*4+1);
+ return 1;
+ }
+ int l = p-fq[i].id.s;
+ if (strncmp(fq[0].id.s, fq[i].id.s, l)) {
+ fprintf(stderr, "File %s, id doesn't match file %s at line %d", in[0], in[i], nrec*4+1);
+ return 1;
+ }
+ }
+ }
+ }
+
+ ++nrec;
+ if (read_ok < 0) continue;
+
+ if (debug) fprintf(stderr, "seq: %s %d\n", fq[0].seq.s, fq[0].seq.n);
+
+ if (!norevcomp) {
+ revcomp(&rc, &fq[1]);
+ } else {
+ rc=fq[1];
+ }
+
+ if (debug) fprintf(stderr, "comp: %s %d\n", rc.seq.s, rc.seq.n);
+
+ int maxo = min(fq[0].seq.n, rc.seq.n);
+ int bestscore=INT_MAX;
+ int besto=-1;
+ for (i=mino; i <= maxo; ++i) {
+ int mind = (pctdiff * i) / 100;
+ int d;
+ d=hd(fq[0].seq.s+fq[0].seq.n-i, rc.seq.s, i);
+ if (debug) fprintf(stderr, "hd: %d, %d\n", i, d);
+ if (d <= mind) {
+ // squared-distance over length, probably can be proven better (like pearson's)
+ int score = (1000*(d*d+1))/i;
+ if (score < bestscore) {
+ bestscore=score;
+ besto=i;
+ }
+ }
+ }
+
+ int hasex=0;
+ if (allow_ex && besto<maxo) {
+ if (fq[0].seq.n > rc.seq.n) {
+ int mind = (pctdiff * maxo) / 100;
+ for (i=0; i < fq[0].seq.n-maxo; ++i ) {
+ int d;
+ d=hd(fq[0].seq.s+fq[0].seq.n-rc.seq.n-i-1, rc.seq.s, maxo);
+ if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d);
+ if (d <= mind) {
+ // squared-distance over length, probably can be proven better (like pearson's)
+ int score = (1000*(d*d+1))/maxo;
+ if (score < bestscore) {
+ bestscore=score;
+ // negative overlap!
+ hasex=-i;
+ besto=maxo;
+ }
+ }
+ }
+ } else if (fq[0].seq.n < rc.seq.n) {
+ int mind = (pctdiff * maxo) / 100;
+ for (i=0; i < rc.seq.n-maxo; ++i ) {
+ int d;
+ d=hd(fq[0].seq.s, rc.seq.s+i, maxo);
+ if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d);
+ if (d <= mind) {
+ // squared-distance over length, probably can be proven better (like pearson's)
+ int score = (1000*(d*d+1))/maxo;
+ if (score < bestscore) {
+ bestscore=score;
+ // negative overlap!
+ hasex=-i;
+ besto=maxo;
+ }
+ }
+ }
+ }
+ }
+
+ if (debug) {
+ fprintf(stderr, "best: %d %d\n", besto-hasex, bestscore);
+ }
+
+ FILE *fmate = NULL;
+ int olen = besto-hasex;
+
+ if (besto > 0) {
+ ++joincnt;
+
+ int l=besto/2; // discard from left
+ int r=besto-(besto/2); // discard from right
+
+ tlen+=olen;
+ tlensq+=olen*olen;
+
+ char *sav_fqs=NULL, *sav_rcs;
+ char *sav_fqq, *sav_rcq;
+
+ if (hasex) {
+ sav_fqs=fq[0].seq.s;
+ sav_fqq=fq[0].qual.s;
+ sav_rcs=rc.seq.s;
+ sav_rcq=rc.qual.s;
+ if (fq[0].seq.n < rc.seq.n) {
+ rc.seq.s=rc.seq.s-hasex;
+ rc.qual.s=rc.qual.s-hasex;
+ rc.seq.n=maxo;
+ rc.qual.n=maxo;
+ } else {
+ // fprintf(stderr, "rc negative overlap: %s %d\n", rc.seq.s, hasex);
+ fq[0].seq.s=fq[0].seq.s+fq[0].seq.n-maxo+hasex-1;
+ fq[0].qual.s=fq[0].qual.s+fq[0].seq.n-maxo+hasex-1;
+ fq[0].seq.n=maxo;
+ fq[0].qual.n=maxo;
+ // fprintf(stderr, "negative overlap: %s -> %s, %d\n", fq[0].seq.s, rc.seq.s, maxo);
+ }
+ // ok now pretend everythings normal, 100% overlap
+ //if (debug)
+ }
+
+ FILE *f=fout[2];
+
+ if (verify) {
+ char *p=strchr(fq[0].id.s,verify);
+ if (p) {
+ *p++ = '\n';
+ *p = '\0';
+ }
+ }
+ fputs(fq[0].id.s,f);
+ for (i = 0; i < besto; ++i ) {
+ int li = fq[0].seq.n-besto+i;
+ int ri = i;
+ if (fq[0].seq.s[li] == rc.seq.s[ri]) {
+ fq[0].qual.s[li] = max(fq[0].qual.s[li], rc.qual.s[ri]);
+ rc.qual.s[ri] = max(fq[0].qual.s[li], rc.qual.s[ri]);
+ } else {
+ // use the better-quality read, although the qual should be downgraded due to the difference!
+ if (fq[0].qual.s[li] > rc.qual.s[ri]) {
+ rc.seq.s[ri] = fq[0].seq.s[li];
+ } else {
+ fq[0].seq.s[li] = rc.seq.s[ri];
+ }
+ }
+ }
+
+ fwrite(fq[0].seq.s,1,fq[0].seq.n-l,f);
+ fputs(rc.seq.s+r,f);
+ fputc('\n',f);
+ fputs(fq[0].com.s,f);
+ fwrite(fq[0].qual.s,1,fq[0].qual.n-l,f);
+ fputs(rc.qual.s+r,f);
+ fputc('\n',f);
+ fmate=fout[4];
+
+ if (sav_fqs) {
+ fq[0].seq.s=sav_fqs;
+ fq[0].qual.s=sav_fqq;
+ rc.seq.s=sav_rcs;
+ rc.qual.s=sav_rcq;
+ }
+
+ if (frep) {
+ fprintf(frep, "%d\n", besto);
+ }
+ } else {
+ for (i=0;i<2;++i) {
+ FILE *f=fout[i];
+ fputs(fq[i].id.s,f);
+ fputs(fq[i].seq.s,f);
+ fputc('\n',f);
+ fputs(fq[i].com.s,f);
+ fputs(fq[i].qual.s,f);
+ fputc('\n',f);
+ }
+ fmate=fout[3];
+ }
+
+ if (fmate) {
+ fputs(fq[2].id.s,fmate);
+ fputs(fq[2].seq.s,fmate);
+ fputc('\n',fmate);
+ fputs(fq[2].com.s,fmate);
+ fputs(fq[2].qual.s,fmate);
+ fputc('\n',fmate);
+ }
+ }
+
+
+ double dev = sqrt((((double)joincnt)*tlensq-pow((double)tlen,2)) / ((double)joincnt*((double)joincnt-1)) );
+ printf("Total reads: %d\n", nrec);
+ printf("Total joined: %d\n", joincnt);
+ printf("Average join len: %.2f\n", (double) tlen / (double) joincnt);
+ printf("Stdev join len: %.2f\n", dev);
+ printf("Version: %s.%d\n", VERSION, SVNREV);
+
+ return 0;
+}
+
+void usage(FILE *f) {
+ fputs(
+"Usage: fastq-join [options] <read1.fq> <read2.fq> [mate.fq] -o <read.%.fq>\n"
+"\n"
+"Joins two paired-end reads on the overlapping ends.\n"
+"\n"
+"Options:\n"
+"\n"
+"-o FIL See 'Output' below\n"
+"-v C Verifies that the 2 files probe id's match up to char C\n"
+" use ' ' (space) for Illumina reads\n"
+"-p N N-percent maximum difference (8)\n"
+"-m N N-minimum overlap (6)\n"
+"-r FIL Verbose stitch length report\n"
+"-R No reverse complement\n"
+"-x Allow insert < read length\n"
+"\n"
+"Output: \n"
+"\n"
+" You can supply 3 -o arguments, for un1, un2, join files, or one \n"
+"argument as a file name template. The suffix 'un1, un2, or join' is \n"
+"appended to the file, or they replace a %-character if present.\n"
+"\n"
+" If a 'mate' input file is present (barcode read), then the files\n"
+"'un3' and 'join2' are also created.\n"
+"\n"
+ ,f);
+}
diff --git a/fastq-lib.cpp b/fastq-lib.cpp
new file mode 100644
index 0000000..17f90ec
--- /dev/null
+++ b/fastq-lib.cpp
@@ -0,0 +1,375 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "fastq-lib.h"
+
+#ifdef __MAIN__
+int main(int argc, char **argv) {
+ // todo... put testing stuff in here, so the lib can be tested independently of the other componenets
+}
+#endif
+
+int read_line(FILE *in, struct line &l) {
+ return (l.n = getline(&l.s, &l.a, in));
+}
+
+int read_fq(FILE *in, int rno, struct fq *fq, const char *name) {
+ read_line(in, fq->id);
+ if (fq->id.s && (*fq->id.s == '>')) {
+ fq->id.s[0] = '@';
+ // read fasta instead
+ char c = fgetc(in);
+ while (c != '>' && c != EOF) {
+ if (fq->seq.a <= (fq->seq.n+1)) {
+ fq->seq.s=(char *)realloc(fq->seq.s, fq->seq.a=(fq->seq.a+16)*2);
+ }
+ if (!isspace(c))
+ fq->seq.s[fq->seq.n++]=c;
+ c = fgetc(in);
+ }
+ if (c != EOF) {
+ ungetc(c, in);
+ }
+ // make it look like a fastq
+ fq->qual.s=(char *)realloc(fq->qual.s, fq->qual.a=(fq->seq.n+1));
+ memset(fq->qual.s, 'h', fq->seq.n);
+ fq->qual.s[fq->qual.n=fq->seq.n]=fq->seq.s[fq->seq.n]='\0';
+ fq->com.s=(char *)malloc(fq->com.a=2);
+ fq->com.n=1;
+ strcpy(fq->com.s,"+");
+ } else {
+ read_line(in, fq->seq);
+ read_line(in, fq->com);
+ read_line(in, fq->qual);
+ }
+
+ if (fq->qual.n <= 0)
+ return 0;
+ if (fq->id.s[0] != '@' || fq->com.s[0] != '+' || fq->seq.n != fq->qual.n) {
+ const char *errtyp = (fq->seq.n != fq->qual.n) ? "length mismatch" : fq->id.s[0] != '@' ? "no '@' for id" : "no '+' for comment";
+ if (name) {
+ fprintf(stderr, "Malformed fastq record (%s) in file '%s', line %d\n", errtyp, name, rno*2+1);
+ } else {
+ fprintf(stderr, "Malformed fastq record (%s) at line %d\n", errtyp, rno*2+1);
+ }
+ return -1;
+ }
+ // win32-safe chomp
+ fq->seq.s[--fq->seq.n] = '\0';
+ if (fq->seq.s[fq->seq.n-1] == '\r') {
+ fq->seq.s[--fq->seq.n] = '\0';
+ }
+ fq->qual.s[--fq->qual.n] = '\0';
+ if (fq->qual.s[fq->qual.n-1] == '\r') {
+ fq->qual.s[--fq->qual.n] = '\0';
+ }
+ return 1;
+}
+
+struct qual_str {
+ long long int cnt;
+ long long int sum;
+ long long int ssq;
+ long long int ns;
+} quals[MAX_FILENO_QUALS+1] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};
+
+int gzclose(FILE *f, bool isgz) {
+ return isgz ? pclose(f) : fclose(f);
+}
+
+FILE *gzopen(const char *f, const char *m, bool*isgz) {
+ // maybe use zlib some day?
+ FILE *h;
+ const char * ext = fext(f);
+ if (!strcmp(ext,".gz")) {
+ char *tmp=(char *)malloc(strlen(f)+100);
+ if (strchr(m,'w')) {
+ strcpy(tmp, "gzip --rsyncable > '");
+ strcat(tmp, f);
+ strcat(tmp, "'");
+ } else {
+ strcpy(tmp, "gunzip -c '");
+ strcat(tmp, f);
+ strcat(tmp, "'");
+ }
+ h = popen(tmp, m);
+ *isgz=1;
+ free(tmp);
+ } else if (!strcmp(ext,".zip")) {
+ char *tmp=(char *)malloc(strlen(f)+100);
+ if (strchr(m,'w')) {
+ strcpy(tmp, "zip -q '");
+ strcat(tmp, f);
+ strcat(tmp, "' -");
+ } else {
+ strcpy(tmp, "unzip -p '");
+ strcat(tmp, f);
+ strcat(tmp, "'");
+ }
+ h = popen(tmp, m);
+ *isgz=1;
+ free(tmp);
+ } else {
+ h = fopen(f, m);
+ *isgz=0;
+ }
+ if (!h) {
+ fprintf(stderr, "Error opening file '%s': %s\n",f, strerror(errno));
+ exit(1);
+ }
+ return h;
+}
+
+const char *fext(const char *f) {
+ const char *x=strrchr(f,'.');
+ return x ? x : "";
+}
+
+bool poorqual(int n, int l, const char *s, const char *q) {
+ int i=0, sum=0, ns=0;
+ for (i=0;i<l;++i) {
+ if (s[i] == 'N')
+ ++ns;
+ quals[n].cnt++;
+ quals[n].ssq += q[i] * q[i];
+ sum+=q[i];
+ }
+ quals[n].sum += sum;
+ quals[n].ns += ns;
+ int xmean = sum/l;
+ if (quals[n].cnt < 20000) {
+ // mean qual < 18 = junk
+ return ((xmean-33) < 18) || (ns > 1);
+ }
+ // enough data? use stdev
+ int pmean = quals[n].sum / quals[n].cnt; // mean q
+ double pdev = stdev(quals[n].cnt, quals[n].sum, quals[n].ssq); // dev q
+ int serr = min(pmean/2,max(1,pdev/sqrt(l))); // stderr for length l
+ // mean qual < min(18,peman-serr*3) = junk/skip it
+ // cap low qual, because adapters often are low qual
+ // but you still need to calculate something, in case we're doing ion/pacbio
+ int thr = min((33+18), (pmean - serr * 3));
+ if (xmean < thr) {
+// fprintf(stderr, "POORQ xmean:%d, pmean:%d, pdev:%f, sqrt(l):%f, serr:%d, thr: %d, %s",xmean,pmean,pdev,sqrt(l),serr,thr,s);
+ return 1; // ditch it
+ }
+ if (ns > (1+(l*quals[n].ns / quals[n].cnt))) { // 1 more n than average?
+// fprintf(stderr, "POORQ: ns:%d, thr: %d\n",ns,(int)(1+(l*quals[n].ns / quals[n].cnt)));
+ return 1; // ditch it
+ }
+ return 0;
+}
+
+#define comp(c) ((c)=='A'?'T':(c)=='a'?'t':(c)=='C'?'G':(c)=='c'?'g':(c)=='G'?'C':(c)=='g'?'c':(c)=='T'?'A':(c)=='t'?'a':(c))
+
+void revcomp(struct fq *d, struct fq *s) {
+ if (!d->seq.s) {
+ d->seq.s=(char *) malloc(d->seq.a=s->seq.n+1);
+ d->qual.s=(char *) malloc(d->qual.a=s->qual.n+1);
+ } else if (d->seq.a <= s->seq.n) {
+ d->seq.s=(char *) realloc(d->seq.s, d->seq.a=(s->seq.n+1));
+ d->qual.s=(char *) realloc(d->qual.s, d->qual.a=(s->qual.n+1));
+ }
+ int i;
+ for (i=0;i<s->seq.n/2;++i) {
+ char b=s->seq.s[i];
+ char q=s->qual.s[i];
+ //printf("%d: %c, %c\n", i, comp(s->seq.s[s->seq.n-i-1]), s->qual.s[s->qual.n-i-1]);
+ d->seq.s[i]=comp(s->seq.s[s->seq.n-i-1]);
+ d->qual.s[i]=s->qual.s[s->qual.n-i-1];
+ //printf("%d: %c, %c\n", s->seq.n-i-1, comp(b), q);
+ d->seq.s[s->seq.n-i-1]=comp(b);
+ d->qual.s[s->seq.n-i-1]=q;
+ }
+ if (s->seq.n % 2) {
+ //printf("%d: %c, %c\n", 1+s->seq.n/2, comp(s->seq.s[s->seq.n/2]));
+ d->seq.s[s->seq.n/2] = comp(s->seq.s[s->seq.n/2]);
+ d->qual.s[s->seq.n/2] = s->qual.s[s->seq.n/2];
+ }
+ d->seq.n=s->seq.n;
+ d->qual.n=s->qual.n;
+ d->seq.s[s->seq.n]='\0';
+ d->qual.s[s->seq.n]='\0';
+}
+
+void free_line(struct line *l) {
+ if (l) {
+ if (l->s) free(l->s);
+ l->s=NULL;
+ }
+}
+
+void free_fq(struct fq *f) {
+ if (f) {
+ free_line(&f->id);
+ free_line(&f->seq);
+ free_line(&f->com);
+ free_line(&f->qual);
+ }
+}
+
+
+/* getline.c -- Replacement for GNU C library function getline
+
+Copyright (C) 1993 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/* Written by Jan Brittenson, bson at gnu.ai.mit.edu. */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+
+/* Read up to (and including) a TERMINATOR from STREAM into *LINEPTR
+ + OFFSET (and null-terminate it). *LINEPTR is a pointer returned from
+ malloc (or NULL), pointing to *N characters of space. It is realloc'd
+ as necessary. Return the number of characters read (not including the
+ null terminator), or -1 on error or EOF. */
+
+int getstr (char ** lineptr, size_t *n, FILE * stream, char terminator, int offset)
+{
+ int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
+ char *read_pos; /* Where we're reading into *LINEPTR. */
+ int ret;
+
+ if (!lineptr || !n || !stream)
+ return -1;
+
+ if (!*lineptr)
+ {
+ *n = 64;
+ *lineptr = (char *) malloc (*n);
+ if (!*lineptr)
+ return -1;
+ }
+
+ nchars_avail = *n - offset;
+ read_pos = *lineptr + offset;
+
+ for (;;)
+ {
+ register int c = getc (stream);
+
+ /* We always want at least one char left in the buffer, since we
+ always (unless we get an error while reading the first char)
+ NUL-terminate the line buffer. */
+
+ assert(*n - nchars_avail == read_pos - *lineptr);
+ if (nchars_avail < 1)
+ {
+ if (*n > 64)
+ *n *= 2;
+ else
+ *n += 64;
+
+ nchars_avail = *n + *lineptr - read_pos;
+ *lineptr = (char *) realloc (*lineptr, *n);
+ if (!*lineptr)
+ return -1;
+ read_pos = *n - nchars_avail + *lineptr;
+ assert(*n - nchars_avail == read_pos - *lineptr);
+ }
+
+ if (c == EOF || ferror (stream))
+ {
+ /* Return partial line, if any. */
+ if (read_pos == *lineptr)
+ return -1;
+ else
+ break;
+ }
+
+ *read_pos++ = c;
+ nchars_avail--;
+
+ if (c == terminator)
+ /* Return the line. */
+ break;
+ }
+
+ /* Done - NUL terminate and return the number of chars read. */
+ *read_pos = '\0';
+
+ ret = read_pos - (*lineptr + offset);
+ return ret;
+}
+
+#if !defined(__GNUC__) || defined(__APPLE__) || defined(WIN32)
+
+ssize_t getline(char **lineptr, size_t *n, FILE *stream)
+{
+ return getstr (lineptr, n, stream, '\n', 0);
+}
+
+/*
+ * public domain strtok_r() by Charlie Gordon
+ *
+ * from comp.lang.c 9/14/2007
+ *
+ * http://groups.google.com/group/comp.lang.c/msg/2ab1ecbb86646684
+ *
+ * (Declaration that it's public domain):
+ * http://groups.google.com/group/comp.lang.c/msg/7c7b39328fefab9c
+ */
+
+char* strtok_r(char *str, const char *delim, char **nextp)
+{
+ char *ret;
+
+ if (str == NULL) {
+ str = *nextp;
+ }
+
+ str += strspn(str, delim);
+
+ if (*str == '\0'){
+ return NULL;
+ }
+
+ ret = str;
+ str += strcspn(str, delim);
+
+ if (*str) {
+ *str++ = '\0';
+ }
+
+ *nextp = str;
+ return ret;
+}
+
+#endif
+
+
diff --git a/fastq-lib.h b/fastq-lib.h
new file mode 100644
index 0000000..d602ccd
--- /dev/null
+++ b/fastq-lib.h
@@ -0,0 +1,113 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// 32-bit o/s support
+#if defined(__i386__)
+ #define _FILE_OFFSET_BITS 64
+#endif
+
+// standard libs
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <getopt.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/stat.h>
+#include <search.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined(__APPLE__)
+ #define getopt(a,b,c) getopt_long(a,b,c,NULL,NULL)
+#endif
+
+// misc useful macros
+#define max(a,b) ((a)>(b)?(a):(b))
+#define min(a,b) ((a)<(b)?(a):(b))
+#define meminit(l) (memset(&l,0,sizeof(l)))
+#define fail(s,...) ((fprintf(stderr,s,##__VA_ARGS__), exit(1)))
+#define warn(s,...) ((fprintf(stderr,s,##__VA_ARGS__)))
+#define stdev(cnt, sum, ssq) sqrt((((double)cnt)*ssq-pow((double)sum,2)) / ((double)cnt*((double)cnt-1)))
+
+// maximum number of files that can be tracked by poorquals lib
+#define MAX_FILENO_QUALS 6
+
+// read line, read fq
+typedef struct line {
+ char *s; int n; size_t a;
+} line;
+
+struct fq {
+ line id;
+ line seq;
+ line com;
+ line qual;
+};
+
+
+void free_line(struct line *l);
+void free_fq(struct fq *fq);
+
+// not GNU? probably no getline & strtok_r...
+#if !defined( __GNUC__) || defined(WIN32) || defined(__APPLE__)
+ ssize_t getline(char **lineptr, size_t *n, FILE *stream);
+ char* strtok_r(char *str, const char *delim, char **nextp);
+#endif
+
+// get file extension
+const char *fext(const char *f);
+
+// read fq
+int read_line(FILE *in, struct line &l); // 0=done, 1=ok, -1=err+continue
+int read_fq(FILE *in, int rno, struct fq *fq, const char *name=NULL); // 0=done, 1=ok, -1=err+continue
+void free_fq(struct fq *fq);
+
+// open a file, possibly gzipped, exit on failure
+FILE *gzopen(const char *in, const char *mode, bool *isgz);
+int gzclose(FILE *f, bool isgz);
+
+// keep track of poor quals (n == "file number", maybe should have persistent stat struct instead?)
+bool poorqual(int n, int l, const char *s, const char *q);
+
+// returns number of differences between 2 strings, where n is the "max-length to check"
+inline int hd(char *a, char *b, int n) {
+ int d=0;
+ //if (debug) fprintf(stderr, "hd: %s,%s ", a, b);
+ while (*a && *b && n > 0) {
+ if (*a != *b) ++d;
+ --n;
+ ++a;
+ ++b;
+ }
+ //if (debug) fprintf(stderr, ", %d/%d\n", d, n);
+ return d+n;
+}
+
+// reverse complement an fq entry into a blank (memset 0) one
+void revcomp(struct fq *dest, struct fq* src);
+
+
diff --git a/fastq-mcf.c b/fastq-mcf.c
new file mode 100644
index 0000000..ee25914
--- /dev/null
+++ b/fastq-mcf.c
@@ -0,0 +1,1697 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+
+See "void usage" below for usage.
+
+*/
+
+#include <google/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <string>
+
+#include "fastq-lib.h"
+
+#define VERSION "1.04"
+#define SVNREV atoi(strchr("$LastChangedRevision: 676 $", ':')+1)
+
+#define MAX_ADAPTER_NUM 1000
+#define SCANLEN 15
+#define SCANMIDP ((int) SCANLEN/2)
+#define MAX_FILES 5
+#define MAX_REF 10
+#define B_A 0
+#define B_C 1
+#define B_G 2
+#define B_T 3
+#define B_N 4
+#define B_CNT 5
+#define MAXWARN 10
+#define MAX_PHRED 100
+
+struct ad {
+ char *id; int nid; size_t naid;
+ char *seq; int nseq; size_t naseq;
+ char escan[SCANLEN+1]; // scan sequence
+ int bcnt[MAX_FILES]; // number found at beginning
+ int bcntz[MAX_FILES]; // number found at beginning
+ int ecnt[MAX_FILES]; // number found at end
+ int ecntz[MAX_FILES]; // number found at end
+
+ char end[MAX_FILES]; // 'b' or 'e'
+ int thr[MAX_FILES]; // min-length for clip
+};
+
+int read_fa(FILE *in, int rno, struct ad *ad); // 0=done, 1=ok, -1=err+continue
+int meanqwin(const char *q, int qn, int i, int w); // mean quality within window win, at position i
+bool evalqual(struct fq &fq, int file_num);
+
+int char2bp(char c);
+char bp2char(int b);
+void saveskip(FILE **fout, int fo_n, struct fq *fq);
+
+void valid_arg(char c, const char *a);
+
+void usage(FILE *f, const char *msg=NULL);
+int debug=0;
+int warncount = 0;
+
+// used to filter out other genomes, spike in controls, etc
+
+const char *cmd_align_se = "bowtie -S %i -f %1";
+const char *cmd_align_pe = "bowtie -S %i -1 %1 -2 %2";
+
+// quality filter args
+int qf_mean=0, qf_max_ns=-1, qf_xgt_num=0, qf_xgt_min=0, qf_max_n_pct=-1;
+int qf2_mean=0, qf2_max_ns=-1, qf2_xgt_num=0, qf2_xgt_min=0, qf2_max_n_pct=0;
+
+// qual adjust
+class adjustment {
+public:
+ int pos;
+ int adj;
+ adjustment() {pos=adj=0;}
+};
+std::vector<adjustment> cycle_adjust;
+int phred_adjust[MAX_PHRED];
+int phred_adjust_max=0;
+bool have_phred_adjust=false;
+
+std::string arg2cmdstr(int argc, char** argv);
+
+// phred used
+char phred = 0;
+
+google::sparse_hash_map <std::string, int> dupset;
+int dupmax = 40000000; // this should be configurable, but right now it isn't
+int max_in_buffer = 2400000;
+
+class inbuffer {
+ int max_buf;
+public:
+ inbuffer() {fin=0; gz=0; bp=0; max_buf=max_in_buffer;};
+ ~inbuffer() {close();};
+
+ FILE *fin;
+ bool gz;
+ int bp;
+ std::vector<std::string> buf;
+
+ ssize_t getline(char **lineptr, size_t *n) {
+ if (bp < buf.size()) {
+ // return bufffered
+ int l=buf[bp].length(); // length without null char
+ if (!*lineptr || *n < (l+1)) {
+ // alloc with room for null
+ *lineptr=(char*)realloc(*lineptr,*n=(l+1));
+ }
+ memcpy(*lineptr,buf[bp].data(),l);
+ (*lineptr)[l]='\0';
+ ++bp;
+ return l;
+ } else {
+ int l=::getline(lineptr, n, fin);
+ if (max_buf > 0) {
+ if (buf.size() > max_buf) {
+ if (debug) fprintf(stderr, "Clearing buffer at %d lines\n", (int) buf.size());
+ buf.resize(0);
+ bp=0;
+ max_buf = 0;
+ } else {
+ if (l > 0) {
+ buf.push_back(std::string(*lineptr));
+ ++bp;
+ }
+ }
+ }
+ return l;
+ }
+ }
+
+ int read_fq(int rno, struct fq *fq, const char *name=NULL) {
+ if (bp < buf.size()) {
+ fq->id.n=getline(&fq->id.s, &fq->id.a);
+ fq->seq.n=getline(&fq->seq.s, &fq->seq.a);
+ fq->com.n=getline(&fq->com.s, &fq->com.a);
+ fq->qual.n=getline(&fq->qual.s, &fq->qual.a);
+ if (fq->qual.n <= 0)
+ return 0;
+
+ if (fq->id.s[0] != '@' || fq->com.s[0] != '+' || fq->seq.n != fq->qual.n) {
+ const char *errtyp = (fq->seq.n != fq->qual.n) ? "length mismatch" : fq->id.s[0] != '@' ? "no '@' for id" : "no '+' for comment";
+ if (name) {
+ fprintf(stderr, "Malformed fastq record (%s) in file '%s', line %d\n", errtyp, name, rno*2+1);
+ } else {
+ fprintf(stderr, "Malformed fastq record (%s) at line %d\n", errtyp, rno*2+1);
+ }
+ return -1;
+ }
+
+ fq->seq.s[--fq->seq.n] = '\0';
+ if (fq->seq.s[fq->seq.n-1] == '\r') {
+ fq->seq.s[--fq->seq.n] = '\0';
+ }
+ fq->qual.s[--fq->qual.n] = '\0';
+ if (fq->qual.s[fq->qual.n-1] == '\r') {
+ fq->qual.s[--fq->qual.n] = '\0';
+ }
+
+ return fq->qual.n > 0;
+ } else {
+ return ::read_fq(fin, rno, fq, name);
+ }
+ }
+
+ void reset() {
+ assert(max_buf > 0);
+ bp=0;
+ }
+
+ bool full() {
+ return buf.size()>=max_buf;
+ }
+
+ int close() {
+ int ret=true;
+ if (fin) {
+ ret = gz ? pclose(fin) : fclose(fin);
+ fin=NULL;
+ }
+ return ret;
+ }
+};
+
+int main (int argc, char **argv) {
+ char c;
+ bool eol;
+ int nmin = 1, nkeep = 19, nmax=0;
+ int qf2_min_len=0;
+ float minpct = 0.25;
+ int pctdiff = 10;
+ int sampcnt = 300000; // # of reads to sample to determine adapter profile, and base skewing
+ int xmax = -1;
+ float scale = 2.2;
+ int noclip=0;
+ char end[MAX_FILES]; meminit(end);
+ float skewpct = 2; // any base at any position is less than skewpct of reads
+ float pctns = 20; // any base that is more than 20% n's
+ bool rmns = 1; // remove n's at the end of the read
+ int qthr = 7; // remove end of-read with quality < qthr
+ int qwin = 1; // remove end of read with mean quality < qthr
+ int ilv3 = -1;
+ int duplen = 0;
+ int dupskip = 0;
+ bool noexec = 0;
+ bool hompol_filter = 0;
+ bool lowcom_filter = 0;
+ float hompol_pct = .92;
+ float lowcom_pct = .90;
+
+ dupset.set_deleted_key("<>");
+
+ int i;
+
+ char *afil = NULL;
+ char *ifil[MAX_FILES]; meminit(ifil);
+ const char *ofil[MAX_FILES]; meminit(ofil);
+ int i_n = 0;
+ int o_n = 0;
+ int e_n = 0;
+ bool skipb = 0;
+ char *fref[MAX_REF]; meminit(fref);
+ int fref_n = 0;
+ char *qspec = NULL;
+
+ static struct option long_options[] = {
+ {"qual-mean", 1, 0, 0},
+ {"max-ns", 1, 0, 0},
+ {"qual-gt", 1, 0, 0},
+ {"min-len", 1, 0, 'l'},
+ {"cycle-adjust", 1, 0, 0},
+ {"phred-adjust", 1, 0, 0},
+ {"phred-adjust-max", 1, 0, 0},
+ {"mate-qual-mean", 1, 0, 0},
+ {"mate-max-ns", 1, 0, 0},
+ {"mate-qual-gt", 1, 0, 0},
+ {"mate-min-len", 1, 0, 0},
+ {"homopolymer-pct", 1, 0, 0},
+ {"lowcomplex-pct", 1, 0, 0},
+ {0, 0, 0, 0}
+ };
+
+ meminit(phred_adjust);
+
+ int option_index = 0;
+ while ( (c = getopt_long(argc, argv, "-nf0uXUVHSRdbehp:o:l:s:m:t:k:x:P:q:L:C:w:F:D:",long_options,&option_index)) != -1) {
+ switch (c) {
+ case '\0':
+ {
+ const char *oname=long_options[option_index].name;
+ if(!strcmp(oname, "qual-mean")) {
+ qf_mean=qf2_mean=atoi(optarg);
+ } else if(!strcmp(oname, "mate-qual-mean")) {
+ qf2_mean=atoi(optarg);
+ } else if(!strcmp(oname, "homopolymer-pct")) {
+ hompol_pct=atof(optarg)/100.0;
+ hompol_filter=1;
+ } else if(!strcmp(oname, "lowcomplex-pct")) {
+ lowcom_pct=atof(optarg)/100.0;
+ lowcom_filter=1;
+ } else if(!strcmp(oname, "qual-gt")) {
+ if (!strchr(optarg, ',')) {
+ fprintf(stderr, "Error, %s requires NUM,THR as argument\n", oname);
+ exit(1);
+ }
+ qf_xgt_num=qf2_xgt_num=atoi(optarg);
+ qf_xgt_min=qf2_xgt_min=atoi(strchr(optarg, ',')+1);
+ } else if(!strcmp(oname, "mate-qual-gt")) {
+ if (!strchr(optarg, ',')) {
+ fprintf(stderr, "Error, %s requires NUM,THR as argument\n", oname);
+ exit(1);
+ }
+ qf2_xgt_num=atoi(optarg);
+ qf2_xgt_min=atoi(strchr(optarg, ',')+1);
+ } else if(!strcmp(oname, "cycle-adjust")) {
+ if (!strchr(optarg, ',')) {
+ fprintf(stderr, "Error, %s requires CYC,ADJ as argument\n", oname);
+ exit(1);
+ }
+ adjustment a;
+ a.pos=atoi(optarg);
+ a.adj=atoi(strchr(optarg, ',')+1);
+ cycle_adjust.push_back(a);
+ } else if(!strcmp(oname, "phred-adjust-max")) {
+ phred_adjust_max=atoi(optarg);
+ } else if(!strcmp(oname, "phred-adjust")) {
+ if (!strchr(optarg, ',')) {
+ fprintf(stderr, "Error, %s requires CYC,ADJ as argument\n", oname);
+ exit(1);
+ }
+ int phred=atoi(optarg);
+ int adj=atoi(strchr(optarg, ',')+1);
+ assert(phred<MAX_PHRED && phred >= 0);
+ if (adj)
+ have_phred_adjust=true;
+ phred_adjust[phred]=adj;
+ } else if(!strcmp(oname, "max-ns")) {
+ if (strchr(optarg,'%')) {
+ qf_max_n_pct=atoi(optarg);
+ qf2_max_n_pct=atoi(optarg);
+ } else {
+ qf_max_ns=atoi(optarg);
+ qf2_max_ns=atoi(optarg);
+ }
+
+ } else if(!strcmp(oname, "mate-max-ns")) {
+ if (strchr(optarg,'%')) {
+ qf2_max_n_pct=atoi(optarg);
+ } else {
+ qf2_max_ns=atoi(optarg);
+ }
+ } else if(!strcmp(oname, "mate-min-len")) {
+ qf2_min_len=atoi(optarg);
+ }
+ break;
+ }
+ case '\1':
+ if (!afil)
+ afil = optarg;
+ else if (i_n<MAX_FILES)
+ ifil[i_n++] = optarg;
+ else {
+ usage(stderr, "Too many input files."); return 1;
+ }
+ break;
+ case 't': minpct = atof(optarg); break;
+ case 'm': nmin = atoi(optarg); break;
+ case 'l': nkeep = atoi(optarg); break;
+ case 'L': nmax = atoi(optarg); break;
+ case '0': nmax=0; skewpct=0; pctns=0; rmns=0; qthr=0; nkeep=0; ilv3=-1; break;
+ case 'u': ilv3=1; break;
+ case 'U': ilv3=0; break;
+ case 'H': hompol_filter=1; break;
+ case 'X': lowcom_filter=1; break;
+ case 'k': skewpct = atof(optarg); break;
+ case 'q': qthr = atoi(optarg); valid_arg(c,optarg); break;
+ case 'Q': qspec = optarg; break;
+ case 'w': qwin = atoi(optarg); break;
+ case 'C': sampcnt = atoi(optarg); if (sampcnt*8 > max_in_buffer) max_in_buffer = sampcnt * 8; break;
+ case 'F': fref[fref_n++] = optarg; break;
+ case 'x': pctns = atof(optarg); break;
+ case 'R': rmns = false; break;
+ case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); return 0; break;
+ case 'p': pctdiff = atoi(optarg); break;
+ case 'P': phred = (char) atoi(optarg); break;
+ case 'D': duplen = atoi(optarg); break;
+ case 'h': usage(stdout); return 1;
+ case 'o': if (!o_n < MAX_FILES)
+ ofil[o_n++] = optarg;
+ break;
+ case 's': scale = atof(optarg); break;
+ case 'S': skipb = 1; break;
+ case 'i': if (i_n<MAX_FILES)
+ ifil[i_n++] = optarg;
+ else
+ return usage(stderr, "Too many input files."), 1;
+ break;
+ case 'n': noclip = 1; break;
+ case 'd': ++debug; break;
+ case 'b': end[e_n++] = 'b'; break;
+ case 'e': end[e_n++] = 'e'; break;
+ case '?':
+ if (strchr("polsmtkx", optopt))
+ fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+ else if (isprint(optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr,
+ "Unknown option character `\\x%x'.\n",
+ optopt);
+ usage(stderr);
+ return 1;
+ }
+ }
+
+ if (duplen > 75) {
+ fprintf(stderr, "WARNING: duplen of %d is probably too long, do you really need it?\n", duplen);
+ }
+
+ if (i_n == 1 && o_n == 0) {
+ ofil[o_n++]="-";
+ }
+
+ if (!noclip && o_n != i_n) {
+ fprintf(stderr, "Error: number of input files must match number of '-o' output files.\n");
+ return 1;
+ }
+
+ if (argc < 3 || !afil || !i_n) {
+ usage(stderr);
+ return 1;
+ }
+
+ FILE *ain = NULL;
+ if (strcasecmp(afil, "n/a") && strcasecmp(afil, "/dev/null") && strcasecmp(afil, "NUL")) {
+ ain = fopen(afil, "r");
+ if (!ain) {
+ fprintf(stderr, "Error opening adapter file '%s': %s\n",afil, strerror(errno));
+ return 1;
+ }
+ }
+
+ FILE *fstat = stderr;
+ if (!noclip && strcmp(ofil[0], "-")) {
+ fstat = stdout;
+ }
+ if (noclip) {
+ fstat = stdout;
+ }
+
+ fprintf(fstat, "Command Line: %s\n", arg2cmdstr(argc, argv).c_str());
+
+ FILE *fout[MAX_FILES]; meminit(fout);
+ bool gzout[MAX_FILES]; meminit(gzout);
+ inbuffer fin[MAX_FILES];
+
+ // if (debug) fprintf(stderr,"i_n:%d, ifil[0]:%s\n",i_n, ifil[0]);
+
+ for (i=0;i<i_n;++i) {
+ if ((i_n==1) && !strcmp(ifil[0], "-")) {
+ fin[i].fin=stdin;
+ fin[i].gz=0;
+ } else {
+ fin[i].fin=gzopen(ifil[i], "r", &fin[i].gz);
+ }
+ }
+
+ struct ad ad[MAX_ADAPTER_NUM+1];
+ memset(ad, 0, sizeof(ad));
+
+ int acnt=0, ok=0, rno=0; // adapter count, ok flag, record number
+
+ if (ain) {
+ while (acnt < MAX_ADAPTER_NUM && (ok = read_fa(ain, rno, &ad[acnt]))) {
+ ++rno;
+ if (ok < 0)
+ break;
+ // copy in truncated to max scan length
+ strncpy(ad[acnt].escan, ad[acnt].seq, SCANLEN);
+ ad[acnt].escan[SCANLEN] = '\0';
+ //fprintf(stderr, "escan: %s, %s\n", ad[acnt].id, ad[acnt].escan);
+ ++acnt;
+ }
+
+ if (acnt == 0) {
+ fprintf(stderr, "No adapters in file '%s'\n",afil);
+ }
+ }
+
+ fprintf(fstat, "Scale used: %g\n", scale);
+ int maxns = 0; // max sequence length
+ int avgns[MAX_FILES]; meminit(avgns); // average sequence length per file
+ // read length
+ for (i=0;i<i_n;++i) {
+
+ char *s = NULL; size_t na = 0; int nr = 0, ns = 0, totn[MAX_FILES]; meminit(totn);
+ char *q = NULL; size_t naq = 0; int nq =0;
+ int j;
+ int ilv3det=2;
+ int skipped = 0;
+
+ struct stat st;
+ stat(ifil[i], &st);
+
+ while (fin[i].getline(&s, &na) > 0) {
+ if (*s == '@') {
+ // look for illumina purity filtering flags
+ if (ilv3det==2) {
+ ilv3det=0;
+ const char *p=strchr(s, ':');
+ if (p) {
+ ++p;
+ if (isdigit(*p)) {
+ p=strchr(s, ' ');
+ if (p) {
+ ++p;
+ if (isdigit(*p)) {
+ ++p;
+ if (*p ==':') {
+ ++p;
+ if (*p =='Y') {
+ // filtering found
+ ilv3det=1;
+ } else if (*p =='N') {
+ // still illumina
+ ilv3det=2;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if ((ns=fin[i].getline(&s, &na)) <=0) {
+ // reached EOF
+ if (debug) fprintf(stderr, "Dropping out of sampling loop\n");
+ break;
+ }
+
+ nq=fin[i].getline(&q, &naq);
+ nq=fin[i].getline(&q, &naq); // qual is 2 lines down
+
+ // skip poor quals/lots of N's when doing sampling
+ if (st.st_size > (sampcnt * 500) && (skipped < sampcnt) && poorqual(i, ns, s, q)) {
+ if (debug) fprintf(stderr, "Skip poorqual\n");
+ ++skipped;
+ continue;
+ }
+
+ if (phred == 0) {
+ --nq;
+ for (j=0;j<nq;++j) {
+ if (q[j] < 64) {
+ if (debug) fprintf(stderr, "Using phred 33, because saw: %c\n", q[j]);
+ // default to sanger 33, if you see a qual < 64
+ phred = 33;
+ break;
+ }
+ }
+ }
+ --ns; // don't count newline for read len
+ ++nr;
+ avgns[i] += ns;
+ if (ns > maxns) maxns = ns;
+
+ // just 10000 reads for readlength sampling
+ if (nr >= 10000) {
+ if (debug) fprintf(stderr, "Read 10000\n");
+ break;
+ }
+ } else {
+ fprintf(stderr, "Invalid FASTQ format : %s\n", ifil[i]);
+ break;
+ }
+ }
+ if (ilv3det == 1 && (ilv3 == -1)) {
+ ilv3=1;
+ }
+ if (debug) fprintf(stderr,"Ilv3det: %d\n", ilv3det);
+ if (s) free(s);
+ if (q) free(q);
+ if (nr)
+ avgns[i] = avgns[i]/nr;
+ }
+
+ if (ilv3 == -1) {
+ ilv3 = 0;
+ }
+
+ if (ilv3) {
+ fprintf(fstat, "Filtering Illumina reads on purity field\n");
+ }
+
+ // default to illumina 64 if you never saw a qual < 33
+ if (phred == 0) phred = 64;
+ fprintf(fstat, "Phred: %d\n", phred);
+
+ for (i=0;i<i_n;++i) {
+ if (avgns[i] == 0) {
+ fprintf(stderr, "No records in file %s\n", ifil[i]);
+ exit(1);
+ }
+ }
+
+ for (i=0;i<i_n;++i) {
+ fin[i].reset();
+ }
+
+ if (debug) fprintf(stderr,"Max ns: %d, Avg[0]: %d\n", maxns, avgns[0]);
+
+ // total base count per read position in sample
+ int balloc = maxns;
+ bool dobcnt = 1;
+ if (maxns > 500) {
+ dobcnt = 0;
+ balloc = 1;
+ }
+
+ int bcnt[MAX_FILES][2][balloc][6]; meminit(bcnt);
+ int qcnt[MAX_FILES][2]; meminit(qcnt);
+ char qmin=127, qmax=0;
+ int nsampcnt = 0;
+ double stat_lowcom_total=0, stat_lowcom_ssq=0, stat_lowcom_b4_total=0, stat_lowcom_b4_ssq=0;
+ long stat_lowcom_cnt=0, stat_lowcom_b4_cnt=0;
+
+ for (i=0;i<i_n;++i) {
+
+ struct stat st;
+ stat(ifil[i], &st);
+
+ // todo, use readfq
+ char *s = NULL; size_t na = 0; int ns = 0, nr = 0;
+ char *q = NULL; size_t naq = 0; int nq =0;
+ char *d = NULL; size_t nad = 0; int nd =0;
+
+ int skipped = 0;
+ while ((nd=fin[i].getline(&d, &nad)) > 0) {
+ if (*d == '@') {
+ if ((ns=fin[i].getline(&s, &na)) <=0)
+ break;
+ nq=fin[i].getline(&q, &naq);
+ nq=fin[i].getline(&q, &naq); // qual is 2 lines down
+
+ --nq; --ns; // don't count newline for read len
+
+ // skip poor quals/lots of N's when doing sampling (otherwise you'll miss some)
+ if ((st.st_size > (sampcnt * 500)) && (skipped < sampcnt) && poorqual(i, ns, s, q)) {
+ ++skipped;
+ continue;
+ }
+
+ if (nq != ns) {
+ if (warncount < MAXWARN) {
+ fprintf(stderr, "Warning, corrupt quality for sequence: %s", s, q);
+ ++warncount;
+ }
+ continue;
+ }
+
+ if (i > 0 && avgns[i] < 11) // reads of avg length < 11 ? barcode lane, skip it
+ continue;
+
+ if (ilv3) { // illumina purity filtering
+ char * p = strchr(d, ' ');
+ if (p) {
+ p+=2;
+ if (*p==':') {
+ ++p;
+ if (*p == 'Y') {
+ continue;
+ }
+ }
+ }
+ }
+
+ ++nr;
+
+ // to be safe, we don't assume reads are fixed-length, not any slower, just a little more code
+ if (dobcnt) {
+ int b;
+ for (b = 0; b < ns/2 && b < maxns; ++b) {
+ ++bcnt[i][0][b][char2bp(s[b])]; // count from begin
+ ++bcnt[i][0][b][B_CNT]; // count of samples at position
+ ++bcnt[i][1][b][char2bp(s[ns-b-1])]; // count from end
+ ++bcnt[i][1][b][B_CNT]; // count of samples at offset-from-end position
+ }
+ }
+ qcnt[i][0]+=((q[0]-phred)<qthr); // count of q<thr for last (first trimmable) base
+ qcnt[i][1]+=((q[ns-1]-phred)<qthr);
+ //fprintf(stderr,"qcnt i%d e0=%d, e1=%d\n", i, qcnt[i][0], qcnt[i][1]);
+
+ int a;
+ char buf[SCANLEN+1];
+ strncpy(buf, s, SCANLEN);
+ for(a=0;a<acnt;++a) {
+ char *p;
+ // search whole seq for 15 char "end" of adap string
+ if (p = strstr(s+1, ad[a].escan)) {
+ if (debug > 1) fprintf(stderr, " END S: %s A: %s (%s), P: %d, SL: %d, Z:%d\n", s, ad[a].id, ad[a].escan, (int) (p-s), ns, (p-s) == ns-SCANLEN);
+ // found at the very end
+ if ((p-s) == ns-SCANLEN)
+ ++ad[a].ecntz[i];
+ ++ad[a].ecnt[i];
+ }
+ // search 15 char begin of seq in longer adap string
+ int slen;
+ if (SCANLEN <= ad[a].nseq) {
+ slen = SCANLEN;
+ p = strstr(ad[a].seq, buf);
+ } else {
+ slen = ad[a].nseq;
+ if (!strncmp(ad[a].seq,buf,ad[a].nseq))
+ p=ad[a].seq;
+ else
+ p=NULL;
+ }
+ if (p) {
+ if (debug > 1) fprintf(stderr, "BEGIN S: %s A: %s (%s), P: %d, SL: %d, Z:%d\n", buf, ad[a].id, ad[a].seq, (int) (p-ad[a].seq), ns, (p-ad[a].seq ) == ad[a].nseq-slen);
+ // found the end of the adapter
+ if (p-ad[a].seq == ad[a].nseq-slen)
+ ++ad[a].bcntz[i];
+ ++ad[a].bcnt[i];
+ }
+ }
+ }
+ if (fin[i].full() || nr >= sampcnt) // enough samples
+ break;
+ }
+ if (s) free(s);
+ if (d) free(d);
+ if (q) free(q);
+ if (i == 0 || avgns[i] >= 11) {
+ if (nsampcnt == 0 || nr < nsampcnt) // fewer than max, set for thresholds
+ nsampcnt=nr;
+ }
+ }
+
+ if (nsampcnt == 0) {
+ fprintf(stderr, "ERROR: Unable to read file for subsampling\n");
+ exit(1);
+ }
+
+ sampcnt = nsampcnt;
+ int sktrim[i_n][2]; meminit(sktrim);
+
+ // look for severe base skew, and auto-trim ends based on it
+ int needqtrim=0;
+ if (dobcnt) {
+ if (sampcnt > 0 && skewpct > 0) {
+ for (i=0;i<i_n;++i) {
+ if (avgns[i] < 11) // reads of avg length < 11 ? barcode lane, skip it
+ continue;
+ int e;
+ for (e = 0; e < 2; ++e) {
+ // 5% qual less than low-threshold? need qualtrim
+ if (qthr > 0 && (100.0*qcnt[i][e])/sampcnt > 5) {
+ needqtrim = 1;
+ }
+
+ int p;
+ for (p = 0; p < maxns/2; ++p) {
+ int b;
+
+ int skth = (int) ( (float) bcnt[i][e][p][B_CNT] * ( skewpct / 100.0 ) ) ; // skew threshold
+ int thr_n = (int) ( (float) bcnt[i][e][p][B_CNT] * ( pctns / 100.0 ) ); // n-threshold
+
+ if (debug > 1)
+ fprintf(stderr,"Sk Prof [%d, %d]: skth=%d, bcnt=%d, ncnt=%d, a=%d, c=%d, g=%d, t=%d\n", e, p, skth,
+ bcnt[i][e][p][B_CNT], bcnt[i][e][p][B_N], bcnt[i][e][p][B_A],
+ bcnt[i][e][p][B_C], bcnt[i][e][p][B_G], bcnt[i][e][p][B_T]);
+
+ if (skth < 10) // too few samples to detect skew
+ continue;
+
+ int tr = 0;
+ for (b = 0; b < 4; ++b) {
+ if (bcnt[i][e][p][b] < skth) { // too few bases of this type
+ tr=1;
+ if (debug > 1)
+ fprintf(stderr, "Skew at i:%d e:%d p:%d b:%d\n", i, e, p, b);
+ break;
+ }
+ }
+ if (bcnt[i][e][p][B_N] > thr_n) { // too many n's
+ if (debug > 1)
+ fprintf(stderr, "Too many N's at i:%d e:%d p:%d b:%d ( %d > %d )\n", i, e, p, b, bcnt[i][e][p][B_N], thr_n);
+ tr=1;
+ }
+
+ if (tr) {
+ if (p == sktrim[i][e]) { // adjacent, so increase trim
+ ++sktrim[i][e];
+ } else {
+ fprintf(fstat, "Within-read Skew: Position %d from the %s of reads is skewed!\n", p, e==0?"start":"end");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ }
+
+ int e;
+ bool someskew = false;
+ for (i=0;i<i_n;++i) {
+ int totskew = sktrim[i][0] + sktrim[i][1];
+ if ((maxns - totskew) < nkeep) {
+ if (totskew > 0) {
+ fprintf(fstat, "Warning: Too much skewing found (%d), disabling skew clipping\n", totskew);
+ }
+ meminit(sktrim);
+ break;
+ }
+ }
+
+ for (i=0;i<i_n;++i) {
+ for (e=0;e<2;++e) {
+ if (sktrim[i][e] > 0) {
+ fprintf(fstat, "Trim '%s': %d from %s\n", e==0?"start":"end", sktrim[i][e], ifil[i]);
+ someskew=true;
+ }
+ }
+ }
+
+ int athr = (int) ((float)sampcnt * minpct) / 100;
+ fprintf(fstat, "Threshold used: %d out of %d\n", athr+1, sampcnt);
+
+ int a;
+ int newc=0;
+ for(a=0;a<acnt;++a) {
+ int any=0;
+ for (i=0;i<i_n;++i) {
+ if (debug) fprintf(stderr, "ad:%s, EC:%d, BC:%d, ECZ: %d, BCZ: %d\n", ad[a].id, ad[a].ecnt[i], ad[a].bcnt[i], ad[a].ecntz[i], ad[a].bcntz[i]);
+ if (ad[a].ecnt[i] > athr || ad[a].bcnt[i] > athr) {
+ int cnt;
+ // heavily weighted toward start/end maches
+ if ((ad[a].ecnt[i] + 10*ad[a].ecntz[i]) >= (ad[a].bcnt[i] + 10*ad[a].bcntz[i])) {
+ ad[a].end[i]='e';
+ cnt = ad[a].ecnt[i];
+ } else {
+ ad[a].end[i]='b';
+ cnt = ad[a].bcnt[i];
+ }
+
+ char *p;
+ if (p=strstr(ad[a].id, "_3p")) {
+ if (p[3] == '\0' || p[3] == '_') {
+ ad[a].end[i]='e';
+ cnt = ad[a].ecnt[i];
+ }
+ } else if (p=strstr(ad[a].id, "_5p")) {
+ if (p[3] == '\0' || p[3] == '_') {
+ ad[a].end[i]='b';
+ cnt = ad[a].bcnt[i];
+ }
+ }
+
+ // user supplied end.... don't clip elsewhere
+ if (end[i] && ad[a].end[i] != end[i])
+ continue;
+
+ if (scale >= 100)
+ ad[a].thr[i] = ad[a].nseq;
+ else
+ ad[a].thr[i] = min(ad[a].nseq,max(nmin,(int) (-log(cnt / (float) sampcnt)/log(scale))));
+
+ fprintf(fstat, "Adapter %s (%s): counted %d at the '%s' of '%s', clip set to %d", ad[a].id, ad[a].seq, cnt, ad[a].end[i] == 'e' ? "end" : "start", ifil[i], ad[a].thr[i]);
+ if (abs((ad[a].bcnt[i]-ad[a].ecnt[i])) < athr/4) {
+ fprintf(fstat, ", warning end was not reliable\n", ad[a].id, ad[a].seq);
+ } else {
+ fputc('\n', fstat);
+ }
+ ++any;
+ }
+ }
+ if (!any)
+ continue;
+ ad[newc++]=ad[a];
+ }
+
+ acnt=newc;
+
+ if (acnt == 0 && !someskew && !needqtrim && !ilv3) {
+ fprintf(fstat, "No adapters found");
+ if (skewpct > 0) fprintf(fstat, ", no skewing detected");
+ if (qthr > 0) fprintf(fstat, ", and no trimming needed");
+ fprintf(fstat, ".\n");
+ if (noclip) exit (1); // for including in a test
+ } else {
+ if (debug) fprintf(stderr, "acnt: %d, ssk: %d, needq: %d\n", acnt, someskew, needqtrim);
+ if (noclip) {
+ if (acnt == 0) fprintf(fstat, "No adapters found. ");
+ if (someskew) fprintf(fstat, "Skewing detected. ");
+ if (needqtrim) fprintf(fstat, "Quality trimming is needed. ");
+ fprintf(fstat, "\n");
+ }
+ }
+
+ if (noclip)
+ exit(0);
+
+ for (i=0;i<o_n;++i) {
+ if (!strcmp(ofil[i],"-")) {
+ fout[i]=stdout;
+ } else {
+ fout[i]=gzopen(ofil[i], "w", &gzout[i]);
+ }
+ }
+
+ FILE *fskip[MAX_FILES]; meminit(fskip);
+ bool gzskip[MAX_FILES]; meminit(gzskip);
+
+ if (skipb) {
+ for (i=0;i<o_n;++i) {
+ if (!strcmp(ofil[i],"-")) {
+ fskip[i]=stderr;
+ } else {
+ char *skipfil = (char *) malloc(strlen(ofil[i])+10);
+ if (!strcmp(fext(ofil[i]),".gz")) {
+ char *p=(char *)strrchr(ofil[i],'.');
+ *p='\0';
+ sprintf(skipfil, "%s.skip.gz", ofil[i]);
+ *p='.';
+ } else {
+ sprintf(skipfil, "%s.skip", ofil[i]);
+ }
+ if (!(fskip[i]=gzopen(skipfil, "w", &gzskip[i]))) {
+ fprintf(stderr, "Error opening skip file '%s': %s\n",skipfil, strerror(errno));
+ return 1;
+ }
+ free(skipfil);
+ }
+ }
+ }
+
+ struct fq fq[MAX_FILES];
+ memset(&fq, 0, sizeof(fq));
+
+ int nrec=0;
+ int nerr=0;
+ int nok=0;
+ int ntooshort=0;
+ int ntoohompol=0;
+ int ntoolowcom=0;
+ int nfiltered=0;
+ // total per read
+ int ntrim[MAX_FILES]; meminit(ntrim);
+
+ // total per end
+ int cnttrim[MAX_FILES][2]; meminit(cnttrim);
+ double tottrim[MAX_FILES][2]; meminit(tottrim);
+ double ssqtrim[MAX_FILES][2]; meminit(ssqtrim);
+ int trimql[MAX_FILES]; meminit(trimql);
+ int trimqb[MAX_FILES]; meminit(trimqb);
+ int nilv3pf=0; // number of illumina version 3 purity filitered
+ int read_ok;
+
+ if (i_n > 0)
+ fprintf(fstat, "Files: %d\n", i_n);
+
+ for (i=0;i<i_n;++i) {
+ fin[i].reset();
+ }
+
+ google::sparse_hash_map <std::string, int>::const_iterator lookup_it;
+
+ bool io_ok = true;
+
+ while (read_ok=fin[0].read_fq(nrec, &fq[0])) {
+ for (i=1;i<i_n;++i) {
+ int mok=fin[1].read_fq(nrec, &fq[i]);
+ if (mok != read_ok) {
+ fprintf(stderr, "# of rows in mate file '%s' doesn't match, quitting!\n", ifil[i]);
+ return 1;
+ }
+ }
+ ++nrec;
+ if (read_ok < 0) {
+ ++nerr;
+ continue;
+ }
+
+ if (ilv3) {
+ char * p = strchr(fq[0].id.s, ' ');
+ if (p) {
+ p+=2;
+ if (*p==':') {
+ ++p;
+ if (*p == 'Y') {
+ ++nilv3pf;
+ if (skipb) saveskip(fskip, i_n, fq);
+ continue;
+ }
+ }
+ }
+ }
+
+ // chomp
+
+ int dotrim[MAX_FILES][2];
+ int skip = 0; // skip whole record?
+ int hompol_seq=0;
+ int hompol_cnt=0;
+ int lowcom_seq=0;
+ int lowcom_cnt=0;
+ int f;
+ for (f=0;f<i_n;++f) {
+ dotrim[f][0] = sktrim[f][0]; // default, trim to detected skew levels
+ dotrim[f][1] = sktrim[f][1];
+ if (avgns[f] < 11)
+ // reads of avg length < 11 ? barcode lane, skip it
+ continue;
+
+
+ if (have_phred_adjust) {
+ for (i=0;i<fq[f].qual.n;++i) {
+ if (phred_adjust[fq[f].qual.s[i]-phred]) {
+ fq[f].qual.s[i]+=phred_adjust[fq[f].qual.s[i]-phred];
+ }
+ }
+ }
+
+ if (phred_adjust_max) {
+ for (i=0;i<fq[f].qual.n;++i) {
+ if ((fq[f].qual.s[i]-phred)>phred_adjust_max) {
+ fq[f].qual.s[i]=phred_adjust_max+phred;
+ }
+ }
+ }
+
+
+ for (i=0;i<cycle_adjust.size();++i) {
+ if (abs(cycle_adjust[i].pos) < fq[f].qual.n) {
+ if (cycle_adjust[i].pos>0) {
+ fq[f].qual.s[cycle_adjust[i].pos-1]+=cycle_adjust[i].adj;
+ } else {
+ fq[f].qual.s[fq[f].qual.n+cycle_adjust[i].pos]+=cycle_adjust[i].adj;
+ }
+ }
+ }
+
+
+ if (rmns) {
+ for (i=dotrim[f][0];i<(fq[f].seq.n);++i) {
+ // trim N's from the front
+ if (fq[f].seq.s[i] == 'N')
+ dotrim[f][0] = i + 1;
+ else
+ break;
+ }
+ for (i=dotrim[f][1];i<(fq[f].seq.n);++i) {
+ // trim N's from the end
+ if (fq[f].seq.s[fq[f].seq.n-i-1] == 'N')
+ dotrim[f][1] = i + 1;
+ else
+ break;
+ }
+ }
+
+ if (hompol_filter) {
+ char p; int h = 0;
+ for (i = dotrim[f][0]+1;i<fq[f].seq.n;++i) {
+ // N's always match everything
+ if (fq[f].seq.s[i] == 'N' || (fq[f].seq.s[i] == fq[f].seq.s[i-1])) {
+ ++hompol_seq;
+ }
+ ++hompol_cnt;
+ }
+ }
+
+ if (lowcom_filter) {
+ char p; int h = 0;
+ for (i = dotrim[f][0]+1;i<fq[f].seq.n;++i) {
+ // N's always match everything
+ if (fq[f].seq.s[i] == 'N' || (fq[f].seq.s[i] == fq[f].seq.s[i-1])) {
+ ++lowcom_seq;
+ } else if (i >= dotrim[f][0]+3) {
+ if (fq[f].seq.s[i] == fq[f].seq.s[i-2] && fq[f].seq.s[i-1] == fq[f].seq.s[i-3]) {
+ ++lowcom_seq;
+ }
+ } else if (i >= dotrim[f][0]+3) {
+ if (fq[f].seq.s[i] == fq[f].seq.s[i-3] && fq[f].seq.s[i-1] == fq[f].seq.s[i-4] && fq[f].seq.s[i-3] == fq[f].seq.s[i-5]) {
+ ++lowcom_seq;
+ }
+ }
+ ++lowcom_cnt;
+ }
+ }
+
+ if (qthr > 0) {
+ bool istrimq = false;
+
+ // trim qual from the begin
+ for (i=dotrim[f][0];i<(fq[f].seq.n);++i) {
+ if (qwin > 1 && (meanqwin(fq[f].qual.s,fq[f].seq.n,i,qwin)-phred) < qthr) {
+ ++trimqb[f];
+ istrimq = true;
+ dotrim[f][0] = i + 1;
+ } else if ((fq[f].qual.s[i]-phred) < qthr) {
+ ++trimqb[f];
+ istrimq = true;
+ dotrim[f][0] = i + 1;
+ } else
+ break;
+ }
+
+ for (i=dotrim[f][1];i<(fq[f].seq.n);++i) {
+ if (qwin > 1 && (meanqwin(fq[f].qual.s,fq[f].seq.n,fq[f].seq.n-i-1,qwin)-phred) < qthr) {
+ ++trimqb[f];
+ istrimq = true;
+ dotrim[f][1] = i + 1;
+ } else if ((fq[f].qual.s[fq[f].seq.n-i-1]-phred) < qthr) {
+ ++trimqb[f];
+ istrimq = true;
+ dotrim[f][1] = i + 1;
+ } else
+ break;
+ }
+ if (istrimq) ++trimql[f];
+ }
+
+ int bestscore_e = INT_MAX, bestoff_e = 0, bestlen_e = 0;
+ int bestscore_b = INT_MAX, bestoff_b = 0, bestlen_b = 0;
+
+ for (i =0; i < acnt; ++i) {
+ if (debug) fprintf(stderr, "seq[%d]: %s %d\n", f, fq[f].seq.s, fq[f].seq.n);
+
+ if (!ad[i].end[f])
+ continue;
+
+ int nmatch = ad[i].thr[f];
+ if (!nmatch) nmatch = ad[i].nseq; // full match required if nmin == 0
+
+ // how far in to search for a match?
+ int mx = ad[i].nseq;
+ if (xmax) {
+ mx = fq[f].seq.n;
+ if (xmax > 0 && (xmax+ad[i].nseq) < mx)
+ mx = xmax+ad[i].nseq; // xmax is added to adapter length
+ }
+
+ if (debug)
+ fprintf(stderr, "adapter: %s, adlen: %d, nmatch: %d, mx: %d\n", ad[i].seq, ad[i].nseq, nmatch, mx);
+
+ if (ad[i].end[f] == 'e') {
+ int off;
+ for (off = nmatch; off <= mx; ++off) { // off is distance from tail of sequence
+ char *seqtail = fq[f].seq.s+fq[f].seq.n-off; // search at tail
+ int ncmp = off<ad[i].nseq ? off : ad[i].nseq;
+ int mind = (pctdiff * ncmp) / 100;
+ int d = hd(ad[i].seq,seqtail,ncmp); // # differences
+ if (debug>1)
+ fprintf(stderr, "tail: %s, bestoff: %d, off: %d, ncmp: %d, mind: %d, hd %d\n", seqtail, bestoff_e, off, ncmp, mind, d);
+ if (d <= mind) {
+ // squared-distance over length
+ int score = (1000*(d*d+1))/ncmp;
+ if (score <= bestscore_e) { // better score?
+ bestscore_e = score; // save max score
+ bestoff_e = off; // offset at max
+ bestlen_e = ncmp; // cmp length at max
+ }
+ if (d == 0 && (ncmp == ad[i].nseq)) {
+ break;
+ }
+ }
+ }
+ } else {
+ int off;
+ for (off = nmatch; off <= mx; ++off) { // off is distance from start of sequence
+ int ncmp = off<ad[i].nseq ? off : ad[i].nseq; // number we are comparing
+ char *matchtail = ad[i].seq+ad[i].nseq-ncmp; // tail of adapter
+ char *seqstart = fq[f].seq.s+off-ncmp; // offset into sequence (if any)
+ int mind = (pctdiff * ncmp) / 100;
+ int d = hd(matchtail,seqstart,ncmp); // # differences
+ if (debug>1)
+ fprintf(stderr, "bestoff: %d, off: %d, ncmp: %d, mind: %d, hd %d\n", bestoff_e, off, ncmp, mind, d);
+
+ if (d <= mind) {
+ int score = (1000*(d*d+1))/ncmp;
+ if (score <= bestscore_b) { // better score?
+ bestscore_b = score; // save max score
+ bestoff_b = off; // offset at max
+ bestlen_b = ncmp; // cmp length at max
+ }
+ if (d == 0 && (ncmp == ad[i].nseq)) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ // lengthen trim based on best level
+ if (bestoff_b > dotrim[f][0])
+ dotrim[f][0]=bestoff_b;
+
+ if (bestoff_e > dotrim[f][1])
+ dotrim[f][1]=bestoff_e;
+
+ int totclip = min(fq[f].seq.n,dotrim[f][0] + dotrim[f][1]);
+
+// if (debug > 1) fprintf(stderr,"totclip %d\n", totclip);
+
+ if (totclip > 0) {
+ // keep length > X, X based on mate
+ int tkeep = f == 0 ? nkeep : qf2_min_len > 0 ? qf2_min_len : nkeep;
+
+ if ( (fq[f].seq.n-totclip) < tkeep) {
+ // skip all reads if one is severely truncated ??
+ // maybe not... ?
+ skip = 1;
+ break;
+ }
+
+ // count number of adapters clipped, not the number of rows trimmed
+ if (bestoff_b > 0 || bestoff_e > 0)
+ ++ntrim[f];
+
+ // save some stats
+ if (bestoff_b > 0) {
+ cnttrim[f][0]++;
+ tottrim[f][0]+=bestoff_b;
+ ssqtrim[f][0]+=bestoff_b*bestoff_b;
+ }
+ if (bestoff_e > 0) {
+ cnttrim[f][1]++;
+ tottrim[f][1]+=bestoff_e;
+ ssqtrim[f][1]+=bestoff_e*bestoff_e;
+ }
+
+ } else {
+ // skip even if the original was too short
+ if (fq[f].seq.n < nkeep)
+ skip = 1;
+ }
+ }
+
+ int hompol_skip=0;
+ if (hompol_filter) {
+ int hompol_max = hompol_pct * hompol_cnt;
+ if (debug>0) printf("%s: hompol cnt:%d, max:%d, seq:%d\n", fq[0].id.s, hompol_cnt, hompol_max, hompol_seq);
+ if (hompol_seq>=hompol_max) hompol_skip = skip = true;
+ }
+
+ int lowcom_skip=0;
+ if (!hompol_skip && lowcom_filter) {
+ int lowcom_max = lowcom_pct * lowcom_cnt;
+ if (debug>0) printf("%s: lowcom cnt:%d, max:%d, seq:%d\n", fq[0].id.s, lowcom_cnt, lowcom_max, lowcom_seq);
+ if (lowcom_seq>=lowcom_max) lowcom_skip = skip = true;
+ if (!lowcom_skip) {
+ stat_lowcom_total+=((double)lowcom_seq/(double)lowcom_cnt);
+ stat_lowcom_ssq+=pow(((double)lowcom_seq/(double)lowcom_cnt),2);
+ stat_lowcom_cnt+=1;
+ }
+ stat_lowcom_b4_total+=((double)lowcom_seq/(double)lowcom_cnt);
+ stat_lowcom_b4_ssq+=pow(((double)lowcom_seq/(double)lowcom_cnt),2);
+ stat_lowcom_b4_cnt+=1;
+ }
+
+
+ if (!skip) {
+ int f;
+ for (f=0;f<o_n;++f) {
+ if (dotrim[f][1] >= strlen(fq[f].seq.s)) {
+ if (debug) fprintf(stderr,"trimmming full sequence from end (%d), %s", dotrim[f][1], fq[f].id.s);
+ skip=1;
+ continue;
+ }
+ if (dotrim[f][1] > 0) {
+ if (debug) fprintf(stderr,"trimming %d from end, %s", dotrim[f][1], fq[f].id.s);
+ fq[f].seq.s[fq[f].seq.n -=dotrim[f][1]]='\0';
+ fq[f].qual.s[fq[f].qual.n-=dotrim[f][1]]='\0';
+ }
+ if (dotrim[f][0] > 0) {
+ if (debug) fprintf(stderr,"trimming %d from begin, %s", dotrim[f][0], fq[f].id.s);
+ fq[f].seq.n -= dotrim[f][0];
+ fq[f].qual.n -= dotrim[f][0];
+ if (fq[f].seq.n < 0) {
+ fq[f].seq.n = 0;
+ fq[f].qual.n = 0;
+ }
+ memmove(fq[f].seq.s ,fq[f].seq.s +dotrim[f][0],fq[f].seq.n );
+ memmove(fq[f].qual.s,fq[f].qual.s+dotrim[f][0],fq[f].qual.n);
+ fq[f].seq.s[fq[f].seq.n]='\0';
+ fq[f].qual.s[fq[f].qual.n]='\0';
+ }
+ if (nmax > 0) {
+ if (fq[f].seq.n >= nmax ) {
+ fq[f].seq.s[nmax]='\0';
+ fq[f].qual.s[nmax]='\0';
+ }
+ }
+ if (avgns[f]>=11 && !evalqual(fq[f],f)) {
+ skip = 2; // 2==qual
+ }
+ }
+
+ if (duplen > 0 && !skip) {
+ // lookup dupset
+ for (f=0;!skip&&f<o_n;++f) {
+ if (avgns[f]>=11) {
+ char t;
+ if (fq[f].seq.a > duplen) {
+ // truncate if needed
+ t = fq[f].seq.s[duplen];
+ fq[f].seq.s[duplen] = '\0';
+ }
+ lookup_it = dupset.find(fq[f].seq.s);
+ if (lookup_it != dupset.end()) {
+ skip=1; // 1==dup
+ } else {
+ if (dupset.size() < dupmax) {
+ dupset[fq[f].seq.s]=1;
+ }
+ }
+ if (fq[f].seq.a > duplen) {
+ // restore full length
+ fq[f].seq.s[duplen] = t;
+ }
+ }
+ }
+ }
+ if (!skip) {
+ for (f=0;f<o_n;++f) {
+ io_ok=io_ok&&(fputs(fq[f].id.s,fout[f])>=0);
+ io_ok=io_ok&&(fputs(fq[f].seq.s,fout[f])>=0);
+ io_ok=io_ok&&(fputc('\n',fout[f])>=0);
+ io_ok=io_ok&&(fputs(fq[f].com.s,fout[f])>=0);
+ io_ok=io_ok&&(fputs(fq[f].qual.s,fout[f])>=0);
+ io_ok=io_ok&&(fputc('\n',fout[f])>=0);
+ }
+ } else {
+ if (skipb) saveskip(fskip, i_n, fq);
+ if (skip==2) ++nfiltered;
+ if (skip==1) ++dupskip;
+ }
+ } else {
+ if (skipb) saveskip(fskip, i_n, fq);
+ if (hompol_skip) {
+ ++ntoohompol;
+ } else if (lowcom_skip) {
+ ++ntoolowcom;
+ } else {
+ ++ntooshort;
+ }
+ }
+ }
+
+ for (i=0;i<i_n;++i) {
+ if (fout[i]) { io_ok = io_ok && ( gzout[i] ? !pclose(fout[i]) : !fclose(fout[i]) ); }
+ fin[i].close();
+ if (fskip[i]) { if (gzskip[i]) pclose(fskip[i]); else fclose(fskip[i]); }
+ }
+
+ if (!io_ok) {
+ fprintf(fstat, "Error during file close, possible partial write, failing\n");
+ }
+
+ fprintf(fstat, "Total reads: %d\n", nrec);
+ fprintf(fstat, "Too short after clip: %d\n", ntooshort);
+ if (nfiltered)
+ fprintf(fstat, "Filtered on quality: %d\n", nfiltered);
+ if (dupskip)
+ fprintf(fstat, "Filtered on duplicates: %d\n", dupskip);
+ if (ntoohompol)
+ fprintf(fstat, "Filtered on hompolymer: %d\n", ntoohompol);
+ if (ntoolowcom)
+ fprintf(fstat, "Filtered on low complexity: %d\n", ntoolowcom);
+ if (stat_lowcom_b4_total > 0) {
+ fprintf(fstat, "Mean lowcom score: %2.2f(%2.2f), %2.2f(%2.2f) after\n",
+ 100*(stat_lowcom_b4_total/(double)stat_lowcom_b4_cnt), 100*stdev(stat_lowcom_b4_cnt,stat_lowcom_b4_total,stat_lowcom_b4_ssq),
+ 100*(stat_lowcom_total/(double)stat_lowcom_cnt), 100*stdev(stat_lowcom_cnt,stat_lowcom_total,stat_lowcom_ssq)
+ );
+ }
+
+ int f;
+ if (i_n == 1) {
+ f=0;
+ for (e=0;e<2;++e) {
+ if (cnttrim[f][e]>0) {
+ fprintf(fstat, "Clipped '%s' reads: Count: %d, Mean: %.2f, Sd: %.2f\n", e==0?"start":"end", cnttrim[f][e], (double) tottrim[f][e] / cnttrim[f][e], stdev(cnttrim[f][e], tottrim[f][e], ssqtrim[f][e]));
+ }
+ }
+ if (trimql[f] > 0) {
+ fprintf(fstat, "Trimmed %d reads by an average of %.2f bases on quality < %d\n", trimql[f], (float) trimqb[f]/trimql[f], qthr);
+ }
+ } else
+ for (f=0;f<i_n;++f) {
+ for (e=0;e<2;++e) {
+ if (cnttrim[f][e]>0) {
+ fprintf(fstat, "Clipped '%s' reads (%s): Count %d, Mean: %.2f, Sd: %.2f\n", e==0?"start":"end", ifil[f], cnttrim[f][e], (double) tottrim[f][e] / cnttrim[f][e], stdev(cnttrim[f][e], tottrim[f][e], ssqtrim[f][e]));
+ }
+ }
+ if (trimql[f] > 0) {
+ fprintf(fstat, "Trimmed %d reads (%s) by an average of %.2f bases on quality < %d\n", trimql[f], ifil[f], (float) trimqb[f]/trimql[f], qthr);
+ }
+ }
+ if (nilv3pf > 0) {
+ fprintf(fstat, "Filtered %d reads on purity flag\n", nilv3pf);
+ }
+ if (nerr > 0) {
+ fprintf(fstat, "Errors (%s): %d\n", ifil[f], nerr);
+ return 2;
+ }
+ if (!io_ok) {
+ return 3;
+ }
+ return 0;
+}
+
+int read_fa(FILE *in, int rno, struct ad *fa) {
+// note: this only reads one line of sequence!
+ fa->nid = getline(&fa->id, &fa->naid, in);
+ fa->nseq = getline(&fa->seq, &fa->naseq, in);
+ if (fa->nseq <= 0)
+ return 0;
+ if (fa->id[0] != '>') {
+ fprintf(stderr, "Malformed adapter fasta record at line %d\n", rno*2+1);
+ return -1;
+ }
+ // chomp
+ fa->seq[--fa->nseq] = '\0';
+ fa->id[--fa->nid] = '\0';
+ char *p = fa->id+1;
+ while (*p == ' ') {
+ ++p;
+ }
+ memmove(fa->id, p, strlen(p)+1);
+ fa->nid=strlen(fa->id);
+
+ // rna 2 dna
+ int i;
+ for (i=0;i<fa->nseq;++i) {
+ if (fa->seq[i]=='U') fa->seq[i] = 'T';
+ }
+ return 1;
+}
+
+void usage(FILE *f, const char *msg) {
+ if(msg)
+ fprintf(f, "%s\n", msg);
+
+ fprintf(f,
+"Usage: fastq-mcf [options] <adapters.fa> <reads.fq> [mates1.fq ...] \n"
+"Version: %s.%d\n"
+"\n"
+"Detects levels of adapter presence, computes likelihoods and\n"
+"locations (start, end) of the adapters. Removes the adapter\n"
+"sequences from the fastq file(s).\n"
+"\n"
+"Stats go to stderr, unless -o is specified.\n"
+"\n"
+"Specify -0 to turn off all default settings\n"
+"\n"
+"If you specify multiple 'paired-end' inputs, then a -o option is\n"
+"required for each. IE: -o read1.clip.q -o read2.clip.fq\n"
+"\n"
+"Options:\n"
+" -h This help\n"
+" -o FIL Output file (stats to stdout)\n"
+" -s N.N Log scale for adapter minimum-length-match (2.2)\n"
+" -t N %% occurance threshold before adapter clipping (0.25)\n"
+" -m N Minimum clip length, overrides scaled auto (1)\n"
+" -p N Maximum adapter difference percentage (10)\n"
+" -l N Minimum remaining sequence length (19)\n"
+" -L N Maximum remaining sequence length (none)\n"
+" -D N Remove duplicate reads : Read_1 has an identical N bases (0)\n"
+" -k N sKew percentage-less-than causing cycle removal (2)\n"
+" -x N 'N' (Bad read) percentage causing cycle removal (20)\n"
+" -q N quality threshold causing base removal (10)\n"
+" -w N window-size for quality trimming (1)\n"
+" -H remove >95%% homopolymer reads (no)\n"
+" -X remove low complexity reads (no)\n"
+//" -F FIL remove sequences that align to FIL\n"
+" -0 Set all default parameters to zero/do nothing\n"
+" -U|u Force disable/enable Illumina PF filtering (auto)\n"
+" -P N Phred-scale (auto)\n"
+" -R Don't remove N's from the fronts/ends of reads\n"
+" -n Don't clip, just output what would be done\n"
+" -C N Number of reads to use for subsampling (300k)\n"
+" -S Save all discarded reads to '.skip' files\n"
+" -d Output lots of random debugging stuff\n"
+"\n"
+"Quality adjustment options:\n"
+" --cycle-adjust CYC,AMT Adjust cycle CYC (negative = offset from end) by amount AMT\n"
+" --phred-adjust SCORE,AMT Adjust score SCORE by amount AMT\n"
+" --phred-adjust-max SCORE Adjust scores > SCORE to SCOTE\n"
+"\n"
+"Filtering options*:\n"
+" --[mate-]qual-mean NUM Minimum mean quality score\n"
+" --[mate-]qual-gt NUM,THR At least NUM quals > THR\n"
+" --[mate-]max-ns NUM Maxmium N-calls in a read (can be a %%)\n"
+" --[mate-]min-len NUM Minimum remaining length (same as -l)\n"
+" --homopolymer-pct PCT Homopolymer filter percent (95)\n"
+" --lowcomplex-pct PCT Complexity filter percent (95)\n"
+"\n"
+"If mate- prefix is used, then applies to second non-barcode read only\n"
+/*
+"Config:\n"
+"\n"
+"Some options are best set globally, such as the aligner to use\n"
+"for filtering, these can be ENV vars, or in /etc/ea-utils.conf:\n"
+"\n"
+"Command line options, if specified, always override config vars.\n"
+"\n"
+"When uses as environment vars, they are all caps, and with\n"
+"EAUTILS_ as the prefix (IE: EAUTILS_PHRED=33)\n"
+"\n"
+" phred (auto)\n"
+" sample_reads 100000\n"
+" scale_clip_len 2.2\n"
+" trim_skew 2\n"
+" trim_quality 10\n"
+" min_clip_len 0\n"
+" min_seq_remain 15\n"
+" max_adap_diff 20\n"
+//" cmd_align_se bowtie -S %%i -f %%1\n"
+//" cmd_align_pe bowtie -S %%i -1 %%1 -2 %%2\n"
+//"\n"
+//"Command lines must return SAM formatted lines, %%i is the filter FIL,\n"
+//"%%1 and %%2 are the first and second fastq's\n"
+*/
+"\n"
+"Adapter files are 'fasta' formatted:\n"
+"\n"
+"Specify n/a to turn off adapter clipping, and just use filters\n"
+"\n"
+"Increasing the scale makes recognition-lengths longer, a scale\n"
+"of 100 will force full-length recognition of adapters.\n"
+"\n"
+"Adapter sequences with _5p in their label will match 'end's,\n"
+"and sequences with _3p in their label will match 'start's,\n"
+"otherwise the 'end' is auto-determined.\n"
+"\n"
+"Skew is when one cycle is poor, 'skewed' toward a particular base.\n"
+"If any nucleotide is less than the skew percentage, then the\n"
+"whole cycle is removed. Disable for methyl-seq, etc.\n"
+"\n"
+"Set the skew (-k) or N-pct (-x) to 0 to turn it off (should be done\n"
+"for miRNA, amplicon and other low-complexity situations!)\n"
+"\n"
+"Duplicate read filtering is appropriate for assembly tasks, and\n"
+"never when read length < expected coverage. -D 50 will use\n"
+"4.5GB RAM on 100m DNA reads - be careful. Great for RNA assembly.\n"
+"\n"
+"*Quality filters are evaluated after clipping/trimming\n"
+"\n"
+"Homopolymer filtering is a subset of low-complexity, but will not\n"
+"be separately tracked unless both are turned on.\n"
+ ,VERSION, SVNREV);
+}
+
+inline int char2bp(char c) {
+ if (c == 'A' || c == 'a') return B_A;
+ if (c == 'C' || c == 'c') return B_C;
+ if (c == 'G' || c == 'g') return B_G;
+ if (c == 'T' || c == 't') return B_T;
+ return B_N;
+}
+
+inline int char2bp_rc(char c) {
+ if (c == 'A' || c == 'a') return B_T;
+ if (c == 'C' || c == 'c') return B_G;
+ if (c == 'G' || c == 'g') return B_C;
+ if (c == 'T' || c == 't') return B_A;
+ return B_N;
+}
+
+
+inline char bp2char(int b) {
+ if (b == B_A) return 'A';
+ if (b == B_C) return 'C';
+ if (b == B_G) return 'G';
+ if (b == B_T) return 'T';
+ return 'N';
+}
+
+void saveskip(FILE **fout, int fo_n, struct fq *fq) {
+ int f;
+ for (f=0;f<fo_n;++f) {
+ fputs(fq[f].id.s,fout[f]);
+ fputs(fq[f].seq.s,fout[f]);
+ fputc('\n',fout[f]);
+ fputs(fq[f].com.s,fout[f]);
+ fputs(fq[f].qual.s,fout[f]);
+ fputc('\n',fout[f]);
+ }
+}
+
+int meanqwin(const char *q, int qn, int i, int w) {
+ if (w > qn) w=qn/4; // maximum window is length/4
+ int s = i-w/2; // start/end window
+ int e = i+w/2;
+ if (s < 0) {e-=s;s=0;} // shift window over if you're past the start
+ if (e >= qn) {s-=((e-qn)+1);e=qn-1;} // shift window over if you're past the end
+ int t = 0;
+ for (i=s;i<=e;++i) {
+ t+=q[i];
+ }
+ return t / (e-s+1); // mean quality within the window at that position
+}
+
+bool evalqual(struct fq &fq, int file_num) {
+ int t_mean, t_max_ns, t_xgt_num, t_xgt_min, t_max_n_pct;
+
+
+ if (file_num <= 0) {
+ // applies to file 1
+ t_mean=qf_mean;
+ t_max_ns=qf_max_ns;
+ t_max_n_pct=qf_max_n_pct;
+ t_xgt_num=qf_xgt_num;
+ t_xgt_min=qf_xgt_min;
+ } else {
+ // applies to file 2 or greater, only if they are set
+ t_mean=qf2_mean > 0 ? qf2_mean : qf_mean;
+ t_max_ns=qf_max_ns > -1 ? qf2_max_ns : qf_max_ns;
+ t_max_n_pct=qf_max_n_pct > -1 ? qf2_max_n_pct : qf_max_n_pct;
+ t_xgt_num=qf_xgt_num > 0 ? qf2_xgt_num : qf_xgt_num;
+ t_xgt_min=qf_xgt_min > 0 ? qf2_xgt_min : qf_xgt_min;
+ }
+
+ if (t_max_n_pct>=0) {
+ t_max_ns=max(t_max_ns,(fq.qual.n*100)/t_max_n_pct);
+ }
+
+ if (t_mean > 0) {
+ int t = 0;
+ int i;
+ for (i=0;i<=fq.qual.n;++i) {
+ t+=fq.qual.s[i];
+ }
+ if ((t/fq.qual.n-phred) < t_mean) {
+ return false;
+ }
+ }
+ if (t_max_ns >= 0) {
+ int t = 0;
+ int i;
+ for (i=0;i<=fq.seq.n;++i) {
+ t+=(fq.seq.s[i]=='N');
+ }
+
+// if (debug > 2) fprintf(stderr,"maxn: max:%d,t:%d,i:%d,id:%s", t_max_ns, t, i, fq.id.s);
+
+ if (t > t_max_ns) {
+ return false;
+ }
+ }
+
+ if (t_xgt_num > 0) {
+ int t = 0;
+ int i;
+ int h = t_xgt_min+phred;
+ for (i=0;i<=fq.qual.n;++i) {
+ t+=(fq.qual.s[i]>=h);
+ }
+ if (t < t_xgt_num) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void valid_arg(char opt, const char *arg) {
+ if (!arg || !*arg || *arg == '-') {
+ fprintf(stderr,"Option '%c' requires an argument.\n\n", opt);
+ usage(stderr);
+ exit(1);
+ }
+}
+
+bool arg_int_pair(const char *optarg, int &a, int&b) {
+ if (!strchr(optarg, ',')) {
+ return false;
+ }
+ a=atoi(optarg);
+ b=atoi(strchr(optarg, ',')+1);
+}
+
+char *arg2cmd(int argc, char** argv) {
+ char *buf=NULL;
+ int n = 0;
+ int k, i;
+ for (i=1; i <argc;++i) {
+ int k=strlen(argv[i]);
+ buf=( char *)realloc(buf,n+k+4);
+ char *p=buf+n;
+ char endq=0;
+ // this is a poor mans quoting, which is good enough for anything that's not rediculous
+ if (strchr(argv[i], ' ')) {
+ if (!strchr(argv[i], '\'')) {
+ *p++='\'';
+ endq='\'';
+ } else {
+ *p++='\"';
+ endq='\"';
+ }
+ }
+ memcpy(p, argv[i], k);
+ p+=k;
+ if (i < (argc-1)) *p++=' ';
+ if (endq) *p++=endq;
+ *p='\0';
+ n = p-buf;
+ }
+ return buf;
+}
+
+std::string arg2cmdstr(int argc, char **argv) {
+ char *tmp=arg2cmd(argc, argv);
+ std::string ret=tmp;
+ free(tmp);
+ return ret;
+}
+
+
+/* vim: set noai ts=4 sw=4: */
diff --git a/fastq-multx.c b/fastq-multx.c
new file mode 100644
index 0000000..ddb3f72
--- /dev/null
+++ b/fastq-multx.c
@@ -0,0 +1,1087 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+
+See "void usage" below for usage.
+
+*/
+
+#include "fastq-lib.h"
+
+#define MAX_BARCODE_NUM 6000
+#define MAX_GROUP_NUM 500
+// factor to divide max by
+#define THFIXFACTOR 20
+#define endstr(e) (e=='e'?"end":e=='b'?"start":"n/a")
+
+const char * VERSION = "1.02";
+#define SVNREV atoi(strchr("$LastChangedRevision: 684 $", ':')+1)
+
+// barcode
+struct bc {
+ line id;
+ line seq;
+ char *out[6]; // one output per input
+ FILE *fout[6];
+ bool gzout[6];
+ int cnt; // count found
+ bool shifted; // count found in 1-shifted position
+ char * dual; // is this a dual-indexed barcode? if so, this points to the second index.
+ int dual_n; // length of dual
+};
+
+// group of barcodes
+struct group {
+ char *id;
+ int tcnt; // number of codes past thresh
+ int i; // my index
+};
+
+// barcode group
+struct bcg {
+ struct bc b; // barcode
+ line group; // group (fluidigm, truseq, etc)
+ int bcnt[6]; // matched begin of file n
+ int ecnt[6]; // matched end of file n
+ int bscnt[6]; // matched begin of file n, shifted by 1
+ int escnt[6]; // matched end of file n, shifted by 1
+ int dbcnt[6]; // dual matched begin of file n
+ int decnt[6]; // dual matched end of file n
+ struct group *gptr;
+};
+
+struct group* getgroup(char *s);
+
+void usage(FILE *f);
+static int debug=0;
+// it's times like this when i think a class might be handy, but nah, not worth it
+typedef struct bnode {
+ char *seq;
+ int cnt;
+} bnode;
+
+struct group grs[MAX_GROUP_NUM];
+static int grcnt=0;
+
+struct bc bc[MAX_BARCODE_NUM+1];
+static int bcnt=0;
+
+static int pickmax=0;
+static void *picktab=NULL;
+void pickbest(const void *nodep, const VISIT which, const int depth);
+int bnodecomp(const void *a, const void *b) {return strcmp(((bnode*)a)->seq,((bnode*)b)->seq);};
+static float pickmaxpct=0.10;
+
+int main (int argc, char **argv) {
+ char c;
+ bool trim = true;
+ int mismatch = 1;
+ int distance = 2;
+ int poor_distance = 0; // count of skipped reads on distance only
+ int quality = 0;
+ char end = '\0';
+ char dend = '\0';
+ bool dual = false;
+ char *in[6];
+ const char *out[6];
+ int f_n=0;
+ int f_oarg=0;
+ const char* guide=NULL; // use an indexed-read
+ const char* list=NULL; // use a barcode master list
+ char verify='\0';
+ bool noexec = false;
+ const char *group = NULL;
+ bool usefile1 = false;
+ int phred = 33;
+ double threshfactor = 1;
+
+ int i;
+ bool omode = false;
+ char *bfil = NULL;
+ while ( (c = getopt (argc, argv, "-Dzxnbeov:m:B:g:L:l:G:q:d:t:")) != -1) {
+ switch (c) t:{
+ case '\1':
+ if (omode) {
+ if (f_oarg<5)
+ out[f_oarg++] = optarg;
+ else {
+ usage(stderr); return 1;
+ }
+ } else if (!bfil && !guide && !list)
+ bfil = optarg;
+ else if (f_n<5) {
+ in[f_n++] = optarg;
+ } else {
+ usage(stderr); return 1;
+ }
+ break;
+ case 'o': omode=true; break;
+ case 'v':
+ if (strlen(optarg)>1) {
+ fprintf(stderr, "Option -v requires a single character argument");
+ exit(1);
+ }
+ verify = *optarg; break;
+ case 'b': end = 'b'; break;
+ case 'e': end = 'e'; break;
+ case 'G': group = optarg; break;
+ case 'g':
+ guide = optarg;
+ in[f_n++] = optarg;
+ out[f_oarg++] = "n/a";
+ break;
+ case 'l': list = optarg; usefile1=0; break;
+ case 'L': list = optarg; usefile1=1; break;
+ case 'B': bfil = optarg; list = NULL; break;
+ case 'x': trim = false; break;
+ case 'n': noexec = true; break;
+ case 't': threshfactor = atof(optarg); break;
+ case 'm': mismatch = atoi(optarg); break;
+ case 'd': distance = atoi(optarg); break;
+ case 'q': quality = atoi(optarg); break;
+ case 'D': ++debug; break;
+ case '?':
+ if (strchr("vmBglG", optopt))
+ fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+ else if (isprint(optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr,
+ "Unknown option character `\\x%x'.\n",
+ optopt);
+ usage(stderr);
+ return 1;
+ }
+ }
+
+ if (group && !list) {
+ fprintf(stderr, "Error: -G only works with -l\n");
+ return 1;
+ }
+
+ if ((list && guide) || (list && bfil) || (guide && bfil)) {
+ fprintf(stderr, "Error: Only one of -B -g or -l\n");
+ return 1;
+ }
+
+ if (f_n != f_oarg) {
+ fprintf(stderr, "Error: number of input files (%d) must match number of output files following '-o'.\n", f_n);
+ return 1;
+ }
+
+ if (argc < 3 || !f_n || (!bfil && !guide && !list)) {
+ usage(stderr);
+ return 1;
+ }
+
+ quality+=phred;
+
+ FILE *fin[6];
+ bool gzin[6]; meminit(gzin);
+ for (i = 0; i < f_n; ++i) {
+ fin[i]=gzopen(in[i],"r",&gzin[i]);
+ }
+
+ // set all to null, zero
+ meminit(bc);
+
+
+ // 3 ways to get barcodes
+ if (list) {
+ // use a list of barcode groups... determine the best set, then use the determined set
+ struct bcg *bcg = (struct bcg *) malloc(sizeof(*bcg) * MAX_GROUP_NUM * MAX_BARCODE_NUM);
+ if (!bcg) {
+ fprintf(stderr, "Out of memory\n");
+ return 1;
+ }
+ memset(bcg, 0, sizeof(*bcg) * MAX_GROUP_NUM * MAX_BARCODE_NUM);
+ int bgcnt=0;
+ int b;
+ FILE *lin = fopen(list, "r");
+ if (!lin) {
+ fprintf(stderr, "Error opening file '%s': %s\n",list, strerror(errno));
+ return 1;
+ }
+ // read barcode groups
+ int ok;
+
+ while (bgcnt < (MAX_GROUP_NUM * MAX_BARCODE_NUM) && (ok = read_line(lin, bcg[bgcnt].b.id))) {
+ if (ok <= 0) break;
+ if (bcg[bgcnt].b.id.s[0]=='#') continue;
+ bcg[bgcnt].b.id.s=strtok(bcg[bgcnt].b.id.s, "\t\n\r ");
+ bcg[bgcnt].b.seq.s=strtok(NULL, "\t\n\r ");
+ char *g=strtok(NULL, "\n\r");
+ if (!g) {
+ if (bgcnt==0){
+ fprintf(stderr,"Barcode guide list needs to be ID<whitespace>SEQUENCE<whitespace>GROUP");
+ return 1;
+ } else {
+ continue;
+ }
+ }
+ if (group) {
+ if (strcasecmp(group, g)) {
+ continue;
+ }
+ }
+ if (!strcmp(bcg[bgcnt].b.seq.s,"seq")) continue;
+
+ // dual indexed indicated by a dash in the sequence...
+ if (bcg[bgcnt].b.dual=strchr(bcg[bgcnt].b.seq.s,'-')) {
+ *bcg[bgcnt].b.dual = '\0';
+ ++bcg[bgcnt].b.dual;
+ bcg[bgcnt].b.dual_n = strlen(bcg[bgcnt].b.dual);
+ }
+ // group pointer for this group
+ bcg[bgcnt].gptr = getgroup(g);
+ bcg[bgcnt].b.id.n=strlen(bcg[bgcnt].b.id.s);
+ bcg[bgcnt].b.seq.n=strlen(bcg[bgcnt].b.seq.s);
+
+ if (debug) fprintf(stderr, "BCG: %d bc:%s n:%d\n", bgcnt, bcg[bgcnt].b.seq.s, bcg[bgcnt].b.seq.n);
+ ++bgcnt;
+ }
+
+ if (!bgcnt) {
+ fprintf(stderr,"No barcodes %s from guide list %s.\n", group ? "matched" : "read", list);
+ return 1;
+ }
+
+ int sampcnt = 200000;
+ struct stat st;
+ int fsum[f_n], fmax[f_n]; int bestcnt=0, besti=-1, bestdual=0;
+ int dfsum[f_n], dfmax[f_n]; int dbestcnt=0, dbesti=-1;
+ meminit(fsum); meminit(fmax); meminit(dfsum); meminit(dfmax);
+
+ // subsample to determine group to use
+ for (i=0;i<(usefile1?1:f_n);++i) {
+ char *s = NULL; size_t na = 0; int nr = 0, ns = 0;
+ char *q = NULL; size_t nq = 0;
+ double tots=0, totsq=0;
+
+ stat(in[i], &st);
+
+ while (getline(&s, &na, fin[i]) > 0) {
+ if (*s != '@') {
+ fprintf(stderr,"Invalid fastq file: %s.\n", in[i]);
+ exit(1);
+ }
+
+ if ((ns=getline(&s, &na, fin[i])) <=0)
+ break;
+
+ getline(&q, &nq, fin[i]);
+ getline(&q, &nq, fin[i]);
+
+ s[--ns]='\0'; q[ns]='\0';
+
+// skip if quality is below average
+ if (st.st_size > (sampcnt * 500) && poorqual(i, ns, s, q))
+ continue;
+
+ for (b=0;b<bgcnt;++b) {
+ // matches front of read?
+ if (!strncasecmp(s, bcg[b].b.seq.s, bcg[b].b.seq.n)) {
+ ++bcg[b].bcnt[i];
+ } else if (!strncasecmp(s+1, bcg[b].b.seq.s, bcg[b].b.seq.n)) {
+ // shifted read?
+ ++bcg[b].bscnt[i];
+ }
+
+ if (ns >= bcg[b].b.seq.n && !strcasecmp(s+ns-bcg[b].b.seq.n, bcg[b].b.seq.s)) {
+ ++bcg[b].ecnt[i];
+ } else if (ns > bcg[b].b.seq.n && !strncasecmp(s+ns-bcg[b].b.seq.n-1, bcg[b].b.seq.s, bcg[b].b.seq.n)) {
+ ++bcg[b].escnt[i];
+ }
+
+ if (bcg[b].b.dual) {
+ if (!strncasecmp(s, bcg[b].b.dual, bcg[b].b.dual_n)) {
+ ++bcg[b].dbcnt[i];
+ }
+
+ if (ns >= bcg[b].b.dual_n && !strcasecmp(s+ns-bcg[b].b.dual_n, bcg[b].b.dual)) {
+ ++bcg[b].decnt[i];
+ }
+ }
+ }
+
+ ++nr;
+ // got enough reads?
+ if (nr >= sampcnt)
+ break;
+ }
+
+ for (b=0;b<bgcnt;++b) {
+ // highest count
+ int hcnt = (int) (max(bcg[b].bcnt[i],bcg[b].ecnt[i]) * log(bcg[b].b.seq.n));
+ fsum[i]+=hcnt;
+ if (hcnt > fmax[i])
+ fmax[i]=hcnt;
+
+ if (fsum[i] > bestcnt) {
+ if (debug > 1)
+ fprintf(stderr,"file %d(%s), bcg: %s, file-sum: %d, bestsum: %d\n", i, in[i], bcg[b].gptr->id, fsum[i], bestcnt);
+
+ bestcnt=fsum[i];
+ besti=i;
+ bestdual=(bcg[b].b.dual!=NULL);
+ }
+
+ if (debug > 1)
+ fprintf(stderr,"dual %d(%s), bcg: %s, file-sum: %d, bestsum: %d\n", i, in[i], bcg[b].gptr->id, dfsum[i], dbestcnt);
+
+ if (bcg[b].b.dual) {
+ // highest count
+ int dcnt = (int) (max(bcg[b].dbcnt[i],bcg[b].decnt[i]) * log(bcg[b].b.dual_n));
+ dfsum[i]+=dcnt;
+ if (dcnt > dfmax[i])
+ dfmax[i]=dcnt;
+ if (dfsum[i] > dbestcnt) {
+ if (debug > 1)
+ fprintf(stderr,"dual %d(%s), bcg: %s, file-sum: %d, bestsum: %d\n", i, in[i], bcg[b].gptr->id, dfsum[i], dbestcnt);
+ dbestcnt=dfsum[i];
+ dbesti=i;
+ }
+ }
+ }
+ if (debug > 0) fprintf(stderr,"file-best %d sum:%d, max:%d\n", besti, fsum[besti], fmax[besti]);
+ if (debug > 0 && bestdual) fprintf(stderr,"dual file-best %d sum:%d, max:%d\n", dbesti, dfsum[dbesti], dfmax[dbesti]);
+ }
+
+ // chosen file is "besti"
+ i=usefile1?0:besti;
+
+ int gmax=0, gindex=-1, scnt = 0, ecnt=0, dscnt = 0, decnt = 0;
+ int thresh = (int) (pickmaxpct*fmax[i]);
+
+ if (debug > 0) fprintf(stderr,"besti: %d thresh: %d, dual: %d\n", besti, thresh, bestdual);
+ for (b=0;b<bgcnt;++b) {
+ int hcnt = (int) (max(bcg[b].bcnt[i],bcg[b].ecnt[i]) * log(bcg[b].b.seq.n));
+ if (debug > 1) fprintf(stderr,"cnt: %s %s hc:%d bc:%d ec: %d\n", bcg[b].b.id.s, bcg[b].b.seq.s, hcnt, bcg[b].bcnt[i], bcg[b].ecnt[i]);
+ if (hcnt >= thresh) {
+ // increase group count
+ bcg[b].gptr->tcnt += hcnt;
+ if (bcg[b].gptr->tcnt > gmax) {
+ gindex=bcg[b].gptr->i;
+ gmax=bcg[b].gptr->tcnt;
+ }
+ }
+ }
+ if (gindex == -1) {
+ fprintf(stderr, "Unable to determine barcode group\n");
+ exit(1);
+ }
+// printf("gmax: %d, gindex %d, %s, thresh: %d\n", gmax, gindex, grs[gindex].id, thresh);
+
+ for (b=0;b<bgcnt;++b) {
+ if (bcg[b].gptr->i == gindex) {
+ if (bcg[b].bcnt[i] > bcg[b].ecnt[i]) {
+ scnt+=bcg[b].dbcnt[i];
+ } else if (bcg[b].bcnt[i] < bcg[b].ecnt[i]) {
+ ecnt+=bcg[b].decnt[i];
+ }
+ if (bcg[b].dbcnt[dbesti] > bcg[b].decnt[dbesti]) {
+ dscnt+=bcg[b].dbcnt[dbesti];
+ } else if (bcg[b].dbcnt[dbesti] < bcg[b].decnt[dbesti]) {
+ decnt+=bcg[b].decnt[dbesti];
+ }
+ }
+ };
+ end = scnt >= ecnt ? 'b' : 'e';
+
+ if (debug) fprintf(stderr,"scnt: %d, ecnt, %d, end: %c\n", scnt, ecnt, end);
+
+ thresh/=threshfactor;
+ if (bestdual)
+ thresh/=5;
+
+ // since this is a known good set, use a very low threshold, just to catch them all
+ fprintf(stderr, "Using Barcode Group: %s on File: %s (%s), Threshold %2.2f%%\n",
+ grs[gindex].id, in[i], endstr(end), 100.0 * (float) ((float)thresh/THFIXFACTOR)/sampcnt);
+
+ if (bestdual) {
+ dend = dscnt >= decnt ? 'b' : 'e';
+ fprintf(stderr, "Dual index on File: %s (%s)\n", in[dbesti], endstr(dend));
+ dual=true;
+ for (b=0;b<bgcnt;++b) {
+ // trim down a bit, but later should trim down to "both-match"
+ if (bcg[b].gptr->i == gindex) {
+ if (bcg[b].decnt[dbesti] < bcg[b].ecnt[i])
+ bcg[b].ecnt[i] = bcg[b].decnt[dbesti];
+ if (bcg[b].dbcnt[dbesti] < bcg[b].bcnt[i])
+ bcg[b].bcnt[i] = bcg[b].dbcnt[dbesti];
+ }
+ }
+ }
+
+ for (b=0;b<bgcnt;++b) {
+ if (bcg[b].gptr->i == gindex) {
+ int cnt = (end == 'e' ? (bcg[b].ecnt[i]+bcg[b].escnt[i]) : ( bcg[b].bcnt[i] + bcg[b].bscnt[i] ));
+ if (cnt > thresh/THFIXFACTOR) {
+ // count exceeds threshold... use it
+ bc[bcnt]=bcg[b].b;
+ if ((end == 'e' && (bcg[b].escnt[i] < 1.2*bcg[b].ecnt[i])) ||
+ (end == 'b' && (bcg[b].bscnt[i] < 1.2*bcg[b].bcnt[i]))
+ ) {
+ if (!dual)
+ fprintf(stderr, "Using Barcode %s (%s)\n", bcg[b].b.id.s, bcg[b].b.seq.s);
+
+ if (debug) fprintf(stderr, "Debug Barcode %s (%s-%s) ... ecnt:%d, escnt:%d,bcnt:%d, bscnt:%d\n", bcg[b].b.id.s, bcg[b].b.seq.s, bcg[b].b.dual, bcg[b].ecnt[i], bcg[b].escnt[i], bcg[b].bcnt[i], bcg[b].bscnt[i]);
+
+ } else {
+ bc[bcnt].shifted=1;
+
+ if (!dual)
+ fprintf(stderr, "Using Barcode %s (%s) shifted\n", bcg[b].b.id.s, bcg[b].b.seq.s);
+
+ if (debug) printf("Debug Barcode %s (%s-%s) shifted ... ecnt:%d, escnt:%d,bcnt:%d, bscnt:%d\n", bcg[b].b.id.s, bcg[b].b.seq.s, bcg[b].b.dual, bcg[b].ecnt[i], bcg[b].escnt[i], bcg[b].bcnt[i], bcg[b].bscnt[i]);
+ }
+ ++bcnt;
+ }
+ }
+ }
+
+ if (i != 0) {
+ // in[0] needs to be the guide file
+ FILE *f = fin[0];
+ char *n = in[0];
+ const char *o = out[0];
+ bool gzi = gzin[0];
+ fin[0]=fin[i];
+ in[0]=in[i];
+ out[0]=out[i];
+ gzin[0]=gzin[i];
+ fin[i]=f;
+ in[i]=n;
+ out[i]=o;
+ gzin[i]=gzi;
+ // swap file in to position 1 if dual
+ if (dual && dbesti != 1) {
+ FILE *f = fin[1];
+ char *n = in[1];
+ const char *o = out[1];
+ bool gzi = gzin[1];
+ fin[1]=fin[dbesti];
+ in[1]=in[dbesti];
+ out[1]=out[dbesti];
+ gzin[1]=gzin[dbesti];
+ fin[dbesti]=f;
+ in[dbesti]=n;
+ out[dbesti]=o;
+ gzin[dbesti]=gzi;
+ }
+ }
+ if (bcg) free(bcg);
+ } else if (guide) {
+ // use the first file as a "guide file" ... and select a set of codes from that
+ FILE *gin = fin[0];
+
+ int blen = 0;
+
+ int sampcnt = 100000;
+ struct stat st;
+ stat(guide, &st);
+
+ char *s = NULL; size_t na = 0; int nr = 0, ns = 0;
+ char *q = NULL; size_t nq = 0;
+
+// small sample to get lengths
+ double tots=0, totsq=0;
+ while (getline(&s, &na, gin) > 0) {
+ if (*s != '@') {
+ fprintf(stderr,"Invalid fastq file: %s.\n", in[0]);
+ exit(1);
+ }
+ if ((ns=getline(&s, &na, gin)) <=0)
+ break;
+ getline(&q, &nq, gin);
+ getline(&q, &nq, gin);
+ --ns;
+ tots+=ns;
+ totsq+=ns*ns;
+ ++nr;
+ if (nr >= 200) break;
+ }
+ double dev = stdev(nr, tots, totsq);
+
+ // short, and nonvarying (by much, depends on the tech used)
+ if (dev < .25 && roundl(tots/nr) < 12) {
+ // most probably a barcode-only read
+ blen = (int) round(tots/nr);
+ end = 'b';
+ } else if (round(tots/nr) < 12) {
+ fprintf(stderr, "File %s looks to be barcode-only, but it's length deviation is too high (%.4g)\n", in[0], dev);
+ return 1;
+ } else {
+ fprintf(stderr, "File %s isn't a barcode-only file, try using -l instead\n", in[0]);
+ return 1;
+ }
+
+ fprintf(stderr, "Barcode length used: %d (%s)\n", blen, endstr(end));
+
+ // load a table of possble codes
+ pickmax=0;
+ picktab=NULL;
+ bnode * ent = NULL;
+ while (getline(&s, &na, gin) > 0) {
+ if (*s != '@') {
+ fprintf(stderr,"Invalid fastq file: %s.\n", in[i]);
+ exit(1);
+ }
+
+ if ((ns=getline(&s, &na, gin)) <=0)
+ break;
+
+ getline(&q, &nq, gin);
+ if (getline(&q, &nq, gin) != ns)
+ break;
+
+ s[--ns]='\0'; q[ns]='\0';
+
+ if (st.st_size > (sampcnt * 500) && poorqual(i, ns, s, q))
+ continue;
+
+ ++nr;
+
+ char *p;
+ if (end == 'b') {
+ p=s;
+ } else {
+ p=s+nr-blen;
+ }
+ p[blen]='\0';
+ if (!ent) // make a new ent
+ ent = (bnode *) malloc(sizeof(*ent));
+
+ if (strchr(p, 'N')||strchr(p, 'n'))
+ continue;
+
+ ent->cnt=0;
+ strcpy(ent->seq=(char*)malloc(strlen(p)+1), p);
+
+ bnode *fent = * (bnode**) tsearch(ent, &picktab, bnodecomp);
+
+ if (fent == ent) // used the ent, added to tree
+ ent = NULL; // need a new one
+
+ ++fent->cnt;
+
+ if (fent->cnt > pickmax) pickmax=fent->cnt;
+
+ if (nr > sampcnt)
+ break;
+ }
+ pickmax=max(1,(int)(pickmaxpct*pickmax));
+ fprintf(stderr, "Threshold used: %d\n", pickmax);
+ twalk(picktab, pickbest);
+ } else {
+ // user specifies a list of barcodes, indexed read is f[0] and f[1] if dual
+ FILE *bin = fopen(bfil, "r");
+ if (!bin) {
+ fprintf(stderr, "Error opening file '%s': %s\n",bfil, strerror(errno));
+ return 1;
+ }
+
+ bcnt = 0;
+ int ok;
+ while (bcnt < MAX_BARCODE_NUM && (ok = read_line(bin, bc[bcnt].id))) {
+ if (ok <= 0) break;
+ if (bc[bcnt].id.s[0]=='#') continue;
+ bc[bcnt].id.s=strtok(bc[bcnt].id.s, "\t\n\r ");
+ bc[bcnt].seq.s=strtok(NULL, "\t\n\r ");
+ if (!bc[bcnt].seq.s) {
+ fprintf(stderr, "Barcode file '%s' required format is 'ID SEQ'\n",bfil);
+ return 1;
+ }
+ if (bc[bcnt].dual=strchr(bc[bcnt].seq.s,'-')) {
+ *bc[bcnt].dual = '\0';
+ ++bc[bcnt].dual;
+ bc[bcnt].dual_n = strlen(bc[bcnt].dual);
+ dual=true;
+ }
+ bc[bcnt].id.n=strlen(bc[bcnt].id.s);
+ bc[bcnt].seq.n=strlen(bc[bcnt].seq.s);
+ if (debug) fprintf(stderr, "BC: %d bc:%s n:%d\n", bcnt, bc[bcnt].seq.s, bc[bcnt].seq.n);
+ ++bcnt;
+ }
+
+ fprintf(stderr, "Using Barcode File: %s\n", bfil);
+ }
+
+ if (noexec) {
+ int b;
+ for (b=0;b<bcnt;++b) {
+ fprintf(stdout, "%s %s\n", bc[b].id.s, bc[b].seq.s);
+ }
+ exit(0);
+ }
+
+ // for whatever reason, the end is not supplied... easy enough to determine accurately
+ // or it's dual... which means we need to resample stuff
+ if (end == '\0' || dual) {
+ for (i=0;i<f_n;++i) {
+ if (!gzin[i])
+ fseek(fin[i],0,0);
+ else {
+ pclose(fin[i]);
+ fin[i]=gzopen(in[i],"r",&gzin[i]);
+ }
+ }
+
+ int sampcnt = dual ? 200000 : 10000;
+ struct stat st;
+ stat(in[0], &st);
+ char *s = NULL; size_t na = 0; int nr = 0, ns = 0;
+ char *q = NULL; size_t nq = 0;
+ int ne=0, nb=0, dne=0, dnb=0, tcount=0, read_ok=0;
+
+ int *recount = dual ? ((int *) malloc(sizeof(int)*bcnt)) : NULL;
+ if (dual) memset(recount, 0, sizeof(int)*bcnt);
+
+ struct fq fq[2]; meminit(fq);
+
+ while (read_ok=read_fq(fin[0], nr, &fq[0])) {
+ fq[0].id.s[--fq[0].id.n]='\0';
+
+ if (dual)
+ read_fq(fin[1], nr, &fq[1]);
+ ++nr;
+
+ if (st.st_size > (sampcnt * 500) && poorqual(0, fq[0].seq.n, fq[0].seq.s, fq[0].qual.s))
+ continue;
+
+ if (dual)
+ if (st.st_size > (sampcnt * 500) && poorqual(1, fq[1].seq.n, fq[1].seq.s, fq[1].qual.s))
+ continue;
+
+ for (i=0;i<bcnt;++i) {
+ int dok = 0;
+ if (debug > 1) fprintf(stderr, "check %s vs %s: %s vs %s", fq[0].id.s, bc[i].id.s, fq[0].seq.s, bc[i].seq.s);
+ if (!strncmp(fq[0].seq.s, bc[i].seq.s, bc[i].seq.n)) {
+ ++nb;
+ ++dok;
+ } else if (!strncmp(fq[0].seq.s+fq[0].seq.n-bc[i].seq.n, bc[i].seq.s, bc[i].seq.n)) {
+ ++ne;
+ ++dok;
+ }
+ if (dual) {
+ if (debug > 1) fprintf(stderr, ", dual: %s vs %s, ", fq[1].seq.s, bc[i].dual);
+ if (!strncmp(fq[1].seq.s, bc[i].dual, bc[i].dual_n)) {
+ ++dnb;
+ ++dok;
+ } else if (!strncmp(fq[1].seq.s+fq[1].seq.n-bc[i].dual_n, bc[i].dual, bc[i].dual_n)) {
+ ++dne;
+ ++dok;
+ }
+ }
+ if (debug > 1) fprintf(stderr, ", dok:%d, i:%d\n", dok, i);
+ if (dok == 2) {
+ ++recount[i];
+ ++tcount;
+ break;
+ }
+ }
+
+ if (nr >= sampcnt)
+ break;
+ }
+
+ end = (ne > nb) ? 'e' : 'b';
+ fprintf(stderr, "End used: %s\n", endstr(end));
+
+ if (dual && list) {
+ // trim down possiblities to reduce number of open files, and small stub files
+ dend = (dne > dnb) ? 'e' : 'b';
+ fprintf(stderr, "Dual-end used: %s\n", endstr(dend));
+ int ocnt = bcnt;
+ // this should allow up to a 300 plex
+ int thresh = max(1,tcount/1000);
+ thresh /= threshfactor;
+ bcnt=0;
+ if (debug)
+ fprintf(stderr, "dual resample threshold: %d out of %d\n", thresh, tcount);
+ for (i=0;i<ocnt;++i) {
+ if (recount[i] >= thresh) {
+ fprintf(stderr, "Using Barcode %s (%s-%s)\n", bc[i].id.s, bc[i].seq.s, bc[i].dual);
+ if (debug)
+ fprintf(stderr, "%d >= %d\n", recount[i], thresh);
+ bc[bcnt].seq=bc[i].seq;
+ bc[bcnt].id=bc[i].id;
+ bc[bcnt].dual=bc[i].dual;
+ bc[bcnt].dual_n=bc[i].dual_n;
+ ++bcnt;
+ } else {
+ if (debug)
+ fprintf(stderr, "skipping barcode %s (%s-%s), %d < %d\n", bc[i].id.s, bc[i].seq.s, bc[i].dual, recount[i], thresh);
+ }
+ }
+ }
+ }
+
+ if (bcnt == 0) {
+ fprintf(stderr, "No barcodes defined, quitting.\n");
+ exit(1);
+ }
+
+ // one beyond barcode count is unmatched
+ bc[bcnt].id.s=(char *)"unmatched";
+
+ // TODO: output barcode read ...but only for unmatched?
+ int b;
+ for (b=0;b<=bcnt;++b) {
+ for (i=0;i<f_n;++i) {
+ if (!strcasecmp(out[i],"n/a") || !strcasecmp(out[i],"/dev/null")) {
+ bc[b].out[i] = NULL;
+ bc[b].fout[i] = NULL;
+ continue;
+ }
+ const char *p=strchr(out[i],'%');
+ if (!p) fail("Each output file name must contain a '%%' sign, which is replaced by the barcode id\n");
+ bc[b].out[i]=(char *) malloc(strlen(out[i])+strlen(bc[b].id.s)+100);
+ strncpy(bc[b].out[i], out[i], p-out[i]);
+ bc[b].out[i][p-out[i]]='\0';
+ strcat(bc[b].out[i], bc[b].id.s);
+ strcat(bc[b].out[i], p+1);
+ if (!(bc[b].fout[i]=gzopen(bc[b].out[i], "w", &bc[b].gzout[i]))) {
+ fprintf(stderr, "Error opening output file '%s': %s\n",bc[b].out[i], strerror(errno));
+ return 1;
+ }
+ }
+ }
+
+ // seek back to beginning of fastq
+ for (i=0;i<f_n;++i) {
+ if (!gzin[i])
+ fseek(fin[i],0,0);
+ else {
+ pclose(fin[i]);
+ fin[i]=gzopen(in[i],"r",&gzin[i]);
+ }
+ }
+
+ // don't trim if you're not outputting the read
+
+ struct fq fq[6];
+ meminit(fq);
+
+ int nrec=0;
+ int nerr=0;
+ int nok=0;
+ int ntooshort=0;
+ int ntrim=0;
+ int nbtrim=0;
+ int read_ok;
+
+ // ACTUAL DEMUX HAPPENS HERE
+ // read in 1 record from EACH file supplied
+ while (read_ok=read_fq(fin[0], nrec, &fq[0])) {
+ for (i=1;i<f_n;++i) {
+ int mate_ok=read_fq(fin[i], nrec, &fq[i]);
+ if (read_ok != mate_ok) {
+ fprintf(stderr, "# of rows in mate file '%s' doesn't match primary file, quitting!\n", in[i]);
+ return 1;
+ }
+ if (verify) {
+ // verify 1 in 100
+ if (0 == (nrec % 100)) {
+ char *p=strchr(fq[i].id.s,verify);
+ if (!p) {
+ fprintf(stderr, "File %s is missing id verification char %c at line %d", in[i], verify, nrec*4+1);
+ return 1;
+ }
+ int l = p-fq[i].id.s;
+ if (strncmp(fq[0].id.s, fq[i].id.s, l)) {
+ fprintf(stderr, "File %s, id doesn't match file %s at line %d", in[0], in[i], nrec*4+1);
+ return 1;
+ }
+ }
+ }
+ }
+ ++nrec;
+ if (read_ok < 0) continue;
+
+ int i, best=-1, bestmm=mismatch+distance+1, bestd=mismatch+distance+1, next_best=mismatch+distance*2+1;
+
+ if (debug) {
+ fq[0].id.s[fq[0].id.n-1] = '\0';
+ fprintf(stderr, "id: %s, seq: %s %d", fq[0].id.s, fq[0].seq.s, fq[0].seq.n);
+ if (dual) fprintf(stderr, ", sdual: %s %d", fq[1].seq.s, fq[1].seq.n);
+ fq[0].id.s[fq[0].id.n] = '\n';
+ if (debug > 1) printf("\n");
+ if (!memcmp(fq[0].id.s, "HWI-ST1000:199:C0KG2ACXX:6:1101:1497:1878",41)) {
+ printf("HERE %d\n", debug);
+ exit(0);
+ }
+ }
+
+ if (quality > 0) {
+ for (i=0;i<fq[0].seq.n;++i) {
+ if (fq[0].qual.s[i]<quality) {
+ fq[0].seq.s[i]='N';
+ }
+ }
+ }
+
+ // for each barcode
+ for (i =0; i < bcnt; ++i) {
+ int d;
+ if (end == 'e') {
+ if (bc[i].shifted) {
+ if (fq[0].seq.n > bc[i].seq.n) {
+ d=hd(fq[0].seq.s+fq[0].seq.n-bc[i].seq.n-1, bc[i].seq.s, bc[i].seq.n);
+ } else {
+ d=bc[i].seq.n;
+ }
+ } else {
+ if (fq[0].seq.n >= bc[i].seq.n) {
+ d=hd(fq[0].seq.s+fq[0].seq.n-bc[i].seq.n, bc[i].seq.s, bc[i].seq.n);
+ } else {
+ d=bc[i].seq.n;
+ }
+ }
+
+ if (dual) {
+ // distance is added in for duals
+ if (fq[1].seq.n >= bc[i].dual_n) {
+ d+=hd(fq[1].seq.s+fq[1].seq.n-bc[i].dual_n, bc[i].dual, bc[i].dual_n);
+ } else {
+ d+=bc[i].dual_n;
+ }
+ }
+ } else {
+ if (bc[i].shifted)
+ d=hd(fq[0].seq.s+1,bc[i].seq.s, bc[i].seq.n);
+ else
+ d=hd(fq[0].seq.s,bc[i].seq.s, bc[i].seq.n);
+
+ // distance is added in for duals
+ if (dual)
+ d+=hd(fq[1].seq.s,bc[i].dual, bc[i].dual_n);
+
+ // if (debug > 1) {
+ // fprintf(stderr, "index: %d dist: %d bc:%s n:%d", i, d, bc[i].seq.s, bc[i].seq.n);
+ // if (dual) fprintf(stderr, ", idual: %s %d", bc[i].dual, bc[i].dual_n);
+ // fprintf(stderr, "\n");
+ // }
+ }
+ // simple...
+ if (d < bestd) {
+ next_best=bestd;
+ bestd=d;
+ if (debug > 1) fprintf(stderr,"next_dist: %d, best_seq: %s:%d\n", next_best, bc[i].seq.s, bestd);
+ }
+ // if exact match
+ if (d==0) {
+ if (debug) fprintf(stderr, ", found bc: %d bc:%s n:%d, bestd: %d, next_best: %d", i, bc[i].seq.s, bc[i].seq.n, bestd, next_best);
+ best=i;
+ break;
+ } else if (d <= mismatch) {
+ // if ok match
+ if (d == bestmm) {
+ best=-1; // more than 1 match... bad
+ } else if (d < bestmm) {
+ bestmm=d; // best match...ok
+ best=i;
+ }
+ }
+ }
+
+ if ((best >= 0) && distance && (next_best-bestd) < distance) {
+ if (debug) fprintf(stderr, "%d<%d, skipping", next_best-bestd, distance);
+ // match is ok, but distance is poor
+ ++poor_distance;
+ best=-1;
+ }
+
+ bool trimmed = false;
+ // only trim if you're outputting the sequence
+ if (trim && best >= 0 && bc[best].fout[0]) {
+ // todo: save trimmed
+ trimmed = true;
+ int len=bc[best].seq.n;
+ if (end =='b') {
+ memmove(fq[0].seq.s, fq[0].seq.s+len, fq[0].seq.n-len);
+ memmove(fq[0].qual.s, fq[0].qual.s+len, fq[0].seq.n-len);
+ }
+ fq[0].seq.s[fq[0].seq.n-len]='\0';
+ fq[0].qual.s[fq[0].qual.n-len]='\0';
+ }
+
+ if (best < 0) {
+ // shuttle to unmatched file
+ best=bcnt;
+ }
+
+ if (debug) fprintf(stderr, ", best: %d %s\n", best, bc[best].id.s);
+
+ ++bc[best].cnt;
+
+ for (i=0;i<f_n;++i) {
+ FILE *f=bc[best].fout[i];
+ if (!f) continue;
+ if (!trimmed) {
+ // todo: capture always, not just when trim is off
+ *strrchr(fq[i].id.s, '\n') = '\0';
+ fputs(fq[i].id.s,f);
+ fputc(' ', f);
+ fputs(fq[0].seq.s,f);
+ if (dual) {
+ fputc('-', f);
+ fputs(fq[1].seq.s,f);
+ }
+ fputc('\n', f);
+ } else {
+ // id still has chr
+ fputs(fq[i].id.s,f);
+ }
+ fputs(fq[i].seq.s,f);
+ fputc('\n',f);
+ fputs(fq[i].com.s,f);
+ fputs(fq[i].qual.s,f);
+ fputc('\n',f);
+ }
+ }
+
+ bool io_ok=1;
+ for (b=0;b<=bcnt;++b) {
+ for (i=0;i<f_n;++i) {
+ if (bc[b].fout[i]) {
+ if (bc[b].gzout[i]) {
+ io_ok = io_ok && !pclose(bc[b].fout[i]);
+ } else {
+ io_ok = io_ok && !fclose(bc[b].fout[i]);
+ }
+ }
+ }
+ }
+
+
+ if (poor_distance > 0)
+ fprintf(stderr, "Skipped because of distance < %d : %d\n", distance, poor_distance);
+
+ if (!io_ok)
+ fprintf(stderr, "Returning error because of i/o error during file close\n");
+
+ int j;
+ printf("Id\tCount\tFile(s)\n");
+ int tot=0;
+ for (i=0;i<=bcnt;++i) {
+ printf("%s\t%d", bc[i].id.s, bc[i].cnt);
+ tot+=bc[i].cnt;
+ for (j=0;j<f_n;++j) {
+ if (bc[i].out[j])
+ printf("\t%s", bc[i].out[j]);
+ }
+ printf("\n");
+ }
+ printf("total\t%d\n", tot);
+
+ if (!io_ok)
+ return 3;
+
+ return 0;
+}
+
+struct group* getgroup(char *s) {
+ int i;
+ for (i=0;i<grcnt;++i) {
+ if (!strcasecmp(s,grs[i].id)) {
+ return &grs[i];
+ }
+ }
+ if (grcnt >= MAX_GROUP_NUM) {
+ fprintf(stderr,"Too many barcode groups, quitting\n");
+ exit(1);
+ }
+ grs[grcnt].id=(char *)malloc(strlen(s)+1);
+ strcpy(grs[grcnt].id,s);
+ grs[grcnt].tcnt=0;
+ grs[grcnt].i=grcnt;
+ return &grs[grcnt++];
+}
+
+void pickbest(const void *nodep, const VISIT which, const int depth)
+{
+ if (which==endorder || which==leaf) {
+ bnode *ent = *(bnode **) nodep;
+ // printf("HERE!! %s, %d, %d\n", ent->seq, ent->cnt, pickmax);
+ // allow one sample to be as much as 1/10 another, possibly too conservative
+ if (ent->cnt > pickmax && bcnt < MAX_BARCODE_NUM) {
+ bc[bcnt].seq.s=ent->seq;
+ bc[bcnt].id.s=ent->seq;
+ bc[bcnt].id.n=strlen(bc[bcnt].id.s);
+ bc[bcnt].seq.n=strlen(bc[bcnt].seq.s);
+ ++bcnt;
+ } else {
+ //free(ent->seq);
+ }
+ //free(ent);
+ //tdelete((void*)ent, &picktab, scompare);
+ }
+}
+
+void usage(FILE *f) {
+ fprintf(f,
+"Usage: fastq-multx [-g|-l|-B] <barcodes.fil> <read1.fq> -o r1.%%.fq [mate.fq -o r2.%%.fq] ...\n"
+"Version: %s.%d\n"
+"\n"
+"Output files must contain a '%%' sign which is replaced with the barcode id in the barcodes file.\n"
+"Output file can be n/a to discard the corresponding data (use this for the barcode read)\n"
+"\n"
+"Barcodes file (-B) looks like this:\n"
+"\n"
+"<id1> <sequence1>\n"
+"<id2> <sequence2> ...\n"
+"\n"
+"Default is to guess the -bol or -eol based on clear stats.\n"
+"\n"
+"If -g is used, then it's parameter is an index lane, and frequently occuring sequences are used.\n"
+"\n"
+"If -l is used then all barcodes in the file are tried, and the *group* with the *most* matches is chosen.\n"
+"\n"
+"Grouped barcodes file (-l or -L) looks like this:\n"
+"\n"
+"<id1> <sequence1> <group1>\n"
+"<id1> <sequence1> <group1>\n"
+"<id2> <sequence2> <group2>...\n"
+"\n"
+"Mated reads, if supplied, are kept in-sync\n"
+"\n"
+"Options:\n"
+"\n"
+"-o FIL1 Output files (one per input, required)\n"
+"-g SEQFIL Determine barcodes from indexed read SEQFIL\n"
+"-l BCFIL Determine barcodes from any read, using BCFIL as a master list\n"
+"-L BCFIL Determine barcodes from <read1.fq>, using BCFIL as a master list\n"
+"-B BCFIL Use barcodes from the specified file, don't run a determination step\n"
+"-b Force beginning of line (5') for barcode matching\n"
+"-e Force end of line (3') for batcode matching\n"
+"-t NUM Divide threshold for auto-determine by factor NUM (1), > 1 = more sensitive\n"
+"-G NAME Use group(s) matching NAME only\n"
+"-x Don't trim barcodes off before writing out destination\n"
+"-n Don't execute, just print likely barcode list\n"
+"-v C Verify that mated id's match up to character C (Use ' ' for illumina)\n"
+"-m N Allow up to N mismatches, as long as they are unique (1)\n"
+"-d N Require a minimum distance of N between the best and next best (2)\n"
+"-q N Require a minimum phred quality of N to accept a barcode base (0)\n"
+ ,VERSION,SVNREV);
+}
diff --git a/fastq-stats.cpp b/fastq-stats.cpp
new file mode 100644
index 0000000..0a2d092
--- /dev/null
+++ b/fastq-stats.cpp
@@ -0,0 +1,672 @@
+/*
+Copyright (c) 2011 Expression Analysis / Gunjan Hariani, Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+$Id: fastq-stats.cpp 652 2013-09-17 17:40:32Z earonesty $
+*/
+const char * VERSION = "1.01 $Id: fastq-stats.cpp 652 2013-09-17 17:40:32Z earonesty $";
+
+#include <ctype.h>
+#include <stdio.h>
+
+void usage( FILE * f ) {
+ fprintf( f,
+ "\nUsage: fastq-stats [options] <fastq-file>\n\n"
+ "Version: %s\n"
+ "\n"
+ "Produces lots of easily digested statistics for the files listed\n"
+ "\n"
+ "Options\n"
+ "\n"
+ "-c cyclemax: max cycles for which following quality stats are produced [35]\n"
+ "-w INT window: max window size for generating duplicate read statistics [2000000]\n"
+ "-d debug: prints out debug statements\n"
+ "-D don't do duplicate read statistics\n"
+ "-s INT number of top duplicate reads to display\n"
+ "-x FIL output fastx statistics (requires an output filename)\n"
+ "-b FIL output base breakdown by per phred quality at every cycle.\n"
+ " It sets cylemax to longest read length\n"
+ "-L FIL Output length counts \n\n"
+
+ "\n"
+ "The following data are printed to stdout:\n" "\n"
+ " reads : #reads in the fastq file\n"
+ " len : read length. mean and stdev are provided for variable read lengths\n"
+ " phred : phred scale used\n"
+ " window-size : Number of reads used to generate duplicate read statistics\n"
+ " cycle-max : Number of bases to assess for duplicity\n"
+ " dups : Number of reads that are duplicates\n"
+ " %%dup : Pct reads that are duplcate\n"
+ " unique-dup seq : Number sequences that are duplicated\n"
+ " min dup count : Smallest duplicate tally for any duplicate sequence\n"
+ " dup seq <rank> <count> <sequence> \n"
+ " : Lists top 10 most frequent duplicate reads along with count mean and stdev\n"
+ " qual : Base Quality min, max and mean\n"
+ " %%A,%%T,%%C,%%G : base percentages\n"
+ " total bases : total number of bases\n"
+ "\n"
+ ,VERSION);
+
+} //end usage function
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <string>
+#include <sparsehash/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <iostream>
+#include "fastq-lib.h"
+#include "gcModel.h"
+
+using namespace std;
+
+#define T_A 0
+#define T_C 2
+#define T_G 6
+#define T_T 19
+#define roundgt0(x) (long)(x+0.5)
+
+class ent {
+public:
+ std::string seq;
+ int cnt;
+
+ ent(const std::string &s, int c) { seq=s; cnt=c; };
+ static bool comp_cnt (const ent &a, const ent &b) {
+ return a.cnt > b.cnt;
+ };
+
+};
+
+class countPerCycle {
+ public:
+ int basecount[26];
+ int qc;
+ double qsum;
+ //vector<int> qual;
+ int counts_by_qual[127];
+ int qmin;
+ int qmax;
+
+ countPerCycle() {
+ //26 english alphabets for A/C/G/T char
+ for(int i=0; i<26; i++) {
+ basecount[i]=0;
+ }
+ for(int i=0; i<127; i++) {
+ counts_by_qual[i] = 0;
+ }
+ qc = 0;
+ qsum = 0;
+ qmin = 10000;
+ qmax = 0;
+ };
+};
+
+class count_perCycle_perQual {
+ public:
+ int counts_by_qual[127];
+
+ count_perCycle_perQual() {
+ for(int i=0; i<127; i++) {
+ counts_by_qual[i] = 0;
+ }
+ };
+};
+
+
+void usage( FILE * f );
+double std_dev( double count , double total, double sqsum );
+double quantile( const std::vector <int> & vec, double p );
+double quantiles_with_counts(int* v, int start, int end, double p, bool dbug);
+std::string string_format( const std::string &fmt, ... );
+
+extern int optind;
+bool nodup = 0;
+google::sparse_hash_map <std::string, int> dups;
+
+vector <std::string> dup_reads; // do i need this
+
+int window = 2000000;
+int cyclemax = 35;
+int gcCyclemax = 100; // to compare with fastqc, seq is rounded to nearest 100 to reduce # of gc models; for < 200 length, this is teh same as max=100
+float gcSum;
+int gcTotal;
+
+int show_max = 10;
+bool debug = 0;
+bool fastx = 0;
+char *fastx_outfile = NULL;
+bool brkdown = 0;
+char *brkdown_outfile = NULL;
+bool len_hist = 0;
+vector<int> vlen; //all read lengths
+char *lenhist_outfile = NULL;
+bool gc = 0;
+char *gc_outfile = NULL;
+
+int main( int argc, char**argv ) {
+
+ int index;
+ char c;
+ optind = 0;
+ char *filename = NULL;
+
+// bad change to working syntax... breaks things!
+// if(argc < 2) {usage(stdout); return 0;}
+
+ while ( (c = getopt (argc, argv, "?DdL:g:x:b:c:w:s:h")) != -1) {
+ switch (c) {
+ case 'c': cyclemax = atoi(optarg); break;
+ case 'D': ++nodup; break;
+ case 'd': ++debug; break;
+ case 'w': window = atoi(optarg); break;
+ case 's': show_max = atoi(optarg); break;
+ case 'x': fastx_outfile = optarg; ++fastx; break;
+ case 'b': brkdown_outfile = optarg; ++brkdown; break;
+ case 'L': ++len_hist; lenhist_outfile = optarg; break;
+ case 'g': gc_outfile = optarg; ++gc; break;
+ case 'h': usage(stdout); return 0;
+ case '?':
+ if (!optopt) {
+ usage(stdout); return 0;
+ } else if(optopt && strchr("gbxcws", optopt)) {
+ // fprintf(stderr, "Option -%c requires an argument.\n", optopt);
+ } else {
+ // fprintf (stderr, "Unknown option \n", optopt);
+ }
+ usage(stderr);
+ return 1;
+ }
+ }
+
+ filename = argv[optind];
+
+ int lenmax = 0;
+ int lenmin = 100000000;
+ double lensum = 0;
+ double lenssq = 0;
+ double nbase = 0;
+ int qualmax = 0;
+ int qualmin = 100000;
+ double qualsum = 0;
+ double qualssq = 0;
+ int errs = 0;
+ long long nreads = 0;
+ int ndups = 0;
+ double dupss = 0;
+ bool fixlen = 0; //is fixed length
+ FILE *file;
+ struct fq newFq; meminit(newFq);
+ bool isgz;
+ vector<countPerCycle> qcStats (1);
+ vector<count_perCycle_perQual> qcStats_by_qual (1);
+ int phred = 64;
+ double ACGTN_count[26];
+ double total_bases = 0;
+
+
+ for(int i=0; i<26; i++) {
+ ACGTN_count[i] = 0;
+ }
+ dups.set_deleted_key("<>");
+
+ if(debug) {
+ cout << endl;
+ cout << "Parameters: " << endl;
+ printf("cyclemax: %d, window: %d, nodup: %d, debug: %d, showmax: %d, fastx: %d, outfile: %s, breakdown: %s, gc: %s\n",
+ cyclemax, window, nodup, debug, show_max, fastx, fastx_outfile, brkdown_outfile, gc_outfile);
+ cout << endl;
+ }
+
+ if(gc) {
+ gcInit(gcCyclemax);
+ }
+
+ //read file
+ file = filename ? gzopen(filename,"r",&isgz) : stdin;
+ while(read_fq(file,nreads++,&newFq)) {
+
+ if(newFq.seq.n != newFq.qual.n) {
+ errs++;
+ }
+
+ if(nreads == 10000) {
+ if(!std_dev((double)nreads,lensum,lenssq)) {
+ fixlen = 1;
+ }
+ }
+
+ total_bases += newFq.seq.n;
+ if(len_hist) {
+ if(newFq.seq.n > vlen.size())
+ vlen.resize(newFq.seq.n+1);
+ ++vlen[newFq.seq.n];
+ }
+
+ if(!fixlen) {
+ if(newFq.seq.n > lenmax) {
+ lenmax = newFq.seq.n;
+ }
+ if(newFq.seq.n < lenmin) {
+ lenmin = newFq.seq.n;
+ }
+ lensum += newFq.seq.n;
+ lenssq += newFq.seq.n*newFq.seq.n;
+ }
+
+
+ if((newFq.seq.n > qcStats.size()) && (fastx)) {
+ qcStats.resize(newFq.seq.n,countPerCycle());
+
+ }
+
+ if((newFq.seq.n > qcStats_by_qual.size()) && (brkdown) && (!fastx)) {
+ qcStats_by_qual.resize(newFq.seq.n,count_perCycle_perQual());
+ }
+
+ int gcTally = 0;
+ //compute quality stats for the first cyclemax bases
+ for(int i=0; i < newFq.seq.n; i++) {
+ int ascii_val = (int) newFq.qual.s[i];
+ if(fastx && ((nreads < window) || (nreads%10 == 0))) {
+ qcStats[i].qc++;
+ qcStats[i].counts_by_qual[ascii_val]++;
+ qcStats[i].qsum += ascii_val;
+ qcStats[i].basecount[(toupper(newFq.seq.s[i])-65)]++;
+ if(ascii_val < qcStats[i].qmin) {
+ qcStats[i].qmin = ascii_val;
+ }
+ if(ascii_val > qcStats[i].qmax) {
+ qcStats[i].qmax = ascii_val;
+ }
+ }
+ if(brkdown && (!fastx) && ((nreads < window) || (nreads%10 == 0))) {
+ qcStats_by_qual[i].counts_by_qual[ascii_val]++;
+ }
+
+ if (i < cyclemax) {
+ nbase++;
+
+ if(ascii_val > qualmax) {
+ qualmax = ascii_val;
+ }
+ if(ascii_val < qualmin) {
+ qualmin = ascii_val;
+ }
+ qualsum += ascii_val;
+ qualssq += ascii_val*ascii_val;
+
+ ACGTN_count[(toupper(newFq.seq.s[i])-65)]++;
+ }
+
+ if (gc && i < gcCyclemax) {
+ if(toupper(newFq.seq.s[i]) == 'G' || toupper(newFq.seq.s[i]) == 'C') {
+ gcTally++;
+ }
+ }
+ }
+ if(gc) {
+ int gcReadLength = newFq.seq.n > gcCyclemax? gcCyclemax : newFq.seq.n;
+ gcProcessSequence(gcReadLength, gcTally);
+ gcSum += (float)( gcTally )/gcReadLength;
+ gcTotal++;
+ }
+
+ if(!nodup) {//if you want to look at duplicate counts
+ if(newFq.seq.n > cyclemax) {
+ newFq.seq.s[cyclemax] = '\0';
+ newFq.seq.n = cyclemax;
+ }
+
+ if(nreads < window) {
+ dups[newFq.seq.s]++;
+ } else {
+ if(dups.find(newFq.seq.s) != dups.end()) {
+ dups[newFq.seq.s]++;
+ }//make sure the element already exists in the key
+
+ if(nreads==window) {
+ google::sparse_hash_map<string,int>::iterator it = dups.begin();
+ while(it != dups.end()) {
+ if((*it).second <= 1) {
+ dups.erase(it);
+ }
+ it++;
+ } //end while loop
+ }
+ }//if nreads > window
+ } //end if you want to look for dups
+
+ } //end reading all fastq reads
+
+ nreads--;
+
+ int inputReadError = gzclose(file, isgz);
+
+
+ if(gc) {
+ FILE *myfile;
+ myfile = fopen(gc_outfile, "w");
+ gcPrintDistribution(myfile);
+ gcClose();
+ }
+
+ std::vector<ent> dup_sort;
+ google::sparse_hash_map<string,int>::iterator it = dups.begin();
+ while(it != dups.end()) {
+ if((*it).second > 1) {
+ ent e((*it).first,(*it).second);
+ //printf("seq: %s dups:%d\n", e.seq.c_str(), e.cnt);
+ dup_sort.push_back(e);
+ ndups += (*it).second;
+ dupss += (*it).second*(*it).second;
+ }
+ it++;
+ } //end while loop
+ dups.clear();
+
+ std::sort(dup_sort.begin(),dup_sort.end(),ent::comp_cnt);
+
+ if(nreads < window) {
+ window = nreads;
+ }
+
+ if(nreads < 1) {
+ cout << "No reads in " << filename << ", not generating output" << endl;
+ return 0;
+ }
+ //autodetect phred
+ if(qualmin < 64) {
+ phred = 33;
+ }
+ printf("reads\t%lld\n",nreads);
+
+ if(!fixlen) {
+ printf("len\t%d\n", lenmax);
+ printf("len mean\t%.4f\n", (double)lensum/nreads);
+ if(nreads > 1) {
+ printf("len stdev\t%.4f\n", std_dev((double)nreads,lensum,lenssq));
+ }
+ printf("len min\t%d\n", lenmin);
+ } else {
+ printf("len\t%d\n",lenmax);
+ }
+
+ printf("phred\t%d\n", phred);
+ if(errs > 0) {
+ printf("errors\t%d\n", errs);
+ }
+
+
+ printf("window-size\t%d\n", window);
+ printf("cycle-max\t%d\n", cyclemax);
+
+ if(fastx) {
+
+ if(brkdown) {
+ FILE *myfile;
+ myfile = fopen(brkdown_outfile,"wd");
+ fprintf(myfile,"Cycle\tQuality\tCount\n");
+
+ for(int i=0; i<qcStats.size(); i++) {
+ for(int j=qcStats[i].qmin; j<=qcStats[i].qmax; j++) {
+ fprintf(myfile,"%d\t%d\t%d\n",(i+1),(j-phred),qcStats[i].counts_by_qual[j]);
+ }
+ }
+ fclose(myfile);
+ }
+
+
+ FILE *myfile;
+ myfile = fopen(fastx_outfile,"wd");
+ fprintf(myfile,"column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\tA_count\tC_count\tG_count\tT_count\tN_count\tMax_count\n");
+ for(int i=0; i<qcStats.size(); i++) {
+ int A_tot = 0;
+ int C_tot = 0;
+ int G_tot = 0;
+ int T_tot = 0;
+ int N_tot = 0;
+ for(int j=0; j<26; j++) {
+ if(j==T_A) {
+ A_tot += qcStats[i].basecount[j];
+ } else if(j==T_C) {
+ C_tot += qcStats[i].basecount[j];
+ } else if(j==T_G) {
+ G_tot += qcStats[i].basecount[j];
+ } else if(j==T_T) {
+ T_tot += qcStats[i].basecount[j];
+ } else {
+ N_tot += qcStats[i].basecount[j];
+ }
+ }
+
+ double q1 = quantiles_with_counts(qcStats[i].counts_by_qual,qcStats[i].qmin,qcStats[i].qmax,.25,0)-phred;
+ double med = quantiles_with_counts(qcStats[i].counts_by_qual,qcStats[i].qmin,qcStats[i].qmax,.5,0)-phred;
+ double q3 = quantiles_with_counts(qcStats[i].counts_by_qual,qcStats[i].qmin,qcStats[i].qmax,.75,0)-phred;
+
+ double iqr = q3-q1;
+ int lW = 0;
+ int rW = 0;
+
+ int low_bound = round(q1-iqr*1.5);
+ if(low_bound <= (qcStats[i].qmin-phred)) {
+ lW = qcStats[i].qmin-phred;
+ } else {
+ for(int low=(low_bound+phred);low<=qcStats[i].qmax;low++) {
+ if(qcStats[i].counts_by_qual[low] > 0) {
+ lW = low-phred;
+ low = qcStats[i].qmax+1;
+ }
+ }
+ }
+
+ int up_bound = round(q3+iqr*1.5);
+ if(up_bound >= (qcStats[i].qmax-phred)) {
+ rW = qcStats[i].qmax-phred;
+ } else {
+ for(int up=(up_bound+phred);up>=qualmin;up--) {
+ if(qcStats[i].counts_by_qual[up] > 0) {
+ rW = up-phred;
+ up = qcStats[i].qmin-1;
+ }
+ }
+ }
+
+ fprintf(myfile,"%d\t%d\t%d\t%d\t%.0f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t", (i+1), qcStats[i].qc, (qcStats[i].qmin-phred),
+ (qcStats[i].qmax-phred), (qcStats[i].qsum-qcStats[i].qc*phred),
+ (qcStats[i].qsum/qcStats[i].qc-phred),
+ q1, med, q3,iqr, lW, rW);
+ fprintf(myfile,"%d\t%d\t%d\t%d\t%d\t%lld\n", A_tot, C_tot, G_tot, T_tot, N_tot,nreads);
+
+ }
+ fclose(myfile);
+ }
+
+ if(brkdown && (!fastx)) {
+ FILE *myfile;
+ myfile = fopen(brkdown_outfile,"wd");
+ fprintf(myfile,"Cycle\tQuality\tCount\n");
+ for(int i=0; i<qcStats_by_qual.size(); i++) {
+ for(int j=qualmin; j<=qualmax; j++) {
+ fprintf(myfile,"%d\t%d\t%d\n",(i+1),(j-phred),qcStats_by_qual[i].counts_by_qual[j]);
+ }
+ }
+ fclose(myfile);
+ }
+
+ if(len_hist) {
+ FILE *myfile;
+ myfile = fopen(lenhist_outfile,"wd");
+ fprintf(myfile,"Length\tCount\n");
+ for(int len_i=0; len_i<=vlen.size(); len_i++) {
+ if(vlen[len_i]) {
+ fprintf(myfile,"%d\t%d\n", len_i,vlen[len_i]);
+ }
+ }
+ fclose(myfile);
+ }
+
+ int uniq_dup = (int)dup_sort.size();
+ if(debug) {
+ cout << endl;
+ cout << "unique duplicates\t" << uniq_dup << endl;
+ cout << "total duplicates\t" << ndups << endl;
+ cout << endl;
+ }
+ if (uniq_dup && !nodup) {
+ printf("dups\t%d\n",ndups-uniq_dup);
+ printf("%%dup\t%.4f\n", ((double)(ndups-uniq_dup)/nreads)*100);
+ int uniq_dup = (int)dup_sort.size();
+ printf("unique-dup seq\t%d\n", uniq_dup);
+ printf("min dup count\t%d\n", dup_sort.back().cnt);
+
+
+ for(int i=0; i<show_max; i++) {
+ if(i < dup_sort.size()) {
+ if(dup_sort.at(i).cnt != 0) {
+ cout << "dup seq \t" << (i+1) << "\t" << (dup_sort.at(i).cnt-1) << "\t" << dup_sort.at(i).seq << endl;
+ }
+ } else { i = show_max; }
+ }
+
+ if(uniq_dup > 1) {
+ printf("dup mean\t%.4f\n", (double)ndups/uniq_dup);
+ printf("dup stddev\t%.4f\n", (std_dev((double)uniq_dup, ndups, dupss)));
+ }
+ }
+ printf("qual min\t%d\n", qualmin-phred);
+ printf("qual max\t%d\n", qualmax-phred);
+ printf("qual mean\t%.4f\n", ((double)qualsum/nbase)-phred);
+ printf("qual stdev\t%.4f\n", std_dev((double)nbase,qualsum,qualssq));
+
+
+ if(gc) {
+ // put these where they belong
+ printf("pct-gc cycle-max\t%d\n", gcCyclemax);
+ printf("pct-gc mean\t%.2f\n", 100.0 * gcSum / gcTotal);
+ }
+
+ printf("%%A\t%.4f\n", ((double)ACGTN_count[T_A]/nbase*100));
+ printf("%%C\t%.4f\n", ((double)ACGTN_count[T_C]/nbase*100));
+ printf("%%G\t%.4f\n", ((double)ACGTN_count[T_G]/nbase*100));
+ printf("%%T\t%.4f\n", ((double)ACGTN_count[T_T]/nbase*100));
+ double ACGT_total = ACGTN_count[T_A] + ACGTN_count[T_C] + ACGTN_count[T_G] + ACGTN_count[T_T];
+ printf("%%N\t%.4f\n", ((double)(nbase-ACGT_total)/nbase*100));
+ printf("total bases\t%.0f\n",total_bases);
+
+ if (inputReadError) {
+ printf("error\t%s\n", "error during close, output may be invalid");
+ }
+
+ // fail if input read failed.... even if we don't know why and reported all the stats
+ return inputReadError;
+
+} //end main method
+
+double quantile( const std::vector <int> & vec, double p ) {
+ int l = vec . size();
+ double t = ( (double) l- 1 ) * p;
+ int it = (int) t;
+ int v = vec [it];
+ if ( t > (double) it ) {
+ return ( v + (t-it) * ( vec [ it + 1 ] - v ) );
+ }
+ else {
+ return v;
+ }
+} //end quantile function
+
+std::string string_format( const std::string &fmt, ... ) {
+ int n, size = 100;
+ std::string str;
+ va_list ap;
+ while (1) {
+ str . resize(size);
+ va_start( ap, fmt );
+ int n =
+ vsnprintf( ( char * ) str . c_str(), size, fmt . c_str(), ap );
+ va_end(ap);
+ if ( n > -1 && n < size ) return str;
+ if ( n > -1 ) size = n + 1;
+ else size *= 2;
+ }
+} //end string_format function
+
+double std_dev(double count, double total, double sqsum) {
+ if(debug) {
+ cout << endl;
+ cout << "count " << count << " total " << total << " sqsum " << sqsum << endl;
+ cout << endl;
+ }
+ return sqrt(sqsum/(count-1)-(total/count *total/(count-1)));
+}
+
+double quantiles_with_counts(int *v, int start, int end, double p, bool dbug) {
+ int v_size = 0;
+ for(int i=start; i<=end; i++) {
+ if(dbug)
+ cout << "i: " << i << " v[i]: " << v[i] << endl;
+ v_size += v[i];
+ }
+
+ double q = p*(v_size-1);
+ int count_skip = (int) q;
+ double val = -1;
+ bool v_fill = 0;
+ int v_next = -1;
+
+ if(dbug) {
+ cout << "p : " << p << endl;
+ cout << "v-size: " << v_size << endl;
+ cout << "q : " << q << endl;
+ cout << "count-skip: " << count_skip << endl;
+ }
+ int tot=0;
+ for(int i=start; i<=end; i++) {
+ tot += v[i];
+ if(tot>count_skip && !v_fill) {
+ val = i;
+ if(dbug)
+ cout << "val : " << val << " val-count: " << v[i] << endl;
+ v_fill = 1;
+ }
+ if(tot>(count_skip+1)) {
+ v_next = i;
+ if(dbug)
+ cout << "val_next : " << v_next << " val-count: " << v[i] << endl;
+ i = end+1;
+ }
+ }
+
+ if(q > count_skip) {
+ if(dbug)
+ cout << "v_next - val " << (v_next-val) << endl;
+ return (val + (q-count_skip)*(v_next-val));
+ } else {
+ return val;
+ }
+}
+
diff --git a/fastx-graph b/fastx-graph
new file mode 100755
index 0000000..a06a5ab
--- /dev/null
+++ b/fastx-graph
@@ -0,0 +1,149 @@
+#!/usr/bin/Rscript --vanilla
+
+# Copyright (c) 2011 Expression Analysis / Gunjan Hariani, Erik Aronesty
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# $Id: fastx-graph 525 2012-12-25 19:41:22Z earonesty $
+
+if (!require(getopt)) {
+ write(c("Installing package on:",system("hostname",intern=T)),file=stderr())
+ install.packages('getopt',repos='http://R-Forge.R-project.org')
+ # will die if it fails at this point
+ library(getopt)
+}
+
+if (!require("Hmisc")) {
+ write(c("Installing package on:",system("hostname",intern=T)),file=stderr())
+ install.packages('Hmisc',repos='http://R-Forge.R-project.org')
+ # will die if it fails at this point
+ library("Hmisc")
+}
+
+spec <- matrix(c(
+ 'input' , 'i', 1, "character", "file from fastq-stats -x (required)",
+ 'gc' , 'G', 1, "character", "input gc content file (optional)",
+ 'out' , 'o', 1, "character", "output filename (optional)",
+ 'help' , 'h', 0, "logical", "this help"
+),ncol=5,byrow=T)
+
+opt = getopt(spec);
+
+if (!is.null(opt$help) || is.null(opt$input)) {
+ cat(paste(getopt(spec, usage=T, command="fastx-graph"),"\n"));
+ q();
+}
+
+in.file <- opt$input
+gc.file <- opt$gc
+out.file <- opt$out
+
+fastx <- read.table(in.file,sep="\t",header=T,as.is=T)
+
+# output is in, minus txt, plus png
+if (is.null(out.file)) {
+ in.file <- gsub(".txt$","",in.file,perl=T)
+ out.file <- paste(in.file,".png",sep="")
+}
+
+# correct for bug in R if the file has a % sign in it
+out.file <- gsub("%","%%",out.file)
+
+png(out.file,width=1000, height=500)
+
+par(mar=c(4,3,5,5),xaxs="i",yaxs="i",xpd=T)
+plot(c(0,0),pch="",ylim=c(min(c(0,fastx$lW)),max(41,fastx$rW)),xlim=c(0,(nrow(fastx)+1)),
+ xlab="",ylab="",las=2,cex.axis=.85,cex.lab=.85,xaxt="n")
+
+lims <- par("xaxp")[1:2]
+major.ticks <- pretty(lims,n=par("xaxp")[3])
+minor.tick(nx=(major.ticks[2]-major.ticks[1]),ny=10)
+
+minor.ticks <- 1:nrow(fastx)
+mtext("Cycle",side=3,line=3,cex=.85)
+axis(1,at=major.ticks,cex.axis=.85,labels=F)
+axis(3,at=major.ticks,cex.axis=.85,labels=F)
+axis(3,at=minor.ticks,tcl=par("tcl")*0.5,labels=minor.ticks,cex.axis=.80,las=3)
+
+colnames(fastx) <- c("column","count","min","max","sum","mean","Q1","med",
+ "Q3","IQR","lW","rW","A_count","C_count","G_count",
+ "T_count","N_count","Max_count")
+
+for(i in 1:nrow(fastx)) {
+ par(new=T)
+ rect(fastx$column[i]-.35,fastx$Q1[i],fastx$column[i]+0.35,fastx$Q3[i],col="gray")
+ segments(fastx$column[i]-.35,fastx$med[i],fastx$column[i]+0.35,fastx$med[i],col="red",lwd=1.5)
+ segments(fastx$column[i],fastx$lW[i],fastx$column[i],fastx$Q1[i],lty="dashed")
+ segments(fastx$column[i],fastx$Q3[i],fastx$column[i],fastx$rW[i],lty="dashed")
+ segments(fastx$column[i]-.35,fastx$lW[i],fastx$column[i]+0.35,fastx$lW[i])
+ segments(fastx$column[i]-.35,fastx$rW[i],fastx$column[i]+0.35,fastx$rW[i])
+}
+
+# if theres a significant difference
+tots<-fastx$N_count+fastx$T_count+fastx$G_count+fastx$C_count+fastx$A_count
+
+if (min(tots) < (max(tots)*.98)) {
+ par(new=T)
+ plot(tots,col="gray",xaxt="n",yaxt="n",xlab="",ylab="",pch="+",ylim=c(min(tots),max(tots)))
+ lines(fastx$N_count+fastx$T_count+fastx$G_count+fastx$C_count+fastx$A_count,col="gray",xaxt="n",yaxt="n",xlab="",ylab="", lty=2)
+}
+
+par(new=T)
+plot(fastx$column,fastx$A_count*100/fastx$count,col="red",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+ ylim=c(0,100),lwd=2)
+par(new=T)
+plot(fastx$column,fastx$C_count*100/fastx$count,col="blue",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+ ylim=c(0,100),lwd=2)
+par(new=T)
+plot(fastx$column,fastx$G_count*100/fastx$count,col="green",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+ ylim=c(0,100),lwd=2)
+par(new=T)
+plot(fastx$column,fastx$T_count*100/fastx$count,col="black",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+ ylim=c(0,100),lwd=2)
+par(new=T)
+barplot(fastx$N_count*100/fastx$count,col="orange",xaxt="n",yaxt="n",xlab="",ylab="",
+ ylim=c(0,100))
+
+axis(4,at=seq(0,100,25),cex.axis=.85)
+mtext("Pct Base Distribution",side=4,line=3,cex=.85)
+mtext("Base Quality",side=2,line=2,cex=.85)
+legend(0,-4,col=c("red","blue","green","black","orange"),lty=1,
+ legend=c("A","C","G","T","N"),cex=.70,horiz=T,lwd=2)
+
+if(!is.null(gc.file)) {
+ tmp <- read.table(gc.file,sep="\t",header=F,as.is=T)
+ if (tmp[1,1] == 'pct-GC') {
+ # silly legacy format
+ GC <- read.table(gc.file,sep="\t",header=T,as.is=T,skip=3)
+ } else if (tmp[1,1] == 'pct_GC') {
+ GC <- read.table(gc.file,sep="\t",header=T,as.is=T,skip=1)
+ } else {
+ GC <- read.table(gc.file,sep="\t",header=T,as.is=T)
+ }
+ colnames(GC)=c("pct_gc", "count")
+ par(new=T)
+ par(fig=c(0.1,0.2,0.45,0.60))
+ par(mar=c(0,0,1,0))
+ plot(GC$pct_gc,GC$count,type="l",xaxt="n",yaxt="n",
+ main="%GC per read",cex.main=.90)
+ axis(1,seq(0,100,20),labels=seq(0,100,20),las=2,tck=-0.1,cex.axis=.75)
+}
+
+
+graphics.off()
diff --git a/gcModel.c b/gcModel.c
new file mode 100644
index 0000000..9627dc2
--- /dev/null
+++ b/gcModel.c
@@ -0,0 +1,207 @@
+/*
+$Id: gcModel.c 564 2013-03-08 17:16:42Z earonesty $
+*/
+#include <ctype.h>
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <string>
+#include <iostream>
+
+#include "fastq-lib.h"
+#include "gcModel.h"
+
+// #define UNIT_TEST
+
+void gcInit(int maxReadLength);
+void gcProcessSequence(int l,int c);
+void gcPrintDistribution(FILE *fp);
+void gcClose();
+
+using namespace std;
+
+#define roundgt0(x) (long)(x<0.5?0:x+0.5)
+
+typedef struct GCModelValue {
+ int percentage;
+ double increment;
+} GC_MODEL_VALUE;
+
+typedef struct GCModelValues {
+ GC_MODEL_VALUE * values;
+ int valuesLength;
+} GC_MODEL_VALUES;
+
+typedef GC_MODEL_VALUES *GC_MODELS;
+
+
+static int claimingCounts[101];
+static double gcDistribution[101];
+static GC_MODELS *cachedModels;
+static int gMaxReadLength = -1;
+
+GC_MODEL_VALUES *calcModels(int readLength) {
+
+ memset(claimingCounts,0,sizeof(claimingCounts));
+
+ GC_MODEL_VALUES *models = (GC_MODEL_VALUES *) malloc((readLength+1) * sizeof(GC_MODEL_VALUES ));
+ memset(models,0,(readLength+1) * sizeof(GC_MODEL_VALUES ));
+
+ for (int pos=0;pos<=readLength;pos++) {
+ double lowCount = pos-0.5;
+ double highCount = pos+0.5;
+
+ if (lowCount < 0.0) lowCount = 0.0;
+ if (highCount < 0.0) highCount = 0.0;
+ if (highCount > readLength) highCount = readLength;
+ if (lowCount > readLength) lowCount = readLength;
+
+ int lowPercentage = (int)roundgt0(((lowCount*100) / readLength));
+ int highPercentage = (int)roundgt0(((highCount*100) / readLength));
+
+ for (int p=lowPercentage;p<=highPercentage;p++) {
+ claimingCounts[p]++;
+ }
+ }
+
+ // We now do a second pass to make up the model using the weightings
+ // we calculated previously.
+
+ for (int pos=0;pos<=readLength;pos++) {
+ double lowCount = pos-0.5;
+ double highCount = pos+0.5;
+
+ if (lowCount < 0) lowCount = 0;
+ if (highCount < 0) highCount = 0;
+ if (highCount > readLength) highCount = readLength;
+ if (lowCount > readLength) lowCount = readLength;
+
+ int lowPercentage = (int)roundgt0((lowCount*100) / readLength);
+ int highPercentage = (int)roundgt0((highCount*100) / readLength);
+
+ models[pos].values = (GC_MODEL_VALUE *) malloc(((highPercentage-lowPercentage)+1) * sizeof(GC_MODEL_VALUE) );
+ memset(models[pos].values,0,
+ ((highPercentage-lowPercentage)+1) * sizeof(GC_MODEL_VALUE) );
+ models[pos].valuesLength = (highPercentage-lowPercentage)+1;
+
+ for (int p=lowPercentage;p<=highPercentage;p++) {
+ models[pos].values[p-lowPercentage].percentage = p;
+ models[pos].values[p-lowPercentage].increment = 1.0/claimingCounts[p];
+ }
+ }
+
+ return (models);
+}
+
+
+void gcProcessSequence(int l,int c) {
+
+ if(l > gMaxReadLength) { printf("Error: read length (%d) exceeds specified maximum length(%d)\n", l, gMaxReadLength); }
+ if(c > l) { printf("Error: GC-count (%d) exceeds actual read length(%d)\n", c, l) ;}
+
+ GC_MODEL_VALUE *values = cachedModels[l][c].values;
+
+ for(int i=0; i < cachedModels[l][c].valuesLength; i++) {
+ gcDistribution[values[i].percentage] += values[i].increment;
+ }
+
+}
+
+void printModels(int rl) {
+ GC_MODEL_VALUES *m = cachedModels[rl];
+
+ printf("## Model values for read length=%d\n",rl);
+
+ for(int i = 0; i <= rl; i++) {
+ printf("%d: ",i);
+ for(int j = 0; j < m[i].valuesLength; j++) {
+ printf("%d,%.2f ",m[i].values[j].percentage, m[i].values[j].increment);
+ }
+ printf("\n");
+ }
+}
+
+void gcPrintDistribution(FILE *fp) {
+ if(fp == NULL) {
+ fp = stdout;
+ }
+ fprintf(fp, "pct_GC\tCount\n");
+ for(int i=0; i<=100;i++) {
+ fprintf(fp, "%d\t%.2f\n",i,gcDistribution[i]);
+ }
+}
+
+void gcClose() {
+ if(gMaxReadLength < 0)return; // never initialized
+
+ for(int rl = 0; rl < gMaxReadLength; rl++) {
+ GC_MODEL_VALUES * m = cachedModels[rl];
+ for(int i = 0; i <= rl; i++) {
+ free(m[i].values);
+ }
+ free(m);
+ }
+
+ free(cachedModels);
+}
+
+void gcInit(int maxReadLength) {
+ gMaxReadLength = maxReadLength;
+
+ memset(gcDistribution,0,sizeof(gcDistribution));
+ // Build all models for a given max readlength:
+ cachedModels = (GC_MODELS*)malloc((maxReadLength+1) * sizeof(GC_MODELS));
+ // original code fills this in,caching, as necessary
+ // here, we just build all models at outset:
+ int pos;
+ for( pos = 0; pos <= maxReadLength; pos++) {
+ cachedModels[pos] = calcModels(pos);
+ }
+}
+
+#ifdef UNIT_TEST
+main() {
+
+ // int maxReadLength = 35;
+ int maxReadLength = 5;
+
+ gcInit(maxReadLength);
+
+ // ***
+ // simulate processing A sequence:
+ // int seqLength = 3; // this sequence's length
+ // int gcCount = 2; // total G's & C's -- count 'em
+ /*
+ for(int i = 0; i < 10000000; i++) {
+ gcProcessSequence(35,15);
+ }
+ for(int i = 0; i < 5000000; i++) {
+ gcProcessSequence(35,10);
+ }
+ */
+
+ printModels(4);
+ // exit(0);
+
+ // for(int pos=0; pos <= maxReadLength; pos++) {
+ // gcProcessSequence(maxReadLength,pos);
+ // }
+
+ // gcProcessSequence(3,0);
+ // gcProcessSequence(4,2);
+ // gcProcessSequence(5,4);
+
+ // gcPrintDistribution(NULL);
+
+
+ gcClose();
+
+}
+#endif
diff --git a/gcModel.h b/gcModel.h
new file mode 100644
index 0000000..e75fcf5
--- /dev/null
+++ b/gcModel.h
@@ -0,0 +1,7 @@
+/*
+$Id: gcModel.h 556 2013-03-01 15:32:36Z earonesty $
+*/
+extern void gcInit(int maxReadLength);
+extern void gcProcessSequence(int l,int c);
+extern void gcPrintDistribution(FILE *fp);
+void gcClose();
diff --git a/gtf2bed b/gtf2bed
new file mode 100755
index 0000000..c624e81
--- /dev/null
+++ b/gtf2bed
@@ -0,0 +1,116 @@
+#!/usr/bin/perl
+
+# Copyright (c) 2011 Erik Aronesty (erik at q32.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
+
+use Data::Dumper;
+
+$in = shift @ARGV;
+
+open IN, ($in =~ /\.gz$/ ? "gunzip -c $in" : $in =~ /\.zip$/ ? "unzip -p $in" : "$in");
+while (<IN>) {
+ $gff = 2 if /^##gff-version 2/;
+ $gff = 3 if /^##gff-version 3/;
+ next if /^#/ && $gff;
+
+ s/\s+$//;
+ # 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr
+ my @f = split /\t/;
+ if ($gff) {
+ # most ver 2's stick gene names in the id field
+ ($id) = $f[8]=~ /\bID="([^"]+)"/;
+ # most ver 3's stick unquoted names in the name field
+ ($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3;
+ } else {
+ ($id) = $f[8]=~ /transcript_id "([^"]+)"/;
+ }
+
+ next unless $id && $f[0];
+
+ if ($f[2] eq 'exon') {
+ die "no position at exon on line $." if ! $f[3];
+ # gff3 puts :\d in exons sometimes
+ $id =~ s/:\d+$// if $gff == 3;
+ push @{$exons{$id}}, \@f;
+ # save lowest start
+ $trans{$id} = \@f if !$trans{$id};
+ } elsif ($f[2] eq 'start_codon') {
+ #optional, output codon start/stop as "thick" region in bed
+ $sc{$id}->[0] = $f[3];
+ } elsif ($f[2] eq 'CDS') {
+ #optional, output codon start/stop as "thick" region in bed
+ push @{$cds{$id}}, \@f;
+ # save lowest start
+ $cdx{$id} = \@f if !$cdx{$id};
+ } elsif ($f[2] eq 'stop_codon') {
+ $sc{$id}->[1] = $f[4];
+ } elsif ($f[2] eq 'miRNA' ) {
+ $trans{$id} = \@f if !$trans{$id};
+ push @{$exons{$id}}, \@f;
+ }
+}
+
+for $id (
+ # sort by chr then pos
+ sort {
+ $trans{$a}->[0] eq $trans{$b}->[0] ?
+ $trans{$a}->[3] <=> $trans{$b}->[3] :
+ $trans{$a}->[0] cmp $trans{$b}->[0]
+ } (keys(%trans)) ) {
+ my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}};
+ my ($cds, $cde);
+ ($cds, $cde) = @{$sc{$id}} if $sc{$id};
+
+ # sort by pos
+ my @ex = sort {
+ $a->[3] <=> $b->[3]
+ } @{$exons{$id}};
+
+ my $beg = $ex[0][3];
+ my $end = $ex[-1][4];
+
+ if ($dir eq '-') {
+ # swap
+ $tmp=$cds;
+ $cds=$cde;
+ $cde=$tmp;
+ $cds -= 2 if $cds;
+ $cde += 2 if $cde;
+ }
+
+ # not specified, just use exons
+ $cds = $beg if !$cds;
+ $cde = $end if !$cde;
+
+ # adjust start for bed
+ --$beg; --$cds;
+
+ my $exn = @ex; # exon count
+ my $exst = join ",", map {$_->[3]-$beg-1} @ex; # exon start
+ my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex; # exon size
+
+ # added an extra comma to make it look exactly like ucsc's beds
+ print "$chr\t$beg\t$end\t$id\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,\n";
+}
+
+
+close IN;
diff --git a/kaln.c b/kaln.c
new file mode 100644
index 0000000..9c0bbaa
--- /dev/null
+++ b/kaln.c
@@ -0,0 +1,486 @@
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3 at gmail.com>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "kaln.h"
+
+#define FROM_M 0
+#define FROM_I 1
+#define FROM_D 2
+
+typedef struct {
+ int i, j;
+ unsigned char ctype;
+} path_t;
+
+int aln_sm_blosum62[] = {
+/* A R N D C Q E G H I L K M F P S T W Y V * X */
+ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
+ -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
+ -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
+ -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
+ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
+ -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
+ -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
+ -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
+ -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
+ -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
+ -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+ -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
+ -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
+ -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
+ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
+ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
+ -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
+ -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
+ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
+ -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
+ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
+};
+
+int aln_sm_blast[] = {
+ 1, -3, -3, -3, -2,
+ -3, 1, -3, -3, -2,
+ -3, -3, 1, -3, -2,
+ -3, -3, -3, 1, -2,
+ -2, -2, -2, -2, -2
+};
+
+int aln_sm_qual[] = {
+ 0, -23, -23, -23, 0,
+ -23, 0, -23, -23, 0,
+ -23, -23, 0, -23, 0,
+ -23, -23, -23, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
+ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
+
+ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
+
+static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
+{
+ int i, n;
+ uint32_t *cigar;
+ unsigned char last_type;
+
+ if (path_len == 0 || path == 0) {
+ *n_cigar = 0;
+ return 0;
+ }
+
+ last_type = path->ctype;
+ for (i = n = 1; i < path_len; ++i) {
+ if (last_type != path[i].ctype) ++n;
+ last_type = path[i].ctype;
+ }
+ *n_cigar = n;
+ cigar = (uint32_t*)calloc(*n_cigar, 4);
+
+ cigar[0] = 1u << 4 | path[path_len-1].ctype;
+ last_type = path[path_len-1].ctype;
+ for (i = path_len - 2, n = 0; i >= 0; --i) {
+ if (path[i].ctype == last_type) cigar[n] += 1u << 4;
+ else {
+ cigar[++n] = 1u << 4 | path[i].ctype;
+ last_type = path[i].ctype;
+ }
+ }
+
+ return cigar;
+}
+
+/***************************/
+/* START OF common_align.c */
+/***************************/
+
+#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
+
+#define set_M(MM, cur, p, sc) \
+{ \
+ if ((p)->M >= (p)->I) { \
+ if ((p)->M >= (p)->D) { \
+ (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
+ } else { \
+ (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
+ } \
+ } else { \
+ if ((p)->I > (p)->D) { \
+ (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
+ } else { \
+ (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
+ } \
+ } \
+}
+#define set_I(II, cur, p) \
+{ \
+ if ((p)->M - gap_open > (p)->I) { \
+ (cur)->It = FROM_M; \
+ (II) = (p)->M - gap_open - gap_ext; \
+ } else { \
+ (cur)->It = FROM_I; \
+ (II) = (p)->I - gap_ext; \
+ } \
+}
+#define set_end_I(II, cur, p) \
+{ \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->I) { \
+ (cur)->It = FROM_M; \
+ (II) = (p)->M - gap_end_open - gap_end_ext; \
+ } else { \
+ (cur)->It = FROM_I; \
+ (II) = (p)->I - gap_end_ext; \
+ } \
+ } else set_I(II, cur, p); \
+}
+#define set_D(DD, cur, p) \
+{ \
+ if ((p)->M - gap_open > (p)->D) { \
+ (cur)->Dt = FROM_M; \
+ (DD) = (p)->M - gap_open - gap_ext; \
+ } else { \
+ (cur)->Dt = FROM_D; \
+ (DD) = (p)->D - gap_ext; \
+ } \
+}
+#define set_end_D(DD, cur, p) \
+{ \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->D) { \
+ (cur)->Dt = FROM_M; \
+ (DD) = (p)->M - gap_end_open - gap_end_ext; \
+ } else { \
+ (cur)->Dt = FROM_D; \
+ (DD) = (p)->D - gap_end_ext; \
+ } \
+ } else set_D(DD, cur, p); \
+}
+
+typedef struct {
+ uint8_t Mt:3, It:2, Dt:3;
+} dpcell_t;
+
+typedef struct {
+ int M, I, D;
+} dpscore_t;
+
+/***************************
+ * banded global alignment *
+ ***************************/
+uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
+{
+ int i, j;
+ dpcell_t **dpcell, *q;
+ dpscore_t *curr, *last, *s;
+ int b1, b2, tmp_end;
+ int *mat, end, max = 0;
+ uint8_t type, ctype;
+ uint32_t *cigar = 0;
+
+ int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
+ int *score_matrix, N_MATRIX_ROW;
+
+ /* initialize some align-related parameters. just for compatibility */
+ gap_open = ap->gap_open;
+ gap_ext = ap->gap_ext;
+ gap_end_open = ap->gap_end_open;
+ gap_end_ext = ap->gap_end_ext;
+ b = ap->band_width;
+ score_matrix = ap->matrix;
+ N_MATRIX_ROW = ap->row;
+
+ if (n_cigar) *n_cigar = 0;
+ if (len1 == 0 || len2 == 0) return 0;
+
+ /* calculate b1 and b2 */
+ if (len1 > len2) {
+ b1 = len1 - len2 + b;
+ b2 = b;
+ } else {
+ b1 = b;
+ b2 = len2 - len1 + b;
+ }
+ if (b1 > len1) b1 = len1;
+ if (b2 > len2) b2 = len2;
+ --seq1; --seq2;
+
+ /* allocate memory */
+ end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
+ dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
+ for (j = 0; j <= len2; ++j)
+ dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
+ for (j = b2 + 1; j <= len2; ++j)
+ dpcell[j] -= j - b2;
+ curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+ last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+
+ /* set first row */
+ SET_INF(*curr); curr->M = 0;
+ for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
+ SET_INF(*s);
+ set_end_D(s->D, dpcell[0] + i, s - 1);
+ }
+ s = curr; curr = last; last = s;
+
+ /* core dynamic programming, part 1 */
+ tmp_end = (b2 < len2)? b2 : len2 - 1;
+ for (j = 1; j <= tmp_end; ++j) {
+ q = dpcell[j]; s = curr; SET_INF(*s);
+ set_end_I(s->I, q, last);
+ end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ ++s; ++q;
+ for (i = 1; i != end; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+ set_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_D(s->D, q, s - 1);
+ if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+ set_end_I(s->I, q, last + i);
+ } else s->I = MINOR_INF;
+ s = curr; curr = last; last = s;
+ }
+ /* last row for part 1, use set_end_D() instead of set_D() */
+ if (j == len2 && b2 != len2 - 1) {
+ q = dpcell[j]; s = curr; SET_INF(*s);
+ set_end_I(s->I, q, last);
+ end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ ++s; ++q;
+ for (i = 1; i != end; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+ set_I(s->I, q, last + i);
+ set_end_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_end_D(s->D, q, s - 1);
+ if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+ set_end_I(s->I, q, last + i);
+ } else s->I = MINOR_INF;
+ s = curr; curr = last; last = s;
+ ++j;
+ }
+
+ /* core dynamic programming, part 2 */
+ for (; j <= len2 - b2 + 1; ++j) {
+ SET_INF(curr[j - b2]);
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ end = j + b1 - 1;
+ for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_D(s->D, q, s - 1);
+ s->I = MINOR_INF;
+ s = curr; curr = last; last = s;
+ }
+
+ /* core dynamic programming, part 3 */
+ for (; j < len2; ++j) {
+ SET_INF(curr[j - b2]);
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+ set_end_I(s->I, q, last + i);
+ set_D(s->D, q, s - 1);
+ s = curr; curr = last; last = s;
+ }
+ /* last row */
+ if (j == len2) {
+ SET_INF(curr[j - b2]);
+ mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+ for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+ set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+ set_I(s->I, q, last + i);
+ set_end_D(s->D, q, s - 1);
+ }
+ set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+ set_end_I(s->I, q, last + i);
+ set_end_D(s->D, q, s - 1);
+ s = curr; curr = last; last = s;
+ }
+
+ *_score = last[len1].M;
+ if (n_cigar) { /* backtrace */
+ path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
+ i = len1; j = len2;
+ q = dpcell[j] + i;
+ s = last + len1;
+ max = s->M; type = q->Mt; ctype = FROM_M;
+ if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
+ if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
+
+ p = path;
+ p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
+ ++p;
+ do {
+ switch (ctype) {
+ case FROM_M: --i; --j; break;
+ case FROM_I: --j; break;
+ case FROM_D: --i; break;
+ }
+ q = dpcell[j] + i;
+ ctype = type;
+ switch (type) {
+ case FROM_M: type = q->Mt; break;
+ case FROM_I: type = q->It; break;
+ case FROM_D: type = q->Dt; break;
+ }
+ p->ctype = ctype; p->i = i; p->j = j;
+ ++p;
+ } while (i || j);
+ cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
+ free(path);
+ }
+
+ /* free memory */
+ for (j = b2 + 1; j <= len2; ++j)
+ dpcell[j] += j - b2;
+ for (j = 0; j <= len2; ++j)
+ free(dpcell[j]);
+ free(dpcell);
+ free(curr); free(last);
+
+ return cigar;
+}
+
+typedef struct {
+ int M, I, D;
+} score_aux_t;
+
+#define MINUS_INF -0x40000000
+
+// matrix: len2 rows and len1 columns
+int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
+{
+
+#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
+ int t1, t2; \
+ score_aux_t *_q; \
+ _q = _q0; \
+ _p->M = _q->M >= _q->I? _q->M : _q->I; \
+ _p->M = _p->M >= _q->D? _p->M : _q->D; \
+ _p->M += (_sc); \
+ ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
+ _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
+ }
+
+ int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
+ const uint8_t *seq1, *seq2;
+ score_aux_t *curr, *last, *swap;
+ bw = abs(len1 - len2) + ap->band_width;
+ i = len1 > len2? len1 : len2;
+ if (bw > i + 1) bw = i + 1;
+ seq1 = _seq1 - 1; seq2 = _seq2 - 1;
+ curr = calloc(len1 + 2, sizeof(score_aux_t));
+ last = calloc(len1 + 2, sizeof(score_aux_t));
+ { // the zero-th row
+ int x, end = len1;
+ score_aux_t *p;
+ j = 0;
+ x = j + bw; end = len1 < x? len1 : x; // band end
+ p = curr;
+ p->M = 0; p->I = p->D = MINUS_INF;
+ for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
+ p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
+ p->M = p->I = p->D = MINUS_INF;
+ swap = curr; curr = last; last = swap;
+ }
+ for (j = 1; j < len2; ++j) {
+ int x, beg = 0, end = len1, *scrow, col_end;
+ score_aux_t *p;
+ x = j - bw; beg = 0 > x? 0 : x; // band start
+ x = j + bw; end = len1 < x? len1 : x; // band end
+ if (beg == 0) { // from zero-th column
+ p = curr;
+ p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+ ++beg; // then beg = 1
+ }
+ scrow = scmat + seq2[j] * scmat_size;
+ if (end == len1) col_end = 1, --end;
+ else col_end = 0;
+ for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
+ if (col_end) {
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
+ ++p;
+ }
+ p->M = p->I = p->D = MINUS_INF;
+// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+ swap = curr; curr = last; last = swap;
+ }
+ { // the last row
+ int x, beg = 0, *scrow;
+ score_aux_t *p;
+ j = len2;
+ x = j - bw; beg = 0 > x? 0 : x; // band start
+ if (beg == 0) { // from zero-th column
+ p = curr;
+ p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+ ++beg; // then beg = 1
+ }
+ scrow = scmat + seq2[j] * scmat_size;
+ for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
+// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+ }
+ ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
+ ret = ret >= curr[len1].D? ret : curr[len1].D;
+ free(curr); free(last);
+ return ret;
+}
+
+#ifdef _MAIN
+int main(int argc, char *argv[])
+{
+// int len1 = 35, len2 = 35;
+// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
+// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
+ int len1 = 4, len2 = 4;
+ uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
+ uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
+ int sc;
+// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
+ sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
+ printf("%d\n", sc);
+ return 0;
+}
+#endif
diff --git a/knetfile.c b/knetfile.c
new file mode 100644
index 0000000..af09146
--- /dev/null
+++ b/knetfile.c
@@ -0,0 +1,632 @@
+/* The MIT License
+
+ Copyright (c) 2008 by Genome Research Ltd (GRL).
+ 2010 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Probably I will not do socket programming in the next few years and
+ therefore I decide to heavily annotate this file, for Linux and
+ Windows as well. -ac */
+
+#include <time.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#include "knetfile.h"
+
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
+static int socket_wait(int fd, int is_read)
+{
+ fd_set fds, *fdr = 0, *fdw = 0;
+ struct timeval tv;
+ int ret;
+ tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ if (is_read) fdr = &fds;
+ else fdw = &fds;
+ ret = select(fd+1, fdr, fdw, 0, &tv);
+#ifndef _WIN32
+ if (ret == -1) perror("select");
+#else
+ if (ret == 0)
+ fprintf(stderr, "select time-out\n");
+ else if (ret == SOCKET_ERROR)
+ fprintf(stderr, "select: %d\n", WSAGetLastError());
+#endif
+ return ret;
+}
+
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+ int on = 1, fd;
+ struct linger lng = { 0, 0 };
+ struct addrinfo hints, *res = 0;
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ /* In Unix/Mac, getaddrinfo() is the most convenient way to get
+ * server information. */
+ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+ if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+ /* The following two setsockopt() are used by ftplib
+ * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+ * necessary. */
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+ freeaddrinfo(res);
+ return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *int64tostr(char *buf, int64_t x)
+{
+ int cnt;
+ int i = 0;
+ do {
+ buf[i++] = '0' + x % 10;
+ x /= 10;
+ } while (x);
+ buf[i] = 0;
+ for (cnt = i, i = 0; i < cnt/2; ++i) {
+ int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+ }
+ return buf;
+}
+
+int64_t strtoint64(const char *buf)
+{
+ int64_t x;
+ for (x = 0; *buf != '\0'; ++buf)
+ x = x * 10 + ((int64_t) *buf - 48);
+ return x;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+ WSADATA wsaData;
+ return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+ WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) \
+ do { \
+ fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
+ return -1; \
+ } while (0)
+
+ int on = 1;
+ SOCKET fd;
+ struct linger lng = { 0, 0 };
+ struct sockaddr_in server;
+ struct hostent *hp = 0;
+ // open socket
+ if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ // get host info
+ if (isalpha(host[0])) hp = gethostbyname(host);
+ else {
+ struct in_addr addr;
+ addr.s_addr = inet_addr(host);
+ hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+ }
+ if (hp == 0) __err_connect("gethost");
+ // connect
+ server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+ server.sin_family= AF_INET;
+ server.sin_port = htons(atoi(port));
+ if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+ // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+ return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+ off_t rest = len, curr, l = 0;
+ /* recv() and read() may not read the required length of data with
+ * one call. They have to be called repeatedly. */
+ while (rest) {
+ if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+ curr = netread(fd, buf + l, rest);
+ /* According to the glibc manual, section 13.2, a zero returned
+ * value indicates end-of-file (EOF), which should mean that
+ * read() will not return zero if EOF has not been met but data
+ * are not immediately available. */
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
+static int kftp_get_response(knetFile *ftp)
+{
+#ifndef _WIN32
+ unsigned char c;
+#else
+ char c;
+#endif
+ int n = 0;
+ char *p;
+ if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+ while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+ //fputc(c, stderr);
+ if (n >= ftp->max_response) {
+ ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+ ftp->response = realloc(ftp->response, ftp->max_response);
+ }
+ ftp->response[n++] = c;
+ if (c == '\n') {
+ if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+ && ftp->response[3] != '-') break;
+ n = 0;
+ continue;
+ }
+ }
+ if (n < 2) return -1;
+ ftp->response[n-2] = 0;
+ return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+ if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+ netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
+ return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+ char *p;
+ int v[6];
+ kftp_send_cmd(ftp, "PASV\r\n", 1);
+ for (p = ftp->response; *p && *p != '('; ++p);
+ if (*p != '(') return -1;
+ ++p;
+ sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+ memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+ ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+ return 0;
+}
+
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+ char host[80], port[10];
+ if (ftp->pasv_port == 0) {
+ fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+ return -1;
+ }
+ sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+ sprintf(port, "%d", ftp->pasv_port);
+ ftp->fd = socket_connect(host, port);
+ if (ftp->fd == -1) return -1;
+ return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+ ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+ if (ftp->ctrl_fd == -1) return -1;
+ kftp_get_response(ftp);
+ kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+ kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+ kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+ return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+ if (ftp->ctrl_fd != -1) {
+ netclose(ftp->ctrl_fd);
+ ftp->ctrl_fd = -1;
+ }
+ netclose(ftp->fd);
+ ftp->fd = -1;
+ return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host, ->retr and ->size
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p;
+ int l;
+ if (strstr(fn, "ftp://") != fn) return 0;
+ for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+ if (*p != '/') return 0;
+ l = p - fn - 6;
+ fp = calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_FTP;
+ fp->fd = -1;
+ /* the Linux/Mac version of socket_connect() also recognizes a port
+ * like "ftp", but the Windows version does not. */
+ fp->port = strdup("21");
+ fp->host = calloc(l + 1, 1);
+ if (strchr(mode, 'c')) fp->no_reconnect = 1;
+ strncpy(fp->host, fn + 6, l);
+ fp->retr = calloc(strlen(p) + 8, 1);
+ sprintf(fp->retr, "RETR %s\r\n", p);
+ fp->size_cmd = calloc(strlen(p) + 8, 1);
+ sprintf(fp->size_cmd, "SIZE %s\r\n", p);
+ fp->seek_offset = 0;
+ return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+ int ret;
+ long long file_size;
+ if (fp->fd != -1) {
+ netclose(fp->fd);
+ if (fp->no_reconnect) kftp_get_response(fp);
+ }
+ kftp_pasv_prep(fp);
+ kftp_send_cmd(fp, fp->size_cmd, 1);
+#ifndef _WIN32
+ if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
+ {
+ fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
+ return -1;
+ }
+#else
+ const char *p = fp->response;
+ while (*p != ' ') ++p;
+ while (*p < '0' || *p > '9') ++p;
+ file_size = strtoint64(p);
+#endif
+ fp->file_size = file_size;
+ if (fp->offset>=0) {
+ char tmp[32];
+#ifndef _WIN32
+ sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+ strcpy(tmp, "REST ");
+ int64tostr(tmp + 5, fp->offset);
+ strcat(tmp, "\r\n");
+#endif
+ kftp_send_cmd(fp, tmp, 1);
+ }
+ kftp_send_cmd(fp, fp->retr, 0);
+ kftp_pasv_connect(fp);
+ ret = kftp_get_response(fp);
+ if (ret != 150) {
+ fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ fp->is_ready = 1;
+ return 0;
+}
+
+
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+ knetFile *fp;
+ char *p, *proxy, *q;
+ int l;
+ if (strstr(fn, "http://") != fn) return 0;
+ // set ->http_host
+ for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+ l = p - fn - 7;
+ fp = calloc(1, sizeof(knetFile));
+ fp->http_host = calloc(l + 1, 1);
+ strncpy(fp->http_host, fn + 7, l);
+ fp->http_host[l] = 0;
+ for (q = fp->http_host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ // get http_proxy
+ proxy = getenv("http_proxy");
+ // set ->host, ->port and ->path
+ if (proxy == 0) {
+ fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(*p? p : "/");
+ } else {
+ fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+ for (q = fp->host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ fp->port = strdup(*q? q : "80");
+ fp->path = strdup(fn);
+ }
+ fp->type = KNF_TYPE_HTTP;
+ fp->ctrl_fd = fp->fd = -1;
+ fp->seek_offset = 0;
+ return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+ int ret, l = 0;
+ char *buf, *p;
+ if (fp->fd != -1) netclose(fp->fd);
+ fp->fd = socket_connect(fp->host, fp->port);
+ buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+ l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+ l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+ l += sprintf(buf + l, "\r\n");
+ netwrite(fp->fd, buf, l);
+ l = 0;
+ while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+ if (buf[l] == '\n' && l >= 3)
+ if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+ ++l;
+ }
+ buf[l] = 0;
+ if (l < 14) { // prematured header
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ ret = strtol(buf + 8, &p, 0); // HTTP return code
+ if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
+ off_t rest = fp->offset;
+ while (rest) {
+ off_t l = rest < 0x10000? rest : 0x10000;
+ rest -= my_netread(fp->fd, buf, l);
+ }
+ } else if (ret != 206 && ret != 200) {
+ free(buf);
+ fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
+ netclose(fp->fd);
+ fp->fd = -1;
+ return -1;
+ }
+ free(buf);
+ fp->is_ready = 1;
+ return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+ knetFile *fp = 0;
+ if (mode[0] != 'r') {
+ fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+ return 0;
+ }
+ if (strstr(fn, "ftp://") == fn) {
+ fp = kftp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ if (kftp_connect(fp) == -1) {
+ knet_close(fp);
+ return 0;
+ }
+ kftp_connect_file(fp);
+ } else if (strstr(fn, "http://") == fn) {
+ fp = khttp_parse_url(fn, mode);
+ if (fp == 0) return 0;
+ khttp_connect_file(fp);
+ } else { // local file
+#ifdef _WIN32
+ /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+ * be undefined on some systems, although it is defined on my
+ * Mac and the Linux I have tested on. */
+ int fd = open(fn, O_RDONLY | O_BINARY);
+#else
+ int fd = open(fn, O_RDONLY);
+#endif
+ if (fd == -1) {
+ perror("open");
+ return 0;
+ }
+ fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ fp->ctrl_fd = -1;
+ }
+ if (fp && fp->fd == -1) {
+ knet_close(fp);
+ return 0;
+ }
+ return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+ knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+ fp->type = KNF_TYPE_LOCAL;
+ fp->fd = fd;
+ return fp;
+}
+
+off_t knet_read(knetFile *fp, void *buf, off_t len)
+{
+ off_t l = 0;
+ if (fp->fd == -1) return 0;
+ if (fp->type == KNF_TYPE_FTP) {
+ if (fp->is_ready == 0) {
+ if (!fp->no_reconnect) kftp_reconnect(fp);
+ kftp_connect_file(fp);
+ }
+ } else if (fp->type == KNF_TYPE_HTTP) {
+ if (fp->is_ready == 0)
+ khttp_connect_file(fp);
+ }
+ if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+ off_t rest = len, curr;
+ while (rest) {
+ do {
+ curr = read(fp->fd, buf + l, rest);
+ } while (curr < 0 && EINTR == errno);
+ if (curr < 0) return -1;
+ if (curr == 0) break;
+ l += curr; rest -= curr;
+ }
+ } else l = my_netread(fp->fd, buf, len);
+ fp->offset += l;
+ return l;
+}
+
+off_t knet_seek(knetFile *fp, int64_t off, int whence)
+{
+ if (whence == SEEK_SET && off == fp->offset) return 0;
+ if (fp->type == KNF_TYPE_LOCAL) {
+ /* Be aware that lseek() returns the offset after seeking,
+ * while fseek() returns zero on success. */
+ off_t offset = lseek(fp->fd, off, whence);
+ if (offset == -1) {
+ // Be silent, it is OK for knet_seek to fail when the file is streamed
+ // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+ return -1;
+ }
+ fp->offset = offset;
+ return 0;
+ }
+ else if (fp->type == KNF_TYPE_FTP)
+ {
+ if (whence==SEEK_CUR)
+ fp->offset += off;
+ else if (whence==SEEK_SET)
+ fp->offset = off;
+ else if ( whence==SEEK_END)
+ fp->offset = fp->file_size+off;
+ fp->is_ready = 0;
+ return 0;
+ }
+ else if (fp->type == KNF_TYPE_HTTP)
+ {
+ if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
+ fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
+ errno = ESPIPE;
+ return -1;
+ }
+ if (whence==SEEK_CUR)
+ fp->offset += off;
+ else if (whence==SEEK_SET)
+ fp->offset = off;
+ fp->is_ready = 0;
+ return 0;
+ }
+ errno = EINVAL;
+ fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+ return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+ if (fp == 0) return 0;
+ if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+ if (fp->fd != -1) {
+ /* On Linux/Mac, netclose() is an alias of close(), but on
+ * Windows, it is an alias of closesocket(). */
+ if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+ else netclose(fp->fd);
+ }
+ free(fp->host); free(fp->port);
+ free(fp->response); free(fp->retr); // FTP specific
+ free(fp->path); free(fp->http_host); // HTTP specific
+ free(fp);
+ return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+ char *buf;
+ knetFile *fp;
+ int type = 4, l;
+#ifdef _WIN32
+ knet_win32_init();
+#endif
+ buf = calloc(0x100000, 1);
+ if (type == 0) {
+ fp = knet_open("knetfile.c", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 1) { // NCBI FTP, large file
+ fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+ knet_seek(fp, 2500000000ll, SEEK_SET);
+ l = knet_read(fp, buf, 255);
+ } else if (type == 2) {
+ fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 3) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+ knet_seek(fp, 1000, SEEK_SET);
+ } else if (type == 4) {
+ fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+ knet_read(fp, buf, 10000);
+ knet_seek(fp, 20000, SEEK_SET);
+ knet_seek(fp, 10000, SEEK_SET);
+ l = knet_read(fp, buf+10000, 10000000) + 10000;
+ }
+ if (type != 4 && type != 1) {
+ knet_read(fp, buf, 255);
+ buf[255] = 0;
+ printf("%s\n", buf);
+ } else write(fileno(stdout), buf, l);
+ knet_close(fp);
+ free(buf);
+ return 0;
+}
+#endif
diff --git a/kprobaln.c b/kprobaln.c
new file mode 100644
index 0000000..04e526a
--- /dev/null
+++ b/kprobaln.c
@@ -0,0 +1,280 @@
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008-2010, by Heng Li <lh3lh3 at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "kprobaln.h"
+
+/*****************************************
+ * Probabilistic banded glocal alignment *
+ *****************************************/
+
+#define EI .25
+#define EM .33333333333
+
+static float g_qual2prob[256];
+
+#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
+
+kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
+kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
+
+/*
+ The topology of the profile HMM:
+
+ /\ /\ /\ /\
+ I[1] I[k-1] I[k] I[L]
+ ^ \ \ ^ \ ^ \ \ ^
+ | \ \ | \ | \ \ |
+ M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
+ \ \/ \/ \/ /
+ \ /\ /\ /\ /
+ -> D[k-1] -> D[k] ->
+
+ M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
+
+ On input, _ref is the reference sequence and _query is the query
+ sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
+ ambiguous residue. iqual is the base quality. c sets the gap open
+ probability, gap extension probability and band width.
+
+ On output, state and q are arrays of length l_query. The higher 30
+ bits give the reference position the query base is matched to and the
+ lower two bits can be 0 (an alignment match) or 1 (an
+ insertion). q[i] gives the phred scaled posterior probability of
+ state[i] being wrong.
+ */
+int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
+ const kpa_par_t *c, int *state, uint8_t *q)
+{
+ double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
+ float *qual, *_qual;
+ const uint8_t *ref, *query;
+ int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
+
+ if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
+
+ /*** initialization ***/
+ is_backward = state && q? 1 : 0;
+ ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
+ bw = l_ref > l_query? l_ref : l_query;
+ if (bw > c->bw) bw = c->bw;
+ if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
+ bw2 = bw * 2 + 1;
+ // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
+ f = calloc(l_query+1, sizeof(void*));
+ if (is_backward) b = calloc(l_query+1, sizeof(void*));
+ for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0
+ f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
+ if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
+ }
+ s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
+ // initialize qual
+ _qual = calloc(l_query, sizeof(float));
+ if (g_qual2prob[0] == 0)
+ for (i = 0; i < 256; ++i)
+ g_qual2prob[i] = pow(10, -i/10.);
+ for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
+ qual = _qual - 1;
+ // initialize transition probability
+ sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
+ m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
+ m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
+ m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
+ bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
+ /*** forward ***/
+ // f[0]
+ set_u(k, bw, 0, 0);
+ f[0][k] = s[0] = 1.;
+ { // f[1]
+ double *fi = f[1], sum;
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+ set_u(u, bw, 1, k);
+ fi[u+0] = e * bM; fi[u+1] = EI * bI;
+ sum += fi[u] + fi[u+1];
+ }
+ // rescale
+ s[1] = sum;
+ set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
+ for (k = _beg; k <= _end; ++k) fi[k] /= sum;
+ }
+ // f[2..l_query]
+ for (i = 2; i <= l_query; ++i) {
+ double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
+ int beg = 1, end = l_ref, x, _beg, _end;
+ uint8_t qyi = query[i];
+ x = i - bw; beg = beg > x? beg : x; // band start
+ x = i + bw; end = end < x? end : x; // band end
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u, v11, v01, v10;
+ double e;
+ e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
+ set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
+ fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
+ fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
+ fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
+ sum += fi[u] + fi[u+1] + fi[u+2];
+// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
+ }
+ // rescale
+ s[i] = sum;
+ set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+ for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
+ }
+ { // f[l_query+1]
+ double sum;
+ for (k = 1, sum = 0.; k <= l_ref; ++k) {
+ int u;
+ set_u(u, bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
+ }
+ s[l_query+1] = sum; // the last scaling factor
+ }
+ { // compute likelihood
+ double p = 1., Pr1 = 0.;
+ for (i = 0; i <= l_query + 1; ++i) {
+ p *= s[i];
+ if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
+ }
+ Pr1 += -4.343 * log(p * l_ref * l_query);
+ Pr = (int)(Pr1 + .499);
+ if (!is_backward) { // skip backward and MAP
+ for (i = 0; i <= l_query; ++i) free(f[i]);
+ free(f); free(s); free(_qual);
+ return Pr;
+ }
+ }
+ /*** backward ***/
+ // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
+ for (k = 1; k <= l_ref; ++k) {
+ int u;
+ double *bi = b[l_query];
+ set_u(u, bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
+ }
+ // b[l_query-1..1]
+ for (i = l_query - 1; i >= 1; --i) {
+ int beg = 1, end = l_ref, x, _beg, _end;
+ double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
+ uint8_t qyi1 = query[i+1];
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = end; k >= beg; --k) {
+ int u, v11, v01, v10;
+ double e;
+ set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
+ e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
+ bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
+ bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
+ bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
+// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
+ }
+ // rescale
+ set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+ for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
+ }
+ { // b[0]
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
+ double sum = 0.;
+ for (k = end; k >= beg; --k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+ set_u(u, bw, 1, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
+ }
+ set_u(k, bw, 0, 0);
+ pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
+ }
+ is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
+ /*** MAP ***/
+ for (i = 1; i <= l_query; ++i) {
+ double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
+ int beg = 1, end = l_ref, x, max_k = -1;
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = beg; k <= end; ++k) {
+ int u;
+ double z;
+ set_u(u, bw, i, k);
+ z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
+ z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
+ }
+ max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
+ if (state) state[i-1] = max_k;
+ if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
+#ifdef _MAIN
+ fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
+ "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
+#endif
+ }
+ /*** free ***/
+ for (i = 0; i <= l_query; ++i) {
+ free(f[i]); free(b[i]);
+ }
+ free(f); free(b); free(s); free(_qual);
+ return Pr;
+}
+
+#ifdef _MAIN
+#include <unistd.h>
+int main(int argc, char *argv[])
+{
+ uint8_t conv[256], *iqual, *ref, *query;
+ int c, l_ref, l_query, i, q = 30, b = 10, P;
+ while ((c = getopt(argc, argv, "b:q:")) >= 0) {
+ switch (c) {
+ case 'b': b = atoi(optarg); break;
+ case 'q': q = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
+ return 1;
+ }
+ memset(conv, 4, 256);
+ conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
+ conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
+ ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
+ l_ref = strlen((char*)ref); l_query = strlen((char*)query);
+ for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
+ for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
+ iqual = malloc(l_query);
+ memset(iqual, q, l_query);
+ kpa_par_def.bw = b;
+ P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
+ fprintf(stderr, "%d\n", P);
+ free(iqual);
+ return 0;
+}
+#endif
diff --git a/kstring.c b/kstring.c
new file mode 100644
index 0000000..b8ff45c
--- /dev/null
+++ b/kstring.c
@@ -0,0 +1,212 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "kstring.h"
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+ va_list ap;
+ int l;
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
+ va_end(ap);
+ if (l + 1 > s->m - s->l) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+ }
+ va_end(ap);
+ s->l += l;
+ return l;
+}
+
+char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
+{
+ const char *p, *start;
+ if (sep) { // set up the table
+ if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
+ aux->finished = 0;
+ if (sep[1]) {
+ aux->sep = -1;
+ aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+ for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+ } else aux->sep = sep[0];
+ }
+ if (aux->finished) return 0;
+ else if (str) aux->p = str - 1, aux->finished = 0;
+ if (aux->sep < 0) {
+ for (p = start = aux->p + 1; *p; ++p)
+ if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+ } else {
+ for (p = start = aux->p + 1; *p; ++p)
+ if (*p == aux->sep) break;
+ }
+ aux->p = p; // end of token
+ if (*p == 0) aux->finished = 1; // no more tokens
+ return (char*)start;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+ int i, n, max, last_char, last_start, *offsets, l;
+ n = 0; max = *_max; offsets = *_offsets;
+ l = strlen(s);
+
+#define __ksplit_aux do { \
+ if (_offsets) { \
+ s[i] = 0; \
+ if (n == max) { \
+ max = max? max<<1 : 2; \
+ offsets = (int*)realloc(offsets, sizeof(int) * max); \
+ } \
+ offsets[n++] = last_start; \
+ } else ++n; \
+ } while (0)
+
+ for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+ if (delimiter == 0) {
+ if (isspace(s[i]) || s[i] == 0) {
+ if (isgraph(last_char)) __ksplit_aux; // the end of a field
+ } else {
+ if (isspace(last_char) || last_char == 0) last_start = i;
+ }
+ } else {
+ if (s[i] == delimiter || s[i] == 0) {
+ if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+ } else {
+ if (last_char == delimiter || last_char == 0) last_start = i;
+ }
+ }
+ last_char = s[i];
+ }
+ *_max = max; *_offsets = offsets;
+ return n;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+typedef unsigned char ubyte_t;
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+static int *ksBM_prep(const ubyte_t *pat, int m)
+{
+ int i, *suff, *prep, *bmGs, *bmBc;
+ prep = (int*)calloc(m + 256, sizeof(int));
+ bmGs = prep; bmBc = prep + m;
+ { // preBmBc()
+ for (i = 0; i < 256; ++i) bmBc[i] = m;
+ for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+ }
+ suff = (int*)calloc(m, sizeof(int));
+ { // suffixes()
+ int f = 0, g;
+ suff[m - 1] = m;
+ g = m - 1;
+ for (i = m - 2; i >= 0; --i) {
+ if (i > g && suff[i + m - 1 - f] < i - g)
+ suff[i] = suff[i + m - 1 - f];
+ else {
+ if (i < g) g = i;
+ f = i;
+ while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+ suff[i] = f - g;
+ }
+ }
+ }
+ { // preBmGs()
+ int j = 0;
+ for (i = 0; i < m; ++i) bmGs[i] = m;
+ for (i = m - 1; i >= 0; --i)
+ if (suff[i] == i + 1)
+ for (; j < m - 1 - i; ++j)
+ if (bmGs[j] == m)
+ bmGs[j] = m - 1 - i;
+ for (i = 0; i <= m - 2; ++i)
+ bmGs[m - 1 - suff[i]] = m - 1 - i;
+ }
+ free(suff);
+ return prep;
+}
+
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
+{
+ int i, j, *prep = 0, *bmGs, *bmBc;
+ const ubyte_t *str, *pat;
+ str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+ prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+ if (_prep && *_prep == 0) *_prep = prep;
+ bmGs = prep; bmBc = prep + m;
+ j = 0;
+ while (j <= n - m) {
+ for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+ if (i >= 0) {
+ int max = bmBc[str[i+j]] - m + 1 + i;
+ if (max < bmGs[i]) max = bmGs[i];
+ j += max;
+ } else return (void*)(str + j);
+ }
+ if (_prep == 0) free(prep);
+ return 0;
+}
+
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+ return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+ return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+ kstring_t *s;
+ int *fields, n, i;
+ ks_tokaux_t aux;
+ char *p;
+ s = (kstring_t*)calloc(1, sizeof(kstring_t));
+ // test ksprintf()
+ ksprintf(s, " abcdefg: %d ", 100);
+ printf("'%s'\n", s->s);
+ // test ksplit()
+ fields = ksplit(s, 0, &n);
+ for (i = 0; i < n; ++i)
+ printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+ // test kstrtok()
+ s->l = 0;
+ for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+ kputsn(p, aux.p - p, s);
+ kputc('\n', s);
+ }
+ printf("%s", s->s);
+ // free
+ free(s->s); free(s); free(fields);
+
+ {
+ static char *str = "abcdefgcdgcagtcakcdcd";
+ static char *pat = "cd";
+ char *ret, *s = str;
+ int *prep = 0;
+ while ((ret = kstrstr(s, pat, &prep)) != 0) {
+ printf("match: %s\n", ret);
+ s = ret + prep[0];
+ }
+ free(prep);
+ }
+ return 0;
+}
+#endif
diff --git a/padding.c b/padding.c
new file mode 100644
index 0000000..a8da562
--- /dev/null
+++ b/padding.c
@@ -0,0 +1,479 @@
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include "kstring.h"
+#include "sam_header.h"
+#include "sam.h"
+#include "bam.h"
+#include "faidx.h"
+
+bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/
+
+static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
+{
+ if (n != b->core.n_cigar) {
+ int o = b->core.l_qname + b->core.n_cigar * 4;
+ if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) {
+ b->m_data = b->data_len + (n - b->core.n_cigar) * 4;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o);
+ memcpy(b->data + b->core.l_qname, cigar, n * 4);
+ b->data_len += (n - b->core.n_cigar) * 4;
+ b->core.n_cigar = n;
+ } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
+}
+
+#define write_cigar(_c, _n, _m, _v) do { \
+ if (_n == _m) { \
+ _m = _m? _m<<1 : 4; \
+ _c = (uint32_t*)realloc(_c, _m * 4); \
+ } \
+ _c[_n++] = (_v); \
+ } while (0)
+
+static void unpad_seq(bam1_t *b, kstring_t *s)
+{
+ int k, j, i;
+ int length;
+ uint32_t *cigar = bam1_cigar(b);
+ uint8_t *seq = bam1_seq(b);
+ // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
+ // We need the padded length after alignment from the CIGAR (excluding
+ // soft clips S, but including pads from CIGAR D operations)
+ length = 0;
+ for (k = 0; k < b->core.n_cigar; ++k) {
+ int op, ol;
+ op= bam_cigar_op(cigar[k]);
+ ol = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL)
+ length += ol;
+ }
+ ks_resize(s, length);
+ for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
+ int op, ol;
+ op = bam_cigar_op(cigar[k]);
+ ol = bam_cigar_oplen(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j);
+ } else if (op == BAM_CSOFT_CLIP) {
+ j += ol;
+ } else if (op == BAM_CHARD_CLIP) {
+ /* do nothing */
+ } else if (op == BAM_CDEL) {
+ for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
+ } else {
+ fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b));
+ assert(-1);
+ }
+ }
+ assert(length == s->l);
+}
+
+int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
+{
+ char base;
+ char *fai_ref = 0;
+ int fai_ref_len = 0, k;
+
+ fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
+ if (fai_ref_len != ref_len) {
+ fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len);
+ free(fai_ref);
+ return -1;
+ }
+ ks_resize(seq, ref_len);
+ seq->l = 0;
+ for (k = 0; k < ref_len; ++k) {
+ base = fai_ref[k];
+ if (base == '-' || base == '*') {
+ // Map gaps to null to match unpad_seq function
+ seq->s[seq->l++] = 0;
+ } else {
+ int i = bam_nt16_table[(int)base];
+ if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
+ fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
+ free(fai_ref);
+ return -1;
+ }
+ seq->s[seq->l++] = i;
+ }
+ }
+ assert(ref_len == seq->l);
+ free(fai_ref);
+ return 0;
+}
+
+int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len)
+{
+ char base;
+ char *fai_ref = 0;
+ int fai_ref_len = 0, k;
+ int bases=0, gaps=0;
+
+ fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
+ if (fai_ref_len != padded_len) {
+ fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len);
+ free(fai_ref);
+ return -1;
+ }
+ for (k = 0; k < padded_len; ++k) {
+ //fprintf(stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref));
+ base = fai_ref[k];
+ if (base == '-' || base == '*') {
+ gaps += 1;
+ } else {
+ int i = bam_nt16_table[(int)base];
+ if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
+ fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
+ free(fai_ref);
+ return -1;
+ }
+ bases += 1;
+ }
+ }
+ free(fai_ref);
+ assert (padded_len == bases + gaps);
+ return bases;
+}
+
+inline int * update_posmap(int *posmap, kstring_t ref)
+{
+ int i, k;
+ posmap = realloc(posmap, ref.m * sizeof(int));
+ for (i = k = 0; i < ref.l; ++i) {
+ posmap[i] = k;
+ if (ref.s[i]) ++k;
+ }
+ return posmap;
+}
+
+int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
+{
+ bam_header_t *h = 0;
+ bam1_t *b = 0;
+ kstring_t r, q;
+ int r_tid = -1;
+ uint32_t *cigar2 = 0;
+ int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
+
+ b = bam_init1();
+ r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
+ int read_ret;
+ h = in->header;
+ while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in'
+ uint32_t *cigar = bam1_cigar(b);
+ n2 = 0;
+ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
+ // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b));
+ r_tid = b->core.tid;
+ unpad_seq(b, &r);
+ if (h->target_len[r_tid] != r.l) {
+ fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l);
+ return -1;
+ }
+ if (fai) {
+ // Check the embedded reference matches the FASTA file
+ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ assert(r.l == q.l);
+ int i;
+ for (i = 0; i < r.l; ++i) {
+ if (r.s[i] != q.s[i]) {
+ // Show gaps as ASCII 45
+ fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
+ h->target_name[b->core.tid], i+1,
+ r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45,
+ q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45);
+ return -1;
+ }
+ }
+ }
+ write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
+ replace_cigar(b, n2, cigar2);
+ posmap = update_posmap(posmap, r);
+ } else if (b->core.n_cigar > 0) {
+ int i, k, op;
+ if (b->core.tid < 0) {
+ fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b));
+ return -1;
+ } else if (b->core.tid == r_tid) {
+ ; // good case, reference available
+ //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b));
+ } else if (fai) {
+ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ posmap = update_posmap(posmap, r);
+ r_tid = b->core.tid;
+ // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]);
+ } else {
+ fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ unpad_seq(b, &q);
+ if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[0]);
+ } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[0]);
+ if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[1]);
+ }
+ }
+ /* Determine CIGAR operator for each base in the aligned read */
+ for (i = 0, k = b->core.pos; i < q.l; ++i, ++k)
+ q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD);
+ /* Include any pads if starts with an insert */
+ if (q.s[0] == BAM_CINS) {
+ for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
+ if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD));
+ }
+ /* Count consecutive CIGAR operators to turn into a CIGAR string */
+ for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
+ if (op != q.s[i]) {
+ write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
+ op = q.s[i]; k = 1;
+ } else ++k;
+ }
+ write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
+ if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
+ } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) {
+ if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) {
+ write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]);
+ }
+ write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
+ }
+ /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */
+ int pre_op, post_op;
+ for (i = 2; i < n2; ++i)
+ if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) {
+ pre_op = bam_cigar_op(cigar2[i-2]);
+ post_op = bam_cigar_op(cigar2[i]);
+ /* Note don't need to check for X/= as code above will use M only */
+ if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) {
+ /* This is a redundant P operator */
+ cigar2[i-1] = 0; // i.e. 0M
+ /* If had same operator either side, combine them in post_op */
+ if (pre_op == post_op) {
+ /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/
+ cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op);
+ cigar2[i-2] = 0; // i.e. 0M
+ }
+ }
+ }
+ /* Remove the zero'd operators (0M) */
+ for (i = k = 0; i < n2; ++i)
+ if (cigar2[i]) cigar2[k++] = cigar2[i];
+ n2 = k;
+ replace_cigar(b, n2, cigar2);
+ b->core.pos = posmap[b->core.pos];
+ if (b->core.mtid < 0 || b->core.mpos < 0) {
+ /* Nice case, no mate to worry about*/
+ // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b));
+ /* TODO - Warning if FLAG says mate should be mapped? */
+ /* Clean up funny input where mate position is given but mate reference is missing: */
+ b->core.mtid = -1;
+ b->core.mpos = -1;
+ } else if (b->core.mtid == b->core.tid) {
+ /* Nice case, same reference */
+ // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b));
+ b->core.mpos = posmap[b->core.mpos];
+ } else {
+ /* Nasty case, Must load alternative posmap */
+ // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]);
+ if (!fai) {
+ fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]);
+ return -1;
+ }
+ /* Temporarily load the other reference sequence */
+ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]);
+ return -1;
+ }
+ posmap = update_posmap(posmap, r);
+ b->core.mpos = posmap[b->core.mpos];
+ /* Restore the reference and posmap*/
+ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+ return -1;
+ }
+ posmap = update_posmap(posmap, r);
+ }
+ }
+ samwrite(out, b);
+ }
+ if (read_ret < -1) {
+ fprintf(stderr, "[depad] truncated file.\n");
+ ret = 1;
+ }
+ free(r.s); free(q.s); free(posmap);
+ bam_destroy1(b);
+ return ret;
+}
+
+bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
+{
+ int i = 0, unpadded_len = 0;
+ bam_header_t *header = 0 ;
+
+ header = bam_header_dup(old);
+ for (i = 0; i < old->n_targets; ++i) {
+ unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
+ if (unpadded_len < 0) {
+ fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]);
+ } else {
+ header->target_len[i] = unpadded_len;
+ //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]);
+ }
+ }
+ /* Duplicating the header allocated new buffer for header string */
+ /* After modifying the @SQ lines it will only get smaller, since */
+ /* the LN entries will be the same or shorter, and we'll remove */
+ /* any MD entries (MD5 checksums). */
+ assert(strlen(old->text) == strlen(header->text));
+ assert (0==strcmp(old->text, header->text));
+ const char *text;
+ text = old->text;
+ header->text[0] = '\0'; /* Resuse the allocated buffer */
+ char * newtext = header->text;
+ char * end=NULL;
+ while (text[0]=='@') {
+ end = strchr(text, '\n');
+ assert(end != 0);
+ if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') {
+ /* TODO - edit the @SQ line here to remove MD and fix LN. */
+ /* For now just remove the @SQ line, and samtools will */
+ /* automatically generate a minimal replacement with LN. */
+ /* However, that discards any other tags like AS, SP, UR. */
+ //fprintf(stderr, "[depad] Removing @SQ line\n");
+ } else {
+ /* Copy this line to the new header */
+ strncat(newtext, text, end - text + 1);
+ }
+ text = end + 1;
+ }
+ assert (text[0]=='\0');
+ /* Check we didn't overflow the buffer */
+ assert (strlen(header->text) <= strlen(old->text));
+ if (strlen(header->text) < header->l_text) {
+ //fprintf(stderr, "[depad] Reallocating header buffer\n");
+ assert (newtext == header->text);
+ newtext = malloc(strlen(header->text) + 1);
+ strcpy(newtext, header->text);
+ free(header->text);
+ header->text = newtext;
+ header->l_text = strlen(newtext);
+ }
+ //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
+ return header;
+}
+
+static int usage(int is_long_help);
+
+int main_pad2unpad(int argc, char *argv[])
+{
+ samfile_t *in = 0, *out = 0;
+ bam_header_t *h = 0;
+ faidx_t *fai = 0;
+ int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0;
+ char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
+ int ret=0;
+
+ /* parse command-line options */
+ strcpy(in_mode, "r"); strcpy(out_mode, "w");
+ while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) {
+ switch (c) {
+ case 'S': is_bamin = 0; break;
+ case 's': assert(compress_level == -1); is_bamout = 0; break;
+ case 'o': fn_out = strdup(optarg); break;
+ case 'u': assert(is_bamout == 1); compress_level = 0; break;
+ case '1': assert(is_bamout == 1); compress_level = 1; break;
+ case 'T': fn_ref = strdup(optarg); break;
+ case '?': is_long_help = 1; break;
+ default: return usage(is_long_help);
+ }
+ }
+ if (argc == optind) return usage(is_long_help);
+
+ if (is_bamin) strcat(in_mode, "b");
+ if (is_bamout) strcat(out_mode, "b");
+ strcat(out_mode, "h");
+ if (compress_level >= 0) {
+ char tmp[2];
+ tmp[0] = compress_level + '0'; tmp[1] = '\0';
+ strcat(out_mode, tmp);
+ }
+
+ // Load FASTA reference (also needed for SAM -> BAM if missing header)
+ if (fn_ref) {
+ fn_list = samfaipath(fn_ref);
+ fai = fai_load(fn_ref);
+ }
+ // open file handlers
+ if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+ fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
+ ret = 1;
+ goto depad_end;
+ }
+ if (in->header == 0) {
+ fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+ ret = 1;
+ goto depad_end;
+ }
+ if (in->header->text == 0 || in->header->l_text == 0) {
+ fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]);
+ assert (0 == in->header->l_text);
+ assert (0 == in->header->text);
+ }
+ if (fn_ref) {
+ h = fix_header(in->header, fai);
+ } else {
+ fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
+ h = in->header;
+ }
+ if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) {
+ fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
+ ret = 1;
+ goto depad_end;
+ }
+
+ // Do the depad
+ ret = bam_pad2unpad(in, out, fai);
+
+depad_end:
+ // close files, free and return
+ if (fai) fai_destroy(fai);
+ if (h != in->header) bam_header_destroy(h);
+ samclose(in);
+ samclose(out);
+ free(fn_list); free(fn_out);
+ return ret;
+}
+
+static int usage(int is_long_help)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools depad <in.bam>\n\n");
+ fprintf(stderr, "Options: -s output is SAM (default is BAM)\n");
+ fprintf(stderr, " -S input is SAM (default is BAM)\n");
+ fprintf(stderr, " -u uncompressed BAM output (can't use with -s)\n");
+ fprintf(stderr, " -1 fast compression BAM output (can't use with -s)\n");
+ fprintf(stderr, " -T FILE reference sequence file [null]\n");
+ fprintf(stderr, " -o FILE output file name [stdout]\n");
+ fprintf(stderr, " -? longer help\n");
+ fprintf(stderr, "\n");
+ if (is_long_help)
+ fprintf(stderr, "Notes:\n\
+\n\
+ 1. Requires embedded reference sequences (before the reads for that reference),\n\
+ with the future aim to also support a FASTA padded reference sequence file.\n\
+\n\
+ 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\
+\n");
+ return 1;
+}
diff --git a/phase.c b/phase.c
new file mode 100644
index 0000000..ef4eff9
--- /dev/null
+++ b/phase.c
@@ -0,0 +1,687 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <math.h>
+#include <zlib.h>
+#include "bam.h"
+#include "errmod.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MAX_VARS 256
+#define FLIP_PENALTY 2
+#define FLIP_THRES 4
+#define MASK_THRES 3
+
+#define FLAG_FIX_CHIMERA 0x1
+#define FLAG_LIST_EXCL 0x4
+#define FLAG_DROP_AMBI 0x8
+
+typedef struct {
+ // configurations, initialized in the main function
+ int flag, k, min_baseQ, min_varLOD, max_depth;
+ // other global variables
+ int vpos_shift;
+ bamFile fp;
+ char *pre;
+ bamFile out[3];
+ // alignment queue
+ int n, m;
+ bam1_t **b;
+} phaseg_t;
+
+typedef struct {
+ int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
+ int vpos, beg, end;
+ uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1;
+ uint32_t in:16, out:16; // in-phase and out-phase
+} frag_t, *frag_p;
+
+#define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+KHASH_MAP_INIT_INT64(64, frag_t)
+
+typedef khash_t(64) nseq_t;
+
+#include "ksort.h"
+KSORT_INIT(rseq, frag_p, rseq_lt)
+
+static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+static inline uint64_t X31_hash_string(const char *s)
+{
+ uint64_t h = *s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return h;
+}
+
+static void count1(int l, const uint8_t *seq, int *cnt)
+{
+ int i, j, n_ambi;
+ uint32_t z, x;
+ if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
+ for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
+ if (seq[i] == 0) ++n_ambi;
+ if (l - n_ambi <= 1) return; // only one SNP
+ for (x = 0; x < 1u<<n_ambi; ++x) { // count
+ for (i = j = 0, z = 0; i < l; ++i) {
+ int c;
+ if (seq[i]) c = seq[i] - 1;
+ else {
+ c = x>>j&1;
+ ++j;
+ }
+ z = z<<1 | c;
+ }
+ ++cnt[z];
+ }
+}
+
+static int **count_all(int l, int vpos, nseq_t *hash)
+{
+ khint_t k;
+ int i, j, **cnt;
+ uint8_t *seq;
+ seq = calloc(l, 1);
+ cnt = calloc(vpos, sizeof(void*));
+ for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
+ if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
+ f->single = 1;
+ continue;
+ }
+ for (j = 1; j < f->vlen; ++j) {
+ for (i = 0; i < l; ++i)
+ seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
+ count1(l, seq, cnt[f->vpos + j]);
+ }
+ }
+ }
+ free(seq);
+ return cnt;
+}
+
+// phasing
+static int8_t *dynaprog(int l, int vpos, int **w)
+{
+ int *f[2], *curr, *prev, max, i;
+ int8_t **b, *h = 0;
+ uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
+ f[0] = calloc(z, sizeof(int));
+ f[1] = calloc(z, sizeof(int));
+ b = calloc(vpos, sizeof(void*));
+ prev = f[0]; curr = f[1];
+ // fill the backtrack matrix
+ for (i = 0; i < vpos; ++i) {
+ int *wi = w[i], *tmp;
+ int8_t *bi;
+ bi = b[i] = calloc(z, 1);
+ /* In the following, x is the current state, which is the
+ * lexicographically smaller local haplotype. xc is the complement of
+ * x, or the larger local haplotype; y0 and y1 are the two predecessors
+ * of x. */
+ for (x = 0; x < z; ++x) { // x0 is the smaller
+ uint32_t y0, y1, xc;
+ int c0, c1;
+ xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
+ c0 = prev[y0] + wi[x] + wi[xc];
+ c1 = prev[y1] + wi[x] + wi[xc];
+ if (c0 > c1) bi[x] = 0, curr[x] = c0;
+ else bi[x] = 1, curr[x] = c1;
+ }
+ tmp = prev; prev = curr; curr = tmp; // swap
+ }
+ { // backtrack
+ uint32_t max_x = 0;
+ int which = 0;
+ h = calloc(vpos, 1);
+ for (x = 0, max = 0, max_x = 0; x < z; ++x)
+ if (prev[x] > max) max = prev[x], max_x = x;
+ for (i = vpos - 1, x = max_x; i >= 0; --i) {
+ h[i] = which? (~x&1) : (x&1);
+ which = b[i][x]? !which : which;
+ x = b[i][x]? (~x&mask)>>1 : x>>1;
+ }
+ }
+ // free
+ for (i = 0; i < vpos; ++i) free(b[i]);
+ free(f[0]); free(f[1]); free(b);
+ return h;
+}
+
+// phase each fragment
+static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
+{
+ khint_t k;
+ uint64_t *pcnt;
+ uint32_t *left, *rght, max;
+ left = rght = 0; max = 0;
+ pcnt = calloc(vpos, 8);
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ int i, c[2];
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos >= vpos) continue;
+ // get the phase
+ c[0] = c[1] = 0;
+ for (i = 0; i < f->vlen; ++i) {
+ if (f->seq[i] == 0) continue;
+ ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
+ }
+ f->phase = c[0] > c[1]? 0 : 1;
+ f->in = c[f->phase]; f->out = c[1 - f->phase];
+ f->phased = f->in == f->out? 0 : 1;
+ f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0;
+ // fix chimera
+ f->flip = 0;
+ if (flip && c[0] >= 3 && c[1] >= 3) {
+ int sum[2], m, mi, md;
+ if (f->vlen > max) { // enlarge the array
+ max = f->vlen;
+ kroundup32(max);
+ left = realloc(left, max * 4);
+ rght = realloc(rght, max * 4);
+ }
+ for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
+ if (f->seq[i]) {
+ int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ ++sum[c == path[f->vpos + i]? 0 : 1];
+ }
+ left[i] = sum[1]<<16 | sum[0];
+ }
+ for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
+ if (f->seq[i]) {
+ int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ ++sum[c == path[f->vpos + i]? 0 : 1];
+ }
+ rght[i] = sum[1]<<16 | sum[0];
+ }
+ // find the best flip point
+ for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
+ int a[2];
+ a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
+ a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
+ if (a[0] > a[1]) {
+ if (a[0] > m) m = a[0], md = 0, mi = i;
+ } else {
+ if (a[1] > m) m = a[1], md = 1, mi = i;
+ }
+ }
+ if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
+ f->flip = 1;
+ if (md == 0) { // flip the tail
+ for (i = mi + 1; i < f->vlen; ++i)
+ if (f->seq[i] == 1) f->seq[i] = 2;
+ else if (f->seq[i] == 2) f->seq[i] = 1;
+ } else { // flip the head
+ for (i = 0; i <= mi; ++i)
+ if (f->seq[i] == 1) f->seq[i] = 2;
+ else if (f->seq[i] == 2) f->seq[i] = 1;
+ }
+ }
+ }
+ // update pcnt[]
+ if (!f->single) {
+ for (i = 0; i < f->vlen; ++i) {
+ int c;
+ if (f->seq[i] == 0) continue;
+ c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ if (c == path[f->vpos + i]) {
+ if (f->phase == 0) ++pcnt[f->vpos + i];
+ else pcnt[f->vpos + i] += 1ull<<32;
+ } else {
+ if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
+ else pcnt[f->vpos + i] += 1ull<<48;
+ }
+ }
+ }
+ }
+ }
+ free(left); free(rght);
+ return pcnt;
+}
+
+static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
+{
+ int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
+ uint64_t *list = 0;
+ for (i = 0; i < vpos; ++i) {
+ uint64_t x = pcnt[i];
+ int c[4], pre = score, s;
+ c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
+ s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
+ if (c[3] > c[2]) s += c[3] - c[2];
+ if (c[1] > c[0]) s += c[1] - c[0];
+ score += s;
+ if (score < 0) score = 0;
+ if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
+ if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
+ if (n == m) {
+ m = m? m<<1 : 4;
+ list = realloc(list, m * 8);
+ }
+ list[n++] = (uint64_t)beg<<32 | max_i;
+ i = max_i; // reset i to max_i
+ score = 0;
+ } else if (score > max) max = score, max_i = i;
+ if (score == 0) max = 0;
+ }
+ *_n = n;
+ return list;
+}
+
+// trim heading and tailing ambiguous bases; mark deleted and remove sequence
+static int clean_seqs(int vpos, nseq_t *hash)
+{
+ khint_t k;
+ int ret = 0;
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ int beg, end, i;
+ if (f->vpos >= vpos) {
+ ret = 1;
+ continue;
+ }
+ for (i = 0; i < f->vlen; ++i)
+ if (f->seq[i] != 0) break;
+ beg = i;
+ for (i = f->vlen - 1; i >= 0; --i)
+ if (f->seq[i] != 0) break;
+ end = i + 1;
+ if (end - beg <= 0) kh_del(64, hash, k);
+ else {
+ if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
+ f->vpos += beg; f->vlen = end - beg;
+ f->single = f->vlen == 1? 1 : 0;
+ }
+ }
+ }
+ return ret;
+}
+
+static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+{
+ int i, is_flip, drop_ambi;
+ drop_ambi = g->flag & FLAG_DROP_AMBI;
+ is_flip = (drand48() < 0.5);
+ for (i = 0; i < g->n; ++i) {
+ int end, which;
+ uint64_t key;
+ khint_t k;
+ bam1_t *b = g->b[i];
+ key = X31_hash_string(bam1_qname(b));
+ end = bam_calend(&b->core, bam1_cigar(b));
+ if (end > min_pos) break;
+ k = kh_get(64, hash, key);
+ if (k == kh_end(hash)) which = 3;
+ else {
+ frag_t *f = &kh_val(hash, k);
+ if (f->ambig) which = drop_ambi? 2 : 3;
+ else if (f->phased && f->flip) which = 2;
+ else if (f->phased == 0) which = 3;
+ else { // phased and not flipped
+ char c = 'Y';
+ which = f->phase;
+ bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
+ }
+ if (which < 2 && is_flip) which = 1 - which; // increase the randomness
+ }
+ if (which == 3) which = (drand48() < 0.5);
+ bam_write1(g->out[which], b);
+ bam_destroy1(b);
+ g->b[i] = 0;
+ }
+ memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
+ g->n -= i;
+}
+
+static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
+{
+ int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
+ khint_t k;
+ frag_t **seqs;
+ int8_t *path, *sitemask;
+ uint64_t *pcnt, *regmask;
+
+ if (vpos == 0) return 0;
+ i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
+ min_pos = i? cns[vpos]>>32 : 0x7fffffff;
+ if (vpos == 1) {
+ printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
+ printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
+ "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos) continue;
+ f->flip = 0;
+ if (f->seq[0] == 0) f->phased = 0;
+ else f->phased = 1, f->phase = f->seq[0] - 1;
+ }
+ }
+ dump_aln(g, min_pos, hash);
+ ++g->vpos_shift;
+ return 1;
+ }
+ { // phase
+ int **cnt;
+ uint64_t *mask;
+ printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
+ sitemask = calloc(vpos, 1);
+ cnt = count_all(g->k, vpos, hash);
+ path = dynaprog(g->k, vpos, cnt);
+ for (i = 0; i < vpos; ++i) free(cnt[i]);
+ free(cnt);
+ pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
+ mask = genmask(vpos, pcnt, &n_masked);
+ regmask = calloc(n_masked, 8);
+ for (i = 0; i < n_masked; ++i) {
+ regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
+ for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
+ sitemask[j] = 1;
+ }
+ free(mask);
+ if (g->flag & FLAG_FIX_CHIMERA) {
+ free(pcnt);
+ pcnt = fragphase(vpos, path, hash, 1);
+ }
+ }
+ for (i = 0; i < n_masked; ++i)
+ printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
+ for (i = 0; i < vpos; ++i) {
+ uint64_t x = pcnt[i];
+ int8_t c[2];
+ c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
+ c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
+ printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
+ i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
+ }
+ free(path); free(pcnt); free(regmask); free(sitemask);
+ seqs = calloc(n_seqs, sizeof(void*));
+ for (k = 0, i = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
+ seqs[i++] = &kh_val(hash, k);
+ n_seqs = i;
+ ks_introsort_rseq(n_seqs, seqs);
+ for (i = 0; i < n_seqs; ++i) {
+ frag_t *f = seqs[i];
+ printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
+ for (j = 0; j < f->vlen; ++j) {
+ uint32_t c = cns[f->vpos + j];
+ if (f->seq[j] == 0) putchar('N');
+ else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
+ }
+ printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
+ }
+ free(seqs);
+ printf("//\n");
+ fflush(stdout);
+ g->vpos_shift += vpos;
+ dump_aln(g, min_pos, hash);
+ return vpos;
+}
+
+static void update_vpos(int vpos, nseq_t *hash)
+{
+ khint_t k;
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
+ else f->vpos -= vpos;
+ }
+ }
+}
+
+static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
+{
+ return hash;
+}
+
+static int readaln(void *data, bam1_t *b)
+{
+ phaseg_t *g = (phaseg_t*)data;
+ int ret;
+ ret = bam_read1(g->fp, b);
+ if (ret < 0) return ret;
+ if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
+ if (g->n == g->m) {
+ g->m = g->m? g->m<<1 : 16;
+ g->b = realloc(g->b, g->m * sizeof(void*));
+ }
+ g->b[g->n++] = bam_dup1(b);
+ }
+ return ret;
+}
+
+static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
+{
+ gzFile fp;
+ kstream_t *ks;
+ int ret, dret;
+ kstring_t *str;
+ khash_t(set64) *hash;
+
+ hash = kh_init(set64);
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ int tid = bam_get_tid(h, str->s);
+ if (tid >= 0 && dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+ kh_put(set64, hash, x, &ret);
+ } else break;
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (dret < 0) break;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ return hash;
+}
+
+static int gl2cns(float q[16])
+{
+ int i, j, min_ij;
+ float min, min2;
+ min = min2 = 1e30; min_ij = -1;
+ for (i = 0; i < 4; ++i) {
+ for (j = i; j < 4; ++j) {
+ if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
+ else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
+ }
+ }
+ return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
+}
+
+int main_phase(int argc, char *argv[])
+{
+ extern void bam_init_header_hash(bam_header_t *header);
+ int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+ const bam_pileup1_t *plp;
+ bam_plp_t iter;
+ bam_header_t *h;
+ nseq_t *seqs;
+ uint64_t *cns = 0;
+ phaseg_t g;
+ char *fn_list = 0;
+ khash_t(set64) *set = 0;
+ errmod_t *em;
+ uint16_t *bases;
+
+ memset(&g, 0, sizeof(phaseg_t));
+ g.flag = FLAG_FIX_CHIMERA;
+ g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
+ while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
+ switch (c) {
+ case 'D': g.max_depth = atoi(optarg); break;
+ case 'q': g.min_varLOD = atoi(optarg); break;
+ case 'Q': g.min_baseQ = atoi(optarg); break;
+ case 'k': g.k = atoi(optarg); break;
+ case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
+ case 'e': g.flag |= FLAG_LIST_EXCL; break;
+ case 'A': g.flag |= FLAG_DROP_AMBI; break;
+ case 'b': g.pre = strdup(optarg); break;
+ case 'l': fn_list = strdup(optarg); break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools phase [options] <in.bam>\n\n");
+ fprintf(stderr, "Options: -k INT block length [%d]\n", g.k);
+ fprintf(stderr, " -b STR prefix of BAMs to output [null]\n");
+ fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD);
+ fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ);
+ fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth);
+// fprintf(stderr, " -l FILE list of sites to phase [null]\n");
+ fprintf(stderr, " -F do not attempt to fix chimeras\n");
+ fprintf(stderr, " -A drop reads with ambiguous phase\n");
+// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ h = bam_header_read(g.fp);
+ if (fn_list) { // read the list of sites to phase
+ bam_init_header_hash(h);
+ set = loadpos(fn_list, h);
+ free(fn_list);
+ } else g.flag &= ~FLAG_LIST_EXCL;
+ if (g.pre) { // open BAMs to write
+ char *s = malloc(strlen(g.pre) + 20);
+ strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
+ strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
+ strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
+ for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
+ free(s);
+ }
+
+ iter = bam_plp_init(readaln, &g);
+ g.vpos_shift = 0;
+ seqs = kh_init(64);
+ em = errmod_init(1. - 0.83);
+ bases = calloc(g.max_depth, 2);
+ printf("CC\n");
+ printf("CC\tDescriptions:\nCC\n");
+ printf("CC\t CC comments\n");
+ printf("CC\t PS start of a phase set\n");
+ printf("CC\t FL filtered region\n");
+ printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n");
+ printf("CC\t EV supporting reads; SAM format\n");
+ printf("CC\t // end of a phase set\nCC\n");
+ printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
+ printf("CC\t PS chr phaseSetStart phaseSetEnd\n");
+ printf("CC\t FL chr filterStart filterEnd\n");
+ printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n");
+ printf("CC\nCC\n");
+ fflush(stdout);
+ while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
+ int i, k, c, tmp, dophase = 1, in_set = 0;
+ float q[16];
+ if (tid < 0) break;
+ if (tid != lasttid) { // change of chromosome
+ g.vpos_shift = 0;
+ if (lasttid >= 0) {
+ seqs = shrink_hash(seqs);
+ phase(&g, h->target_name[lasttid], vpos, cns, seqs);
+ update_vpos(0x7fffffff, seqs);
+ }
+ lasttid = tid;
+ vpos = 0;
+ }
+ if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
+ if (n > g.max_depth) continue; // do not proceed if the depth is too high
+ // fill the bases array and check if there is a variant
+ for (i = k = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint8_t *seq;
+ int q, baseQ, b;
+ if (p->is_del || p->is_refskip) continue;
+ baseQ = bam1_qual(p->b)[p->qpos];
+ if (baseQ < g.min_baseQ) continue;
+ seq = bam1_seq(p->b);
+ b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+ if (b > 3) continue;
+ q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+ if (q < 4) q = 4;
+ if (q > 63) q = 63;
+ bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+ }
+ if (k == 0) continue;
+ errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
+ c = gl2cns(q); // get the consensus
+ // tell if to proceed
+ if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
+ if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
+ // add the variant
+ if (vpos == max_vpos) {
+ max_vpos = max_vpos? max_vpos<<1 : 128;
+ cns = realloc(cns, max_vpos * 8);
+ }
+ cns[vpos] = (uint64_t)pos<<32 | c;
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint64_t key;
+ khint_t k;
+ uint8_t *seq = bam1_seq(p->b);
+ frag_t *f;
+ if (p->is_del || p->is_refskip) continue;
+ if (p->b->core.qual == 0) continue;
+ // get the base code
+ c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
+ if (c == (cns[vpos]&3)) c = 1;
+ else if (c == (cns[vpos]>>16&3)) c = 2;
+ else c = 0;
+ // write to seqs
+ key = X31_hash_string(bam1_qname(p->b));
+ k = kh_put(64, seqs, key, &tmp);
+ f = &kh_val(seqs, k);
+ if (tmp == 0) { // present in the hash table
+ if (vpos - f->vpos + 1 < MAX_VARS) {
+ f->vlen = vpos - f->vpos + 1;
+ f->seq[f->vlen-1] = c;
+ f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+ }
+ dophase = 0;
+ } else { // absent
+ memset(f->seq, 0, MAX_VARS);
+ f->beg = p->b->core.pos;
+ f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+ f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0;
+ }
+ }
+ if (dophase) {
+ seqs = shrink_hash(seqs);
+ phase(&g, h->target_name[tid], vpos, cns, seqs);
+ update_vpos(vpos, seqs);
+ cns[0] = cns[vpos];
+ vpos = 0;
+ }
+ ++vpos;
+ }
+ if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
+ bam_header_destroy(h);
+ bam_plp_destroy(iter);
+ bam_close(g.fp);
+ kh_destroy(64, seqs);
+ kh_destroy(set64, set);
+ free(cns);
+ errmod_destroy(em);
+ free(bases);
+ if (g.pre) {
+ for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
+ free(g.pre); free(g.b);
+ }
+ return 0;
+}
diff --git a/randomFQ b/randomFQ
new file mode 100755
index 0000000..272cd73
--- /dev/null
+++ b/randomFQ
@@ -0,0 +1,245 @@
+#!/usr/bin/perl
+use strict;
+use Getopt::Long;
+
+# Author: Erik Aronesty (earonesty at xpressionanalysis.com)
+# Outputs a random sampled fastq or fasta file, from an input fastq
+# Copyright (c) 2011 Expression Analysis
+
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+my $cnt;
+my $fasta;
+my $out;
+my $seed=1;
+my $window;
+my $always;
+my $append;
+my $pct;
+GetOptions("count=i"=>\$cnt, "pct=f"=>\$pct, "fasta"=>\$fasta, "out=s"=>\$out, "seed=i"=>\$seed, "window=i"=>\$window, "always|a"=>\$always, "append|A"=>\$append) || die usage();
+if ($window < $cnt) {
+ $window = $cnt * 5;
+ $window = 100000 if $window < 100000;
+}
+
+if ($seed) {
+ srand($seed);
+}
+
+die usage() unless $cnt>0||$pct>0;
+
+my $in = shift;
+my $mate = shift;
+my $mate2 = shift;
+
+die "Can't see $in\n" unless -e $in;
+die "Can't see $mate\n" unless !$mate || -e $mate;
+die "Can't see $mate2\n" unless !$mate2 || -e $mate2;
+
+my $s = -s $in;
+my $sm = -s $mate;
+my $sm2 = -s $mate2;
+
+die "Need -out <prefix> for paired-end" if $mate && !$out;
+
+my $suff;
+if ($out =~ s/\%(.*)//) {
+ $suff = $1;
+}
+$out =~ s/_$//;
+
+my $gzmeth = $suff =~ /\.gz/ ? "|gzip -c " : "";
+
+open(IN, $in=~/\.gz$/?"gunzip -c $in|":$in) || die;
+
+$append = $append ? ">>" : ">";
+
+if ($mate) {
+ open(MI, $mate=~/\.gz$/?"gunzip -c $mate|":$mate) || die;
+ open(MI2, $mate2=~/\.gz$/?"gunzip -c $mate2|":$mate2) if $mate2;
+ open(O1, "$gzmeth$append${out}_1$suff") || die;
+ open(O2, "$gzmeth$append${out}_2$suff") || die;
+ if ($mate2) {
+ open(O3, "$gzmeth$append${out}_3$suff") || die;
+ }
+} else {
+ my $gzmeth = $out =~ /\.gz/ ? "|gzip -c " : "";
+ if ($out) {
+ open(O1, "$gzmeth$append$out");
+ } else {
+ open(O1, ">&STDOUT");
+ }
+}
+
+my $lc = 0+`alc -o $in`;
+my $stats = $in; $stats =~ s/\.gz$//; $stats .= ".stats";
+if (-e $stats) {
+ my $rlc = `grep ^reads $stats | cut -f 2`+0;
+ $lc = $rlc if $rlc;
+} else {
+ if ($in =~ /gz$/) {
+ $lc *= .90; # lower guess
+ }
+}
+
+if (!$always && ($cnt > $lc/2)) {
+ $cnt *= 4;
+ warn "Source is too small relative to count requested, just returning tail -$cnt\n";
+ if ($mate) {
+ if ($in=~/gz$/) {
+ system("gunzip -c $in | head -$cnt $gzmeth $append ${out}_1$suff");
+ } else {
+ system("tail -$cnt $in $gzmeth $append ${out}_1$suff");
+ }
+ if ($mate=~/gz$/) {
+ system("gunzip -c $mate | head -$cnt $gzmeth $append ${out}_2$suff");
+ } else {
+ system("tail -$cnt $mate $gzmeth $append ${out}_2$suff");
+ }
+ if ($mate2=~/gz$/) {
+ system("gunzip -c $mate2 | head -$cnt $gzmeth $append ${out}_3$suff");
+ } else {
+ system("tail -$cnt $mate2 $gzmeth $append ${out}_3$suff") if ($mate2);
+ }
+ exit 0;
+ } else {
+ if ($out) {
+ $gzmeth = $out =~ /\.gz/ ? "|gzip -c " : "";
+ $out = "$gzmeth $append $out";
+ }
+ if ($in=~/gz$/) {
+ exec("gunzip -c $in | head -$cnt $out");
+ die("Exec failed : $!\n");
+ } else {
+ exec("tail -$cnt $in $out");
+ die("Exec failed : $!\n");
+ }
+ }
+}
+
+# reads to sample from....whole file or a smaller part?
+$window = $lc if $window > $lc;
+
+my $fudge = 1.5; # top weighted
+$fudge = 1.2 if $always; # less top-weighted
+
+# larger = more chance to keep read
+my $prob;
+
+if ($pct) {
+ $prob = $pct/100;
+ $cnt = 10000000000;
+} else {
+ $prob = ($cnt*$fudge)/$window;
+}
+
+# we used to seek ... but this broke too much ... if you want to fix... fix randomFQ-broken ...
+if (!$always && !$pct) {
+ # skip some reads at the beginning
+ my $skip = ($lc-$window) / 10;
+ $skip = 200000 if $skip > 200000;
+ for (my $i=0;$i<$skip;++$i) {
+ scalar <IN>; scalar <IN>; scalar <IN>; scalar <IN>;
+ scalar <MI>; scalar <MI>; scalar <MI>; scalar <MI>;
+ scalar <MI2>; scalar <MI2>; scalar <MI2>; scalar <MI2>;
+ }
+}
+
+while ($cnt > 0) {
+ if (rand() > $prob) {
+ # discard
+ <IN>; <IN>; <IN>; <IN>;
+ <MI>; <MI>; <MI>; <MI>;
+ <MI2>; <MI2>; <MI2>; <MI2>;
+ next;
+ }
+
+ # id
+ my $i = <IN>;
+ if (!$i) {
+ $cnt = 0;
+ last;
+ }
+ my $i2 = <MI> if $mate;
+ my $i3 = <MI2> if $mate2;
+
+ # read
+ my $r = <IN>;
+ my $r2 = <MI> if $mate;
+ my $r3 = <MI2> if $mate2;
+
+ if ($fasta) {
+ # only need id and read
+ $i=~ s/^\@/>/;
+ $i2=~ s/^\@/>/;
+ $i3=~ s/^\@/>/;
+ print O1 "$i$r";
+ print O2 "$i2$r2" if $mate;
+ print O3 "$i3$r3" if $mate2;
+ <IN>;<IN>;
+ <MI>;<MI>;
+ } else {
+ # print id and read
+ print O1 "$i$r";
+ print O2 "$i2$r2";
+ print O3 "$i3$r3";
+ # copy comment and quality
+ print O1 scalar <IN>;
+ print O2 scalar <MI> if $mate;
+ print O3 scalar <MI2> if $mate2;
+ print O1 scalar <IN>;
+ print O2 scalar <MI> if $mate;
+ print O3 scalar <MI2> if $mate2;
+ }
+ --$cnt;
+}
+
+close IN;
+
+sub usage() {
+ return <<EOF
+usage: $0 (-c <count> | -p <pct>) [-fasta] [-out <prefix>] [-seed <int>] <input-fastq> [<input-2> [<index-3>] ]
+
+Returns <count> number of random entries from the input fastq.
+
+Output is fastq, unless you specify -fasta
+
+If the -out parameter ends in .gz, the result is gzipped in-place.
+
+-p returns a % of total reads, -c returns a fixed count.
+
+SINGLE END:
+
+Outputs to standard output, unless -out <file> is specified.
+
+PAIRED END:
+
+Pass 2 (or 3) files as input, -out is required.
+
+If the paired-end output contains a "%" sign, it is replaced with the 1 & 2 for paired-end.
+
+IE: -o output_%.fastq.gz
+
+Otherwise it's jsut output_1 and output_2
+
+*** If one file is an indexed read, it has to be the 3rd file (for now).
+EOF
+}
+
+sub max {
+ return $_[0] > $_[1] ? $_[0] : $_[1];
+}
+
diff --git a/razf.c b/razf.c
new file mode 100644
index 0000000..e7499f9
--- /dev/null
+++ b/razf.c
@@ -0,0 +1,853 @@
+/*
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue at gmail.com>, Heng Li <lh3 at sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NO_RAZF
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "razf.h"
+
+
+#if ZLIB_VERNUM < 0x1221
+struct _gz_header_s {
+ int text;
+ uLong time;
+ int xflags;
+ int os;
+ Bytef *extra;
+ uInt extra_len;
+ uInt extra_max;
+ Bytef *name;
+ uInt name_max;
+ Bytef *comment;
+ uInt comm_max;
+ int hcrc;
+ int done;
+};
+#warning "zlib < 1.2.2.1; RAZF writing is disabled."
+#endif
+
+#define DEF_MEM_LEVEL 8
+
+static inline uint32_t byte_swap_4(uint32_t v){
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint64_t byte_swap_8(uint64_t v){
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+
+static inline int is_big_endian(){
+ int x = 0x01;
+ char *c = (char*)&x;
+ return (c[0] != 0x01);
+}
+
+#ifndef _RZ_READONLY
+static void add_zindex(RAZF *rz, int64_t in, int64_t out){
+ if(rz->index->size == rz->index->cap){
+ rz->index->cap = rz->index->cap * 1.5 + 2;
+ rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
+ rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
+ }
+ if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
+ rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
+ rz->index->size ++;
+}
+
+static void save_zindex(RAZF *rz, int fd){
+ int32_t i, v32;
+ int is_be;
+ is_be = is_big_endian();
+ if(is_be) write(fd, &rz->index->size, sizeof(int));
+ else {
+ v32 = byte_swap_4((uint32_t)rz->index->size);
+ write(fd, &v32, sizeof(uint32_t));
+ }
+ v32 = rz->index->size / RZ_BIN_SIZE + 1;
+ if(!is_be){
+ for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+ for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+ }
+ write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+ write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
+}
+#endif
+
+#ifdef _USE_KNETFILE
+static void load_zindex(RAZF *rz, knetFile *fp){
+#else
+static void load_zindex(RAZF *rz, int fd){
+#endif
+ int32_t i, v32;
+ int is_be;
+ if(!rz->load_index) return;
+ if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
+ is_be = is_big_endian();
+#ifdef _USE_KNETFILE
+ knet_read(fp, &rz->index->size, sizeof(int));
+#else
+ read(fd, &rz->index->size, sizeof(int));
+#endif
+ if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
+ rz->index->cap = rz->index->size;
+ v32 = rz->index->size / RZ_BIN_SIZE + 1;
+ rz->index->bin_offsets = malloc(sizeof(int64_t) * v32);
+#ifdef _USE_KNETFILE
+ knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#else
+ read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#endif
+ rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
+#ifdef _USE_KNETFILE
+ knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#else
+ read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#endif
+ if(!is_be){
+ for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+ for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+ }
+}
+
+#ifdef _RZ_READONLY
+static RAZF* razf_open_w(int fd)
+{
+ fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n");
+ return 0;
+}
+#else
+static RAZF* razf_open_w(int fd){
+ RAZF *rz;
+#ifdef _WIN32
+ setmode(fd, O_BINARY);
+#endif
+ rz = calloc(1, sizeof(RAZF));
+ rz->mode = 'w';
+#ifdef _USE_KNETFILE
+ rz->x.fpw = fd;
+#else
+ rz->filedes = fd;
+#endif
+ rz->stream = calloc(sizeof(z_stream), 1);
+ rz->inbuf = malloc(RZ_BUFFER_SIZE);
+ rz->outbuf = malloc(RZ_BUFFER_SIZE);
+ rz->index = calloc(sizeof(ZBlockIndex), 1);
+ deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ rz->header = calloc(sizeof(gz_header), 1);
+ rz->header->os = 0x03; //Unix
+ rz->header->text = 0;
+ rz->header->time = 0;
+ rz->header->extra = malloc(7);
+ strncpy((char*)rz->header->extra, "RAZF", 4);
+ rz->header->extra[4] = 1; // obsolete field
+ // block size = RZ_BLOCK_SIZE, Big-Endian
+ rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
+ rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
+ rz->header->extra_len = 7;
+ rz->header->name = rz->header->comment = 0;
+ rz->header->hcrc = 0;
+ deflateSetHeader(rz->stream, rz->header);
+ rz->block_pos = rz->block_off = 0;
+ return rz;
+}
+
+static void _razf_write(RAZF* rz, const void *data, int size){
+ int tout;
+ rz->stream->avail_in = size;
+ rz->stream->next_in = (void*)data;
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_NO_FLUSH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out) break;
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ if(rz->stream->avail_in == 0) break;
+ };
+ rz->in += size - rz->stream->avail_in;
+ rz->block_off += size - rz->stream->avail_in;
+}
+
+static void razf_flush(RAZF *rz){
+ uint32_t tout;
+ if(rz->buf_len){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_off = rz->buf_len = 0;
+ }
+ if(rz->stream->avail_out){
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ }
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_FULL_FLUSH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out == 0){
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ } else break;
+ }
+ rz->block_pos = rz->out;
+ rz->block_off = 0;
+}
+
+static void razf_end_flush(RAZF *rz){
+ uint32_t tout;
+ if(rz->buf_len){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_off = rz->buf_len = 0;
+ }
+ while(1){
+ tout = rz->stream->avail_out;
+ deflate(rz->stream, Z_FINISH);
+ rz->out += tout - rz->stream->avail_out;
+ if(rz->stream->avail_out < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE
+ write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+ write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ } else break;
+ }
+}
+
+static void _razf_buffered_write(RAZF *rz, const void *data, int size){
+ int i, n;
+ while(1){
+ if(rz->buf_len == RZ_BUFFER_SIZE){
+ _razf_write(rz, rz->inbuf, rz->buf_len);
+ rz->buf_len = 0;
+ }
+ if(size + rz->buf_len < RZ_BUFFER_SIZE){
+ for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+ rz->buf_len += size;
+ return;
+ } else {
+ n = RZ_BUFFER_SIZE - rz->buf_len;
+ for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+ size -= n;
+ data += n;
+ rz->buf_len += n;
+ }
+ }
+}
+
+int razf_write(RAZF* rz, const void *data, int size){
+ int ori_size, n;
+ int64_t next_block;
+ ori_size = size;
+ next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+ while(rz->in + rz->buf_len + size >= next_block){
+ n = next_block - rz->in - rz->buf_len;
+ _razf_buffered_write(rz, data, n);
+ data += n;
+ size -= n;
+ razf_flush(rz);
+ add_zindex(rz, rz->in, rz->out);
+ next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+ }
+ _razf_buffered_write(rz, data, size);
+ return ori_size;
+}
+#endif
+
+/* gzip flag byte */
+#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
+#define COMMENT 0x10 /* bit 4 set: file comment present */
+#define RESERVED 0xE0 /* bits 5..7: reserved */
+
+static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
+ int method, flags, n, len;
+ if(size < 2) return 0;
+ if(data[0] != 0x1f || data[1] != 0x8b) return 0;
+ if(size < 4) return 0;
+ method = data[2];
+ flags = data[3];
+ if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
+ n = 4 + 6; // Skip 6 bytes
+ *extra_off = n + 2;
+ *extra_len = 0;
+ if(flags & EXTRA_FIELD){
+ if(size < n + 2) return 0;
+ len = ((int)data[n + 1] << 8) | data[n];
+ n += 2;
+ *extra_off = n;
+ while(len){
+ if(n >= size) return 0;
+ n ++;
+ len --;
+ }
+ *extra_len = n - (*extra_off);
+ }
+ if(flags & ORIG_NAME) while(n < size && data[n++]);
+ if(flags & COMMENT) while(n < size && data[n++]);
+ if(flags & HEAD_CRC){
+ if(n + 2 > size) return 0;
+ n += 2;
+ }
+ return n;
+}
+
+#ifdef _USE_KNETFILE
+static RAZF* razf_open_r(knetFile *fp, int _load_index){
+#else
+static RAZF* razf_open_r(int fd, int _load_index){
+#endif
+ RAZF *rz;
+ int ext_off, ext_len;
+ int n, is_be, ret;
+ int64_t end;
+ unsigned char c[] = "RAZF";
+ rz = calloc(1, sizeof(RAZF));
+ rz->mode = 'r';
+#ifdef _USE_KNETFILE
+ rz->x.fpr = fp;
+#else
+#ifdef _WIN32
+ setmode(fd, O_BINARY);
+#endif
+ rz->filedes = fd;
+#endif
+ rz->stream = calloc(sizeof(z_stream), 1);
+ rz->inbuf = malloc(RZ_BUFFER_SIZE);
+ rz->outbuf = malloc(RZ_BUFFER_SIZE);
+ rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
+#ifdef _USE_KNETFILE
+ n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+ n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif
+ ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
+ if(ret == 0){
+ PLAIN_FILE:
+ rz->in = n;
+ rz->file_type = FILE_TYPE_PLAIN;
+ memcpy(rz->outbuf, rz->inbuf, n);
+ rz->buf_len = n;
+ free(rz->stream);
+ rz->stream = NULL;
+ return rz;
+ }
+ rz->header_size = ret;
+ ret = inflateInit2(rz->stream, -WINDOW_BITS);
+ if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
+ rz->stream->avail_in = n - rz->header_size;
+ rz->stream->next_in = rz->inbuf + rz->header_size;
+ rz->stream->avail_out = RZ_BUFFER_SIZE;
+ rz->stream->next_out = rz->outbuf;
+ rz->file_type = FILE_TYPE_GZ;
+ rz->in = rz->header_size;
+ rz->block_pos = rz->header_size;
+ rz->next_block_pos = rz->header_size;
+ rz->block_off = 0;
+ if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
+ if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
+ fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
+ return rz;
+ }
+ rz->load_index = _load_index;
+ rz->file_type = FILE_TYPE_RZ;
+#ifdef _USE_KNETFILE
+ if(knet_seek(fp, -16, SEEK_END) == -1){
+#else
+ if(lseek(fd, -16, SEEK_END) == -1){
+#endif
+ UNSEEKABLE:
+ rz->seekable = 0;
+ rz->index = NULL;
+ rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
+ } else {
+ is_be = is_big_endian();
+ rz->seekable = 1;
+#ifdef _USE_KNETFILE
+ knet_read(fp, &end, sizeof(int64_t));
+#else
+ read(fd, &end, sizeof(int64_t));
+#endif
+ if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
+ else rz->src_end = end;
+
+#ifdef _USE_KNETFILE
+ knet_read(fp, &end, sizeof(int64_t));
+#else
+ read(fd, &end, sizeof(int64_t));
+#endif
+ if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
+ else rz->end = end;
+ if(n > rz->end){
+ rz->stream->avail_in -= n - rz->end;
+ n = rz->end;
+ }
+ if(rz->end > rz->src_end){
+#ifdef _USE_KNETFILE
+ knet_seek(fp, rz->in, SEEK_SET);
+#else
+ lseek(fd, rz->in, SEEK_SET);
+#endif
+ goto UNSEEKABLE;
+ }
+#ifdef _USE_KNETFILE
+ knet_seek(fp, rz->end, SEEK_SET);
+ if(knet_tell(fp) != rz->end){
+ knet_seek(fp, rz->in, SEEK_SET);
+#else
+ if(lseek(fd, rz->end, SEEK_SET) != rz->end){
+ lseek(fd, rz->in, SEEK_SET);
+#endif
+ goto UNSEEKABLE;
+ }
+#ifdef _USE_KNETFILE
+ load_zindex(rz, fp);
+ knet_seek(fp, n, SEEK_SET);
+#else
+ load_zindex(rz, fd);
+ lseek(fd, n, SEEK_SET);
+#endif
+ }
+ return rz;
+}
+
+#ifdef _USE_KNETFILE
+RAZF* razf_dopen(int fd, const char *mode){
+ if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n");
+ else if(strstr(mode, "w")) return razf_open_w(fd);
+ return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+ fprintf(stderr,"[razf_dopen2] implement me\n");
+ return NULL;
+}
+#else
+RAZF* razf_dopen(int fd, const char *mode){
+ if(strstr(mode, "r")) return razf_open_r(fd, 1);
+ else if(strstr(mode, "w")) return razf_open_w(fd);
+ else return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+ if(strstr(mode, "r")) return razf_open_r(fd, 0);
+ else if(strstr(mode, "w")) return razf_open_w(fd);
+ else return NULL;
+}
+#endif
+
+static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
+ int fd;
+ RAZF *rz;
+ if(strstr(mode, "r")){
+#ifdef _USE_KNETFILE
+ knetFile *fd = knet_open(filename, "r");
+ if (fd == 0) {
+ fprintf(stderr, "[_razf_open] fail to open %s\n", filename);
+ return NULL;
+ }
+#else
+#ifdef _WIN32
+ fd = open(filename, O_RDONLY | O_BINARY);
+#else
+ fd = open(filename, O_RDONLY);
+#endif
+#endif
+ if(fd < 0) return NULL;
+ rz = razf_open_r(fd, _load_index);
+ } else if(strstr(mode, "w")){
+#ifdef _WIN32
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666);
+#else
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
+#endif
+ if(fd < 0) return NULL;
+ rz = razf_open_w(fd);
+ } else return NULL;
+ return rz;
+}
+
+RAZF* razf_open(const char *filename, const char *mode){
+ return _razf_open(filename, mode, 1);
+}
+
+RAZF* razf_open2(const char *filename, const char *mode){
+ return _razf_open(filename, mode, 0);
+}
+
+int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
+ int64_t n;
+ if(rz->mode != 'r' && rz->mode != 'R') return 0;
+ switch(rz->file_type){
+ case FILE_TYPE_PLAIN:
+ if(rz->end == 0x7fffffffffffffffLL){
+#ifdef _USE_KNETFILE
+ if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0;
+ n = knet_tell(rz->x.fpr);
+ knet_seek(rz->x.fpr, 0, SEEK_END);
+ rz->end = knet_tell(rz->x.fpr);
+ knet_seek(rz->x.fpr, n, SEEK_SET);
+#else
+ if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
+ rz->end = lseek(rz->filedes, 0, SEEK_END);
+ lseek(rz->filedes, n, SEEK_SET);
+#endif
+ }
+ *u_size = *c_size = rz->end;
+ return 1;
+ case FILE_TYPE_GZ:
+ return 0;
+ case FILE_TYPE_RZ:
+ if(rz->src_end == rz->end) return 0;
+ *u_size = rz->src_end;
+ *c_size = rz->end;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int _razf_read(RAZF* rz, void *data, int size){
+ int ret, tin;
+ if(rz->z_eof || rz->z_err) return 0;
+ if (rz->file_type == FILE_TYPE_PLAIN) {
+#ifdef _USE_KNETFILE
+ ret = knet_read(rz->x.fpr, data, size);
+#else
+ ret = read(rz->filedes, data, size);
+#endif
+ if (ret == 0) rz->z_eof = 1;
+ return ret;
+ }
+ rz->stream->avail_out = size;
+ rz->stream->next_out = data;
+ while(rz->stream->avail_out){
+ if(rz->stream->avail_in == 0){
+ if(rz->in >= rz->end){ rz->z_eof = 1; break; }
+ if(rz->end - rz->in < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE
+ rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in);
+#else
+ rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
+#endif
+ } else {
+#ifdef _USE_KNETFILE
+ rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+ rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif
+ }
+ if(rz->stream->avail_in == 0){
+ rz->z_eof = 1;
+ break;
+ }
+ rz->stream->next_in = rz->inbuf;
+ }
+ tin = rz->stream->avail_in;
+ ret = inflate(rz->stream, Z_BLOCK);
+ rz->in += tin - rz->stream->avail_in;
+ if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
+ fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__);
+ rz->z_err = 1;
+ break;
+ }
+ if(ret == Z_STREAM_END){
+ rz->z_eof = 1;
+ break;
+ }
+ if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
+ rz->buf_flush = 1;
+ rz->next_block_pos = rz->in;
+ break;
+ }
+ }
+ return size - rz->stream->avail_out;
+}
+
+int razf_read(RAZF *rz, void *data, int size){
+ int ori_size, i;
+ ori_size = size;
+ while(size > 0){
+ if(rz->buf_len){
+ if(size < rz->buf_len){
+ for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+ rz->buf_off += size;
+ rz->buf_len -= size;
+ data += size;
+ rz->block_off += size;
+ size = 0;
+ break;
+ } else {
+ for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+ data += rz->buf_len;
+ size -= rz->buf_len;
+ rz->block_off += rz->buf_len;
+ rz->buf_off = 0;
+ rz->buf_len = 0;
+ if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ }
+ } else if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ if(rz->buf_flush) continue;
+ rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+ if(rz->z_eof && rz->buf_len == 0) break;
+ }
+ rz->out += ori_size - size;
+ return ori_size - size;
+}
+
+int razf_skip(RAZF* rz, int size){
+ int ori_size;
+ ori_size = size;
+ while(size > 0){
+ if(rz->buf_len){
+ if(size < rz->buf_len){
+ rz->buf_off += size;
+ rz->buf_len -= size;
+ rz->block_off += size;
+ size = 0;
+ break;
+ } else {
+ size -= rz->buf_len;
+ rz->buf_off = 0;
+ rz->buf_len = 0;
+ rz->block_off += rz->buf_len;
+ if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ }
+ } else if(rz->buf_flush){
+ rz->block_pos = rz->next_block_pos;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ }
+ if(rz->buf_flush) continue;
+ rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+ if(rz->z_eof || rz->z_err) break;
+ }
+ rz->out += ori_size - size;
+ return ori_size - size;
+}
+
+static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
+#ifdef _USE_KNETFILE
+ knet_seek(rz->x.fpr, in, SEEK_SET);
+#else
+ lseek(rz->filedes, in, SEEK_SET);
+#endif
+ rz->in = in;
+ rz->out = out;
+ rz->block_pos = in;
+ rz->next_block_pos = in;
+ rz->block_off = 0;
+ rz->buf_flush = 0;
+ rz->z_eof = rz->z_err = 0;
+ inflateReset(rz->stream);
+ rz->stream->avail_in = 0;
+ rz->buf_off = rz->buf_len = 0;
+}
+
+int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
+ int64_t pos;
+ rz->z_eof = 0;
+ if(rz->file_type == FILE_TYPE_PLAIN){
+ rz->buf_off = rz->buf_len = 0;
+ pos = block_start + block_offset;
+#ifdef _USE_KNETFILE
+ knet_seek(rz->x.fpr, pos, SEEK_SET);
+ pos = knet_tell(rz->x.fpr);
+#else
+ pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+ rz->out = rz->in = pos;
+ return pos;
+ }
+ if(block_start == rz->block_pos && block_offset >= rz->block_off) {
+ block_offset -= rz->block_off;
+ goto SKIP; // Needn't reset inflate
+ }
+ if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
+ _razf_reset_read(rz, block_start, 0);
+ SKIP:
+ if(block_offset) razf_skip(rz, block_offset);
+ return rz->block_off;
+}
+
+int64_t razf_seek(RAZF* rz, int64_t pos, int where){
+ int64_t idx;
+ int64_t seek_pos, new_out;
+ rz->z_eof = 0;
+ if (where == SEEK_CUR) pos += rz->out;
+ else if (where == SEEK_END) pos += rz->src_end;
+ if(rz->file_type == FILE_TYPE_PLAIN){
+#ifdef _USE_KNETFILE
+ knet_seek(rz->x.fpr, pos, SEEK_SET);
+ seek_pos = knet_tell(rz->x.fpr);
+#else
+ seek_pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+ rz->buf_off = rz->buf_len = 0;
+ rz->out = rz->in = seek_pos;
+ return seek_pos;
+ } else if(rz->file_type == FILE_TYPE_GZ){
+ if(pos >= rz->out) goto SKIP;
+ return rz->out;
+ }
+ if(pos == rz->out) return pos;
+ if(pos > rz->src_end) return rz->out;
+ if(!rz->seekable || !rz->load_index){
+ if(pos >= rz->out) goto SKIP;
+ }
+ idx = pos / RZ_BLOCK_SIZE - 1;
+ seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+ new_out = (idx + 1) * RZ_BLOCK_SIZE;
+ if(pos > rz->out && new_out <= rz->out) goto SKIP;
+ _razf_reset_read(rz, seek_pos, new_out);
+ SKIP:
+ razf_skip(rz, (int)(pos - rz->out));
+ return rz->out;
+}
+
+uint64_t razf_tell2(RAZF *rz)
+{
+ /*
+ if (rz->load_index) {
+ int64_t idx, seek_pos;
+ idx = rz->out / RZ_BLOCK_SIZE - 1;
+ seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+ if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
+ fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
+ (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
+ }
+ */
+ return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
+}
+
+int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
+{
+ if (where != SEEK_SET) return -1;
+ return razf_jump(rz, voffset>>16, voffset&0xffff);
+}
+
+void razf_close(RAZF *rz){
+ if(rz->mode == 'w'){
+#ifndef _RZ_READONLY
+ razf_end_flush(rz);
+ deflateEnd(rz->stream);
+#ifdef _USE_KNETFILE
+ save_zindex(rz, rz->x.fpw);
+ if(is_big_endian()){
+ write(rz->x.fpw, &rz->in, sizeof(int64_t));
+ write(rz->x.fpw, &rz->out, sizeof(int64_t));
+ } else {
+ uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+ write(rz->x.fpw, &v64, sizeof(int64_t));
+ v64 = byte_swap_8((uint64_t)rz->out);
+ write(rz->x.fpw, &v64, sizeof(int64_t));
+ }
+#else
+ save_zindex(rz, rz->filedes);
+ if(is_big_endian()){
+ write(rz->filedes, &rz->in, sizeof(int64_t));
+ write(rz->filedes, &rz->out, sizeof(int64_t));
+ } else {
+ uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+ write(rz->filedes, &v64, sizeof(int64_t));
+ v64 = byte_swap_8((uint64_t)rz->out);
+ write(rz->filedes, &v64, sizeof(int64_t));
+ }
+#endif
+#endif
+ } else if(rz->mode == 'r'){
+ if(rz->stream) inflateEnd(rz->stream);
+ }
+ if(rz->inbuf) free(rz->inbuf);
+ if(rz->outbuf) free(rz->outbuf);
+ if(rz->header){
+ free(rz->header->extra);
+ free(rz->header->name);
+ free(rz->header->comment);
+ free(rz->header);
+ }
+ if(rz->index){
+ free(rz->index->bin_offsets);
+ free(rz->index->cell_offsets);
+ free(rz->index);
+ }
+ free(rz->stream);
+#ifdef _USE_KNETFILE
+ if (rz->mode == 'r')
+ knet_close(rz->x.fpr);
+ if (rz->mode == 'w')
+ close(rz->x.fpw);
+#else
+ close(rz->filedes);
+#endif
+ free(rz);
+}
+
+#endif
diff --git a/razip.c b/razip.c
new file mode 100644
index 0000000..825e732
--- /dev/null
+++ b/razip.c
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include "razf.h"
+
+#define WINDOW_SIZE 4096
+
+static int razf_main_usage()
+{
+ printf("\n");
+ printf("Usage: razip [options] [file] ...\n\n");
+ printf("Options: -c write on standard output, keep original files unchanged\n");
+ printf(" -d decompress\n");
+ printf(" -l list compressed file contents\n");
+ printf(" -b INT decompress at INT position in the uncompressed file\n");
+ printf(" -s INT decompress INT bytes in the uncompressed file\n");
+ printf(" -h give this help\n");
+ printf("\n");
+ return 0;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+ int fd = -1;
+ char c;
+ if (!is_forced) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
+ printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+ scanf("%c", &c);
+ if (c != 'Y' && c != 'y') {
+ printf("razip: not overwritten\n");
+ exit(1);
+ }
+ }
+ }
+ if (fd < 0) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
+ fprintf(stderr, "razip: %s: Fail to write\n", fn);
+ exit(1);
+ }
+ }
+ return fd;
+}
+
+int main(int argc, char **argv)
+{
+ int c, compress, pstdout, is_forced;
+ RAZF *rz;
+ void *buffer;
+ long start, end, size;
+
+ compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+ while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+ switch(c){
+ case 'h': return razf_main_usage();
+ case 'd': compress = 0; break;
+ case 'c': pstdout = 1; break;
+ case 'l': compress = 2; break;
+ case 'b': start = atol(optarg); break;
+ case 's': size = atol(optarg); break;
+ case 'f': is_forced = 1; break;
+ }
+ }
+ if (size >= 0) end = start + size;
+ if(end >= 0 && end < start){
+ fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+ return 1;
+ }
+ if(compress == 1){
+ int f_src, f_dst = -1;
+ if(argc > optind){
+ if((f_src = open(argv[optind], O_RDONLY)) < 0){
+ fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+ return 1;
+ }
+ if(pstdout){
+ f_dst = fileno(stdout);
+ } else {
+ char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+ strcpy(name, argv[optind]);
+ strcat(name, ".rz");
+ f_dst = write_open(name, is_forced);
+ if (f_dst < 0) return 1;
+ free(name);
+ }
+ } else if(pstdout){
+ f_src = fileno(stdin);
+ f_dst = fileno(stdout);
+ } else return razf_main_usage();
+ rz = razf_dopen(f_dst, "w");
+ buffer = malloc(WINDOW_SIZE);
+ while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
+ razf_close(rz); // f_dst will be closed here
+ if (argc > optind && !pstdout) unlink(argv[optind]);
+ free(buffer);
+ close(f_src);
+ return 0;
+ } else {
+ if(argc <= optind) return razf_main_usage();
+ if(compress == 2){
+ rz = razf_open(argv[optind], "r");
+ if(rz->file_type == FILE_TYPE_RZ) {
+ printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name");
+ printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end,
+ argv[optind]);
+ } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]);
+ } else {
+ int f_dst;
+ if (argc > optind && !pstdout) {
+ char *name;
+ if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) {
+ printf("razip: %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = write_open(name, is_forced);
+ free(name);
+ } else f_dst = fileno(stdout);
+ rz = razf_open(argv[optind], "r");
+ buffer = malloc(WINDOW_SIZE);
+ razf_seek(rz, start, SEEK_SET);
+ while(1){
+ if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE);
+ else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if(c <= 0) break;
+ start += c;
+ write(f_dst, buffer, c);
+ if(end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (!pstdout) unlink(argv[optind]);
+ }
+ razf_close(rz);
+ return 0;
+ }
+}
+
diff --git a/sam-stats.cpp b/sam-stats.cpp
new file mode 100644
index 0000000..9f0cd7d
--- /dev/null
+++ b/sam-stats.cpp
@@ -0,0 +1,1121 @@
+/*
+# Copyright (c) 2011 Erik Aronesty
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
+*/
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+
+#include <string>
+#include <google/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <google/dense_hash_map> // or sparse_hash_set, dense_hash_map, ...
+
+#include <samtools/sam.h> // samtools api
+
+#include "fastq-lib.h"
+
+const char * VERSION = "1.38";
+
+#define SVNREV atoi(strchr("$LastChangedRevision: 681 $", ':')+1)
+
+using namespace std;
+
+void usage(FILE *f);
+
+#define MAX_MAPQ 300
+// this factor is based on a quick empirical look at a few bam files....
+#define VFACTOR 1.5
+
+//#define max(a,b) (a>b?a:b)
+//#define min(a,b) (a<b?a:b)
+#define meminit(l) (memset(&l,0,sizeof(l)))
+#define debugout(s,...) if (debug) fprintf(stderr,s,##__VA_ARGS__)
+#undef warn
+#define warn(s,...) ((++errs), fprintf(stderr,s,##__VA_ARGS__))
+#define stdev(cnt, sum, ssq) sqrt((((double)cnt)*ssq-pow((double)sum,2)) / ((double)cnt*((double)cnt-1)))
+
+template <class vtype>
+ double quantile(const vtype &vec, double p);
+
+template <class itype>
+ double quantile(const vector<itype> &vec, double p);
+
+std::string string_format(const std::string &fmt, ...);
+
+int debug=0;
+int errs=0;
+extern int optind;
+int histnum=30;
+bool isbwa=false;
+int rnamode = 0;
+bool allow_no_reads = false;
+
+// from http://programerror.com/2009/10/iterative-calculation-of-lies-er-stats/
+class cRunningStats
+{
+private:
+ double m_n; // count
+ double m_m1; // mean
+ double m_m2; // second moment
+ double m_m3; // third moment
+ double m_m4; // fourth moment
+public:
+ cRunningStats() : m_n(0.0), m_m1(0.0), m_m2(0.0), m_m3(0.0), m_m4(0.0)
+ { ; }
+ void Push(double x)
+ {
+ m_n++;
+ double d = (x - m_m1);
+ double d_n = d / m_n;
+ double d_n2 = d_n * d_n;
+ m_m4 += d * d_n2 * d_n * ((m_n - 1) * ((m_n * m_n) - 3 * m_n + 3)) +
+ 6 * d_n2 * m_m2 - 4 * d_n * m_m3;
+ m_m3 += d * d_n2 * ((m_n - 1) * (m_n - 2)) - 3 * d_n * m_m2;
+ m_m2 += d * d_n * (m_n - 1);
+ m_m1 += d_n;
+ }
+ double Mean() { return m_m1; }
+ double StdDeviation() { return sqrt(Variance()); }
+ double StdError() { return (m_n > 1.0) ? sqrt(Variance() / m_n) : 0.0; }
+ double Variance() { return (m_n > 1.0) ? (m_m2 / (m_n - 1.0)) : 0.0; }
+ double Skewness() { return sqrt(m_n) * m_m3 / pow(m_m2, 1.5); }
+ double Kurtosis() { return m_n * m_m4 / (m_m2 * m_m2); }
+};
+
+/// if we use this a lot may want to make it variable size
+class scoverage {
+public:
+ scoverage() {mapb=reflen=0; dist.resize(histnum+2); mapr=0;};
+ long long int mapb;
+ long int mapr;
+ cRunningStats spos;
+ int reflen;
+ vector <int> dist;
+};
+
+// sorted integer bucket ... good for ram with small max size, slow to access
+class ibucket {
+public:
+ int tot;
+ vector<int> dat;
+ ibucket(int max) {dat.resize(max+1);tot=0;}
+ int size() const {return tot;};
+
+ int operator[] (int n) const {
+ assert(n < size());
+ int i;
+ for (i=0;i<dat.size();++i) {
+ if (n < dat[i]) {
+ return i;
+ }
+ n-=dat[i];
+ }
+ }
+
+ void push(int v) {
+ assert(v<dat.size());
+ ++dat[v];
+ ++tot;
+ }
+};
+
+class fqent {
+ public:
+ int bits;
+ std::string r;
+ std::string q;
+};
+
+class sstats {
+public:
+ ibucket vmapq; // all map qualities
+ sstats() : vmapq(MAX_MAPQ) {
+ memset((void*)&dat,0,sizeof(dat));
+ covr.set_empty_key("-");
+ petab.set_deleted_key("-");
+ }
+ ~sstats() {
+ covr.clear();
+ }
+ struct {
+ int n, mapn, mapzero; // # of entries, # of mapped entries,
+ int lenmin, lenmax; double lensum, lenssq; // read length stats
+ double mapsum, mapssq; // map quality sum/ssq
+ double nmnz, nmsum; // # of mismatched reads, sum of mismatch lengths
+ long long int nbase;
+ int qualmax, qualmin; // num bases samples, min/max qual
+ double qualsum, qualssq; // sum quals, sum-squared qual
+ int nrev, nfor; // rev reads, for reads
+ double tmapb; // number of mapped bases
+ long long int basecnt[5];
+ int del, ins; // length total dels/ins found
+ bool pe; // paired-end ? 0 or 1
+ int disc;
+ int disc_pos;
+ int dupmax; // max dups found
+ } dat;
+ vector<int> visize; // all insert sizes
+ google::dense_hash_map<std::string, scoverage> covr; // # mapped per ref seq
+ google::sparse_hash_map<std::string, int> dups; // alignments by read-id (not necessary for some pipes)
+ google::sparse_hash_map<std::string, fqent> petab; // peread table
+
+ // file-format neutral ... called per read... warning seq/qual are not necessarily null-terminated
+ void dostats(string name, int rlen, int bits, const string &ref, int pos, int mapq, const string &materef, int nmate, const string &seq, const char *qual, int nm, int del, int ins);
+
+ // read a bam/sam file and call dostats over and over
+ bool parse_bam(const char *in);
+ bool parse_sam(FILE *f);
+};
+
+#define T_A 0
+#define T_C 1
+#define T_G 2
+#define T_T 3
+#define T_N 4
+
+void build_basemap();
+
+int dupreads = 1000000;
+int max_chr = 1000;
+bool trackdup=0;
+FILE *sefq = NULL;
+FILE *pefq1 = NULL;
+FILE *pefq2 = NULL;
+int basemap[256];
+int main(int argc, char **argv) {
+ const char *ext = NULL;
+ bool multi=0, newonly=0, inbam=0;
+ int fq_out=0;
+ const char *rnafile = NULL;
+ char c;
+ optind = 0;
+ struct option long_options[] = {
+ {"fastq", no_argument, NULL, 'o'},
+ {0,0,0,0},
+ };
+ int long_index=0;
+ const char *prefix;
+
+ while ( (c = getopt_long(argc, argv, "?BzArR:Ddx:MhS:", long_options, &long_index)) != -1) {
+ switch (c) {
+ case 'd': ++debug; break; // increment debug level
+ case 'D': ++trackdup; break;
+ case 'B': inbam=1; break;
+ case 'A': max_chr=1000000; break; // max chrom
+ case 'R': rnafile=optarg; // pass through
+ case 'r': max_chr=1000000; rnamode=1; if (histnum < 60) histnum=60; break;
+ case 'O': prefix=optarg; break;
+ case 'S': histnum=atoi(optarg); break;
+ case 'x': ext=optarg; break;
+ case 'M': newonly=1; break;
+ case 'z': allow_no_reads = true; break;
+ case 'o': fq_out=1; trackdup=1; break; // output suff
+ case 'h': usage(stdout); return 0;
+ case '?':
+ if (!optopt) {
+ usage(stdout); return 0;
+ } else if (optopt && strchr("ox", optopt))
+ fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+ else if (isprint(optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
+ usage(stderr);
+ return 1;
+ }
+ }
+
+ // recompute argc owing to getopt (is this necessary? i don't think so)
+ const char *stdv[3] = {argv[0],"-",NULL};
+ if (!argv[optind]) {
+ argc=2;
+ argv = (char **) stdv;
+ optind=1;
+ }
+
+ multi = (argc-optind-1) > 0; // more than 1 input?
+ if (multi && !ext)
+ ext = "stats"; // force serial processed extension-mode
+
+ build_basemap(); // precompute matrices for rabit base->integer (A->0, C->1,. ...etc) lookups
+
+ debugout("argc:%d, argv[1]:%s, multi:%d, ext:%s\n", argc,argv[optind],multi,ext);
+
+ FILE *rnao = NULL;
+
+
+ const char *p;
+ // for each input file
+ for (;optind < argc;++optind) {
+ sstats s;
+ const char *in = argv[optind];
+ FILE *f;
+ FILE *o=NULL;
+ FILE *rnao=NULL;
+ bool needpclose = 0;
+
+ // decide input format
+ string out;
+
+ if (!strcmp(in,"-")) {
+ // read sam/bam from stdin
+ if (ext||fq_out) {
+ warn("Can't use file extension with stdin\n");
+ continue;
+ }
+ f = stdin;
+ o = stdout;
+ } else {
+ if ((p = strrchr(in,'.')) && !strcmp(p, ".gz")) {
+ // maybe this is a gzipped sam file...
+ string cmd = string_format("gunzip -c '%s'", in);
+ f = popen(cmd.c_str(), "r");
+ needpclose=1;
+ if (f) {
+ char c;
+ if (!inbam) {
+ // guess file format with 1 char
+ c=getc(f); ungetc(c,f);
+ if (c==-1) {
+ warn("Can't unzip %s\n", in);
+ pclose(f);
+ continue;
+ }
+ if (c==31) {
+ // bam file... reopen to reset stream... can't pass directly
+ string cmd = string_format("gunzip -c '%s'", in);
+ f = popen(cmd.c_str(), "r");
+ inbam=1;
+ }
+ } else
+ c = 31; // user forced bam, no need to check/reopen
+
+ if (inbam) {
+ // why did you gzip a bam... weird?
+ if (dup2(fileno(f),0) == -1) {
+ warn("Can't dup2 STDIN\n");
+ continue;
+ }
+ in = "-";
+ }
+ } else {
+ warn("Can't unzip %s: %s\n", in, strerror(errno));
+ continue;
+ }
+ // extension mode... output to file minus .gz
+ if (ext||fq_out)
+ out=string(in, p-in);
+ } else {
+ f = fopen(in, "r");
+ if (!f) {
+ warn("Can't open %s: %s\n", in, strerror(errno));
+ continue;
+ }
+ // extension mode... output to file
+ if (ext||fq_out)
+ out=in;
+ }
+ if (fq_out) {
+ sefq=fopen((out+".fq").c_str(),"w");
+ pefq1=fopen((out+".fq1").c_str(),"w");
+ pefq2=fopen((out+".fq2").c_str(),"w");
+ }
+ if (ext) {
+ ( out += '.') += ext;
+ o=fopen(out.c_str(), "w");
+ if (!o) {
+ warn("Can't write %s: %s\n", out.c_str(), strerror(errno));
+ continue;
+ }
+ } else
+ o=stdout;
+ }
+
+ // more guessing
+ debugout("file:%s, f: %lx\n", in, (long int) f);
+ char c;
+ if (!inbam) {
+ // guess file format
+ c=getc(f); ungetc(c,f);
+ if (c==31 && !strcmp(in,"-")) {
+ // if bamtools api allowed me to pass a stream, this wouldn't be an issue....
+ warn("Specify -B to read a bam file from standard input\n");
+ continue;
+ }
+ } else
+ c = 31; // 31 == bam
+
+ if (rnafile) {
+ rnao=fopen(rnafile,"w");
+ if (!rnao) {
+ warn("Can't write %s: %s\n", rnafile, strerror(errno));
+ return 1;
+ }
+ } else {
+ rnao=o;
+ }
+
+ // parse sam or bam as needed
+ if (c != 31) {
+ // (could be an uncompressed bam... but can't magic in 1 char)
+ if (!s.parse_sam(f)) {
+ if (needpclose) pclose(f); else fclose(f);
+ warn("Invalid or corrupt sam file %s\n", in);
+ continue;
+ }
+ } else {
+ if (!s.parse_bam(in)) {
+ if (needpclose) pclose(f); else fclose(f);
+ warn("Invalid or corrupt bam file %s\n", in);
+ continue;
+ }
+ }
+ int ret;
+ if (needpclose) ret=pclose(f); else ret=fclose(f);
+ if (ret!=0) {
+ warn("Error closing '%s': %s\n", in, strerror(errno));
+ continue;
+ }
+
+ if (fq_out) {
+ if(sefq && s.dat.pe) {
+ fclose(sefq);
+ unlink((out+".fq").c_str());
+ }
+ if (pefq1 && !s.dat.pe) {
+ fclose(pefq1);
+ fclose(pefq2);
+ unlink((out+".fq1").c_str());
+ unlink((out+".fq2").c_str());
+ }
+ }
+
+ // sort sstats
+ sort(s.visize.begin(), s.visize.end());
+
+ int phred = s.dat.qualmin < 64 ? 33 : 64;
+ if (!s.dat.n && ! allow_no_reads) {
+ warn("No reads in %s\n", in);
+ continue;
+ }
+ fprintf(o, "reads\t%d\n", s.dat.n);
+ fprintf(o, "version\t%s.%d\n", VERSION, SVNREV);
+
+ // mapped reads is the number of reads that mapped at least once (either mated or not)
+ if (s.dat.mapn > 0) {
+ if (trackdup && s.dat.dupmax > (s.dat.pe+1)) {
+ google::sparse_hash_map<string,int>::iterator it = s.dups.begin();
+ vector<int> vtmp;
+ int amb = 0;
+ int sing = 0;
+ while(it!=s.dups.end()) {
+ // *not* making the distinction between 2 singleton mappings and 1 paired here
+ if (it->second > (s.dat.pe+1)) {
+ ++amb;
+ }
+ if (it->second == 1 && s.dat.pe) {
+ ++sing;
+ }
+ ++it;
+ }
+ int mapped = (int) s.dups.size()*(s.dat.pe+1)-sing;
+
+ fprintf(o,"mapped reads\t%d\n", mapped);
+ if (amb > 0) {
+ int unmapped=s.dat.n-s.dat.mapn;
+ fprintf(o,"pct align\t%.6f\n", 100.0*((double)mapped/(double)(mapped+unmapped)));
+ fprintf(o,"ambiguous\t%d\n", amb*(s.dat.pe+1));
+ fprintf(o,"pct ambiguous\t%.6f\n", 100.0*((double)amb/(double)s.dups.size()));
+ fprintf(o,"max dup align\t%.d\n", s.dat.dupmax-s.dat.pe);
+ } else {
+ // no ambiguous mappings... simple
+ fprintf(o, "pct align\t%.6f\n", 100.0*(double)s.dat.mapn/(double)s.dat.n);
+ }
+ if (sing)
+ fprintf(o,"singleton mappings\t%.d\n", sing);
+ // number of total mappings
+ fprintf(o, "total mappings\t%d\n", s.dat.mapn);
+ } else {
+ // dup-id's not tracked
+ fprintf(o, "mapped reads\t%d\n", s.dat.mapn);
+ fprintf(o, "pct align\t%.6f\n", 100.0*(double)s.dat.mapn/(double)s.dat.n);
+ // todo: add support for bwa's multiple alignment tag
+ // fprintf(o, "total mappings\t%d\n", s.dat.mapn);
+ }
+ } else {
+ fprintf(o, "mapped reads\t%d\n", s.dat.mapn);
+ }
+
+ if (s.dat.mapzero > 0) {
+ fprintf(o, "skipped mappings\t%d\n", s.dat.mapzero);
+ }
+
+ fprintf(o, "mapped bases\t%.0f\n", s.dat.tmapb);
+ if (s.dat.pe) {
+ fprintf(o, "library\tpaired-end\n");
+ }
+ if (s.dat.disc > 0) {
+ fprintf(o, "discordant mates\t%d\n", s.dat.disc);
+ }
+ if (s.dat.disc_pos > 0) {
+ fprintf(o, "distant mates\t%d\n", s.dat.disc_pos);
+ }
+
+ if (s.dat.mapn > 0) {
+ if (s.dat.mapn > 100) {
+ // at least 100 mappings to call a meaningful "percentage"
+ fprintf(o, "pct forward\t%.3f\n", 100*(s.dat.nfor/(double)(s.dat.nfor+s.dat.nrev)));
+ }
+
+ fprintf(o, "phred\t%d\n", phred);
+ fprintf(o, "forward\t%d\n", s.dat.nfor);
+ fprintf(o, "reverse\t%d\n", s.dat.nrev);
+ if (s.dat.lenmax != s.dat.lenmin) {
+ fprintf(o, "len max\t%d\n", s.dat.lenmax);
+ fprintf(o, "len mean\t%.4f\n", s.dat.lensum/s.dat.mapn);
+ fprintf(o, "len stdev\t%.4f\n", stdev(s.dat.mapn, s.dat.lensum, s.dat.lenssq));
+ } else {
+ fprintf(o, "len max\t%d\n", s.dat.lenmax);
+ }
+ fprintf(o, "mapq mean\t%.4f\n", s.dat.mapsum/s.dat.mapn);
+ fprintf(o, "mapq stdev\t%.4f\n", stdev(s.dat.mapn, s.dat.mapsum, s.dat.mapssq));
+
+ fprintf(o, "mapq Q1\t%.2f\n", quantile(s.vmapq,.25));
+ fprintf(o, "mapq median\t%.2f\n", quantile(s.vmapq,.50));
+ fprintf(o, "mapq Q3\t%.2f\n", quantile(s.vmapq,.75));
+
+ if (s.dat.lensum > 0) {
+ fprintf(o, "snp rate\t%.6f\n", s.dat.nmsum/s.dat.lensum);
+ if (s.dat.ins >0 ) fprintf(o, "ins rate\t%.6f\n", s.dat.ins/s.dat.lensum);
+ if (s.dat.del >0 ) fprintf(o, "del rate\t%.6f\n", s.dat.del/s.dat.lensum);
+ fprintf(o, "pct mismatch\t%.4f\n", 100.0*((double)s.dat.nmnz/s.dat.mapn));
+ }
+
+ if (s.visize.size() > 0) {
+ double p10 = quantile(s.visize, .10);
+ double p90 = quantile(s.visize, .90);
+ double matsum=0, matssq=0;
+ int matc = 0;
+ int i;
+ for(i=0;i<s.visize.size();++i) {
+ int v = s.visize[i];
+ if (v >= p10 && v <= p90) {
+ ++matc;
+ matsum+=v;
+ matssq+=v*v;
+ }
+ }
+ fprintf(o, "insert mean\t%.4f\n", matsum/matc);
+ if (matc > 1) {
+ fprintf(o, "insert stdev\t%.4f\n", stdev(matc, matsum, matssq));
+ fprintf(o, "insert Q1\t%.2f\n", quantile(s.visize, .25));
+ fprintf(o, "insert median\t%.2f\n", quantile(s.visize, .50));
+ fprintf(o, "insert Q3\t%.2f\n", quantile(s.visize, .75));
+ }
+ }
+
+ if (s.dat.nbase >0) {
+ fprintf(o,"base qual mean\t%.4f\n", (s.dat.qualsum/s.dat.nbase)-phred);
+ fprintf(o,"base qual stdev\t%.4f\n", stdev(s.dat.nbase, s.dat.qualsum, s.dat.qualssq));
+ fprintf(o,"%%A\t%.4f\n", 100.0*((double)s.dat.basecnt[T_A]/(double)s.dat.nbase));
+ fprintf(o,"%%C\t%.4f\n", 100.0*((double)s.dat.basecnt[T_C]/(double)s.dat.nbase));
+ fprintf(o,"%%G\t%.4f\n", 100.0*((double)s.dat.basecnt[T_G]/(double)s.dat.nbase));
+ fprintf(o,"%%T\t%.4f\n", 100.0*((double)s.dat.basecnt[T_T]/(double)s.dat.nbase));
+ if (s.dat.basecnt[T_N] > 0) {
+ fprintf(o,"%%N\t%.4f\n", 100.0*((double)s.dat.basecnt[T_N]/(double)s.dat.nbase));
+ }
+ }
+ // how many ref seqs have mapped bases?
+ int mseq=0;
+ google::dense_hash_map<string,scoverage>::iterator it = s.covr.begin();
+ vector<string> vtmp;
+ bool haverlen = 0;
+ while (it != s.covr.end()) {
+ if (it->second.mapb > 0) {
+ ++mseq; // number of mapped refseqs
+ if (mseq <= max_chr) vtmp.push_back(it->first); // don't bother if too many chrs
+ if (it->second.reflen > 0) haverlen = 1;
+ }
+ ++it;
+ }
+ // don't print per-seq percentages if size is huge, or is 1
+ if ((haverlen || mseq > 1) && mseq <= max_chr) { // worth reporting
+ // sort the id's
+ sort(vtmp.begin(),vtmp.end());
+ vector<string>::iterator vit=vtmp.begin();
+ double logb=log(2);
+ vector<double> vcovrvar;
+ vector<double> vcovr;
+ vector<double> vskew;
+ // for each chromosome or reference sequence...
+ while (vit != vtmp.end()) {
+ scoverage &v = s.covr[*vit]; // coverage vector
+ if (v.reflen && histnum > 0) { // user asked for histogram
+ string sig;
+ int d; double logd, lsum=0, lssq=0;
+
+ for (d=0;d<histnum;++d) { // log counts for each portion of the histogram
+ logd = log(1+v.dist[d])/logb;
+ lsum+=logd;
+ lssq+=logd*logd;
+ sig += ('0' + (int) logd);
+ }
+ if (rnamode) {
+ // variability of coverage
+ double cv = stdev(histnum, lsum, lssq)/(lsum/histnum);
+ // percent coverage estimated using historgram... maybe track real coverage some day, for now this is fine
+ double covr = 0;
+ for (d=0;d<histnum;++d) {
+ // VFAC = % greater than 1 that a bin must be to be considered 100%
+ if (v.dist[d] > VFACTOR*v.reflen/histnum) {
+ ++covr; // 100% covered this bin
+ } else {
+ // calc bases/(factor * size of bin)
+ covr += ((double)v.dist[d] / ((double)VFACTOR*v.reflen/histnum));
+ }
+ }
+ double origcovr = covr;
+ covr /= (double) histnum;
+ covr = min(100.0*((double)v.mapb/v.reflen),100.0*covr);
+ // when dealing with "position skewness", you need to anchor things
+ v.spos.Push(v.reflen);
+ v.spos.Push(1);
+ double skew = -v.spos.Skewness();
+ // if there's some coverage
+ if (v.mapr > 0) {
+ if (v.mapr > 10) {
+ // summary stats
+ vcovr.push_back(covr); // look at varition
+ vcovrvar.push_back(cv); // look at varition
+ vskew.push_back(skew); // and skew
+ }
+ if (rnao) { // "rna mode" = more detailed output of coverage and skewness of coverage
+ fprintf(rnao,"%s\t%d\t%ld\t%.2f\t%.4f\t%.4f\t%s\n", vit->c_str(), v.reflen, v.mapr, covr, skew, cv, sig.c_str());
+ }
+ }
+ } else if (max_chr < 100) { // normal dna mode, just print percent alignment to each
+ fprintf(o,"%%%s\t%.2f\t%s\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum), sig.c_str());
+ } else {
+ fprintf(o,"%%%s\t%.6f\t%s\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum), sig.c_str());
+ }
+ } else {
+ if (max_chr < 100) {
+ fprintf(o,"%%%s\t%.2f\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum));
+ } else {
+ fprintf(o,"%%%s\t%.6f\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum));
+ }
+ }
+ ++vit;
+ }
+ if (rnamode) {
+ sort(vcovr.begin(), vcovr.end());
+ sort(vcovrvar.begin(), vcovrvar.end());
+ sort(vskew.begin(), vskew.end());
+ double medcovrvar = quantile(vcovrvar,.5);
+ double medcovr = quantile(vcovr,.5);
+ double medskew = quantile(vskew,.5);
+ fprintf(o,"median skew\t%.2f\n", medskew);
+ fprintf(o,"median coverage cv\t%.2f\n", medcovrvar);
+ fprintf(o,"median coverage\t%.2f\n", medcovr);
+ }
+ }
+ if (s.covr.size() > 1) {
+ fprintf(o,"num ref seqs\t%d\n", (int) s.covr.size());
+ fprintf(o,"num ref aligned\t%d\n", (int) mseq);
+ }
+ } else {
+ if (s.covr.size() > 1) {
+ fprintf(o,"num ref seqs\t%d\n", (int) s.covr.size());
+ }
+ }
+ }
+ return errs ? 1 : 0;
+}
+
+#define S_ID 0
+#define S_BITS 1
+#define S_NMO 2
+#define S_POS 3
+#define S_MAPQ 4
+#define S_CIG 5
+#define S_MATEREF 6
+#define S_MATE 8
+#define S_READ 9
+#define S_QUAL 10
+#define S_TAG 11
+
+void sstats::dostats(string name, int rlen, int bits, const string &ref, int pos, int mapq, const string &materef, int nmate, const string &seq, const char *qual, int nm, int del, int ins) {
+
+ ++dat.n;
+
+ if (bits & 0x04) return; // bits say ... query was not mapped
+
+ if (pos<=0) {
+ ++dat.mapzero; // quantify weird errors
+ return; // not mapped well enough to count
+ }
+
+ ++dat.mapn; // mapped query
+
+ // TODO: build a histogram of read lengths using the integer bucket
+
+ // read length min/max
+ if (rlen > dat.lenmax) dat.lenmax = rlen;
+ if ((rlen < dat.lenmin) || dat.lenmin==0) dat.lenmin = rlen;
+
+ // read length sum/ssq
+ dat.lensum += rlen;
+ dat.lenssq += rlen*rlen;
+
+ // TODO: allow for alternate paired-end layouts besides Illumina's
+
+ // reverse stranded query
+ if (bits & 16)
+ if (bits & 0x40) // first read in the pair
+ ++dat.nrev; // reverse
+ else
+ ++dat.nfor; // second read? actually was a forward alignment
+ else
+ if (bits & 0x40) // first read in the pair
+ ++dat.nfor;
+ else
+ ++dat.nrev;
+
+ // mapping quality mean/stdev
+ dat.mapsum += mapq;
+ dat.mapssq += mapq*mapq;
+
+ // mapping quality histogram
+ vmapq.push(mapq);
+
+ // TODO: NM histogram maybe?
+
+ // number of mismateches
+ if (nm > 0) {
+ // nm is snp+ins+del... which is silly
+ dat.nmnz += 1; // how many read are not perfect matches?
+ dat.nmsum += nm-del-ins; // mismatch sum
+ }
+ dat.del+=del; // deletion sum
+ dat.ins+=ins; // insert sum
+
+ // if we know about the reference sequence
+ if (ref.length()) {
+ scoverage *sc = &(covr[ref]);
+ if (sc) { // and we have ram for coverage
+ sc->mapb+=rlen; // total up mapped bases in that ref
+ if (rnamode) { // more detailed
+ int i;
+ sc->mapr+=1;
+ for (i=0;i<rlen;++i) { // walk along read
+ sc->spos.Push(pos+i); // per-position stats
+ }
+ if (histnum > 0 && sc->reflen > 0) { // if we're making a histogram
+ for (i=0;i<rlen;++i) { // walk along read
+ int x = histnum * ((double)(pos+i) / sc->reflen); // find the bucket this base is in
+ if (x < histnum) {
+ sc->dist[x]+=1; // add 1 to that bucket
+ } else {
+ // out of bounds.... what to do?
+ sc->dist[histnum] += 1; // out of bounds bases (fall off the edge) = extra bucket
+ }
+ }
+ }
+ } else if (histnum > 0 && sc->reflen > 0) { // lightweight... don't deal with each base, ok becauss CHRs are big
+ int x = histnum * ((double)pos / sc->reflen);
+ if (debug > 1) {
+ warn("chr: %s, hn: %d, pos: %d, rl: %d, x: %x\n", ref.c_str(), histnum, pos, sc->reflen, x);
+ }
+ if (x < histnum) {
+ sc->dist[x]+=rlen;
+ } else {
+ // out of bounds.... what to do?
+ sc->dist[histnum] +=rlen;
+ }
+ }
+ }
+ }
+ // total mapped bases += read length
+ dat.tmapb+=rlen;
+ if (nmate>0) {
+ // insert size histogram
+ visize.push_back(nmate);
+ dat.pe=1;
+ }
+
+ // mate reference chromosome is not the same as my own?
+ if (materef.size() && (materef != "=" && materef != "*" && materef != ref)) {
+ // this is a discordant read
+ dat.disc++;
+ } else {
+ // mate reference chromosome is far (>50kb) from my own?
+ if (abs(nmate) > 50000) {
+ // this is discordant-by position
+ dat.disc_pos++;
+ }
+ }
+
+ // walk along sequence, add qualities to overall min/max/mean/stdev
+ int i, j;
+ for (i=0;i<seq.length();++i) {
+ if (qual[i]>dat.qualmax) dat.qualmax=qual[i];
+ if (qual[i]<dat.qualmin) dat.qualmin=qual[i];
+ dat.qualsum+=qual[i];
+ dat.qualssq+=qual[i]*qual[i];
+ // also count bases
+ ++dat.basecnt[basemap[seq[i]]];
+ // total number of bases counted (this should be the same as tmapb??? get rid of it???)
+ ++dat.nbase;
+ }
+
+ // TODO: we should be able to use the "non primary" bit field
+ // need to test to see if this works for all aligners
+ // then have a mode that only report stats for primary alignments... for example, and no need for this
+ // expensive, giant hash table
+
+ // duplicate tracking turned on?
+ if (trackdup) {
+ size_t p;
+ // illumina mode... check for a space in the name, and ignore stuff after it
+ if ((p = name.find_first_of(' '))!=string::npos)
+ name.resize(p);
+
+ // count dups for that id
+ int x=++dups[name];
+
+ // keep track of max dups
+ if (x>dat.dupmax)
+ dat.dupmax=x;
+
+ // fastq-output mode...
+ if (sefq) {
+ // if the data isn't paired end or if we're not sure yet
+ if (!dat.pe || dat.mapn < 1000) {
+ // output a single end fq
+ fprintf(sefq,"@%s\n%s\n+\n%s\n",name.c_str(), seq.c_str(), qual);
+ }
+ }
+
+ // if we're outputting paired-end fastq's and if there's not a lot of dups
+ if (pefq1 && x < 4 && (dat.pe || dat.mapn < 1000)) {
+ fqent fq;
+ google::sparse_hash_map<string,fqent>::iterator it=petab.find(name);
+ // find my mate?
+ if (it == petab.end()) {
+ // no, add me
+ fq.r=seq;
+ fq.q=qual;
+ fq.bits=bits&0x40; // mate flag
+ petab[name]=fq;
+ } else if (it->second.bits != bits) {
+ // yes? remove me
+ fq=it->second;
+ fprintf(pefq1,"@%s 1\n%s\n+\n%s\n",name.c_str(), fq.r.c_str(), fq.q.c_str());
+ fprintf(pefq2,"@%s 2\n%s\n+\n%s\n",name.c_str(), seq.c_str(), qual);
+ petab.erase(it);
+ }
+ }
+ }
+}
+
+// parse a sam file... maybe let samtools do this, and then handle stats in "bam mode"... faster for sure
+bool sstats::parse_sam(FILE *f) {
+ line l; meminit(l);
+ int lineno=0;
+ int warnings=0;
+ while (read_line(f, l)>0) {
+ ++lineno;
+ char *sp;
+ if (l.s[0]=='@') {
+ if (!strncmp(l.s,"@SQ\t",4)) {
+ char *t=strtok_r(l.s, "\t", &sp);
+ string sname; int slen=0;
+ while(t) {
+ if (!strncmp(t,"SN:",3)) {
+ sname=&(t[3]);
+ if (slen)
+ break;
+ } else if (!strncmp(t,"LN:",3)) {
+ slen=atoi(&t[3]);
+ if (sname.length())
+ break;
+ }
+ t=strtok_r(NULL, "\t", &sp);
+ }
+ covr[sname].reflen=slen;
+ }
+ continue;
+ }
+ char *t=strtok_r(l.s, "\t", &sp);
+ char *d[100]; meminit(d);
+ int n =0;
+ while(t) {
+ d[n++]=t;
+ t=strtok_r(NULL, "\t", &sp);
+ }
+ int nm=0;
+ int i;
+ // get # mismatches
+ for (i=S_TAG;i<n;++i){
+ if (d[i] && !strncasecmp(d[i],"NM:i:",5)) {
+ nm=atoi(&d[i][5]);
+ }
+ }
+
+ if (!d[S_BITS] || !isdigit(d[S_BITS][0])
+ || !d[S_POS] || !isdigit(d[S_POS][0])
+ ) {
+ if (warnings < 5) {
+ warn("Line %d, missing bits/position information\n", lineno);
+ ++warnings;
+ }
+ // invalid sam
+ return false;
+ }
+
+ int ins = 0, del = 0;
+ char *p=d[S_CIG];
+ // sum the cig
+ while (*p) {
+ int n=strtod(p, &sp);
+ if (sp==p) {
+ break;
+ }
+ if (*sp == 'I')
+ ins+=n;
+ else if (*sp == 'D')
+ del+=n;
+ p=sp+1;
+ }
+
+ // force unmapped to position negative one
+ if (d[S_CIG][0] == '*') d[S_POS] = (char *) "-1";
+
+ // as-if it were a bam...
+ dostats(d[S_ID],strlen(d[S_READ]),atoi(d[S_BITS]),d[S_NMO],atoi(d[S_POS]),atoi(d[S_MAPQ]),d[S_MATEREF],atoi(d[S_MATE]),d[S_READ],d[S_QUAL],nm, ins, del);
+ }
+ return true;
+}
+
+// let samtools parse the bam
+bool sstats::parse_bam(const char *in) {
+ samfile_t *fp;
+ if (!(fp=samopen(in, "rb", NULL))) {
+ warn("Error reading '%s': %s\n", in, strerror(errno));
+ return false;
+ }
+ if (fp->header) {
+ int i;
+ for (i = 0; i < fp->header->n_targets; ++i) {
+ covr[fp->header->target_name[i]].reflen=fp->header->target_len[i];
+ }
+ }
+ bam1_t *al=bam_init1();
+ int ret=0;
+ while ( (ret=samread(fp, al)) > 0 ) {
+ uint32_t *cig = bam1_cigar(al);
+ char *name = bam1_qname(al);
+ int len = al->core.l_qseq;
+ uint8_t *tag=bam_aux_get(al, "NM"); // NM tag
+ int nm = tag ? bam_aux2i(tag) : 0;
+ int ins=0, del=0;
+ int i;
+
+ // count inserts and deletions
+ for (i=0;i<al->core.n_cigar;++i) {
+ int op = cig[i] & BAM_CIGAR_MASK;
+ if (op == BAM_CINS) {
+ ins+=(cig[i] >> BAM_CIGAR_SHIFT);
+ } else if (op == BAM_CDEL) {
+ del+=(cig[i] >> BAM_CIGAR_SHIFT);
+ }
+ }
+
+ // crappy cigar?
+ if (al->core.n_cigar == 0)
+ al->core.pos=-1; // not really a match if there's no cigar string... this deals with bwa's issue
+
+ char *qual = (char *) bam1_qual(al); // qual string
+ uint8_t * bamseq = bam1_seq(al); // sequence string
+ string seq; seq.resize(len); // ok... really make it a string
+ for (i=0;i<len;++i) {
+ seq[i] = bam_nt16_rev_table[bam1_seqi(bamseq, i)];
+ qual[i] += 33;
+ }
+
+ // now do stats
+ dostats(name,len,al->core.flag,al->core.tid>=0?fp->header->target_name[al->core.tid]:"",al->core.pos+1,al->core.qual, al->core.mtid>=0?fp->header->target_name[al->core.mtid]:"", al->core.isize, seq, qual, nm, ins, del);
+ }
+ if (ret < -2) {
+ // no stats .. corrupt file
+ return false;
+ }
+ if (ret < -1) {
+ ++errs;
+ // truncated file, output stats, but return error code
+ return true;
+ }
+ return true;
+}
+
+void usage(FILE *f) {
+ fprintf(f,
+"Usage: sam-stats [options] [file1] [file2...filen]\n"
+"Version: %s.%d\n"
+"\n"
+"Produces lots of easily digested statistics for the files listed\n"
+"\n"
+"Options (default in parens):\n"
+"\n"
+"-D Keep track of multiple alignments\n"
+"-O PREFIX Output prefix enabling extended output (see below)\n"
+"-R FIL Coverage/RNA output (coverage, 3' bias, etc, implies -A)\n"
+"-A Report all chr sigs, even if there are more than 1000\n"
+"-b INT Number of reads to sample for per-base stats (1M)\n"
+"-S INT Size of ascii-signature (30)\n"
+"-x FIL File extension for handling multiple files (stats)\n"
+"-M Only overwrite if newer (requires -x, or multiple files)\n"
+"-B Input is bam, don't bother looking at magic\n"
+"-z Don't fail when zero entries in sam\n"
+"\n"
+"OUTPUT:\n"
+"\n"
+"If one file is specified, then the output is to standard out. If\n"
+"multiple files are specified, or if the -x option is supplied,\n"
+"the output file is <filename>.<ext>. Default extension is 'stats'.\n"
+"\n"
+"Complete Stats:\n"
+"\n"
+" <STATS> : mean, max, stdev, median, Q1 (25 percentile), Q3\n"
+" reads : # of entries in the sam file, might not be # reads\n"
+" phred : phred scale used\n"
+" bsize : # reads used for qual stats\n"
+" mapped reads : number of aligned reads (unique probe id sequences)\n"
+" mapped bases : total of the lengths of the aligned reads\n"
+" forward : number of forward-aligned reads\n"
+" reverse : number of reverse-aligned reads\n"
+" snp rate : mismatched bases / total bases (snv rate)\n"
+" ins rate : insert bases / total bases\n"
+" del rate : deleted bases / total bases\n"
+" pct mismatch : percent of reads that have mismatches\n"
+" pct align : percent of reads that aligned\n"
+" len <STATS> : read length stats, ignored if fixed-length\n"
+" mapq <STATS> : stats for mapping qualities\n"
+" insert <STATS> : stats for insert sizes\n"
+" %%<CHR> : percentage of mapped bases per chr, followed by a signature\n"
+"\n"
+"Subsampled stats (1M reads max):\n"
+" base qual <STATS> : stats for base qualities\n"
+" %%A,%%T,%%C,%%G : base percentages\n"
+"\n"
+"Meaning of the per-chromosome signature:\n"
+" A ascii-histogram of mapped reads by chromosome position.\n"
+" It is only output if the original SAM/BAM has a header. The values\n"
+" are the log2 of the # of mapped reads at each position + ascii '0'.\n"
+"\n"
+"Extended output mode produces a set of files:\n"
+" .stats : primary output\n"
+" .fastx : fastx-toolkit compatible output\n"
+" .rcov : per-reference counts & coverage\n"
+" .xdist : mismatch distribution\n"
+" .ldist : length distribution (if applicable)\n"
+" .mqdist : mapping quality distribution\n"
+"\n"
+ ,VERSION, SVNREV);
+}
+
+std::string string_format(const std::string &fmt, ...) {
+ int n, size=100;
+ std::string str;
+ va_list ap;
+ while (1) {
+ str.resize(size);
+ va_start(ap, fmt);
+ int n = vsnprintf((char *)str.c_str(), size, fmt.c_str(), ap);
+ va_end(ap);
+ if (n > -1 && n < size)
+ return str;
+ if (n > -1)
+ size=n+1;
+ else
+ size*=2;
+ }
+}
+
+// R-compatible quantile code : TODO convert to template
+
+template <class vtype>
+double quantile(const vtype &vec, double p) {
+ int l = vec.size();
+ if (!l) return 0;
+ double t = ((double)l-1)*p;
+ int it = (int) t;
+ int v=vec[it];
+ if (t > (double)it) {
+ return (v + (t-it) * (vec[it+1] - v));
+ } else {
+ return v;
+ }
+}
+
+template <class itype>
+double quantile(const vector<itype> &vec, double p) {
+ int l = vec.size();
+ if (!l) return 0;
+ double t = ((double)l-1)*p;
+ int it = (int) t;
+ itype v=vec[it];
+ if (t > (double)it) {
+ return (v + (t-it) * (vec[it+1] - v));
+ } else {
+ return v;
+ }
+}
+
+void build_basemap() {
+ int cb,j;
+ for (cb=0;cb<256;++cb) {
+ switch(cb) {
+ case 'A': case 'a':
+ j=T_A; break;
+ case 'C': case 'c':
+ j=T_C; break;
+ case 'G': case 'g':
+ j=T_G; break;
+ case 'T': case 't':
+ j=T_T; break;
+ default:
+ j=T_N; break;
+ }
+ basemap[cb]=j;
+ }
+}
+
+
diff --git a/sam.c b/sam.c
new file mode 100644
index 0000000..fa11df6
--- /dev/null
+++ b/sam.c
@@ -0,0 +1,186 @@
+#include <string.h>
+#include <unistd.h>
+#include "faidx.h"
+#include "sam.h"
+
+#define TYPE_BAM 1
+#define TYPE_READ 2
+
+bam_header_t *bam_header_dup(const bam_header_t *h0)
+{
+ bam_header_t *h;
+ int i;
+ h = bam_header_init();
+ *h = *h0;
+ h->hash = h->dict = h->rg2lib = 0;
+ h->text = (char*)calloc(h->l_text + 1, 1);
+ memcpy(h->text, h0->text, h->l_text);
+ h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+ for (i = 0; i < h->n_targets; ++i) {
+ h->target_len[i] = h0->target_len[i];
+ h->target_name[i] = strdup(h0->target_name[i]);
+ }
+ return h;
+}
+static void append_header_text(bam_header_t *header, char* text, int len)
+{
+ int x = header->l_text + 1;
+ int y = header->l_text + len + 1; // 1 byte null
+ if (text == 0) return;
+ kroundup32(x);
+ kroundup32(y);
+ if (x < y) header->text = (char*)realloc(header->text, y);
+ strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
+ header->l_text += len;
+ header->text[header->l_text] = 0;
+}
+
+int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
+{
+ if (!(fp->type&1) || (fp->type&2)) return -1;
+ bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
+ return 0;
+}
+
+samfile_t *samopen(const char *fn, const char *mode, const void *aux)
+{
+ samfile_t *fp;
+ fp = (samfile_t*)calloc(1, sizeof(samfile_t));
+ if (strchr(mode, 'r')) { // read
+ fp->type |= TYPE_READ;
+ if (strchr(mode, 'b')) { // binary
+ fp->type |= TYPE_BAM;
+ fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+ if (fp->x.bam == 0) goto open_err_ret;
+ fp->header = bam_header_read(fp->x.bam);
+ } else { // text
+ fp->x.tamr = sam_open(fn);
+ if (fp->x.tamr == 0) goto open_err_ret;
+ fp->header = sam_header_read(fp->x.tamr);
+ if (fp->header->n_targets == 0) { // no @SQ fields
+ if (aux) { // check if aux is present
+ bam_header_t *textheader = fp->header;
+ fp->header = sam_header_read2((const char*)aux);
+ if (fp->header == 0) goto open_err_ret;
+ append_header_text(fp->header, textheader->text, textheader->l_text);
+ bam_header_destroy(textheader);
+ }
+ if (fp->header->n_targets == 0 && bam_verbose >= 1)
+ fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
+ } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
+ }
+ } else if (strchr(mode, 'w')) { // write
+ fp->header = bam_header_dup((const bam_header_t*)aux);
+ if (strchr(mode, 'b')) { // binary
+ char bmode[3];
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
+ fp->type |= TYPE_BAM;
+ fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
+ if (fp->x.bam == 0) goto open_err_ret;
+ bam_header_write(fp->x.bam, fp->header);
+ } else { // text
+ // open file
+ fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
+ if (fp->x.tamw == 0) goto open_err_ret;
+ if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
+ else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
+ else fp->type |= BAM_OFDEC<<2;
+ // write header
+ if (strchr(mode, 'h')) {
+ int i;
+ bam_header_t *alt;
+ // parse the header text
+ alt = bam_header_init();
+ alt->l_text = fp->header->l_text; alt->text = fp->header->text;
+ sam_header_parse(alt);
+ alt->l_text = 0; alt->text = 0;
+ // check if there are @SQ lines in the header
+ fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
+ if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
+ if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
+ fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
+ } else { // then dump ->target_{name,len}
+ for (i = 0; i < fp->header->n_targets; ++i)
+ fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
+ }
+ bam_header_destroy(alt);
+ }
+ }
+ }
+ return fp;
+
+open_err_ret:
+ free(fp);
+ return 0;
+}
+
+void samclose(samfile_t *fp)
+{
+ if (fp == 0) return;
+ if (fp->header) bam_header_destroy(fp->header);
+ if (fp->type & TYPE_BAM) bam_close(fp->x.bam);
+ else if (fp->type & TYPE_READ) sam_close(fp->x.tamr);
+ else fclose(fp->x.tamw);
+ free(fp);
+}
+
+int samread(samfile_t *fp, bam1_t *b)
+{
+ if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading
+ if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b);
+ else return sam_read1(fp->x.tamr, fp->header, b);
+}
+
+int samwrite(samfile_t *fp, const bam1_t *b)
+{
+ if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing
+ if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b);
+ else {
+ char *s = bam_format1_core(fp->header, b, fp->type>>2&3);
+ int l = strlen(s);
+ fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw);
+ free(s);
+ return l + 1;
+ }
+}
+
+int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
+{
+ bam_plbuf_t *buf;
+ int ret;
+ bam1_t *b;
+ b = bam_init1();
+ buf = bam_plbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf, mask);
+ while ((ret = samread(fp, b)) >= 0)
+ bam_plbuf_push(b, buf);
+ bam_plbuf_push(0, buf);
+ bam_plbuf_destroy(buf);
+ bam_destroy1(b);
+ return 0;
+}
+
+char *samfaipath(const char *fn_ref)
+{
+ char *fn_list = 0;
+ if (fn_ref == 0) return 0;
+ fn_list = calloc(strlen(fn_ref) + 5, 1);
+ strcat(strcpy(fn_list, fn_ref), ".fai");
+ if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
+ if (access(fn_ref, R_OK) == -1) {
+ fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
+ } else {
+ if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
+ if (fai_build(fn_ref) == -1) {
+ fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
+ free(fn_list); fn_list = 0;
+ }
+ }
+ }
+ return fn_list;
+}
diff --git a/sam_header.c b/sam_header.c
new file mode 100644
index 0000000..a1b5181
--- /dev/null
+++ b/sam_header.c
@@ -0,0 +1,772 @@
+#include "sam_header.h"
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, const char *)
+
+struct _HeaderList
+{
+ struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
+ struct _HeaderList *next;
+ void *data;
+};
+typedef struct _HeaderList list_t;
+typedef list_t HeaderDict;
+
+typedef struct
+{
+ char key[2];
+ char *value;
+}
+HeaderTag;
+
+typedef struct
+{
+ char type[2];
+ list_t *tags;
+}
+HeaderLine;
+
+const char *o_hd_tags[] = {"SO","GO",NULL};
+const char *r_hd_tags[] = {"VN",NULL};
+
+const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
+const char *r_sq_tags[] = {"SN","LN",NULL};
+const char *u_sq_tags[] = {"SN",NULL};
+
+const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
+const char *r_rg_tags[] = {"ID",NULL};
+const char *u_rg_tags[] = {"ID",NULL};
+
+const char *o_pg_tags[] = {"VN","CL",NULL};
+const char *r_pg_tags[] = {"ID",NULL};
+
+const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
+const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
+const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
+const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
+
+
+static void debug(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+}
+
+#if 0
+// Replaced by list_append_to_end
+static list_t *list_prepend(list_t *root, void *data)
+{
+ list_t *l = malloc(sizeof(list_t));
+ l->next = root;
+ l->data = data;
+ return l;
+}
+#endif
+
+// Relies on the root->last being correct. Do not use with the other list_*
+// routines unless they are fixed to modify root->last as well.
+static list_t *list_append_to_end(list_t *root, void *data)
+{
+ list_t *l = malloc(sizeof(list_t));
+ l->last = l;
+ l->next = NULL;
+ l->data = data;
+
+ if ( !root )
+ return l;
+
+ root->last->next = l;
+ root->last = l;
+ return root;
+}
+
+static list_t *list_append(list_t *root, void *data)
+{
+ list_t *l = root;
+ while (l && l->next)
+ l = l->next;
+ if ( l )
+ {
+ l->next = malloc(sizeof(list_t));
+ l = l->next;
+ }
+ else
+ {
+ l = malloc(sizeof(list_t));
+ root = l;
+ }
+ l->data = data;
+ l->next = NULL;
+ return root;
+}
+
+static void list_free(list_t *root)
+{
+ list_t *l = root;
+ while (root)
+ {
+ l = root;
+ root = root->next;
+ free(l);
+ }
+}
+
+
+
+// Look for a tag "XY" in a predefined const char *[] array.
+static int tag_exists(const char *tag, const char **tags)
+{
+ int itag=0;
+ if ( !tags ) return -1;
+ while ( tags[itag] )
+ {
+ if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
+ itag++;
+ }
+ return -1;
+}
+
+
+
+// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
+// or NULL if everything has been read. The lineptr should be freed by the caller. The
+// newline character is stripped.
+static const char *nextline(char **lineptr, size_t *n, const char *text)
+{
+ int len;
+ const char *to = text;
+
+ if ( !*to ) return NULL;
+
+ while ( *to && *to!='\n' && *to!='\r' ) to++;
+ len = to - text + 1;
+
+ if ( *to )
+ {
+ // Advance the pointer for the next call
+ if ( *to=='\n' ) to++;
+ else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
+ }
+ if ( !len )
+ return to;
+
+ if ( !*lineptr )
+ {
+ *lineptr = malloc(len);
+ *n = len;
+ }
+ else if ( *n<len )
+ {
+ *lineptr = realloc(*lineptr, len);
+ *n = len;
+ }
+ if ( !*lineptr ) {
+ debug("[nextline] Insufficient memory!\n");
+ return 0;
+ }
+
+ memcpy(*lineptr,text,len);
+ (*lineptr)[len-1] = 0;
+
+ return to;
+}
+
+// name points to "XY", value_from points to the first character of the value string and
+// value_to points to the last character of the value string.
+static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
+{
+ HeaderTag *tag = malloc(sizeof(HeaderTag));
+ int len = value_to-value_from+1;
+
+ tag->key[0] = name[0];
+ tag->key[1] = name[1];
+ tag->value = malloc(len+1);
+ memcpy(tag->value,value_from,len+1);
+ tag->value[len] = 0;
+ return tag;
+}
+
+static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
+{
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
+ tags = tags->next;
+ }
+ return NULL;
+}
+
+
+// Return codes:
+// 0 .. different types or unique tags differ or conflicting tags, cannot be merged
+// 1 .. all tags identical -> no need to merge, drop one
+// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
+// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
+static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
+{
+ HeaderTag *t1, *t2;
+
+ if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
+ return 0;
+
+ int itype = tag_exists(hline1->type,types);
+ if ( itype==-1 ) {
+ debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
+ return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
+ }
+
+ if ( unique_tags[itype] )
+ {
+ t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
+ t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
+ if ( !t1 || !t2 ) // this should never happen, the unique tags are required
+ return 2;
+
+ if ( strcmp(t1->value,t2->value) )
+ return 0; // the unique tags differ, cannot be merged
+ }
+ if ( !required_tags[itype] && !optional_tags[itype] )
+ {
+ t1 = hline1->tags->data;
+ t2 = hline2->tags->data;
+ if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
+ return 0;
+ }
+
+ int missing=0, itag=0;
+ while ( required_tags[itype] && required_tags[itype][itag] )
+ {
+ t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
+ t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
+ if ( !t1 && !t2 )
+ return 2; // this should never happen
+ else if ( !t1 || !t2 )
+ missing = 1; // there is some tag missing in one of the hlines
+ else if ( strcmp(t1->value,t2->value) )
+ {
+ if ( unique_tags[itype] )
+ return 2; // the lines have a matching unique tag but have a conflicting tag
+
+ return 0; // the lines contain conflicting tags, cannot be merged
+ }
+ itag++;
+ }
+ itag = 0;
+ while ( optional_tags[itype] && optional_tags[itype][itag] )
+ {
+ t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
+ t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
+ if ( !t1 && !t2 )
+ {
+ itag++;
+ continue;
+ }
+ if ( !t1 || !t2 )
+ missing = 1; // there is some tag missing in one of the hlines
+ else if ( strcmp(t1->value,t2->value) )
+ {
+ if ( unique_tags[itype] )
+ return 2; // the lines have a matching unique tag but have a conflicting tag
+
+ return 0; // the lines contain conflicting tags, cannot be merged
+ }
+ itag++;
+ }
+ if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged
+ return 1;
+}
+
+
+static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
+{
+ list_t *tags;
+ HeaderLine *out = malloc(sizeof(HeaderLine));
+ out->type[0] = hline->type[0];
+ out->type[1] = hline->type[1];
+ out->tags = NULL;
+
+ tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *old = tags->data;
+
+ HeaderTag *new = malloc(sizeof(HeaderTag));
+ new->key[0] = old->key[0];
+ new->key[1] = old->key[1];
+ new->value = strdup(old->value);
+ out->tags = list_append(out->tags, new);
+
+ tags = tags->next;
+ }
+ return out;
+}
+
+static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
+{
+ list_t *tmpl_tags;
+
+ if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
+ return 0;
+
+ tmpl_tags = tmpl_hline->tags;
+ while (tmpl_tags)
+ {
+ HeaderTag *tmpl_tag = tmpl_tags->data;
+ HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key);
+ if ( !out_tag )
+ {
+ HeaderTag *tag = malloc(sizeof(HeaderTag));
+ tag->key[0] = tmpl_tag->key[0];
+ tag->key[1] = tmpl_tag->key[1];
+ tag->value = strdup(tmpl_tag->value);
+ out_hline->tags = list_append(out_hline->tags,tag);
+ }
+ tmpl_tags = tmpl_tags->next;
+ }
+ return 1;
+}
+
+
+static HeaderLine *sam_header_line_parse(const char *headerLine)
+{
+ HeaderLine *hline;
+ HeaderTag *tag;
+ const char *from, *to;
+ from = headerLine;
+
+ if ( *from != '@' ) {
+ debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+ return 0;
+ }
+ to = ++from;
+
+ while (*to && *to!='\t') to++;
+ if ( to-from != 2 ) {
+ debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
+ return 0;
+ }
+
+ hline = malloc(sizeof(HeaderLine));
+ hline->type[0] = from[0];
+ hline->type[1] = from[1];
+ hline->tags = NULL;
+
+ int itype = tag_exists(hline->type, types);
+
+ from = to;
+ while (*to && *to=='\t') to++;
+ if ( to-from != 1 ) {
+ debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ return 0;
+ }
+ from = to;
+ while (*from)
+ {
+ while (*to && *to!='\t') to++;
+
+ if ( !required_tags[itype] && !optional_tags[itype] )
+ {
+ // CO is a special case, it can contain anything, including tabs
+ if ( *to ) { to++; continue; }
+ tag = new_tag(" ",from,to-1);
+ }
+ else
+ tag = new_tag(from,from+3,to-1);
+
+ if ( header_line_has_tag(hline,tag->key) )
+ debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
+ hline->tags = list_append(hline->tags, tag);
+
+ from = to;
+ while (*to && *to=='\t') to++;
+ if ( *to && to-from != 1 ) {
+ debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ return 0;
+ }
+
+ from = to;
+ }
+ return hline;
+}
+
+
+// Must be of an existing type, all tags must be recognised and all required tags must be present
+static int sam_header_line_validate(HeaderLine *hline)
+{
+ list_t *tags;
+ HeaderTag *tag;
+ int itype, itag;
+
+ // Is the type correct?
+ itype = tag_exists(hline->type, types);
+ if ( itype==-1 )
+ {
+ debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
+ return 0;
+ }
+
+ // Has all required tags?
+ itag = 0;
+ while ( required_tags[itype] && required_tags[itype][itag] )
+ {
+ if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
+ {
+ debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
+ hline->type[0],hline->type[1]);
+ return 0;
+ }
+ itag++;
+ }
+
+ // Are all tags recognised?
+ tags = hline->tags;
+ while ( tags )
+ {
+ tag = tags->data;
+ if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
+ {
+ // Lower case tags are user-defined values.
+ if( !(islower(tag->key[0]) || islower(tag->key[1])) )
+ {
+ // Neither is lower case, but tag was not recognized.
+ debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
+ // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes
+ }
+ // else - allow user defined tag
+ }
+ tags = tags->next;
+ }
+
+ return 1;
+}
+
+
+static void print_header_line(FILE *fp, HeaderLine *hline)
+{
+ list_t *tags = hline->tags;
+ HeaderTag *tag;
+
+ fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
+ while (tags)
+ {
+ tag = tags->data;
+
+ fprintf(fp, "\t");
+ if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+ fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
+ fprintf(fp, "%s", tag->value);
+
+ tags = tags->next;
+ }
+ fprintf(fp,"\n");
+}
+
+
+static void sam_header_line_free(HeaderLine *hline)
+{
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ free(tag->value);
+ free(tag);
+ tags = tags->next;
+ }
+ list_free(hline->tags);
+ free(hline);
+}
+
+void sam_header_free(void *_header)
+{
+ HeaderDict *header = (HeaderDict*)_header;
+ list_t *hlines = header;
+ while (hlines)
+ {
+ sam_header_line_free(hlines->data);
+ hlines = hlines->next;
+ }
+ list_free(header);
+}
+
+HeaderDict *sam_header_clone(const HeaderDict *dict)
+{
+ HeaderDict *out = NULL;
+ while (dict)
+ {
+ HeaderLine *hline = dict->data;
+ out = list_append(out, sam_header_line_clone(hline));
+ dict = dict->next;
+ }
+ return out;
+}
+
+// Returns a newly allocated string
+char *sam_header_write(const void *_header)
+{
+ const HeaderDict *header = (const HeaderDict*)_header;
+ char *out = NULL;
+ int len=0, nout=0;
+ const list_t *hlines;
+
+ // Calculate the length of the string to allocate
+ hlines = header;
+ while (hlines)
+ {
+ len += 4; // @XY and \n
+
+ HeaderLine *hline = hlines->data;
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ len += strlen(tag->value) + 1; // \t
+ if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+ len += strlen(tag->value) + 3; // XY:
+ tags = tags->next;
+ }
+ hlines = hlines->next;
+ }
+
+ nout = 0;
+ out = malloc(len+1);
+ hlines = header;
+ while (hlines)
+ {
+ HeaderLine *hline = hlines->data;
+
+ nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
+
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ nout += sprintf(out+nout,"\t");
+ if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+ nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
+ nout += sprintf(out+nout,"%s", tag->value);
+ tags = tags->next;
+ }
+ hlines = hlines->next;
+ nout += sprintf(out+nout,"\n");
+ }
+ out[len] = 0;
+ return out;
+}
+
+void *sam_header_parse2(const char *headerText)
+{
+ list_t *hlines = NULL;
+ HeaderLine *hline;
+ const char *text;
+ char *buf=NULL;
+ size_t nbuf = 0;
+ int tovalidate = 0;
+
+ if ( !headerText )
+ return 0;
+
+ text = headerText;
+ while ( (text=nextline(&buf, &nbuf, text)) )
+ {
+ hline = sam_header_line_parse(buf);
+ if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
+ // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
+ hlines = list_append_to_end(hlines, hline);
+ else
+ {
+ if (hline) sam_header_line_free(hline);
+ sam_header_free(hlines);
+ if ( buf ) free(buf);
+ return NULL;
+ }
+ }
+ if ( buf ) free(buf);
+
+ return hlines;
+}
+
+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
+{
+ const HeaderDict *dict = (const HeaderDict*)_dict;
+ const list_t *l = dict;
+ khash_t(str) *tbl = kh_init(str);
+ khiter_t k;
+ int ret;
+
+ if (_dict == 0) return tbl; // return an empty (not null) hash table
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key, *value;
+ key = header_line_has_tag(hline,key_tag);
+ value = header_line_has_tag(hline,value_tag);
+ if ( !key || !value )
+ {
+ l = l->next;
+ continue;
+ }
+
+ k = kh_get(str, tbl, key->value);
+ if ( k != kh_end(tbl) )
+ debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
+ k = kh_put(str, tbl, key->value, &ret);
+ kh_value(tbl, k) = value->value;
+
+ l = l->next;
+ }
+ return tbl;
+}
+
+char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
+{
+ const HeaderDict *dict = (const HeaderDict*)_dict;
+ const list_t *l = dict;
+ int max, n;
+ char **ret;
+
+ ret = 0; *_n = max = n = 0;
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key;
+ key = header_line_has_tag(hline,key_tag);
+ if ( !key )
+ {
+ l = l->next;
+ continue;
+ }
+
+ if (n == max) {
+ max = max? max<<1 : 4;
+ ret = realloc(ret, max * sizeof(void*));
+ }
+ ret[n++] = key->value;
+
+ l = l->next;
+ }
+ *_n = n;
+ return ret;
+}
+
+void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
+{
+ list_t *l = iter;
+ if ( !l ) return NULL;
+
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key, *value;
+ key = header_line_has_tag(hline,key_tag);
+ value = header_line_has_tag(hline,value_tag);
+ if ( !key && !value )
+ {
+ l = l->next;
+ continue;
+ }
+
+ *_key = key->value;
+ *_value = value->value;
+ return l->next;
+ }
+ return l;
+}
+
+const char *sam_tbl_get(void *h, const char *key)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ khint_t k;
+ k = kh_get(str, tbl, key);
+ return k == kh_end(tbl)? 0 : kh_val(tbl, k);
+}
+
+int sam_tbl_size(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ return h? kh_size(tbl) : 0;
+}
+
+void sam_tbl_destroy(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ kh_destroy(str, tbl);
+}
+
+void *sam_header_merge(int n, const void **_dicts)
+{
+ const HeaderDict **dicts = (const HeaderDict**)_dicts;
+ HeaderDict *out_dict;
+ int idict, status;
+
+ if ( n<2 ) return NULL;
+
+ out_dict = sam_header_clone(dicts[0]);
+
+ for (idict=1; idict<n; idict++)
+ {
+ const list_t *tmpl_hlines = dicts[idict];
+
+ while ( tmpl_hlines )
+ {
+ list_t *out_hlines = out_dict;
+ int inserted = 0;
+ while ( out_hlines )
+ {
+ status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
+ if ( status==0 )
+ {
+ out_hlines = out_hlines->next;
+ continue;
+ }
+
+ if ( status==2 )
+ {
+ print_header_line(stderr,tmpl_hlines->data);
+ print_header_line(stderr,out_hlines->data);
+ debug("Conflicting lines, cannot merge the headers.\n");
+ return 0;
+ }
+ if ( status==3 )
+ sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
+
+ inserted = 1;
+ break;
+ }
+ if ( !inserted )
+ out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
+
+ tmpl_hlines = tmpl_hlines->next;
+ }
+ }
+
+ return out_dict;
+}
+
+
diff --git a/tidx/fastq-lib.cpp b/tidx/fastq-lib.cpp
new file mode 120000
index 0000000..0551761
--- /dev/null
+++ b/tidx/fastq-lib.cpp
@@ -0,0 +1 @@
+../fastq-lib.cpp
\ No newline at end of file
diff --git a/tidx/fastq-lib.h b/tidx/fastq-lib.h
new file mode 120000
index 0000000..de4dedc
--- /dev/null
+++ b/tidx/fastq-lib.h
@@ -0,0 +1 @@
+../fastq-lib.h
\ No newline at end of file
diff --git a/tidx/tidx-lib.cpp b/tidx/tidx-lib.cpp
new file mode 100644
index 0000000..3199516
--- /dev/null
+++ b/tidx/tidx-lib.cpp
@@ -0,0 +1,436 @@
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string>
+#include <vector>
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <sparsehash/dense_hash_map>
+
+#include "fastq-lib.h"
+#include "utils.h"
+#include "tidx.h"
+
+void usage(FILE *f);
+
+using namespace std;
+using namespace google;
+
+double xtime();
+
+bool annot_comp (const annot &a, const annot &b) { return (a.beg < b.beg); }
+
+template <typename L, typename R> void append(L& lhs, R const& rhs) { lhs.insert(lhs.end(), rhs.begin(), rhs.end()); }
+template <typename L, typename R> void prepend(L& lhs, R const& rhs) { lhs.insert(lhs.begin(), rhs.begin(), rhs.end()); }
+
+struct string_annot_serializer {
+ bool operator()(FILE* fp, const std::pair<const string&, const vector<annot> >& value) const {
+
+ {
+ assert(value.first.length() <= UCHAR_MAX);
+ const unsigned char size = value.first.length();
+ if (fwrite(&size, sizeof(size), 1, fp) != 1)
+ return false;
+ if (fwrite(value.first.data(), size, 1, fp) != 1)
+ return false;
+ }
+
+ {
+ const vector<annot>&van=value.second;
+ const unsigned long size = van.size();
+ if (fwrite(&size, sizeof(size), 1, fp) != 1)
+ return false;
+ int i;
+ for (i=0;i<size;++i) {
+ if (fwrite(&van[i].beg, sizeof(van[i].beg), 1, fp) != 1)
+ return false;
+ if (fwrite(&van[i].end, sizeof(van[i].end), 1, fp) != 1)
+ return false;
+ assert(van[i].pos.size() <= USHRT_MAX);
+ const unsigned short size = van[i].pos.size();
+ if (fwrite(&size, sizeof(size), 1, fp) != 1)
+ return false;
+ int j;
+ for (j=0;j<size;++j) {
+ if (fwrite(&van[i].pos[j], sizeof(van[i].pos[j]), 1, fp) != 1)
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ bool operator()(FILE* fp, std::pair<const string, vector<annot> >* value) const {
+
+ {
+
+ string buf;
+ unsigned char size; // all strings are <= 255 chars long
+ if (fread(&size, sizeof(size), 1, fp) != 1)
+ return false;
+
+ if(size>buf.size()) buf.resize(size*2);
+
+ if (fread((void *)buf.data(), size, 1, fp) != 1) {
+ return false;
+ }
+ // necessarry to "new" the value which must be const, except during "unsearialization"
+ // api shouldn't foist this on the user ... should be behind the scenes
+ string * ncs = const_cast<string *>(&value->first);
+ new(ncs) string(buf.data(), (size_t)size);
+
+ }
+
+ {
+
+ vector<annot> &van=value->second;
+ unsigned long size;
+ if (fread(&size, sizeof(size), 1, fp) != 1)
+ return false;
+ int i;
+ van.resize(size);
+ for (i=0;i<size;++i) {
+ if (fread(&van[i].beg, sizeof(van[i].beg), 1, fp) != 1)
+ return false;
+ if (fread(&van[i].end, sizeof(van[i].beg), 1, fp) != 1)
+ return false;
+ unsigned short size;
+ if (fread(&size, sizeof(size), 1, fp) != 1)
+ return false;
+ int j;
+ van[i].pos.resize(size);
+ for(j=0;j<size;++j) {
+ if (fread(&van[i].pos[j], sizeof(van[i].pos[j]), 1, fp) != 1)
+ return false;
+ }
+ }
+
+ }
+
+ return true;
+ }
+};
+
+
+void chomp_line(struct line &l) {
+ if (l.s[l.n-1] == '\n') l.s[--l.n]='\0'; // chomp
+ if (l.s[l.n-1] == '\r') l.s[--l.n]='\0'; // chomp
+}
+
+vector <long int> empty_vector;
+const vector<long int> &tidx::lookup(const char *chr, int pos) {
+ dense_hash_map<string,vector<annot> >::iterator it=map.find(chr);
+ if (it == map.end()) return empty_vector;
+ vector<annot> &va = it->second;
+ if (debug) fprintf(stderr,"lookup: %s:%d -> %d\n", chr, pos, (int) va.size());
+ int b=0, t=va.size(), c=0;
+ while (t>b) {
+ c=(t+b)/2;
+// printf("here1: c:%d, t:%d, b:%d, pos:%d, beg:%d, end:%d, res:%d\n", c, t, b, pos, va[c].beg, va[c].end, va[c].pos[0]);
+ if (pos == va[c].beg)
+ break;
+ else if (pos < va[c].beg)
+ t=c-1;
+ else if (pos > va[c].beg) {
+ if (pos <= va[c].end) {
+ return va[c].pos;
+ }
+ b=c+1;
+ }
+ }
+
+ if (t == b)
+ c = t;
+// printf("here2: c:%d, t:%d, b:%d, pos:%d, beg:%d, end:%d, res:%d\n", c, t, b, pos, va[c].beg, va[c].end, va[c].pos[0]);
+ if (pos >= va[c].beg && pos <= va[c].end) {
+ return va[c].pos;
+ }
+ return empty_vector;
+}
+
+vector<long int> tidx::lookup_r(const char *chr, int beg, int end) {
+ dense_hash_map<string,vector<annot> >::iterator it=map.find(chr);
+ if (it == map.end()) return empty_vector;
+ vector<annot> &va = it->second;
+ if (debug) fprintf(stderr,"lookup_r: %s:%d.%d -> %d\n", chr, beg, end, (int) va.size());
+ int b=0, t=va.size(), c=0;
+ while (t>b) {
+ c=(t+b)/2;
+ if (beg == va[c].beg)
+ break;
+ else if (beg < va[c].beg)
+ t=c-1;
+ else if (beg > va[c].beg) {
+ if (beg <= va[c].end)
+ break;
+ b=c+1;
+ }
+ }
+ if (t == b)
+ c = t;
+ vector<long int> res;
+ while (c<va.size() && end >= va[c].beg && beg <= va[c].end) {
+ append(res,va[c].pos);
+ ++c;
+ }
+ return res;
+}
+
+string tidx::lookup(const char *chr, int pos, const char *msep) {
+// printf("here2\n");
+ const vector<long int> &v = lookup(chr, pos);
+ string res;
+ if (!fh) {
+ fh=fopen(path.c_str(),"rb");
+ if (!fh)
+ fail("%s:%s\n",path.c_str(),strerror(errno));
+ }
+ string line;
+ int i;
+ struct line l; meminit(l);
+ for (i=0;i<v.size();++i) {
+ fseek(fh,v[i],0);
+ read_line(fh, l);
+ chomp_line(l);
+ res += msep;
+ res += string(l.s, l.n);
+ }
+ free_line(&l);
+ return res;
+}
+
+string tidx::lookup_r(const char *chr, int beg, int end, const char *msep) {
+// printf("here2\n");
+ const vector<long int> &v = lookup_r(chr, beg, end);
+ string res;
+ if (!fh) {
+ fh=fopen(path.c_str(),"rb");
+ if (!fh)
+ fail("%s:%s\n",path.c_str(),strerror(errno));
+ }
+ string line;
+ int i;
+ struct line l; meminit(l);
+ for (i=0;i<v.size();++i) {
+ fseek(fh,v[i],0);
+ read_line(fh, l);
+ chomp_line(l);
+ res += msep;
+ res += string(l.s, l.n);
+ }
+ free_line(&l);
+ return res;
+}
+
+string api_ret = "";
+const char *tidx::lookup_c(const char *chr, int pos, const char *msep) {
+ api_ret = lookup(chr, pos, msep);
+ return api_ret.c_str();
+}
+
+const char *tidx::lookup_cr(const char *chr, int beg, int end, const char *msep) {
+ api_ret = lookup_r(chr, beg, end, msep);
+ return api_ret.c_str();
+}
+
+bool tidx::read(const char *in) {
+ string uin = string_format("gunzip -c %s.tidx", in);
+
+ if (debug) fprintf(stderr, "read %s\n", in);
+ FILE *fun=popen(uin.c_str(),"r");
+ if (!fun) {
+ return false;
+ }
+ map.unserialize(string_annot_serializer(), fun);
+ path=in;
+ return true;
+}
+
+void tidx::init() {
+ debug=false;
+ fh=NULL;
+ map.set_empty_key("-");
+}
+
+void tidx::dump(FILE *fh) {
+ fprintf(fh,"#file\t%s\n",path.c_str());
+ dense_hash_map<string,vector<annot> >::iterator it = map.begin();;
+ while (it != map.end()) {
+ vector<annot> &van = it->second;
+ int i;
+ for (i=0;i<van.size();++i) {
+ fprintf(fh, "%s\t%d\t%d\t%ld\t%ld\n", it->first.c_str(), van[i].beg, van[i].end, van[i].pos.size(), van[i].pos[0]);
+ }
+ ++it;
+ }
+}
+
+// fun part
+void tidx::build(const char *in, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c, bool sub_e) {
+ FILE *fin=fopen(in,"r");
+
+ if (!fin)
+ fail("%s:%s\n",in,strerror(errno));
+
+ if (nend == -1)
+ nend = nbeg;
+
+ string out = string_format("gzip -c > %s.tidx", in);
+ FILE *fout=popen(out.c_str(),"w");
+ if (!fout)
+ fail("%s:%s\n", out.c_str(),strerror(errno));
+
+ double xst = xtime();
+
+ struct line l; meminit(l);
+ int nlast = max(max(nbeg,nend),nchr);
+ int nl = 0;
+
+ string p_chr = "%";
+ path=in;
+ vector<annot> *pvan;
+ long tpos = ftell(fin);
+ // read in the annotation file
+ if (debug) fprintf(stderr, "reading %s (%d, %d, %d)\n", in, nchr, nbeg, nend);
+ while (read_line(fin, l)>0) {
+ ++nl;
+ if (skip_i > 0 || *l.s==skip_c) {
+ --skip_i;
+ } else {
+ chomp_line(l);
+ vector<char *> v = split(l.s, sep);
+ if (nlast >= v.size()) {
+ fail("error, file %s, line %d: missing info\n", in, nl);
+ }
+ annot a;
+ a.beg=atoi(v[nbeg]);
+ a.end=atoi(v[nend]);
+ if (sub_e) --a.end;
+ if (a.beg > a.end) {
+ fail("error, file %s, line %d: beg > end : %d > %d\n", in, nl, a.beg, a.end);
+ }
+ a.pos.push_back(tpos);
+ if (strcmp(v[nchr], p_chr.c_str())) { // speed up
+ pvan = &map[v[nchr]];
+ p_chr=v[nchr];
+ }
+ pvan->push_back(a);
+ }
+ tpos = ftell(fin);
+ }
+ dense_hash_map<string,vector<annot> >::iterator it;
+
+ free_line(&l);
+
+ // for each chromosome
+ it = map.begin();
+ while (it != map.end()) {
+ vector<annot> &van = it->second;
+ // sort the annotation file by beginning of region
+ sort(van.begin(), van.end(), annot_comp);
+ int i;
+ if (debug) fprintf(stderr, "frag %s : %ld ->", it->first.c_str(), van.size());
+ for (i=0;i<van.size()-1;++i) {
+ if (van[i].beg >= van[i+1].beg && van[i].end == van[i+1].end) {
+ // exact match
+ if (debug) fprintf(stderr, " [dup %d]", van[i].beg);
+ // merge annotations
+ prepend(van[i+1].pos,van[i].pos);
+ // skip next... (empty pos won't be serialized)
+ assert(van[i].beg == van[i+1].beg);
+ van[i].pos.clear();
+ } else if (van[i].end >= van[i+1].beg) {
+ if (debug) fprintf(stderr, " [ovr %d-%d:%ld ]", van[i].beg, van[i].end, van[i].pos[0]);
+ // overlap next
+ int new_st;
+ int new_en;
+
+ // forced to initialize here so we can use a reference (for efficiency)
+ vector<long> new_ro = van[i].pos;
+
+ if (van[i].end < van[i+1].end) {
+ // contained within next, so new frag starting after i stop
+ new_st = van[i].end + 1;
+ new_en = van[i+1].end;
+ new_ro = van[i+1].pos; // that only contains the other
+ van[i+1].end=van[i].end; // shorten next to my end
+ append(van[i+1].pos,van[i].pos); // and now the other contains me
+ } else {
+ // passes next, so next contains all of me
+ new_st = van[i+1].end+1; // new frag is after the end of next
+ new_en = van[i].end;
+ append(van[i+1].pos,van[i].pos);
+ }
+
+ van[i].end=van[i+1].beg-1; // shorten my end to less than the next's start
+
+ if (debug) fprintf(stderr, " [i:%d:%d:%ld]", van[i].beg, van[i].end, van[i].pos[0]);
+ if (debug) fprintf(stderr, " [i+1:%d:%d:%ld]", van[i+1].beg, van[i+1].end, van[i+1].pos[0]);
+
+ if (new_en >= new_st) { // is this a real one?
+ if (debug) fprintf(stderr, " [n:%d:%d:%ld]", new_st, new_en, new_ro[0]);
+
+ int j = i+2; // figure out where it goes (shouldn't be far)
+ while (j < van.size() & new_st > van[j].beg) {
+ ++j;
+ }
+
+ annot a;
+ a.beg=new_st;
+ a.end=new_en;
+ a.pos=new_ro;
+ // (slow... use linked list, turn to array later for storage/bin search?)
+ van.insert(van.begin()+j, a); // insert into the annot array
+ }
+ }
+ }
+ long j = 0;
+ for (i=0;i<van.size();++i) {
+ // overlap next
+ if (van[i].pos.size() != 0 && van[i].beg <= van[i].end)
+ if (i != j)
+ van[j++]=van[i];
+ else
+ ++j;
+ }
+ if (j != van.size()) {
+ if (debug) fprintf(stderr, "(rm %ld) ", van.size()-j);
+ van.resize(j);
+ }
+ if (debug) fprintf(stderr, " %ld\n", van.size());
+ ++it;
+ }
+
+ double xen = xtime();
+ double speed = xen-xst;
+
+ if (debug) fprintf(stderr, "compiled in %g secs\n", speed);
+
+ map.serialize(string_annot_serializer(), fout);
+ pclose(fout);
+
+ //
+ xst = xtime();
+ tidx tmap(in);
+ xen = xtime();
+ speed = xen-xst;
+ if (debug) fprintf(stderr, "read in %g secs\n", speed);
+ path=in;
+}
+
+double xtime() {
+ struct timeval tm;
+ gettimeofday(&tm, NULL);
+ return (double) tm.tv_sec + ((double)tm.tv_usec)/1000000.0;
+}
+
+// for the api
+void tidx_build(const char *file, const char *sep, int chr, int beg, int end, int skip_i, char skip_c, bool sub_e) {
+ tidx n;
+ n.build(file, sep, chr, beg, end, skip_i, skip_c, sub_e);
+}
diff --git a/tidx/tidx.cpp b/tidx/tidx.cpp
new file mode 100644
index 0000000..2de896f
--- /dev/null
+++ b/tidx/tidx.cpp
@@ -0,0 +1,220 @@
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string>
+#include <vector>
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <sparsehash/dense_hash_map>
+
+#include "fastq-lib.h"
+#include "utils.h"
+#include "tidx.h"
+
+void usage(FILE *f);
+
+using namespace std;
+using namespace google;
+
+int main (int argc, char **argv) {
+ bool debug = false;
+ bool echo = false;
+ bool build = false;
+ vector<const char *> vin;
+ const char *ain= NULL;
+ const char *sep = "\t";
+ const char *msep = "^";
+ const char *trim = "chr";
+ char *point = NULL;
+ int nchr = 1, nbeg = 2, nend = 3;
+ char skip_c = '#';
+ char sub_e = 0;
+ int skip_i = 0;
+ bool dump = 0;
+
+ char c;
+ while ( (c = getopt (argc, argv, "Dlhdt:r:c:b:T:e:p:i:s:a:nB")) != -1) {
+ switch (c) {
+ case 'd':
+ debug=true; break;
+ case 'D':
+ dump=true; break;
+ case 'n':
+ echo=false; break;
+ case 'l':
+ sub_e=true; break;
+ case 'B':
+ build = true; break;
+ case 'h':
+ usage(stdout); exit(0);
+ case 't':
+ sep = optarg; break;
+ case 'p':
+ point = optarg; break;
+ case 's':
+ if (isdigit(*optarg))
+ skip_i = atoi(optarg);
+ else
+ skip_c = *optarg;
+ break;
+ case 'r':
+ msep = optarg; break;
+ case 'T':
+ trim = optarg; break;
+ case 'c':
+ nchr = atoi(optarg); break;
+ case 'i':
+ vin.push_back(optarg); break;
+ case 'a':
+ ain = optarg; break;
+ case 'b':
+ nbeg = atoi(optarg); break;
+ case 'e':
+ nend = atoi(optarg); break;
+ case '?':
+ if (strchr("tncbe", optopt))
+ fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+ else if (isprint(optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr,
+ "Unknown option character `\\x%x'.\n",
+ optopt);
+ usage(stderr);
+ return 1;
+ }
+ }
+
+ if (!vin.size()) {
+ if (argc>1) warn("Error: at least one -i index file is required\n"); usage(stderr); exit(1);
+ }
+
+ if (! build && ! ain && !point && !dump) {
+ fail("Error: one of -D -B, -p or -a is required\n");
+ }
+
+ if ((!!build + !!ain + !!point + !!dump) > 1) {
+ warn("Error: only one of -B, -p or -a is allowed\n");
+ }
+
+ --nchr; --nbeg; --nend;
+
+ if ( build ) {
+ int f_i;
+ for (f_i=0;f_i<vin.size();++f_i) {
+ tidx x;
+ if (debug)
+ x.debug=true;
+ x.build(vin[f_i], sep, nchr, nbeg, nend, skip_i, skip_c, sub_e);
+ }
+ } else {
+ struct line l; meminit(l);
+ int nl = 0;
+ int f_i;
+ vector<tidx *>vmap; vmap.resize(vin.size());
+ for (f_i=0;f_i<vin.size();++f_i) {
+ vmap[f_i]=new tidx(vin[f_i]);
+ if (debug)
+ vmap[f_i]->debug=true;
+ if (dump) {
+ vmap[f_i]->dump(stdout);
+ }
+ }
+ if (dump) {
+ exit(0);
+ }
+
+ if ( point ) {
+ char * p = strchr(point, ':');
+ if (!p) {
+ fail("Error: -p requires chr:pos argument\n");
+ }
+ *p++ = '\0';
+ long pos = atol(p);
+ int found = 0;
+ for (f_i=0;f_i<vin.size();++f_i) {
+ string tmp = vmap[f_i]->lookup(point, pos, msep);
+ if (tmp.size()) {
+ ++found;
+ fputs(tmp.c_str()+(found==1),stdout); // echo
+ }
+ }
+ if (found) fputc('\n',stdout);
+ return !found;
+ } else {
+ FILE *fin = !strcmp(ain,"-") ? stdin : fopen(ain, "r");
+ if (!fin)
+ fail("error '%s':%s", ain,strerror(errno));
+
+ while (read_line(fin, l)>0) {
+ ++nl;
+
+ chomp_line(l);
+
+ fputs(l.s,stdout); // echo
+
+ vector<char *> v = split(l.s, sep); // todo, only get the keys desired, don't destroy
+
+ string res;
+ if (v.size() > nchr && v.size() > nbeg) {
+ for (f_i=0;f_i<vin.size();++f_i) {
+ string tmp = vmap[f_i]->lookup(v[nchr], atol(v[nbeg]), msep);
+ if (tmp.size()) {
+ res = res + tmp;
+ }
+ }
+ }
+ fputs(res.c_str(),stdout); // echo
+ fputc('\n',stdout);
+ }
+ free_line(&l);
+ }
+
+ for (f_i=0;f_i<vin.size();++f_i)
+ delete vmap[f_i];
+ }
+}
+
+void usage(FILE *f) {
+fputs(
+"Usage: tidx [options] -i IFILE [-i IFILE2...] -a AFILE\n"
+" or: tidx [options] -B -i IFILE\n"
+"\n"
+"Fragments and merges overlapping regions in an file with start-stop values.\n"
+"Creating a simple, fast, compressed index\n"
+"\n"
+"Also can load that index, and search AFILE for intersecting lines\n"
+"\n"
+"If the 'group' column is zero, no grouping will be used\n"
+"\n"
+"If just -b is present during a search, then only that column\n"
+"is searched.\n"
+"\n"
+"If both -b and -e are present during a search, then all regions\n"
+"that overlap will be returned.\n"
+"\n"
+"Options and (defaults):\n"
+"\n"
+"-i IFILE Text file to index (can specify more than one)\n"
+"-B Build index, don't annotate\n"
+"-a FILE Read text file and annotate\n"
+"-p CHR:POS Lookup a single point (slow!)\n"
+"-r STRING Annotation response separator (^)\n"
+"-t CHAR(s) Field separator (TAB)\n"
+"-c INT Group by (chromosome) column (1)\n"
+"-b INT Begin region column (2) (or position for annot)\n"
+"-e INT End region column (3)\n"
+"-s INT or CHAR Skip rows starting with CHAR (#), or skip INT rows\n"
+"-l Less than end, not less than or equal-to\n"
+"-n Don't echo input lines\n"
+//"-d Verbose debug output\n"
+//"-D Dump input table (debug)\n"
+//"-p CHR:POS Single point lookup (debug, slow)\n"
+"\n"
+ ,f);
+}
+
diff --git a/tidx/tidx.h b/tidx/tidx.h
new file mode 100644
index 0000000..15a24ee
--- /dev/null
+++ b/tidx/tidx.h
@@ -0,0 +1,43 @@
+#include <string>
+#include <vector>
+#include <sparsehash/dense_hash_map>
+
+class annot {
+public:
+ int beg;
+ int end;
+ std::vector<long> pos;
+};
+
+class tidx {
+ FILE *fh;
+ void init();
+public:
+ bool debug;
+ tidx() {init();};
+ tidx(const char *path) {init(); read(path);};
+
+ std::string path;
+ google::dense_hash_map<std::string,std::vector<annot> > map;
+
+ void dump(FILE *stream);
+ bool read(const char *path);
+ void build(const char *path, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c, bool sub_e);
+
+ const std::vector <long int> & lookup(const char *chr, int pos);
+ std::string lookup(const char *chr, int pos, const char *msep);
+
+// range lookup
+ std::vector <long int> lookup_r(const char *chr, int beg, int end);
+ std::string lookup_r(const char *chr, int beg, int end, const char *msep);
+
+// const char * return value
+ const char * lookup_c(const char *chr, int pos, const char *msep);
+ const char * lookup_cr(const char *chr, int beg, int end, const char *msep);
+};
+
+void chomp_line(struct line &l);
+
+// build, with no return value, for API use
+void tidx_build(const char *path, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c, bool sub_e);
+
diff --git a/tidx/utils.cpp b/tidx/utils.cpp
new file mode 120000
index 0000000..451eef0
--- /dev/null
+++ b/tidx/utils.cpp
@@ -0,0 +1 @@
+../utils.cpp
\ No newline at end of file
diff --git a/tidx/utils.h b/tidx/utils.h
new file mode 120000
index 0000000..6cd5d4f
--- /dev/null
+++ b/tidx/utils.h
@@ -0,0 +1 @@
+../utils.h
\ No newline at end of file
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000..ceac58e
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,5 @@
+#include <string>
+#include <vector>
+
+std::string string_format(const std::string &fmt, ...);
+std::vector<char *> split(char* str, const char* delim);
diff --git a/varcall.cpp b/varcall.cpp
new file mode 100644
index 0000000..72ec202
--- /dev/null
+++ b/varcall.cpp
@@ -0,0 +1,1744 @@
+/*
+Copyright (c) 2012 Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <math.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <stdarg.h>
+
+#include <gsl/gsl_randist.h>
+
+#include <sys/stat.h>
+
+#include <string>
+#include <queue>
+#include <list>
+
+#include <google/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <google/dense_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include "tidx/tidx.h"
+
+#include "fastq-lib.h"
+
+#define SVNREV atoi(strchr("$Revision: 632 $", ':')+1)
+const char * VERSION = "0.9";
+
+#define MIN_READ_LEN 20
+#define DEFAULT_LOCII 1000000
+
+using namespace std;
+using namespace google;
+
+void usage(FILE *f);
+
+// #define DEBUG 1
+
+#define meminit(l) (memset(&l,0,sizeof(l)))
+#ifdef DEBUG
+ #define debug(s,...) fprintf(stderr,s,##__VA_ARGS__)
+#else
+ #define debug(s,...)
+#endif
+#undef warn
+#define warn(s,...) ++errs; fprintf(stderr,s,##__VA_ARGS__)
+#define die(s,...) (fprintf(stderr,s,##__VA_ARGS__), exit(1))
+#define stat_out(s,...) fprintf(stat_fout,s,##__VA_ARGS__)
+#define stdev(cnt, sum, ssq) sqrt((((double)cnt)*ssq-pow((double)sum,2)) / ((double)cnt*((double)cnt-1)))
+#define log10(x) (log(x)/log(10))
+
+double quantile(const std::vector<int> &vec, double p);
+double quantile(const std::vector<double> &vec, double p);
+double pnorm(double x);
+double qnorm(double x);
+int rand_round(double x);
+
+// basic utils
+std::vector<char *> split(char* str, const char* delim);
+std::string string_format(const std::string &fmt, ...);
+void to_upper(const std::string str);
+void rename_tmp(std::string f);
+
+int errs=0;
+extern int optind;
+
+class Noise {
+public:
+ Noise() {noise=0;depth=0;};
+ Noise(int d, double n, double q, double mq) {depth=d; noise=n;qnoise=q;mnqual=mq;};
+ double noise;
+ double qnoise;
+ int depth;
+ double mnqual;
+};
+
+double quantile_depth(const vector<Noise> &vec, double p);
+
+bool noisebydepth (const Noise &a, const Noise &b) { return (a.depth>b.depth);}
+
+class PileupEnt {
+public:
+ bool is_rev;
+ bool is_start;
+ bool f;
+ const char *b;
+ int q;
+ int m;
+ int p;
+};
+
+class vcall {
+public:
+ vcall() {base='\0'; mn_qual=mq0=fwd=rev=qual=is_ref=qual_ssq=mq_sum=mq_ssq=tail_rev=tail_fwd=0;}
+ char base;
+ bool is_ref;
+ int qual, fwd, rev, mq0, mn_qual, qual_ssq, mq_sum, mq_ssq, tail_rev, tail_fwd;
+ vector <string> seqs;
+ int depth() const {return fwd+rev;}
+ int mq_rms() const {return sqrt(mq_ssq/depth());}
+ int qual_rms() const {return sqrt(qual_ssq/depth());}
+};
+
+class vfinal {
+public:
+ vfinal(vcall &c) {max_idl_cnt=0; padj=1; pcall = &c;};
+ vfinal & operator=(vfinal const&x) {max_idl_seq=x.max_idl_seq; max_idl_cnt=x.max_idl_cnt; padj=x.padj; pcall=x.pcall;}
+ vcall *pcall;
+ string max_idl_seq;
+ int max_idl_cnt;
+ double padj;
+ bool is_indel() {return max_idl_cnt > 0;};
+};
+
+bool hitolocall (const vcall &i,const vcall &j) {return ((i.depth())>(j.depth()));}
+bool sortreffirst (const vfinal &i,const vfinal &j) {return (i.pcall->is_ref&&!j.pcall->is_ref)||((i.pcall->is_ref==j.pcall->is_ref) && ((i.pcall->depth())>(j.pcall->depth())));}
+
+class Read {
+public:
+ int MapQ;
+ string Seq;
+ Read() {MapQ=0;};
+};
+
+class PileupReads {
+public:
+ double MeanReadLen() {return ReadBin.size() ? TotReadLen/ReadBin.size() : MIN_READ_LEN;}
+ int TotReadLen;
+ deque<Read> ReadBin;
+ list<Read> ReadList;
+ PileupReads() {TotReadLen=0;}
+};
+
+class PileupSummary {
+public:
+ string Chr;
+ int Pos;
+ char Base;
+ int Depth;
+ int TotQual;
+ int NumReads;
+ vector<vcall> Calls;
+
+ int SkipN;
+ int SkipDupReads;
+ int SkipMinMapq;
+ int SkipMinQual;
+ int MaxDepthByPos;
+ int RepeatCount;
+ char RepeatBase;
+
+ PileupSummary(char *line, PileupReads &reads);
+ PileupSummary() { Base = '\0'; Pos=-1; };
+};
+
+class PileupVisitor {
+ public:
+ char InputType;
+
+ string AnnotFile; // path to file
+ tidx AnnotDex; // start/stop index file
+ char AnnotType; // b (bed) or g (gtf - preferred)
+
+ PileupReads Reads;
+ PileupVisitor() {InputType ='\0';}
+ PileupVisitor(const char *a) {InputType ='\0'; LoadIndex(a);}
+ void Parse(char *dat) {PileupSummary p(dat, Reads); Visit(p);};
+ void LoadIndex(const char *a);
+ virtual void Visit(PileupSummary &dat)=0;
+ virtual void Finish()=0;
+};
+
+class VarStatVisitor : public PileupVisitor {
+ public:
+ VarStatVisitor() : PileupVisitor() {tot_locii=0; tot_depth=0; num_reads=0;};
+
+ void Visit(PileupSummary &dat);
+ void Finish() {};
+
+ public:
+ double tot_depth;
+ int tot_locii;
+ int num_reads;
+ vector<Noise> stats;
+ vector<Noise> ins_stats;
+ vector<Noise> del_stats;
+};
+
+class VarCallVisitor : public PileupVisitor {
+
+ deque<PileupSummary> Win;
+ void VisitX(PileupSummary &dat);
+
+ public:
+ int WinMax;
+ VarCallVisitor() : PileupVisitor() {SkippedDepth=0;WinMax=0;Hets=0;Homs=0;Locii=0;};
+
+ void Visit(PileupSummary &dat);
+ void Finish();
+
+ int SkippedDepth;
+ int Locii;
+ int Hets;
+ int Homs;
+};
+
+bool hasdata(const string &file) {
+ struct stat st;
+ if (stat(file.c_str(), &st)) {
+ return false;
+ }
+ return st.st_size > 0;
+}
+
+
+int minsampdepth=20;
+double pct_depth=0;
+double pct_qdepth=0;
+double global_error_rate=0;
+double max_phred;
+int total_locii=-1;
+double pct_balance=0;
+char *debug_xchr=NULL;
+int debug_xpos=0;
+int min_depth=1;
+int min_mapq=0;
+int min_qual=3;
+int repeat_filter=7;
+double artifact_filter=1;
+int min_adepth=2;
+int read_tail_pct=.6;
+int read_tail_len=4;
+int min_idepth=3;
+int no_baq=0;
+double zygosity=.5; // set to .1 for 1 10% admixture, or even .05 for het/admix
+
+void parse_bams(PileupVisitor &v, int in_n, char **in, const char *ref);
+
+FILE *noise_f=NULL, *var_f = NULL, *varsum_f = NULL, *tgt_f = NULL, *tgtsum_f = NULL, *vcf_f = NULL, *eav_f=NULL;
+
+double alpha=.05;
+int phred=33;
+double phi(double x);
+
+FILE *openordie(const char *path, const char *mode) {
+ FILE *f=fopen(path, mode);
+ if (!f) {
+ warn("Can't open-%s %s: %s\n", mode, path, strerror(errno));
+ exit(1);
+ }
+ return f;
+}
+
+int main(int argc, char **argv) {
+ char c;
+ const char *noiseout=NULL;
+ const char *ref=NULL;
+ optind = 0;
+ int umindepth=0;
+ int uminadepth=0;
+ int uminidepth=0;
+ double upctqdepth=0;
+ int do_stats=0;
+ int do_varcall=0;
+
+ char *out_prefix = NULL;
+ char *target_annot = NULL;
+ char *read_stats = NULL;
+
+ while ( (c = getopt_long(argc, argv, "?svVBhe:m:N:x:f:p:a:g:q:Q:i:o:D:R:b:L:S:",NULL,NULL)) != -1) {
+ switch (c) {
+ case 'h': usage(stdout); return 0;
+ case 'm': umindepth=atoi(optarg); break;
+ case 'q': min_qual=atoi(optarg); break;
+ case 'o': out_prefix=optarg; break;
+ case 'Q': min_mapq=atoi(optarg); break;
+ case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); exit(0); break;
+ case 'R': repeat_filter=atoi(optarg); break;
+ case 'A': target_annot=optarg; break;
+ case 'a': uminadepth=atoi(optarg);break;
+ case 'D': artifact_filter=atof(optarg);break;
+ case 'i': uminidepth=atoi(optarg);break;
+ case 'x': {
+ debug_xchr=optarg;
+ char *p=strrchr(debug_xchr, ':');
+ if (!p) die("Invalid param for -x");
+ *p='\0';
+ debug_xpos=atoi(++p);
+ if (!p) die("Invalid param for -x, need pos");
+ break;
+ }
+ case 'b': pct_balance=atof(optarg)/100.0; break;
+ case 'B': no_baq=1; break;
+ case 'p': upctqdepth=atof(optarg); break;
+ case 'e': alpha=atof(optarg); break;
+ case 'g': global_error_rate=atof(optarg); break;
+ case 'L': total_locii=atoi(optarg); break;
+ case 'f': ref=optarg; break;
+ case 'N': noiseout=optarg; break;
+ case 's': do_stats=1; break;
+ case 'S': read_stats=optarg; break;
+ case 'v': do_varcall=1; break;
+ case '?':
+ if (!optopt) {
+ usage(stdout); return 0;
+ } else if (optopt && strchr("ox", optopt))
+ fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+ else if (isprint(optopt))
+ fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+ else
+ fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
+ usage(stderr);
+ return 1;
+ }
+ }
+
+
+ if (!do_stats && !do_varcall || do_stats && do_varcall) {
+ warn("Specify -s for stats only, or -v to do variant calling\n\n");
+ usage(stderr);
+ return 1;
+ }
+
+ if (out_prefix) {
+ if (!do_varcall) {
+ warn("Specify -o with -v only\n\n");
+ usage(stderr);
+ return 1;
+ }
+
+ var_f = openordie(string_format("%s.var.tmp", out_prefix).c_str(), "w");
+ vcf_f = openordie(string_format("%s.vcf.tmp", out_prefix).c_str(), "w");
+ eav_f = openordie(string_format("%s.eav.tmp", out_prefix).c_str(), "w");
+ noise_f = openordie(string_format("%s.noise.tmp", out_prefix).c_str(), "w");
+ varsum_f = openordie(string_format("%s.varsum.tmp", out_prefix).c_str(), "w");
+ if (target_annot) {
+ tgt_f = openordie(string_format("%s.tgt.tmp", out_prefix).c_str(), "w");
+ tgtsum_f = openordie(string_format("%s.tgtsum.tmp", out_prefix).c_str(), "w");
+ }
+ } else {
+ var_f = stdout;
+ varsum_f = stderr;
+ }
+
+ if (umindepth > minsampdepth) {
+ minsampdepth=umindepth;
+ }
+
+ if (noiseout) {
+ noise_f = fopen(noiseout, "w");
+ if (!noise_f) {
+ warn("Can't write %s: %s\n", noiseout, strerror(errno));
+ exit(1);
+ }
+ }
+
+ // set argv to '-' if stdin
+ const char *stdv[3] = {argv[0],"-",NULL};
+ if (!argv[optind]) {
+ argc=2;
+ argv = (char **) stdv;
+ optind=1;
+ }
+
+ char **in=&argv[optind];
+ int in_n = argc-optind;
+
+ // not really random
+ srand(1);
+
+ max_phred = -log10(global_error_rate)*10;
+
+ if (do_stats) {
+ FILE *stat_fout=stdout; // stats to stdout
+
+ if (do_varcall) // unless varcalling at the same time
+ stat_fout=stderr;
+
+ VarStatVisitor vstat;
+
+ parse_bams(vstat, in_n, in, ref);
+
+ stat_out("version\tvarcall-%s.%d\n", VERSION, SVNREV);
+ stat_out("min depth\t%d\n", minsampdepth);
+ stat_out("alpha\t%f\n", alpha);
+
+ if (vstat.stats.size()) {
+ // sort by depth descending
+ sort(vstat.stats.begin(), vstat.stats.end(), noisebydepth);
+
+ // flip 3 and 1 because sorted in descending order for sampling (above)
+ double depth_q3=quantile_depth(vstat.stats, .25);
+ double depth_q2=quantile_depth(vstat.stats, .50);
+ double depth_q1=quantile_depth(vstat.stats, .75);
+ double depth_qx=quantile_depth(vstat.stats, .95);
+
+ // number of locii to compute error rate
+ int ncnt=min(100000,vstat.stats.size());
+
+ int i;
+ double nsum=0, nssq=0, dsum=0, dmin=vstat.stats[0].depth, qnsum=0, qnssq=0, qualsum=0;
+
+ double ins_nsum=0, ins_nssq=0, del_nsum=0, del_nssq=0;
+ for (i=0;i<ncnt;++i) {
+ if (vstat.stats[i].depth < depth_q1) {
+ continue;
+ }
+ nsum+=vstat.stats[i].noise;
+ nssq+=vstat.stats[i].noise*vstat.stats[i].noise;
+ dsum+=vstat.stats[i].depth;
+ qnsum+=vstat.stats[i].qnoise;
+ qnssq+=vstat.stats[i].qnoise*vstat.stats[i].qnoise;
+ qualsum+=vstat.stats[i].mnqual;
+ if (vstat.stats[i].depth < dmin) dmin = vstat.stats[i].depth;
+ ins_nsum+=vstat.ins_stats[i].noise;
+ ins_nssq+=vstat.ins_stats[i].noise*vstat.ins_stats[i].noise;
+ del_nsum+=vstat.del_stats[i].noise;
+ del_nssq+=vstat.del_stats[i].noise*vstat.del_stats[i].noise;
+ }
+
+ double noise_mean =nsum/ncnt;
+ double noise_dev = stdev(ncnt, nsum, nssq);
+ double qnoise_mean =qnsum/ncnt;
+ double qnoise_dev = stdev(ncnt, qnsum, qnssq);
+ double qual_mean = qualsum/ncnt;
+ double ins_noise_mean =ins_nsum/ncnt;
+ double ins_noise_dev = stdev(ncnt, ins_nsum, ins_nssq);
+ double del_noise_mean =del_nsum/ncnt;
+ double del_noise_dev = stdev(ncnt, del_nsum, del_nssq);
+
+ stat_out("qual mean\t%.4f\n", qual_mean);
+ stat_out("noise mean\t%.6f\n", noise_mean);
+ stat_out("noise dev\t%.6f\n", noise_dev);
+ stat_out("qnoise mean\t%.6f\n", qnoise_mean);
+ stat_out("qnoise dev\t%.6f\n", qnoise_dev);
+ stat_out("ins freq\t%.6f\n", ins_noise_mean);
+ stat_out("ins freq dev\t%.6f\n", ins_noise_dev);
+ stat_out("del freq\t%.6f\n", del_noise_mean);
+ stat_out("del freq dev\t%.6f\n", del_noise_dev);
+
+ if (qnoise_mean >= noise_mean ) {
+ stat_out("error\tpoor quality estimates\n");
+ }
+
+ stat_out("noise depth mean\t%.4f\n", dsum/ncnt);
+ stat_out("noise depth min\t%.4f\n", dmin);
+ stat_out("noise cnt\t%d\n", ncnt);
+
+ stat_out("depth q1\t%.4f\n", depth_q1);
+ stat_out("depth median\t%.4f\n", depth_q2);
+ stat_out("depth q3\t%.4f\n", depth_q3);
+
+ dsum=0;
+ for (i=0;i<vstat.stats.size();++i) {
+ dsum+=vstat.stats[i].depth;
+ }
+
+ int locii_gtmin=0;
+ for (i=0;i<vstat.stats.size();++i) {
+ if (vstat.stats[i].depth >= min_depth) {
+ ++locii_gtmin;
+ }
+ }
+ stat_out("locii >= min depth\t%d\n", locii_gtmin);
+ stat_out("locii\t%d\n", vstat.tot_locii);
+
+ double stdevfrommean=-qnorm((alpha/locii_gtmin)/2);
+ stat_out("qnorm adj\t%f\n", stdevfrommean);
+
+ pct_qdepth=qnoise_mean+qnoise_dev*stdevfrommean;
+ stat_out("min pct qual\t%.4f\n", 100*pct_qdepth);
+ }
+ }
+
+ if (read_stats){
+ FILE * f = fopen(read_stats, "r");
+ if (!f) {
+ warn("File %s does not exist, quitting\n", read_stats);
+ exit(1);
+ }
+ line l; meminit(l);
+ char *val;
+ while(read_line(f, l)>0) {
+ if (val=strchr(l.s, '\t')) {
+ *val='\0'; ++val;
+ if (!strcasecmp(l.s, "min depth")) {
+ if (umindepth && umindepth > atoi(val)) {
+ fprintf(varsum_f,"warning\tsampling depth was less than variation depth\n");
+ }
+ if (!umindepth) umindepth=atoi(val);
+ } else if (!strcasecmp(l.s, "min pct qual")) {
+ if (upctqdepth<=0) upctqdepth=atof(val);
+ } else if (!strcasecmp(l.s, "noise mean")) {
+ if (global_error_rate<=0) global_error_rate=atof(val);
+ } else if (!strcasecmp(l.s, "locii >= min depth")) {
+ if (total_locii<0) total_locii=atoi(val);
+ } else if (!strcasecmp(l.s, "alpha")) {
+ if (alpha<=0) alpha=atof(val);
+ }
+ }
+ }
+ }
+
+ if (total_locii<0) total_locii=DEFAULT_LOCII;
+ if (total_locii==0) total_locii=1; // no adjustment
+
+ if (eav_f) {
+ fprintf(eav_f,"chr\tpos\tref\tdepth\tnum_states\ttop_consensus\ttop_freq\tvar_base\tvar_depth\tvar_qual\tvar_strands\tforward_strands\treverse_strands\t%cval\n",total_locii>1?'e':'p');
+ }
+
+ if (do_varcall) {
+ if (umindepth) min_depth=umindepth;
+ if (upctqdepth > 0) pct_qdepth=(double)upctqdepth/100;
+ if (uminadepth) min_adepth=uminadepth;
+ if (uminidepth) min_idepth=uminidepth;
+
+ if (!min_depth || (!pct_depth && !pct_qdepth)) {
+ fprintf(varsum_f,"warning\toutputting all variations, no minimum depths specified\n");
+ }
+
+ fprintf(varsum_f,"version\tvarcall-%s.%d\n", VERSION, SVNREV);
+ fprintf(varsum_f,"min depth\t%d\n", min_depth);
+ fprintf(varsum_f,"min call depth\t%d\n", min_adepth);
+ fprintf(varsum_f,"alpha\t%f\n", alpha);
+ fprintf(varsum_f,"min pct qual\t%d\n", (int)(100*pct_qdepth));
+
+ fprintf(varsum_f,"min balance\t%d\n", (int)(100*pct_balance));
+ fprintf(varsum_f,"artifact filter\t%f\n", artifact_filter);
+ fprintf(varsum_f,"min qual\t%d\n", min_qual);
+ fprintf(varsum_f,"min map qual\t%d\n", min_mapq);
+ fprintf(varsum_f,"error rate\t%f\n", global_error_rate);
+ fprintf(varsum_f,"locii used for adjustment\t%d\n", total_locii);
+
+ VarCallVisitor vcall;
+
+ if (repeat_filter > 0) {
+ fprintf(varsum_f,"homopolymer filter\t%d\n", repeat_filter);
+ vcall.WinMax=repeat_filter+repeat_filter+3;
+ } else {
+ vcall.WinMax=5;
+ }
+
+ if (vcf_f) {
+ // print VCF header
+ fprintf(vcf_f, "%s\n", "##fileformat=VCFv4.1");
+ }
+
+ parse_bams(vcall, in_n, in, ref);
+
+ if (vcall.InputType == 'B') {
+ fprintf(varsum_f,"baq correct\t%s\n", (no_baq?"no":"yes"));
+ }
+ fprintf(varsum_f,"locii\t%d\n", vcall.Locii);
+ fprintf(varsum_f,"hom calls\t%d\n", vcall.Homs);
+ fprintf(varsum_f,"het calls\t%d\n", vcall.Hets);
+ fprintf(varsum_f,"locii below depth\t%d\n", vcall.SkippedDepth);
+
+ if (out_prefix) {
+ fclose(var_f);
+ fclose(vcf_f);
+ fclose(eav_f);
+ fclose(noise_f);
+ fclose(varsum_f);
+ if (target_annot) {
+ fclose(tgt_f);
+ fclose(tgtsum_f);
+ }
+ rename_tmp(string_format("%s.var.tmp", out_prefix));
+ rename_tmp(string_format("%s.vcf.tmp", out_prefix));
+ rename_tmp(string_format("%s.eav.tmp", out_prefix));
+ rename_tmp(string_format("%s.noise.tmp", out_prefix));
+ rename_tmp(string_format("%s.varsum.tmp", out_prefix));
+ if (target_annot) {
+ rename_tmp(string_format("%s.tgt.tmp", out_prefix));
+ rename_tmp(string_format("%s.tgtsum.tmp", out_prefix));
+ }
+ }
+ }
+}
+
+void rename_tmp(std::string f) {
+ std::string notmp = f;
+ size_t pos = notmp.find(".tmp");
+ if (pos >= 0) {
+ notmp.replace(notmp.find(".tmp"),4,"");
+ rename(f.c_str(),notmp.c_str());
+ }
+}
+
+// normal distribution
+double qnorm(double q) {
+ if(q == .5)
+ return 0;
+
+ q = 1.0 - q;
+
+ double p = (q > 0.0 && q < 0.5) ? q : (1.0 - q);
+ double t = sqrt(log(1.0 / pow(p, 2.0)));
+
+ double c0 = 2.515517;
+ double c1 = 0.802853;
+ double c2 = 0.010328;
+
+ double d1 = 1.432788;
+ double d2 = 0.189269;
+ double d3 = 0.001308;
+
+ double x = t - (c0 + c1 * t + c2 * pow(t, 2.0)) /
+ (1.0 + d1 * t + d2 * pow(t, 2.0) + d3 * pow(t, 3.0));
+
+ if(q > .5)
+ x *= -1.0;
+
+ return x;
+}
+
+double pnorm(double x)
+{
+ // constants
+ double a1 = 0.254829592;
+ double a2 = -0.284496736;
+ double a3 = 1.421413741;
+ double a4 = -1.453152027;
+ double a5 = 1.061405429;
+ double p = 0.3275911;
+
+ // Save the sign of x
+ int sign = 1;
+ if (x < 0)
+ sign = -1;
+ x = fabs(x)/sqrt(2.0);
+
+ // A&S formula 7.1.26
+ double t = 1.0/(1.0 + p*x);
+ double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x);
+
+ return 0.5*(1.0 + sign*y);
+}
+
+void parse_bams(PileupVisitor &v, int in_n, char **in, const char *ref) {
+
+ if (!in_n) {
+ warn("No input files, quitting\n");
+ exit(1);
+ }
+
+ int i, bam_n=0;
+ for (i=0;i<in_n;++i) {
+ if (!strcmp(fext(in[i]), ".bam")) {
+ ++bam_n;
+ }
+ }
+
+ if (bam_n != in_n) {
+ if (bam_n > 0) {
+ warn("Can't mix bams and other input files\n");
+ exit(1);
+ } else {
+ if (in_n > 1) {
+ warn("Can't handle multiple pileups... TODO\n");
+ exit(1);
+ } else {
+ warn("input\t%d pileup\n", in_n);
+ v.InputType='P';
+ }
+ }
+ } else {
+ warn("input\t%d bam\n", bam_n);
+ v.InputType='B';
+ }
+
+ int is_popen = 0;
+ FILE *fin;
+
+ if (bam_n) {
+ if (!ref) {
+ warn("Need a reference file (-f) parameter, try -h for help\n");
+ exit(1);
+ }
+
+ if (!hasdata(string(ref)+".fai")) {
+ int ret=system(string_format("samtools faidx '%s'", ref).c_str());
+ if (ret) {
+ warn("Need a %s.fai file, run samtools faidx\n", ref);
+ exit(1);
+ }
+ }
+
+
+ const char *nobaq = no_baq ? "-B" : "";
+
+ string mpil_cmd = string_format("samtools mpileup -Q 0 -d 100000 %s -f '%s'", nobaq, ref);
+
+ int i;
+ for (i=0;i<in_n;++i) {
+ mpil_cmd += " '";
+ mpil_cmd += in[i];
+ mpil_cmd += "' ";
+ }
+
+ warn("command\t%s\n", mpil_cmd.c_str());
+
+ fin = popen(mpil_cmd.c_str(), "r");
+ if (!fin)
+ exit(1);
+
+ is_popen = 1;
+ } else {
+ if (!strcmp(in[0], "-")) {
+ fin=stdin;
+ } else {
+ if (!strcmp(fext(in[0]), ".gz")) {
+ string gunz = string_format("gunzip -c '%s'", in[0]);
+ fin = popen(gunz.c_str(), "r");
+ is_popen = 1;
+ } else {
+ fin = fopen(in[0], "r");
+ }
+ if (!fin) {
+ warn("%s: %s", in[0], strerror(errno));
+ exit(1);
+ }
+ }
+ }
+
+ line l; meminit(l);
+ int cnt=0;
+ if (fin) {
+ while(read_line(fin, l)>0) {
+ // chr 2 G 6 ^9,^+.^*,^2,^&.^&, &.'&*- 9+*2&& 166,552,643,201,299,321
+ v.Parse(l.s);
+ ++cnt;
+ }
+ v.Finish();
+
+ if (is_popen) pclose(fin); else fclose(fin);
+ }
+
+ if (cnt == 0) {
+ warn("No data in pileup, quitting\n");
+ exit(1);
+ }
+}
+
+#define T_A 0
+#define T_C 1
+#define T_G 2
+#define T_T 3
+#define T_SDEL 4
+#define T_NDEL 5
+#define T_INS 6
+#define T_N 7
+#define b2i(c) ((c)=='A'?0:(c)=='a'?0:(c)=='C'?1:(c)=='c'?1:(c)=='G'?2:(c)=='g'?2:(c)=='T'?3:(c)=='t'?3:(c)=='*'?4:(c)=='-'?5:(c)=='+'?6:7)
+#define i2b(i) (i==0?'A':i==1?'C':i==2?'G':i==3?'T':i==4?'*':i==5?'-':i==6?'+':'?')
+
+bool hitoloint (int i,int j) { return (i>j);}
+
+int track_readlen[10000];
+
+
+PileupSummary::PileupSummary(char *line, PileupReads &rds) {
+
+ vector<char *> d=split(line, "\t");
+
+ if (d.size() < 6) {
+ warn("Can't read pileup : %d fields, need 6 columns\n", (int) d.size());
+ exit(1);
+ }
+
+ const char * p_qual=d[5];
+
+ Chr=d[0];
+ Pos=atoi(d[1]);
+ Base=*(d[2]);
+ Depth = atoi(d[3]);
+ SkipDupReads = 0;
+ SkipN = 0;
+ SkipMinQual = 0;
+ SkipMinMapq = 0;
+ MaxDepthByPos = 0;
+ RepeatCount = 0;
+ RepeatBase = '\0';
+ NumReads = 0;
+
+ int i;
+ vector<int> depthbypos;
+
+ const char *cur_p = d[4];
+
+ list<Read>::iterator read_i = rds.ReadList.begin();
+
+ int eor=0;
+ for (i=0;i<Depth;++i,++read_i) {
+ bool sor=0;
+
+ if (*cur_p == '^') {
+ sor=1;
+ ++cur_p;
+ Read x;
+ x.MapQ = *cur_p-phred;
+ ++cur_p;
+ if (read_i != rds.ReadList.end()) {
+ ++read_i;
+ }
+ read_i=rds.ReadList.insert(read_i,x);
+ }
+
+ if (read_i == rds.ReadList.end()) {
+ warn("warning\tread start without '^', partial pileup: '%s'\n", cur_p);
+ Read x;
+ x.MapQ = -1;
+ read_i=rds.ReadList.insert(read_i,x);
+ }
+
+ int pia = read_i->Seq.length()+1;
+ if (pia >= depthbypos.size()) {
+ depthbypos.resize(pia+1);
+ }
+ depthbypos[pia]++;
+
+
+ if (sor)
+ ++NumReads;
+
+ char q = p_qual[i]-phred; // qual char
+ char mq = read_i->MapQ;
+ char o = *cur_p; // orig call
+ char c = toupper(o); // uppercase/ref
+ bool is_ref = 0;
+
+ if (o == '.' || o == ',') {
+ c = Base; // ref instead
+ is_ref = 1;
+ }
+
+ if (o == '>' || o == '<') {
+ c = 'N'; // no call
+ is_ref = 1;
+ }
+
+ bool skip = 0;
+
+ // probably should not be adding anything here... but the old code added 1 and floored... new code adds .5 and rounds... which is comparable
+ // really.. should just be adding zero, the reason the old code had it was because of a lack of max()
+ if (c == 'N') {
+ ++SkipN;
+ skip=1;
+ } else if (artifact_filter > 0 && (depthbypos[pia] > max(1,rand_round(0.5+artifact_filter * (Depth/rds.MeanReadLen()))))) {
+ ++SkipDupReads;
+ skip=1;
+ } else if (mq < min_mapq) {
+ ++SkipMinMapq;
+ skip=1;
+ } else if (q < min_qual) {
+ ++SkipMinQual;
+ skip=1;
+ } else {
+ int j = b2i(c);
+ if (j >= Calls.size()) {
+ int was = Calls.size();
+ Calls.resize(j+1);
+ int t; for (t=was;t<=j;++t) {
+ Calls[t].base=i2b(t);
+ }
+ }
+ if (is_ref)
+ Calls[j].is_ref = 1;
+
+ if ( o == ',' || o == 'a' || o == 'c' || o == 't' || o == 'g' ) {
+ ++Calls[j].rev;
+ } else if ( c != 'N' ) {
+ ++Calls[j].fwd;
+ }
+
+ Calls[j].qual+=q;
+ Calls[j].mn_qual+=min(mq,q);
+ Calls[j].mq_ssq+=mq*mq;
+ Calls[j].mq_sum+=mq;
+ Calls[j].qual_ssq+=q*q;
+/*
+ if (pia <= read_tail_len || (rds.MeanReadLen()-pia) <= read_tail_len) {
+ if ( o == ',' || o == 'a' || o == 'c' || o == 't' || o == 'g' ) {
+ ++Calls[j].tail_rev;
+ } else {
+ ++Calls[j].tail_fwd;
+ }
+ }
+*/
+
+ if (vcf_f) {
+ if (mq == 0)
+ Calls[j].mq0++;
+ }
+ }
+
+ if (c == '-' || c == '+') {
+ warn("invalid pileup, at '%s', indel not attached to read?\n", cur_p);
+ } else {
+ if (c != '*')
+ read_i->Seq += c;
+ ++cur_p;
+ }
+
+ if (*cur_p == '+' || *cur_p == '-') {
+ c = *cur_p;
+ char *end_p;
+ int len = strtol(++cur_p, &end_p, 10);
+ string ins_seq(end_p, len);
+ to_upper(ins_seq);
+ read_i->Seq += ins_seq;
+ if (!skip) {
+ int j = b2i(c);
+ if (j >= Calls.size()) {
+ int was = Calls.size();
+ Calls.resize(j+1);
+ int t; for (t=was;t<=j;++t) {
+ Calls[t].base=i2b(t);
+ }
+ }
+ if ( o == ',' || o == 'a' || o == 'c' || o == 't' || o == 'g' ) {
+ ++Calls[j].rev;
+ } else {
+ ++Calls[j].fwd;
+ }
+ Calls[j].qual+=q;
+ Calls[j].mn_qual+=min(q, mq);
+ Calls[j].qual_ssq+=q*q;
+ Calls[j].mq_ssq+=mq*mq;
+ Calls[j].mq_sum+=mq;
+ Calls[j].seqs.push_back(ins_seq);
+ }
+ cur_p=end_p+len;
+ }
+
+ if (*cur_p == '$') {
+ if (read_i->MapQ > -1) {
+ rds.TotReadLen+=read_i->Seq.size();
+ rds.ReadBin.push_back(*read_i);
+ if (rds.ReadBin.size() > min(1000,Depth*2)) {
+ rds.ReadBin.pop_front();
+ rds.TotReadLen-=rds.ReadBin.front().Seq.size();
+ }
+ }
+// printf("%d\t%s\n", read_i->MapQ, read_i->Seq.c_str());
+ read_i=rds.ReadList.erase(read_i);
+ --read_i;
+ ++cur_p;
+ ++eor;
+ }
+ }
+
+ if ((Depth-eor) != rds.ReadList.size()) {
+ warn("warning\tdepth is %d, but read list is: %d\n", Depth, (int) rds.ReadList.size());
+ }
+
+ if (*cur_p == '-' || *cur_p == '+') {
+ char *end_p;
+ int len = strtol(++cur_p, &end_p, 10);
+ // keep this
+ string idl(end_p, len);
+ cur_p=end_p+len;
+ }
+
+ if (*cur_p) {
+ warn("Failed to parse pileup %s\n", d[4]);
+ exit(1);
+ }
+
+ for (i=0;i<depthbypos.size();++i) {
+ if (depthbypos[i] > MaxDepthByPos) {
+ MaxDepthByPos = depthbypos[i];
+ }
+ }
+
+ Depth=0;
+ for (i=0;i<5 && i < Calls.size();++i) { // total depth (exclude inserts for tot depth, otherwise they are double-counted)
+ Depth+=Calls[i].depth();
+ }
+
+
+ TotQual=0;
+ for (i=0;i<5 && i < Calls.size();++i) { // total depth (exclude inserts for tot depth, otherwise they are double-counted)
+ TotQual+=Calls[i].qual;
+ }
+}
+
+PileupSummary JunkSummary;
+
+void VarCallVisitor::Visit(PileupSummary &p) {
+ if (WinMax < 3) {
+ // no real window ... just go straight
+ VisitX(p);
+ return;
+ }
+
+ if (p.Base != '-' && p.Base != '@') {
+ if (Win.size() && (Win.back().Pos != (p.Pos - 1) )) {
+ if (Win.back().Pos < p.Pos && ((p.Pos - Win.back().Pos) <= (WinMax/2))) {
+ while (Win.back().Pos < (p.Pos - 1)) {
+ // visit/pop, add a placeholder
+ JunkSummary.Base = '-';
+ JunkSummary.Pos = Win.back().Pos + 1;
+ Visit(JunkSummary);
+ }
+ } else {
+ while (Win.size() && Win[WinMax/2].Base != '@') {
+ // visit/pop, but don't add anything, until it's empty
+ JunkSummary.Base = '@';
+ JunkSummary.Pos = 0;
+ Visit(JunkSummary);
+ }
+ }
+ }
+ }
+
+ // initialize the window with nothing, if it's not full
+ while (Win.size() < WinMax) {
+ JunkSummary.Base = '@';
+ JunkSummary.Pos = 0;
+ Win.push_back(JunkSummary);
+ }
+
+ Win.push_back(p);
+
+ //debug("Visit: %d\n", p.Pos);
+
+ if (Win.size() > WinMax) // queue too big? pop
+ Win.pop_front();
+
+ int i;
+ int lrc=0,rrc=0; // left repeat count, right repeat count
+ char lrb, rrb; // left repeat base...
+ int vx;
+
+ if (Win.size() < WinMax) { // small window? look at leading edge only
+ return;
+ } else {
+ vx = WinMax/2; // larger window? look at midpoint
+ }
+
+ if (Win[vx].Base == '-' || Win[vx].Base == '@')
+ return;
+
+ if (vx > 1) { // look left
+ lrb = Win[vx-1].Base;
+ for (i=vx-2; i >= 0; --i) { // increment repeat count
+ if (Win[i].Base == lrb)
+ ++lrc;
+ else
+ break;
+ }
+ }
+ if (vx < (Win.size()-2)) {
+ rrb = Win[vx+1].Base;
+ for (i=vx+2; i < Win.size(); ++i) {
+ if (Win[i].Base == rrb)
+ ++rrc;
+ else
+ break;
+ }
+ }
+
+ // repeat counts are now 1-based, not 0-based
+ ++lrc;
+ ++rrc;
+
+ // maximum repeat count and associated base
+ if (lrb == rrb ) {
+ Win[vx].RepeatCount = lrc+rrc;
+ Win[vx].RepeatBase = lrb;
+ } else if (lrc > rrc) {
+ Win[vx].RepeatCount = lrc;
+ Win[vx].RepeatBase = lrb;
+ } else {
+ Win[vx].RepeatCount = rrc;
+ Win[vx].RepeatBase = rrb;
+ }
+
+ if (debug_xpos) {
+ if (Win[vx].Pos == debug_xpos && !strcmp(debug_xchr,Win[vx].Chr.data())) {
+ fprintf(stderr,"xpos-window\t");
+ for (i=0;i<Win.size();++i) {
+ fprintf(stderr,"%c", Win[i].Base);
+ }
+ fprintf(stderr,"\n");
+ }
+ }
+
+ double drms = 0;
+ if (vx < Win.size()-1) {
+ int i;
+ int dminus = b2i('-');
+ int dstar = b2i('*');
+
+ if (Win[vx].Calls.size() > dminus && Win[vx].Calls[dminus].depth() > 0) {
+ if (Win[vx+1].Calls.size() > dstar && Win[vx+1].Calls[dstar].depth() > 0) {
+ // baq adjustment works at the 'star' not at the 'indel', so adjust qual using the next locus
+ double adj=Win[vx+1].Calls[dstar].qual_rms()/(double)Win[vx].Calls[dminus].qual_rms();
+ if (debug_xpos) {
+ if (Win[vx].Pos == debug_xpos && !strcmp(debug_xchr,Win[vx].Chr.data())) {
+ fprintf(stderr,"xpos-adj-qual\t%d to %d (%f)\n", Win[vx].Calls[dminus].qual_rms(),Win[vx+1].Calls[dstar].qual_rms(), adj);
+ }
+ }
+ Win[vx].Calls[dminus].qual *= adj;
+ Win[vx].Calls[dminus].qual_ssq *= adj;
+ } else {
+ vcall none;
+ if (debug_xpos) {
+ if (Win[vx].Pos == debug_xpos && !strcmp(debug_xchr,Win[vx].Chr.data())) {
+ fprintf(stderr,"xpos-skip-del-qual\t%d\n", Win[vx].Calls[dminus].depth());
+ }
+ }
+ Win[vx].Calls[dminus] = none;
+ }
+ }
+ }
+
+ VisitX(Win[vx]);
+}
+
+void VarCallVisitor::Finish() {
+ // finish out the rest of the pileup, with the existing window
+ int vx = WinMax/2+1;
+ while (vx < Win.size()) {
+ ///debug("Finish: %d\n", Win[vx].Pos);
+ VisitX(Win[vx++]);
+ }
+}
+
+void VarCallVisitor::VisitX(PileupSummary &p) {
+ //debug("VisitX: %d\n", p.Pos);
+
+ if (debug_xpos) {
+ if (p.Pos != debug_xpos)
+ return;
+ if (strcmp(debug_xchr,p.Chr.data()))
+ return;
+ }
+
+ if (p.Depth < min_depth) {
+ if (debug_xpos) {
+ fprintf(stderr,"xpos-skip-depth\t%d < %d\n",p.Depth, min_depth);
+ fprintf(stderr,"xpos-skip-dup\t%d\n",p.SkipDupReads);
+ fprintf(stderr,"xpos-skip-n\t%d\n",p.SkipN);
+ fprintf(stderr,"xpos-skip-mapq\t%d\n",p.SkipMinMapq);
+ fprintf(stderr,"xpos-skip-qual\t%d\n",p.SkipMinQual);
+ }
+ ++SkippedDepth;
+ return;
+ }
+
+ int ins_fwd = p.Calls.size() > 6 ? p.Calls[6].fwd : 0;
+ int ins_rev = p.Calls.size() > 6 ? p.Calls[6].rev : 0;
+
+ int i;
+ if (p.Calls.size() > 6)
+ p.Calls.resize(7); // toss N's before sort
+
+ sort(p.Calls.begin(), p.Calls.end(), hitolocall);
+
+ int need_out = -1;
+ int skipped_balance=0;
+ int skipped_alpha=0;
+ int skipped_indel=0;
+ int skipped_tail_hom=0;
+ int skipped_depth=0;
+ int skipped_repeat=0;
+
+ vector<vfinal> final_calls;
+ for (i=0;i<p.Calls.size();++i) { // all calls
+// printf("CALL TOP: depth:%d base: %c, pd: %d, calls: %d\n", (int) p.Calls[i].depth(), p.Calls[i].base, p.Depth, (int) p.Calls.size());
+
+ double pct = (double) p.Calls[i].depth()/p.Depth;
+ double qpct = (double) p.Calls[i].qual/p.TotQual;
+
+ if (!p.Calls[i].base)
+ continue;
+
+ if (!p.Calls[i].depth())
+ continue;
+
+ double bpct = (double) min(p.Calls[i].fwd,p.Calls[i].rev)/p.Calls[i].depth();
+
+ if (pct > pct_depth && qpct >= pct_qdepth && (p.Calls[i].depth() >= min_adepth)) {
+ if (bpct < pct_balance) {
+ int fwd_adj=0, rev_adj=0;
+ // f=b*(f+r); r=f/b-f; adj=r-(f/b-f)
+ if (p.Calls[i].fwd < p.Calls[i].rev) {
+ rev_adj = (int) p.Calls[i].rev - ( p.Calls[i].fwd/pct_balance - p.Calls[i].fwd );
+ } else {
+ fwd_adj = (int) p.Calls[i].fwd - ( p.Calls[i].rev/pct_balance - p.Calls[i].rev );
+ }
+ if (fwd_adj + rev_adj > 1 && bpct > 0) {
+ // adjust call down
+ p.Calls[i].qual -= (rev_adj+fwd_adj)*(p.Calls[i].qual/p.Calls[i].depth());
+ p.Calls[i].mq_sum -= (rev_adj+fwd_adj)*(p.Calls[i].mq_sum/p.Calls[i].depth());
+ p.Calls[i].qual_ssq -= (rev_adj+fwd_adj)*(p.Calls[i].qual_ssq/p.Calls[i].depth());
+ p.Calls[i].mq_ssq -= (rev_adj+fwd_adj)*(p.Calls[i].mq_ssq/p.Calls[i].depth());
+ p.Calls[i].rev -= rev_adj;
+ p.Calls[i].fwd -= fwd_adj;
+ skipped_balance+=rev_adj+fwd_adj;
+
+ // fixed bpct
+ bpct = (double) min(p.Calls[i].fwd,p.Calls[i].rev)/p.Calls[i].depth();
+ } else {
+ // it's junk anyway
+ }
+
+ // fix depths after adjustment!
+ pct = (double) p.Calls[i].depth()/p.Depth;
+ qpct = (double) p.Calls[i].qual/p.TotQual;
+ }
+ }
+
+ if (pct > pct_depth && qpct >= pct_qdepth && (p.Calls[i].depth() >= min_adepth)) {
+ // balance is meaningless at low depths
+ if ((bpct >= pct_balance) || (p.Calls[i].depth()<4)) {
+ if (p.Calls[i].base == '+' || p.Calls[i].base == '-') {
+ // yuk ... time to think about a possible indel call
+ if (p.Calls[i].depth() >= min_idepth) {
+ // should really pick more than 1
+ // but need to allow "similar" indels to pile up
+ // should group into distinct bins, using some homology thing
+ sort(p.Calls[i].seqs.begin(), p.Calls[i].seqs.end());
+ string prev, maxs;
+ int pcnt=0, maxc=0, j;
+ for (j=0;j<p.Calls[i].seqs.size();++j) {
+ if (prev == p.Calls[i].seqs[j]) {
+ ++pcnt;
+ } else {
+ if (pcnt > maxc) {
+ maxs=prev;
+ maxc=pcnt;
+ }
+ prev=p.Calls[i].seqs[j];
+ pcnt=1;
+ }
+ }
+ if (pcnt > maxc) {
+ maxs=prev;
+ maxc=pcnt;
+ }
+ if (maxc >= min_idepth && maxc >= min_adepth) {
+ // only calls 1 indel at a given position
+ if ((repeat_filter == 0) || (p.RepeatCount < repeat_filter)) {
+ // maybe use rms here... see if it helps
+ double mean_qual = p.Calls[i].qual/(double)p.Calls[i].depth();
+ double err_rate = mean_qual < max_phred ? pow(10,-mean_qual/10.0) : global_error_rate;
+ // expected number of non-reference = error_rate*depth
+ double pval=(p.Depth*err_rate==0)?0:gsl_ran_poisson_pdf(p.Calls[i].depth(), p.Depth*err_rate);
+ double padj=total_locii ? pval*total_locii : pval; // multiple-testing adjustment
+
+ if (padj <= alpha) {
+ vfinal final(p.Calls[i]);
+
+ double mq_padj=max(total_locii*pow(10,-p.Calls[i].mq_sum/10.0),padj); // never report pval as better than the total mapping quality
+ if (debug_xpos) fprintf(stderr,"xpos-debug-pval\tbase:%c, err:%g, pval:%g, padj:%g, mq_padj:%g, mq_sum:%d\n", p.Calls[i].base, err_rate, pval, padj, mq_padj, p.Calls[i].mq_sum);
+
+ if (mq_padj > 1) mq_padj=1;
+
+ if (need_out == -1)
+ need_out = i;
+
+// printf("FINAL: depth:%d base: %s\n", (int) maxc, maxs.c_str());
+ final.padj=mq_padj;
+ final.max_idl_cnt=maxc;
+ final.max_idl_seq=maxs;
+ final_calls.push_back(final);
+ } else {
+ skipped_alpha+=p.Calls[i].depth();
+ }
+ // implicitly skip all the ohter indel calls at the same locus
+ skipped_indel+=p.Calls[i].depth()-maxc;
+ } else {
+ skipped_repeat+=p.Calls[i].depth();
+ }
+ } else {
+ skipped_indel+=p.Calls[i].depth();
+ }
+ } else {
+ skipped_indel+=p.Calls[i].depth();
+ }
+ } else {
+ if (p.Calls[i].base == '*' && (
+ ((repeat_filter > 0) && (p.RepeatCount >= repeat_filter)) ||
+ (p.Calls[i].depth() < min_idepth)
+ )) {
+ skipped_indel+=p.Calls[i].depth();
+ } else {
+ // subtract inserts from reference .. perhaps > 0 is correct here....
+ if (p.Calls[i].is_ref && (ins_rev+ins_fwd) > max(min_idepth,min_adepth)) {
+ p.Calls[i].fwd-=ins_fwd;
+ p.Calls[i].rev-=ins_rev;
+ }
+
+ double mean_qual = p.Calls[i].qual/(double)p.Calls[i].depth();
+
+/*
+ if ( (repeat_filter > 0) && (p.RepeatCount >= repeat_filter) ) {
+ p.Calls[i].fwd-=p.Calls[i].tail_fwd;
+ p.Calls[i].rev-=p.Calls[i].tail_rev;
+ skipped_tail_hom+=p.Calls[i].tail_fwd+p.Calls[i].tail_rev;
+ }
+*/
+ if (p.Calls[i].depth() >= min_adepth && p.Calls[i].depth() > 0) {
+ double err_rate = mean_qual < max_phred ? pow(10,-mean_qual/10.0) : global_error_rate;
+ // expected number of non-reference bases at this position is error_rate*depth
+ double pval=(p.Depth*err_rate==0)?0:gsl_ran_poisson_pdf(p.Calls[i].depth(), p.Depth*err_rate);
+ double padj=total_locii ? pval*total_locii : pval; // multiple-testing adjustment
+
+ if (padj <= alpha) {
+ double mq_padj=max(total_locii*pow(10,-p.Calls[i].mq_sum/10.0),padj); // never report as better than the mapping quality
+
+ if (mq_padj > 1) mq_padj=1;
+
+ if (debug_xpos) fprintf(stderr,"xpos-debug-pval\tbase:%c, err:%g, pval:%g, padj:%g, mq_padj:%g, mq_sum:%d\n", p.Calls[i].base, err_rate, pval, padj, mq_padj, p.Calls[i].mq_sum);
+
+ if (!p.Calls[i].is_ref || debug_xpos) {
+ if (need_out == -1)
+ need_out = i;
+ }
+ vfinal final(p.Calls[i]);
+ final.padj=mq_padj;
+ final_calls.push_back(final);
+ } else {
+ skipped_alpha+=p.Calls[i].depth();
+ }
+ }
+ }
+ }
+ } else {
+ skipped_balance+=p.Calls[i].depth();
+ }
+ } else {
+ // depth is too low now.... technically you can just add all the rest of the calls to skipped_depth without checking
+ skipped_depth+=p.Calls[i].depth();
+ }
+ }
+
+ ++Locii;
+
+ if (need_out>=0||debug_xpos) {
+
+ if (final_calls.size() > 1){
+// printf("HERE1 %c/%c\n", final_calls[0].pcall->base, final_calls[1].pcall->base);
+ if(final_calls[1].pcall->is_ref) {
+ vfinal tmp=final_calls[1];
+ final_calls[1]=final_calls[0];
+ final_calls[0]=tmp;
+// printf("HERE2 %c/%c\n", final_calls[0].pcall->base, final_calls[1].pcall->base);
+ }
+ }
+
+// printf("allele_count: %d\n", (int) final_calls.size());
+
+
+ int total_call_depth=0;
+ int i;
+ for (i=0;i<final_calls.size();++i) {
+ total_call_depth+=final_calls[i].pcall->depth();
+ }
+ double pct_allele = 0;
+ if (need_out >=0) {
+ // more than 1 call at this position = Het
+ if (final_calls.size() > 1) {
+ if (final_calls[0].pcall->is_ref) {
+ pct_allele = 100.0 * final_calls[1].pcall->depth() / (double) total_call_depth;
+ } else {
+ // no reference seen... but still het?
+ pct_allele = 100.0 * final_calls[0].pcall->depth() / (double) total_call_depth;
+ }
+ ++Hets;
+ } else {
+ pct_allele = 100.0 * final_calls[0].pcall->depth() / (double) total_call_depth;
+ ++Homs;
+ }
+ }
+
+ if (var_f) {
+ int i;
+ string pil;
+ for (i=0;i<final_calls.size();++i) {
+ vfinal &f=final_calls[i];
+ if (f.is_indel()) {
+ pil += string_format("\t%c%s:%d,%d,%.1e",f.pcall->base,f.max_idl_seq.c_str(),f.max_idl_cnt,f.pcall->qual/f.pcall->depth(),f.padj);
+ } else {
+ pil += string_format("\t%c:%d,%d,%.1e",f.pcall->base,f.pcall->depth(),f.pcall->qual/f.pcall->depth(),f.padj);
+ }
+ }
+ fprintf(var_f,"%s\t%d\t%c\t%d\t%d\t%2.2f%s\n",p.Chr.c_str(), p.Pos, p.Base, p.Depth, skipped_alpha+skipped_depth+skipped_balance+p.SkipN+p.SkipDupReads+p.SkipMinMapq+p.SkipMinQual, pct_allele, pil.c_str());
+ }
+
+ if (vcf_f) {
+
+ for (i=0;i<final_calls.size();++i) {
+ vfinal &f=final_calls[i];
+ int qual = f.padj>0?min(40,10*(-log10(f.padj))):40;
+
+ if (f.is_indel()) {
+ string base;
+ string alt;
+ if (f.pcall->base =='-') {
+ base = p.Base + f.max_idl_seq;
+ alt = p.Base;
+ } else {
+ base = p.Base;
+ alt = p.Base + f.max_idl_seq;
+ }
+ double freq_allele = f.max_idl_cnt / (double) p.Depth;
+ fprintf(vcf_f,"%s\t%d\t.\t%s\t%s\t%2d\tPASS\tMQ=%d;BQ=%d;DP=%d;AF=%2.2f\n",
+ p.Chr.c_str(), p.Pos, base.c_str(), alt.c_str(), qual,
+ (int) f.pcall->mq_rms(),
+ (int) f.pcall->qual_rms(),
+ total_call_depth,
+ freq_allele);
+ } else {
+ char alt = f.pcall->base;
+ if (f.pcall->is_ref)
+ alt = '.';
+ double freq_allele = f.pcall->depth() / (double) p.Depth;
+ fprintf(vcf_f,"%s\t%d\t.\t%c\t%c\t%d\tPASS\tMQ=%d;BQ=%d;DP=%d;AF=%2.2f\n",
+ p.Chr.c_str(), p.Pos, p.Base, alt, qual,
+ (int) f.pcall->mq_rms(),
+ (int) f.pcall->qual_rms(),
+ total_call_depth,
+ freq_allele);
+ }
+ }
+ }
+
+ if (eav_f) {
+// printf(eav_f,"chr\tpos\tref\tdepth\tnum_states\ttop_consensus\ttop_freq\tvar_base\tvar_depth\tvar_qual\tvar_strands\tforward_strands\treverse_strands\n");
+ string top_cons, var_base, var_depth, var_qual, var_strands, forward, reverse;
+
+ float padj=final_calls[0].padj;
+ if (final_calls[0].pcall->is_ref && final_calls.size() > 1) {
+ padj=final_calls[1].padj;
+ }
+ for (i=0;i<final_calls.size();++i) {
+ vfinal &f=final_calls[i];
+ if (i < 2) {
+ if (i > 0) top_cons += "/";
+ top_cons += f.pcall->base;
+ }
+ if (i > 0) var_base += "/";
+ var_base += f.pcall->base;
+ if (f.is_indel()) {
+ if (i < 2) {
+ top_cons += f.max_idl_seq;
+ }
+ var_base += f.max_idl_seq;
+ }
+ if (i > 0) var_depth+= ";";
+ var_depth+= string_format("%d",f.pcall->depth());
+ if (i > 0) var_qual+= ";";
+ var_qual+= string_format("%d",f.pcall->qual_rms());
+ if (i > 0) var_strands+= ";";
+ var_strands+= string_format("%d",(f.pcall->fwd>0)+(f.pcall->rev>0));
+ if (i > 0) forward += ";";
+ forward+= string_format("%d",f.pcall->fwd);
+ if (i > 0) reverse += ";";
+ reverse+= string_format("%d",f.pcall->rev);
+ }
+ fprintf(eav_f,"%s\t%d\t%c\t%d\t%d\t%s\t%2.2f\t%s\t%s\t%s\t%s\t%s\t%s\t%.1e\n",p.Chr.c_str(), p.Pos, p.Base, p.Depth, (int) final_calls.size(),top_cons.c_str(), pct_allele, var_base.c_str(), var_depth.c_str(), var_qual.c_str(), var_strands.c_str(), forward.c_str(), reverse.c_str(), padj);
+ }
+
+ if (debug_xpos) {
+ fprintf(stderr,"xpos-skip-dup\t%d\n",p.SkipDupReads);
+ fprintf(stderr,"xpos-skip-mapq\t%d\n",p.SkipMinMapq);
+ fprintf(stderr,"xpos-skip-qual\t%d\n",p.SkipMinQual);
+ fprintf(stderr,"xpos-skip-bal\t%d\n",skipped_balance);
+ fprintf(stderr,"xpos-skip-depth\t%d\n",skipped_depth);
+ fprintf(stderr,"xpos-skip-indel\t%d\n",skipped_indel);
+// fprintf(stderr,"xpos-skip-tail-imbalance\t%d\n",skipped_tail_hom);
+ fprintf(stderr,"xpos-skip-repeat\t%d\n",skipped_repeat);
+ fprintf(stderr,"xpos-skip-alpha\t%d\n",skipped_alpha);
+ if (repeat_filter > 0) {
+ fprintf(stderr,"repeat-count\t%d\n",p.RepeatCount);
+ fprintf(stderr,"repeat-filter\t%d\n",repeat_filter);
+ fprintf(stderr,"repeat-base\t%c\n",p.RepeatBase);
+ }
+ exit(0);
+ }
+ }
+}
+
+
+void PileupVisitor::LoadIndex(const char *path) {
+ FILE *f = fopen(path,"r");
+ if (!f) {
+ warn("Can't open %s : %s\n", path, strerror(errno));
+ exit(1);
+ }
+
+ AnnotType = '\0';
+ line l; meminit(l);
+ int cnt=0;
+ while(read_line(f, l)>0) {
+ vector<char *> d=split(l.s, "\t");
+ if (d.size() < 9) {
+ warn("File must be a GTF or a BED: '%s'\n", path);
+ exit(1);
+ }
+ AnnotType = (*d[5]=='+' || *d[5] == '-') ? 'b' : '\0';
+ AnnotType = (*d[6]=='+' || *d[5] == '-') ? 'g' : AnnotType;
+ break;
+ }
+
+ if (!AnnotType) {
+ warn("File must be a GTF or a BED: '%s'\n", path);
+ exit(1);
+ }
+
+ if (!AnnotDex.read(path)) {
+ // void build(const char *path, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c);
+ AnnotDex.build(path, "\t", 0, 1, 2, 0, '#', 1);
+ }
+
+ fclose(f);
+}
+
+void VarStatVisitor::Visit(PileupSummary &p) {
+ tot_locii += 1;
+
+ if (p.Depth < minsampdepth)
+ return;
+
+ // insert and deletions have their own, separate noise levels
+
+ int ins_depth = p.Calls.size() > 6 ? p.Calls[6].depth() : 0;
+ int ins_qual = p.Calls.size() > 6 ? p.Calls[6].qual : 0;
+ double ins_noise = 0;
+ double ins_qnoise = 0;
+ if (p.Calls.size() > 1 && p.Calls[1].depth() > ins_depth && ins_depth > 0) {
+ ins_noise = (double) ins_depth/p.Depth;
+ ins_qnoise = (double) ins_qual/p.TotQual;
+ }
+
+ int del_depth = p.Calls.size() > 5 ? p.Calls[5].depth() : 0;
+ int del_qual = p.Calls.size() > 5 ? p.Calls[5].qual : 0;
+ double del_noise = 0;
+ double del_qnoise = 0;
+ if (p.Calls.size() > 1 && p.Calls[1].depth() > del_depth && del_depth > 0) {
+ del_noise = (double) del_depth/p.Depth;
+ del_qnoise = (double) del_qual/p.TotQual;
+ }
+
+ // snp's are "noise" if there are 3 alleles at a given position
+ int i;
+ if (p.Calls.size() > 5)
+ p.Calls.resize(5); // toss N's and inserts before sort
+
+ sort(p.Calls.begin(), p.Calls.end(), hitolocall);
+
+ double noise = p.Calls.size() > 2 ? (double) p.Calls[2].depth()/p.Depth : 0;
+ double qnoise = p.Calls.size() > 2 ? (double) p.Calls[2].qual/p.TotQual : 0;
+
+ double mnqual = (double)p.TotQual/p.Depth;
+
+ char pbase = p.Calls.size() > 2 ? p.Calls[2].base : '.';
+
+ if (noise_f) {
+ fprintf(noise_f,"%d\t%c\t%f\t%f\n", p.Depth, pbase, noise, qnoise, mnqual);
+/*
+ if (ins_noise > 0) {
+ fprintf(noise_f,"%d\t%c\t%f\t%f\n", p.Depth, '+', ins_noise, ins_qnoise, mnqual);
+ }
+ if (del_noise > 0) {
+ fprintf(noise_f,"%d\t%c\t%f\t%f\n", p.Depth, '-', del_noise, del_qnoise, mnqual);
+ }
+*/
+ }
+
+ tot_depth += p.Depth;
+ num_reads += p.NumReads;
+ stats.push_back(Noise(p.Depth, noise, qnoise, mnqual));
+ ins_stats.push_back(Noise(p.Depth, ins_noise, ins_qnoise, mnqual));
+ del_stats.push_back(Noise(p.Depth, del_noise, del_qnoise, mnqual));
+}
+
+
+void usage(FILE *f) {
+ fprintf(f,
+"Usage: varcall <-s|-v> <-f REF> [options] bam1 [bam2...]\n"
+"Version: %s.%d (BETA)\n"
+"\n"
+"Either outputs summry stats for the list of files, or performs variant calling\n"
+"\n"
+"Options (later options override earlier):\n"
+"\n"
+"-s Calculate statistics\n"
+"-v Calculate variants bases on supplied parameters (see -S)\n"
+"-f Reference fasta (required if using bams, ignored otherwise)\n"
+"-m Min locii depth (0)\n"
+"-a Min allele depth (0)\n"
+"-p Min allele pct by quality (0)\n"
+"-q Min qual (3)\n"
+"-Q Min mapping quality (0)\n"
+"-b Min pct balance (strand/total) (0)\n"
+"-D FLOAT Max duplicate read fraction (depth/length per position) (1)\n"
+"-B Turn off BAQ correction (false)\n"
+"-R Homopolymer repeat indel filtering (8)\n"
+"-e FLOAT Alpha filter to use, requires -l or -S (.05)\n"
+"-g FLOAT Global minimum error rate (default: assume phred is ok)\n"
+"-l INT Number of locii in total pileup used for bonferroni (1 mil)\n"
+"-x CHR:POS Output this pos only, then quit\n"
+"-N FIL Output noise stats to FIL\n"
+"-S FIL Read in statistics and params from a previous run with -s (do this!)\n"
+"-A ANNOT Calculate in-target stats using the annotation file (requires -o)\n"
+"-o PREFIX Output prefix (note: overlaps with -N)\n"
+"\n"
+"Input files\n"
+"\n"
+"Files must be sorted bam files with bai index files available. Alternatively,\n"
+"a single pileup file can be supplied.\n"
+"\n"
+"Output files\n"
+"\n"
+"Varcalls go to stdout. Stats go to stdout, or stderr if varcalling too\n"
+"\n"
+"If an output prefix is used, files are created as follows:\n"
+" PREFIX.var Variant calls in tab delimited 'varcall' format\n"
+" PREFIX.eav Variant calls in tab delimited 'ea-var' format\n"
+" PREFIX.vcf Variant calls, in vcf format\n"
+" PREFIX.varsum Summary of variant calls\n"
+" PREFIX.tgt On-target stats detail\n"
+" PREFIX.tgtsum Summary of on-target stats\n"
+" PREFIX.noise Noise stats detail\n"
+"\n"
+"Stats Output:\n"
+"\n"
+"Contains mean, median, quartile information for depth, base quality, read len,\n"
+"mapping quality, indel levels. Also estimates parameters suitable for\n"
+"variant calls, and can be passed directly to this program for variant calls\n"
+"\n"
+"Filtering Details:\n"
+"\n"
+ ,VERSION, SVNREV);
+}
+
+std::string string_format(const std::string &fmt, ...) {
+ int n, size=100;
+ std::string str;
+ va_list ap;
+ while (1) {
+ str.resize(size);
+ va_start(ap, fmt);
+ int n = vsnprintf((char *)str.c_str(), size, fmt.c_str(), ap);
+ va_end(ap);
+ if (n > -1 && n < size) {
+ str.resize(n);
+ return str;
+ }
+ if (n > -1)
+ size=n+1;
+ else
+ size*=2;
+ }
+}
+
+void to_upper(const std::string str) {
+ std::string::iterator it;
+ int i;
+ for ( i=0;i<str.size();++i ) {
+ ((char *)(void *)str.data())[i]=toupper(((char *)str.data())[i]);
+ }
+}
+
+// returns quantile depth
+double quantile_depth(const std::vector<Noise> &vec, double p) {
+ int l = vec.size();
+ assert(l > 0);
+ double t = ((double)l-1)*p;
+ int it = (int) t;
+ int v=vec[it].depth;
+ if (t > (double)it) {
+ return (v + (t-it) * (vec[it+1].depth - v));
+ } else {
+ return v;
+ }
+}
+
+double quantile(const std::vector<int> &vec, double p) {
+ int l = vec.size();
+ double t = ((double)l-1)*p;
+ int it = (int) t;
+ int v=vec[it];
+ if (t > (double)it) {
+ return (v + (t-it) * (vec[it+1] - v));
+ } else {
+ return v;
+ }
+}
+
+double quantile(const std::vector<double> &vec, double p) {
+ int l = vec.size();
+ double t = ((double)l-1)*p;
+ int it = (int) t;
+ double v=vec[it];
+ if (t > (double)it) {
+ return (v + p * (vec[it+1] - v));
+ } else {
+ return v;
+ }
+}
+
+std::vector<char *> split(char* str,const char* delim)
+{
+ char* token = strtok(str,delim);
+ std::vector<char *> result;
+ while(token != NULL)
+ {
+ result.push_back(token);
+ token = strtok(NULL,delim);
+ }
+ return result;
+}
+
+int rand_round(double x) {
+ return floor(x)+((rand()>(x-int(x))) ? 1 : 0);
+//warn("rr:%f=%d\n",x);
+}
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ea-utils.git
More information about the debian-med-commit
mailing list