[med-svn] [SCM] bwa branch, master, updated. debian/0.6.1-1-22-g71fd912
Charles Plessy
plessy at debian.org
Thu Mar 7 05:47:00 UTC 2013
The following commit has been merged in the master branch:
commit 947c8141c58d61196b274d19fc9273ebe688ce2e
Author: Charles Plessy <plessy at debian.org>
Date: Thu Mar 7 14:25:22 2013 +0900
Imported Upstream version 0.7.0
diff --git a/Makefile b/Makefile
index 6f388f2..eab4198 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,14 @@
CC= gcc
-CXX= g++
CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS)
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
-LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \
- bwaseqio.o bwase.o kstring.o
-AOBJS= QSufSort.o bwt_gen.o \
- is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
- bwape.o cs2nt.o \
+LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
+AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
+ is.o bwtindex.o bwape.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
-PROG= bwa
+PROG= bwa bwamem-lite
INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
@@ -26,19 +23,29 @@ SUBDIRS= .
all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
- $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
+ $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
+
+bwamem-lite:libbwa.a example.o
+ $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
-bwa.o:bwa.h
+ksw.o:ksw.h
+kstring.o:kstring.h
+utils.o:utils.h ksort.h kseq.h
+bntseq.o:bntseq.h
+bwt.o:bwt.h utils.h
+bwa.o:bwa.h bwt.h bntseq.h
+bwamem.o:ksw.h kbtree.h ksort.h kvec.h kstring.h utils.h bwamem.h
+bwamem_pair.o:ksw.h kvec.h kstring.h utils.h bwamem.h
QSufSort.o:QSufSort.h
+bwt_gen.o:QSufSort.h
+
+fastmap.o:bwt.h bwamem.h
-bwt.o:bwt.h
-bwtio.o:bwt.h
bwtaln.o:bwt.h bwtaln.h kseq.h
-bntseq.o:bntseq.h
bwtgap.o:bwtgap.h bwtaln.h bwt.h
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
diff --git a/NEWS b/NEWS
index d68c693..35202f1 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,56 @@
+Beta Release 0.7.0 (28 Feburary, 2013)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query
+sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap
+algorithm and extends seeds with banded affine-gap-penalty dynamic programming
+(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or
+longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA
+and BWA-SW and is more accurate. It also supports split alignments like BWA-SW
+and may optionally output multiple hits like BWA. BWA-MEM does not guarantee
+to find hits within a certain edit distance, but BWA is not efficient for such
+task given longer reads anyway, and the edit-distance criterion is arguably
+not as important in long-read alignment.
+
+In addition to the algorithmic improvements, BWA-MEM also implements a few
+handy features in practical aspects:
+
+ 1. BWA-MEM automatically switches between local and glocal (global wrt reads;
+ local wrt reference) alignment. It reports the end-to-end glocal alignment
+ if the glocal alignment is not much worse than the optimal local alignment.
+ Glocal alignment reduces reference bias.
+
+ 2. BWA-MEM automatically infers pair orientation from a batch of single-end
+ alignments. It allows more than one orientations if there are sufficient
+ supporting reads. This feature has not been tested on reads from Illumina
+ jumping library yet. (EXPERIMENTAL)
+
+ 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
+ is possible to convert a name-sorted BAM to an interleaved fastq on the fly
+ and feed the data stream to BWA-MEM for mapping.
+
+ 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which
+ helps to transfer individual read annotations to the output.
+
+ 5. BWA-MEM supports more advanced piping. Users can now run:
+ (bwa mem ref.fa '<bzcat r1.fq.bz2' '<bzcat r2.fq.bz2') to map bzip'd read
+ files without replying on bash features.
+
+ 6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c'
+ program in the source code directory implements a full single-end mapper in
+ 50 lines of code.
+
+The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
+for production use yet. However, when the implementation becomes stable after a
+few release cycles, existing BWA users are recommended to migrate to BWA-MEM
+for 76bp or longer Illumina reads and long query sequences. The original BWA
+short-read algorithm will not deliver satisfactory results for 150bp+ Illumina
+reads. Change of mappers will be necessary sooner or later.
+
+(0.7.0 beta: 28 Feburary 2013, r313)
+
+
+
Release 0.6.2 (19 June, 2012)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/QSufSort.c b/QSufSort.c
index e437ac3..36c5a51 100644
--- a/QSufSort.c
+++ b/QSufSort.c
@@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin
qsint_t i, j;
qsint_t s, negatedSortedGroupLength;
qsint_t numSymbolAggregated;
- qsint_t maxNumInputSymbol;
qsint_t numSortedPos = 1;
qsint_t newAlphabetSize;
- maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
-
if (!skipTransform) {
/* bucketing possible*/
newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
diff --git a/bntseq.c b/bntseq.c
index adcd2d7..972837e 100644
--- a/bntseq.c
+++ b/bntseq.c
@@ -35,7 +35,7 @@
#include "utils.h"
#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -288,21 +288,26 @@ int bwa_fa2pac(int argc, char *argv[])
return 0;
}
+int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
+{
+ int left, mid, right;
+ if (pos_f >= bns->l_pac) return -1;
+ left = 0; mid = 0; right = bns->n_seqs;
+ while (left < right) { // binary search
+ mid = (left + right) >> 1;
+ if (pos_f >= bns->anns[mid].offset) {
+ if (mid == bns->n_seqs - 1) break;
+ if (pos_f < bns->anns[mid+1].offset) break; // bracketed
+ left = mid + 1;
+ } else right = mid;
+ }
+ return mid;
+}
+
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
{
int left, mid, right, nn;
- if (ref_id) {
- left = 0; mid = 0; right = bns->n_seqs;
- while (left < right) {
- mid = (left + right) >> 1;
- if (pos_f >= bns->anns[mid].offset) {
- if (mid == bns->n_seqs - 1) break;
- if (pos_f < bns->anns[mid+1].offset) break; // bracketed
- left = mid + 1;
- } else right = mid;
- }
- *ref_id = mid;
- }
+ if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
left = 0; right = bns->n_holes; nn = 0;
while (left < right) {
mid = (left + right) >> 1;
@@ -321,3 +326,26 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
}
return nn;
}
+
+uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
+{
+ uint8_t *seq = 0;
+ if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
+ if (end > l_pac<<1) end = l_pac<<1;
+ if (beg < 0) beg = 0;
+ if (beg >= l_pac || end <= l_pac) {
+ int64_t k, l = 0;
+ *len = end - beg;
+ seq = malloc(end - beg);
+ if (beg >= l_pac) { // reverse strand
+ int64_t beg_f = (l_pac<<1) - 1 - end;
+ int64_t end_f = (l_pac<<1) - 1 - beg;
+ for (k = end_f; k > beg_f; --k)
+ seq[l++] = 3 - _get_pac(pac, k);
+ } else { // forward strand
+ for (k = beg; k < end; ++k)
+ seq[l++] = _get_pac(pac, k);
+ }
+ } else *len = 0; // if bridging the forward-reverse boundary, return nothing
+ return seq;
+}
diff --git a/bntseq.h b/bntseq.h
index 843db64..4061438 100644
--- a/bntseq.h
+++ b/bntseq.h
@@ -29,6 +29,7 @@
#define BWT_BNTSEQ_H
#include <stdint.h>
+#include <stdio.h>
#include <zlib.h>
#ifndef BWA_UBYTE
@@ -71,7 +72,9 @@ extern "C" {
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
void bns_destroy(bntseq_t *bns);
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
+ int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
+ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
#ifdef __cplusplus
}
diff --git a/bwa.1 b/bwa.1
index 66bc9a2..45b9921 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,47 +1,45 @@
-.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools"
+.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
.SH SYNOPSIS
.PP
-bwa index -a bwtsw database.fasta
+bwa index ref.fa
.PP
-bwa aln database.fasta short_read.fastq > aln_sa.sai
+bwa mem ref.fa reads.fq > aln-se.sam
.PP
-bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
+bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
.PP
-bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
+bwa aln ref.fa short_read.fq > aln_sa.sai
.PP
-bwa bwasw database.fasta long_read.fastq > aln.sam
+bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
+.PP
+bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
+.PP
+bwa bwasw ref.fa long_read.fq > aln.sam
.SH DESCRIPTION
.PP
-BWA is a fast light-weighted tool that aligns relatively short sequences
-(queries) to a sequence database (targe), such as the human reference
-genome. It implements two different algorithms, both based on
-Burrows-Wheeler Transform (BWT). The first algorithm is designed for
-short queries up to ~150bp with low error rate (<3%). It does gapped
-global alignment w.r.t. queries, supports paired-end reads, and is one
-of the fastest short read alignment algorithms to date while also
-visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
-reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like
-alignment to find high-scoring local hits and split hits. On
-low-error short queries, BWA-SW is a little slower and less accurate than the
-first algorithm, but on long queries, it is better.
-.PP
-For both algorithms, the database file in the FASTA format must be
-first indexed with the
-.B `index'
-command, which typically takes a few hours for a 3GB genome. The first algorithm is
-implemented via the
-.B `aln'
-command, which finds the suffix array (SA) coordinates of good hits of
-each individual read, and the
-.B `samse/sampe'
-command, which converts SA coordinates to chromosomal coordinate and
-pairs reads (for `sampe'). The second algorithm is invoked by the
-.B `bwasw'
-command. It works for single-end reads only.
+BWA is a software package for mapping low-divergent sequences against a large
+reference genome, such as the human genome. It consists of three algorithms:
+BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
+sequence reads up to 100bp, while the rest two for longer sequences ranged from
+70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
+support and split alignment, but BWA-MEM, which is the latest, is generally
+recommended for high-quality queries as it is faster and more accurate.
+BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
+reads.
+
+For all the algorithms, BWA first needs to construct the FM-index for
+the reference genome (the
+.B index
+command). Alignment algorithms are invoked with different sub-commands:
+.BR aln / samse / sampe
+for BWA-backtrack,
+.B bwasw
+for BWA-SW and
+.B mem
+for the BWA-MEM algorithm.
.SH COMMANDS AND OPTIONS
.TP
@@ -53,9 +51,6 @@ Index database sequences in the FASTA format.
.B OPTIONS:
.RS
.TP 10
-.B -c
-Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x)
-.TP
.BI -p \ STR
Prefix of the output database [same as db filename]
.TP
@@ -77,6 +72,175 @@ genome.
.RE
.TP
+.B mem
+.B bwa mem
+.RB [ -aCHMpP ]
+.RB [ -t
+.IR nThreads ]
+.RB [ -k
+.IR minSeedLen ]
+.RB [ -w
+.IR bandWidth ]
+.RB [ -r
+.IR seedSplitRatio ]
+.RB [ -c
+.IR maxOcc ]
+.RB [ -A
+.IR matchScore ]
+.RB [ -B
+.IR mmPenalty ]
+.RB [ -O
+.IR gapOpenPen ]
+.RB [ -E
+.IR gapExtPen ]
+.RB [ -L
+.IR clipPen ]
+.RB [ -U
+.IR unpairPen ]
+.RB [ -R
+.IR RGline ]
+.RB [ -v
+.IR verboseLevel ]
+.I db.prefix
+.I reads.fq
+.RI [ mates.fq ]
+
+Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
+algorithm works by seeding alignments with maximal exact matches (MEMs) and
+then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
+
+If
+.I mates.fq
+file is absent and option
+.B -p
+is not set, this command regards input reads are single-end. If
+.I mates.fq
+is present, this command assumes the
+.IR i -th
+read in
+.I reads.fq
+and the
+.IR i -th
+read in
+.I mates.fq
+constitute a read pair. If
+.B -p
+is used, the command assumes the
+.RI 2 i -th
+and the
+.RI (2 i +1)-th
+read in
+.I reads.fq
+constitute a read pair (such input file is said to be interleaved). In this case,
+.I mates.fq
+is ignored. In the paired-end mode, the
+.B mem
+command will infer the read orientation and the insert size distribution from a
+batch of reads.
+
+The BWA-MEM algorithm performs local alignment. It may produce multiple primary
+alignments for different part of a query sequence. This is a crucial feature
+for long sequences. However, some tools such as Picard's markDuplicates does
+not work with split alignments. One may consider to use option
+.B -M
+to flag shorter split hits as secondary.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -t \ INT
+Number of threads [1]
+.TP
+.BI -k \ INT
+Minimum seed length. Matches shorter than
+.I INT
+will be missed. The alignment speed is usually insensitive to this value unless
+it significantly deviates 20. [19]
+.TP
+.BI -w \ INT
+Band width. Essentially, gaps longer than
+.I INT
+will not be found. Note that the maximum gap length is also affected by the
+scoring matrix and the hit length, not solely determined by this option. [100]
+.TP
+.BI -r \ FLOAT
+Trigger re-seeding for a MEM longer than
+.IR minSeedLen * FLOAT .
+This is a key heuristic parameter for tuning the performance. Larger value
+yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
+.TP
+.BI -c \ INT
+Discard a MEM if it has more than
+.I INT
+occurence in the genome. This is an insensitive parameter. [10000]
+.TP
+.B -P
+In the paired-end mode, perform SW to rescue missing hits only but do not try to find
+hits that fit a proper pair.
+.TP
+.BI -A \ INT
+Matching score. [1]
+.TP
+.BI -B \ INT
+Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
+.TP
+.BI -O \ INT
+Gap open penalty. [6]
+.TP
+.BI -E \ INT
+Gap extension penalty. A gap of length k costs O + k*E (i.e.
+.B -O
+is for opening a zero-length gap). [1]
+.TP
+.BI -L \ INT
+Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
+score reaching the end of query. If this score is larger than the best SW score
+minus the clipping penalty, clipping will not be applied. Note that in this
+case, the SAM AS tag reports the best SW score; clipping penalty is not
+deducted. [5]
+.TP
+.BI -U \ INT
+Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
+.RI scoreRead1+scoreRead2- INT
+and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
+two scores to determine whether we should force pairing. [9]
+.TP
+.B -p
+Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.
+.TP
+.BI -R \ STR
+Complete read group header line. '\\t' can be used in
+.I STR
+and will be converted to a TAB in the output SAM. The read group ID will be
+attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
+[null]
+.TP
+.B -a
+Output all found alignments for single-end or unpaired paired-end reads. These
+alignments will be flagged as secondary alignments.
+.TP
+.B -C
+Append append FASTA/Q comment to SAM output. This option can be used to
+transfer read meta information (e.g. barcode) to the SAM output. Note that the
+FASTA/Q comment (the string after a space in the header line) must conform the SAM
+spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
+.TP
+.B -H
+Use hard clipping 'H' in the SAM output. This option may dramatically reduce
+the redundancy of output when mapping long contig or BAC sequences.
+.TP
+.B -M
+Mark shorter split hits as secondary (for Picard compatibility).
+.TP
+.BI -v \ INT
+Control the verbose level of the output. This option has not been fully
+supported throughout BWA. Ideally, a value 0 for disabling all the output to
+stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
+all normal messages; 4 or higher for debugging. When this option takes value
+4, the output is not SAM. [3]
+.RE
+
+.TP
.B aln
bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc]
@@ -482,24 +646,6 @@ Pairing is slower for shorter reads. This is mainly because shorter
reads have more spurious hits and converting SA coordinates to
chromosomal coordinates are very costly.
-.SH NOTES ON LONG-READ ALIGNMENT
-.PP
-Command
-.B bwasw
-is designed for long-read alignment. BWA-SW essentially aligns the trie
-of the reference genome against the directed acyclic word graph (DAWG) of a
-read to find seeds not highly repetitive in the genome, and then performs a
-standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called
-the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the
-top Z reference suffix intervals that match the vertex. BWA-SW is more accurate
-if the resultant alignment is supported by more seeds, and therefore BWA-SW
-usually performs better on long queries or queries with low divergence to the
-reference genome.
-
-BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads
-mainly because it gives better gapped alignment. For paired-end reads, it is yet
-to know whether BWA-short or BWA-SW yield overall better results.
-
.SH CHANGES IN BWA-0.6
.PP
Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
@@ -534,16 +680,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes
from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
libraries are distributed under the MIT license.
.PP
-If you use the short-read alignment component, please cite the following
+If you use the BWA-backtrack algorithm, please cite the following
paper:
.PP
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
.PP
-If you use the long-read component (BWA-SW), please cite:
+If you use the BWA-SW algorithm, please cite:
.PP
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
+.PP
+If you use the fastmap component of BWA, please cite:
+.PP
+Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de
+novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178]
+.PP
+The BWA-MEM algorithm has not been published yet.
.SH HISTORY
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
@@ -569,3 +722,11 @@ short-read aligners are being implemented.
The BWA-SW algorithm is a new component of BWA. It was conceived in
November 2008 and implemented ten months later.
+
+The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
+matches (SMEMs), which was first published with the fermi assembler paper
+in 2012. I first implemented the basic SMEM algorithm in the
+.B fastmap
+command for an experiment and then extended the basic algorithm and added the
+extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
+
diff --git a/bwa.c b/bwa.c
index 8e99f18..beea6d1 100644
--- a/bwa.c
+++ b/bwa.c
@@ -1,272 +1,324 @@
-#include <stdlib.h>
#include <string.h>
#include <stdio.h>
-#include <math.h>
-#include "bwa.h"
-#include "bwt.h"
-#include "bwtgap.h"
+#include <zlib.h>
+#include <assert.h>
#include "bntseq.h"
+#include "bwa.h"
+#include "ksw.h"
+#include "utils.h"
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-extern unsigned char nst_nt4_table[256];
-extern void seq_reverse(int len, uint8_t *seq, int is_comp);
-
-bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 };
+int bwa_verbose = 3;
+char bwa_rg_id[256];
-struct bwa_idx_t {
- bwt_t *bwt;
- bntseq_t *bns;
- uint8_t *pac;
-};
+/************************
+ * Batch FASTA/Q reader *
+ ************************/
-struct bwa_buf_t {
- int max_buf;
- bwa_pestat_t pes;
- gap_stack_t *stack;
- gap_opt_t *opt;
- int *diff_tab;
- uint8_t *buf;
- int *logn;
-};
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
-bwa_idx_t *bwa_idx_load(const char *prefix)
+static inline void trim_readno(kstring_t *s)
{
- bwa_idx_t *p;
- int l;
- char *str;
- l = strlen(prefix);
- p = calloc(1, sizeof(bwa_idx_t));
- str = malloc(l + 10);
- strcpy(str, prefix);
- p->bns = bns_restore(str);
- strcpy(str + l, ".bwt");
- p->bwt = bwt_restore_bwt(str);
- str[l] = 0;
- strcpy(str + l, ".sa");
- bwt_restore_sa(str, p->bwt);
- free(str);
- p->pac = calloc(p->bns->l_pac/4+1, 1);
- fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac);
- fclose(p->bns->fp_pac);
- p->bns->fp_pac = 0;
- return p;
+ if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
+ s->l -= 2, s->s[s->l] = 0;
}
-void bwa_idx_destroy(bwa_idx_t *p)
-{
- bns_destroy(p->bns);
- bwt_destroy(p->bwt);
- free(p->pac);
- free(p);
+static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
+{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
+ s->name = strdup(ks->name.s);
+ s->comment = ks->comment.l? strdup(ks->comment.s) : 0;
+ s->seq = strdup(ks->seq.s);
+ s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
+ s->l_seq = strlen(s->seq);
}
-bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
+bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
{
- extern gap_opt_t *gap_init_opt(void);
- extern int bwa_cal_maxdiff(int l, double err, double thres);
- int i;
- bwa_buf_t *p;
- p = malloc(sizeof(bwa_buf_t));
- p->stack = gap_init_stack2(max_score);
- p->opt = gap_init_opt();
- p->opt->s_gapo = opt->s_gapo;
- p->opt->s_gape = opt->s_gape;
- p->opt->max_diff = opt->max_diff;
- p->opt->max_gapo = opt->max_gapo;
- p->opt->max_gape = opt->max_gape;
- p->opt->seed_len = opt->seed_len;
- p->opt->max_seed_diff = opt->max_seed_diff;
- p->opt->fnr = opt->fnr;
- p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int));
- for (i = 1; i < BWA_MAX_QUERY_LEN; ++i)
- p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
- p->logn = calloc(256, sizeof(int));
- for (i = 1; i != 256; ++i)
- p->logn[i] = (int)(4.343 * log(i) + 0.499);
- return p;
+ kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
+ int size = 0, m, n;
+ bseq1_t *seqs;
+ m = n = 0; seqs = 0;
+ while (kseq_read(ks) >= 0) {
+ if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
+ fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
+ break;
+ }
+ if (n >= m) {
+ m = m? m<<1 : 256;
+ seqs = realloc(seqs, m * sizeof(bseq1_t));
+ }
+ trim_readno(&ks->name);
+ kseq2bseq1(ks, &seqs[n]);
+ size += seqs[n++].l_seq;
+ if (ks2) {
+ trim_readno(&ks2->name);
+ kseq2bseq1(ks2, &seqs[n]);
+ size += seqs[n++].l_seq;
+ }
+ if (size >= chunk_size) break;
+ }
+ if (size == 0) { // test if the 2nd file is finished
+ if (ks2 && kseq_read(ks2) >= 0)
+ fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
+ }
+ *n_ = n;
+ return seqs;
}
-void bwa_buf_destroy(bwa_buf_t *p)
-{
- gap_destroy_stack(p->stack);
- free(p->diff_tab); free(p->logn); free(p->opt);
- free(p);
-}
+/*****************
+ * CIGAR related *
+ *****************/
-bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
+// Generate CIGAR when the alignment end points are known
+uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{
- extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
- int i, seq_len, buf_len;
- bwt_width_t *w, *seed_w;
- uint8_t *s;
- gap_opt_t opt2 = *buf->opt;
- bwa_sai_t sai;
-
- seq_len = strlen(seq);
- // estimate the buffer length
- buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
- if (buf_len > buf->max_buf) {
- buf->max_buf = buf_len;
- kroundup32(buf->max_buf);
- buf->buf = realloc(buf->buf, buf->max_buf);
+ uint32_t *cigar = 0;
+ uint8_t tmp, *rseq;
+ int i;
+ int64_t rlen;
+ *n_cigar = 0; *NM = -1;
+ if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
+ rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
+ if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
+ if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
+ for (i = 0; i < l_query>>1; ++i)
+ tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+ for (i = 0; i < rlen>>1; ++i)
+ tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
}
- memset(buf->buf, 0, buf_len);
- seed_w = (bwt_width_t*)buf->buf;
- w = seed_w + buf->opt->seed_len;
- s = (uint8_t*)(w + seq_len + 1);
- if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
- // copy the sequence
- for (i = 0; i < seq_len; ++i)
- s[i] = nst_nt4_table[(int)seq[i]];
- seq_reverse(seq_len, s, 0);
- // mapping
- bwt_cal_width(idx->bwt, seq_len, s, w);
- if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
- if (seq_len > buf->opt->seed_len)
- bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
- for (i = 0; i < seq_len; ++i) // complement; I forgot why...
- s[i] = s[i] > 3? 4 : 3 - s[i];
- sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
- return sai;
-}
-
-static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
-{
- uint64_t x = pos, z;
- int k, y = 0;
- *n_mm = *n_gaps = 0;
- for (k = 0; k < n_cigar; ++k) {
- int l = cigar[k]>>4;
- int op = cigar[k]&0xf;
- if (op == 0) { // match/mismatch
- for (z = 0; z < l && x + z < l_pac; ++z) {
- int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
- if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm);
- }
+ if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
+ cigar = malloc(4);
+ cigar[0] = l_query<<4 | 0;
+ *n_cigar = 1;
+ for (i = 0, *score = 0; i < l_query; ++i)
+ *score += mat[rseq[i]*5 + query[i]];
+ } else {
+ int w, max_gap, min_w;
+ //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
+ //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+ // set the band-width
+ max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
+ max_gap = max_gap > 1? max_gap : 1;
+ w = (max_gap + abs(rlen - l_query) + 1) >> 1;
+ w = w < w_? w : w_;
+ min_w = abs(rlen - l_query) + 3;
+ w = w > min_w? w : min_w;
+ // NW alignment
+ *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
+ }
+ {// compute NM
+ int k, x, y, n_mm = 0, n_gap = 0;
+ for (k = 0, x = y = 0; k < *n_cigar; ++k) {
+ int op = cigar[k]&0xf;
+ int len = cigar[k]>>4;
+ if (op == 0) { // match
+ for (i = 0; i < len; ++i)
+ if (query[x + i] != rseq[y + i]) ++n_mm;
+ x += len; y += len;
+ } else if (op == 1) x += len, n_gap += len;
+ else if (op == 2) y += len, n_gap += len;
}
- if (op == 1 || op == 2) (*n_gaps) += l;
- if (op == 0 || op == 2) x += l;
- if (op == 0 || op == 1 || op == 4) y += l;
+ *NM = n_mm + n_gap;
}
+ if (rb >= l_pac) // reverse back query
+ for (i = 0; i < l_query>>1; ++i)
+ tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+
+ret_gen_cigar:
+ free(rseq);
+ return cigar;
}
-void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln)
+int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
- extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
- extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
- int strand, seq_len, i, n_gap, n_mm;
- uint64_t pos3, pac_pos;
- uint8_t *s[2];
-
- memset(aln, 0, sizeof(bwa_aln_t));
- seq_len = strlen(seq);
- if (seq_len<<1 > buf->max_buf) {
- buf->max_buf = seq_len<<1;
- kroundup32(buf->max_buf);
- buf->buf = realloc(buf->buf, buf->max_buf);
+ int ib, ie, is_rev;
+ int64_t fb, fe, mid = -1;
+ if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary
+ *qb = *qe = *rb = *re = -1;
+ return -1; // unable to fix
+ } else {
+ fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
+ ib = bns_pos2rid(bns, fb);
+ if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
+ fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
+ ie = bns_pos2rid(bns, fe);
+ if (ie - ib > 1) { // bridge three or more references
+ *qb = *qe = *rb = *re = -1;
+ return -2; // unable to fix
+ } else {
+ int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
+ mid = is_rev? *re - l : *rb + l;
+ }
}
- s[0] = buf->buf;
- s[1] = s[0] + seq_len;
- for (i = 0; i < seq_len; ++i)
- s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
- seq_reverse(seq_len, s[1], 1);
- pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
- if (strand) aln->flag |= 16;
- if (n_gaps) { // only for gapped alignment
- int n_cigar;
- bwa_cigar_t *cigar16;
- cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
- aln->n_cigar = n_cigar;
- aln->cigar = malloc(n_cigar * 4);
- for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
- int op = cigar16[i]>>14;
- int len = cigar16[i]&0x3fff;
- if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
- aln->cigar[i] = len<<4 | op;
- if (op == 0 || op == 2) pos3 += len;
+ if (mid >= 0) {
+ int i, score, n_cigar, y, NM;
+ uint32_t *cigar;
+ int64_t x;
+ cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
+ for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
+ int op = cigar[i]&0xf, len = cigar[i]>>4;
+ if (op == 0) {
+ if (x <= mid && mid < x + len) {
+ if (mid - *rb > *re - mid) { // the first part is longer
+ if (x == mid) { // need to check the previous operation
+ assert(i); // mid != *rb should always stand
+ if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
+ else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
+ else abort(); // should not be here
+ } else *qe = y + (mid - x), *re = mid;
+ } else *qb = y + (mid - x), *rb = mid;
+ break;
+ } else x += len, y += len;
+ } else if (op == 1) { // insertion
+ y += len;
+ } else if (op == 2) { // deletion
+ if (x <= mid && mid < x + len) {
+ if (mid - *rb > *re - mid) *qe = y, *re = x;
+ else *qb = y, *rb = x + len;
+ break;
+ } else x += len;
+ } else abort(); // should not be here
}
- free(cigar16);
- } else { // ungapped
- aln->n_cigar = 1;
- aln->cigar = malloc(4);
- aln->cigar[0] = seq_len<<4 | 0;
- pos3 = pac_pos + seq_len;
+ free(cigar);
}
- aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id);
- aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset;
- if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence
- aln->flag |= 4; // read unmapped
- compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap);
- aln->n_mm = n_mm;
- aln->n_gap = n_gap;
+ return 1;
}
-/************************
- * Single-end alignment *
- ************************/
+/*********************
+ * Full index reader *
+ *********************/
-bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar)
+char *bwa_idx_infer_prefix(const char *hint)
{
- bwa_one_t *one;
- int best, cnt, i, seq_len;
-
- seq_len = strlen(seq);
- one = calloc(1, sizeof(bwa_one_t));
- one->sai = bwa_sai(idx, buf, seq);
- if (one->sai.n == 0) return one;
- // count number of hits; randomly select one alignment
- best = one->sai.sai[0].score;
- for (i = cnt = 0; i < one->sai.n; ++i) {
- bwa_sai1_t *p = &one->sai.sai[i];
- if (p->score > best) break;
- if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
- one->which = p;
- one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
+ char *prefix;
+ int l_hint;
+ FILE *fp;
+ l_hint = strlen(hint);
+ prefix = malloc(l_hint + 3 + 4 + 1);
+ strcpy(prefix, hint);
+ strcpy(prefix + l_hint, ".64.bwt");
+ if ((fp = fopen(prefix, "rb")) != 0) {
+ fclose(fp);
+ prefix[l_hint + 3] = 0;
+ return prefix;
+ } else {
+ strcpy(prefix + l_hint, ".bwt");
+ if ((fp = fopen(prefix, "rb")) == 0) {
+ free(prefix);
+ return 0;
+ } else {
+ fclose(fp);
+ prefix[l_hint] = 0;
+ return prefix;
}
- cnt += p->l - p->k + 1;
}
- one->c1 = cnt;
- for (; i < one->sai.n; ++i)
- cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1;
- one->c2 = cnt - one->c1;
- // estimate single-end mapping quality
- one->mapQs = -1;
- if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible?
- else if (one->c1 > 1) one->mapQs = 0;
- else {
- int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape;
- if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25;
- else if (one->c2 == 0) one->mapQs = 37;
+}
+
+bwt_t *bwa_idx_load_bwt(const char *hint)
+{
+ char *tmp, *prefix;
+ bwt_t *bwt;
+ prefix = bwa_idx_infer_prefix(hint);
+ if (prefix == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
+ return 0;
+ }
+ tmp = calloc(strlen(prefix) + 5, 1);
+ strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
+ bwt = bwt_restore_bwt(tmp);
+ strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
+ bwt_restore_sa(tmp, bwt);
+ free(tmp); free(prefix);
+ return bwt;
+}
+
+bwaidx_t *bwa_idx_load(const char *hint, int which)
+{
+ bwaidx_t *idx;
+ char *prefix;
+ prefix = bwa_idx_infer_prefix(hint);
+ if (prefix == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
+ return 0;
}
- if (one->mapQs < 0) {
- cnt = (one->c2 >= 255)? 255 : one->c2;
- one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt];
+ idx = calloc(1, sizeof(bwaidx_t));
+ if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
+ if (which & BWA_IDX_BNS) {
+ idx->bns = bns_restore(prefix);
+ if (which & BWA_IDX_PAC) {
+ idx->pac = calloc(idx->bns->l_pac/4+1, 1);
+ fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
+ fclose(idx->bns->fp_pac);
+ idx->bns->fp_pac = 0;
+ }
}
- one->mapQ = one->mapQs;
- // compute CIGAR on request
- one->one.ref_id = -1;
- if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one);
- return one;
+ free(prefix);
+ return idx;
}
-void bwa_one_destroy(bwa_one_t *one)
+void bwa_idx_destroy(bwaidx_t *idx)
{
- free(one->sai.sai);
- free(one->one.cigar);
- free(one);
+ if (idx == 0) return;
+ if (idx->bwt) bwt_destroy(idx->bwt);
+ if (idx->bns) bns_destroy(idx->bns);
+ if (idx->pac) free(idx->pac);
+ free(idx);
}
-/************************
- * Paired-end alignment *
- ************************/
+/***********************
+ * SAM header routines *
+ ***********************/
+
+void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
+{
+ int i;
+ for (i = 0; i < bns->n_seqs; ++i)
+ err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
+ if (rg_line) err_printf("%s\n", rg_line);
+}
-void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2])
+static char *bwa_escape(char *s)
{
+ char *p, *q;
+ for (p = q = s; *p; ++p) {
+ if (*p == '\\') {
+ ++p;
+ if (*p == 't') *q++ = '\t';
+ else if (*p == 'n') *q++ = '\n';
+ else if (*p == 'r') *q++ = '\r';
+ else if (*p == '\\') *q++ = '\\';
+ } else *q++ = *p;
+ }
+ *q = '\0';
+ return s;
}
-void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2])
+char *bwa_set_rg(const char *s)
{
+ char *p, *q, *r, *rg_line = 0;
+ memset(bwa_rg_id, 0, 256);
+ if (strstr(s, "@RG") != s) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
+ goto err_set_rg;
+ }
+ rg_line = strdup(s);
+ bwa_escape(rg_line);
+ if ((p = strstr(rg_line, "\tID:")) == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
+ goto err_set_rg;
+ }
+ p += 4;
+ for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
+ if (q - p + 1 > 256) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
+ goto err_set_rg;
+ }
+ for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
+ *r++ = *q;
+ return rg_line;
+
+err_set_rg:
+ free(rg_line);
+ return 0;
}
+
diff --git a/bwa.h b/bwa.h
index e8172da..81d40e0 100644
--- a/bwa.h
+++ b/bwa.h
@@ -2,103 +2,45 @@
#define BWA_H_
#include <stdint.h>
+#include "bntseq.h"
+#include "bwt.h"
-#define BWA_DEF_MAX_SCORE 2048
-#define BWA_MAX_QUERY_LEN 1024
+#define BWA_IDX_BWT 0x1
+#define BWA_IDX_BNS 0x2
+#define BWA_IDX_PAC 0x4
+#define BWA_IDX_ALL 0x7
-// BWA index
-struct bwa_idx_t;
-typedef struct bwa_idx_t bwa_idx_t;
-
-// Buffer for BWA alignment
-struct bwa_buf_t;
-typedef struct bwa_buf_t bwa_buf_t;
-
-// BWA alignment options
-typedef struct {
- int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3
- int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
- int seed_len, max_seed_diff; // seed length and max differences allowed in the seed
- float fnr; // parameter for automatic length-adjusted max differences
-} bwa_opt_t;
-
-// default BWA alignment options
-extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
-
-// an interval hit in the SA coordinate; basic unit in .sai files
typedef struct {
- uint32_t n_mm:16, n_gapo:8, n_gape:8;
- int score;
- uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
-} bwa_sai1_t;
+ bwt_t *bwt; // FM-index
+ bntseq_t *bns; // information on the reference sequences
+ uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
+} bwaidx_t;
-// all interval hits in the SA coordinate
typedef struct {
- int n; // number of interval hits
- bwa_sai1_t *sai;
-} bwa_sai_t;
+ int l_seq;
+ char *name, *comment, *seq, *qual, *sam;
+} bseq1_t;
-// an alignment
-typedef struct {
- uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
- int32_t ref_id; // referece sequence index (the first seq is indexed by 0)
- uint32_t offset; // coordinate on the reference; zero-based
- uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag
- uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations
-} bwa_aln_t;
-
-typedef struct {
- int mapQs, mapQ, c1, c2;
- uint64_t sa;
- bwa_sai1_t *which;
- bwa_sai_t sai;
- bwa_aln_t one;
-} bwa_one_t;
-
-typedef struct {
- double avg, std, ap_prior;
- uint64_t low, high, high_bayesian;
-} bwa_pestat_t;
+extern int bwa_verbose;
+extern char bwa_rg_id[256];
#ifdef __cplusplus
extern "C" {
#endif
- // load a BWA index
- bwa_idx_t *bwa_idx_load(const char *prefix);
- void bwa_idx_destroy(bwa_idx_t *p);
-
- // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
- bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
- void bwa_buf_destroy(bwa_buf_t *p);
+ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
- /**
- * Find all the SA intervals
- *
- * @param idx BWA index; multiple threads can share the same index
- * @param buf BWA alignment buffer; each thread should have its own buffer
- * @param seq NULL terminated C string, consisting of A/C/G/T/N only
- *
- * @return SA intervals seq is matched to
- */
- bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
+ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
+ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
- /**
- * Construct an alignment in the base-pair coordinate
- *
- * @param idx BWA index
- * @param buf BWA alignment buffer
- * @param seq NULL terinated C string
- * @param sa Suffix array value
- * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
- *
- * @return An alignment
- */
- void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln);
+ char *bwa_idx_infer_prefix(const char *hint);
+ bwt_t *bwa_idx_load_bwt(const char *hint);
- bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar);
+ bwaidx_t *bwa_idx_load(const char *hint, int which);
+ void bwa_idx_destroy(bwaidx_t *idx);
- void bwa_one_destroy(bwa_one_t *one);
+ void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
+ char *bwa_set_rg(const char *s);
#ifdef __cplusplus
}
diff --git a/bwa.txt b/bwa.txt
new file mode 100644
index 0000000..d32ad96
--- /dev/null
+++ b/bwa.txt
@@ -0,0 +1,607 @@
+bwa(1) Bioinformatics tools bwa(1)
+
+
+
+NAME
+ bwa - Burrows-Wheeler Alignment Tool
+
+SYNOPSIS
+ bwa index ref.fa
+
+ bwa mem ref.fa reads.fq > aln-se.sam
+
+ bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
+
+ bwa aln ref.fa short_read.fq > aln_sa.sai
+
+ bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
+
+ bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
+
+ bwa bwasw ref.fa long_read.fq > aln.sam
+
+
+DESCRIPTION
+ BWA is a software package for mapping low-divergent sequences against a
+ large reference genome, such as the human genome. It consists of three
+ algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is
+ designed for Illumina sequence reads up to 100bp, while the rest two
+ for longer sequences ranged from 70bp to 1Mbp. BWA-MEM and BWA-SW share
+ similar features such as long-read support and split alignment, but
+ BWA-MEM, which is the latest, is generally recommended for high-quality
+ queries as it is faster and more accurate. BWA-MEM also has better
+ performance than BWA-backtrack for 70-100bp Illumina reads.
+
+ For all the algorithms, BWA first needs to construct the FM-index for
+ the reference genome (the index command). Alignment algorithms are
+ invoked with different sub-commands: aln/samse/sampe for BWA-backtrack,
+ bwasw for BWA-SW and mem for the BWA-MEM algorithm.
+
+
+COMMANDS AND OPTIONS
+ index bwa index [-p prefix] [-a algoType] <in.db.fasta>
+
+ Index database sequences in the FASTA format.
+
+ OPTIONS:
+
+ -p STR Prefix of the output database [same as db filename]
+
+ -a STR Algorithm for constructing BWT index. Available
+ options are:
+
+ is IS linear-time algorithm for constructing suf-
+ fix array. It requires 5.37N memory where N is
+ the size of the database. IS is moderately
+ fast, but does not work with database larger
+ than 2GB. IS is the default algorithm due to
+ its simplicity. The current codes for IS algo-
+ rithm are reimplemented by Yuta Mori.
+
+ bwtsw Algorithm implemented in BWT-SW. This method
+ works with the whole human genome.
+
+
+ mem bwa mem [-aCHMpP] [-t nThreads] [-k minSeedLen] [-w bandWidth]
+ [-r seedSplitRatio] [-c maxOcc] [-A matchScore] [-B mmPenalty]
+ [-O gapOpenPen] [-E gapExtPen] [-L clipPen] [-U unpairPen] [-R
+ RGline] [-v verboseLevel] db.prefix reads.fq [mates.fq]
+
+ Align 70bp-1Mbp query sequences with the BWA-MEM algorithm.
+ Briefly, the algorithm works by seeding alignments with maximal
+ exact matches (MEMs) and then extending seeds with the affine-
+ gap Smith-Waterman algorithm (SW).
+
+ If mates.fq file is absent and option -p is not set, this com-
+ mand regards input reads are single-end. If mates.fq is present,
+ this command assumes the i-th read in reads.fq and the i-th read
+ in mates.fq constitute a read pair. If -p is used, the command
+ assumes the 2i-th and the (2i+1)-th read in reads.fq constitute
+ a read pair (such input file is said to be interleaved). In this
+ case, mates.fq is ignored. In the paired-end mode, the mem com-
+ mand will infer the read orientation and the insert size distri-
+ bution from a batch of reads.
+
+ The BWA-MEM algorithm performs local alignment. It may produce
+ multiple primary alignments for different part of a query
+ sequence. This is a crucial feature for long sequences. However,
+ some tools such as Picard's markDuplicates does not work with
+ split alignments. One may consider to use option -M to flag
+ shorter split hits as secondary.
+
+ OPTIONS:
+
+ -t INT Number of threads [1]
+
+ -k INT Minimum seed length. Matches shorter than INT will be
+ missed. The alignment speed is usually insensitive to
+ this value unless it significantly deviates 20. [19]
+
+ -w INT Band width. Essentially, gaps longer than INT will not
+ be found. Note that the maximum gap length is also
+ affected by the scoring matrix and the hit length, not
+ solely determined by this option. [100]
+
+ -r FLOAT Trigger re-seeding for a MEM longer than min-
+ SeedLen*FLOAT. This is a key heuristic parameter for
+ tuning the performance. Larger value yields fewer
+ seeds, which leads to faster alignment speed but lower
+ accuracy. [1.5]
+
+ -c INT Discard a MEM if it has more than INT occurence in the
+ genome. This is an insensitive parameter. [10000]
+
+ -P In the paired-end mode, perform SW to rescue missing
+ hits only but do not try to find hits that fit a
+ proper pair.
+
+ -A INT Matching score. [1]
+
+ -B INT Mismatch penalty. The sequence error rate is approxi-
+ mately: {.75 * exp[-log(4) * B/A]}. [4]
+
+ -O INT Gap open penalty. [6]
+
+ -E INT Gap extension penalty. A gap of length k costs O + k*E
+ (i.e. -O is for opening a zero-length gap). [1]
+
+ -L INT Clipping penalty. When performing SW extension, BWA-
+ MEM keeps track of the best score reaching the end of
+ query. If this score is larger than the best SW score
+ minus the clipping penalty, clipping will not be
+ applied. Note that in this case, the SAM AS tag
+ reports the best SW score; clipping penalty is not
+ deducted. [5]
+
+ -U INT Penalty for an unpaired read pair. BWA-MEM scores an
+ unpaired read pair as scoreRead1+scoreRead2-INT and
+ scores a paired as scoreRead1+scoreRead2-insert-
+ Penalty. It compares these two scores to determine
+ whether we should force pairing. [9]
+
+ -p Assume the first input query file is interleaved
+ paired-end FASTA/Q. See the command description for
+ details.
+
+ -R STR Complete read group header line. '\t' can be used in
+ STR and will be converted to a TAB in the output SAM.
+ The read group ID will be attached to every read in
+ the output. An example is '@RG\tID:foo\tSM:bar'.
+ [null]
+
+ -a Output all found alignments for single-end or unpaired
+ paired-end reads. These alignments will be flagged as
+ secondary alignments.
+
+ -C Append append FASTA/Q comment to SAM output. This
+ option can be used to transfer read meta information
+ (e.g. barcode) to the SAM output. Note that the
+ FASTA/Q comment (the string after a space in the
+ header line) must conform the SAM spec (e.g.
+ BC:Z:CGTAC). Malformated comments lead to incorrect
+ SAM output.
+
+ -H Use hard clipping 'H' in the SAM output. This option
+ may dramatically reduce the redundancy of output when
+ mapping long contig or BAC sequences.
+
+ -M Mark shorter split hits as secondary (for Picard com-
+ patibility).
+
+ -v INT Control the verbose level of the output. This option
+ has not been fully supported throughout BWA. Ideally,
+ a value 0 for disabling all the output to stderr; 1
+ for outputting errors only; 2 for warnings and errors;
+ 3 for all normal messages; 4 or higher for debugging.
+ When this option takes value 4, the output is not SAM.
+ [3]
+
+
+ aln bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
+ nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M
+ misMsc] [-O gapOsc] [-E gapEsc] [-q trimQual] <in.db.fasta>
+ <in.query.fq> > <out.sai>
+
+ Find the SA coordinates of the input reads. Maximum maxSeedDiff
+ differences are allowed in the first seedLen subsequence and
+ maximum maxDiff differences are allowed in the whole sequence.
+
+ OPTIONS:
+
+ -n NUM Maximum edit distance if the value is INT, or the
+ fraction of missing alignments given 2% uniform base
+ error rate if FLOAT. In the latter case, the maximum
+ edit distance is automatically chosen for different
+ read lengths. [0.04]
+
+ -o INT Maximum number of gap opens [1]
+
+ -e INT Maximum number of gap extensions, -1 for k-difference
+ mode (disallowing long gaps) [-1]
+
+ -d INT Disallow a long deletion within INT bp towards the
+ 3'-end [16]
+
+ -i INT Disallow an indel within INT bp towards the ends [5]
+
+ -l INT Take the first INT subsequence as seed. If INT is
+ larger than the query sequence, seeding will be dis-
+ abled. For long reads, this option is typically ranged
+ from 25 to 35 for `-k 2'. [inf]
+
+ -k INT Maximum edit distance in the seed [2]
+
+ -t INT Number of threads (multi-threading mode) [1]
+
+ -M INT Mismatch penalty. BWA will not search for suboptimal
+ hits with a score lower than (bestScore-misMsc). [3]
+
+ -O INT Gap open penalty [11]
+
+ -E INT Gap extension penalty [4]
+
+ -R INT Proceed with suboptimal alignments if there are no
+ more than INT equally best hits. This option only
+ affects paired-end mapping. Increasing this threshold
+ helps to improve the pairing accuracy at the cost of
+ speed, especially for short reads (~32bp).
+
+ -c Reverse query but not complement it, which is required
+ for alignment in the color space. (Disabled since
+ 0.6.x)
+
+ -N Disable iterative search. All hits with no more than
+ maxDiff differences will be found. This mode is much
+ slower than the default.
+
+ -q INT Parameter for read trimming. BWA trims a read down to
+ argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l
+ is the original read length. [0]
+
+ -I The input is in the Illumina 1.3+ read format (quality
+ equals ASCII-64).
+
+ -B INT Length of barcode starting from the 5'-end. When INT
+ is positive, the barcode of each read will be trimmed
+ before mapping and will be written at the BC SAM tag.
+ For paired-end reads, the barcode from both ends are
+ concatenated. [0]
+
+ -b Specify the input read sequence file is the BAM for-
+ mat. For paired-end data, two ends in a pair must be
+ grouped together and options -1 or -2 are usually
+ applied to specify which end should be mapped. Typical
+ command lines for mapping pair-end data in the BAM
+ format are:
+
+ bwa aln ref.fa -b1 reads.bam > 1.sai
+ bwa aln ref.fa -b2 reads.bam > 2.sai
+ bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam >
+ aln.sam
+
+ -0 When -b is specified, only use single-end reads in
+ mapping.
+
+ -1 When -b is specified, only use the first read in a
+ read pair in mapping (skip single-end reads and the
+ second reads).
+
+ -2 When -b is specified, only use the second read in a
+ read pair in mapping.
+
+
+ samse bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
+
+ Generate alignments in the SAM format given single-end reads.
+ Repetitive hits will be randomly chosen.
+
+ OPTIONS:
+
+ -n INT Maximum number of alignments to output in the XA tag
+ for reads paired properly. If a read has more than INT
+ hits, the XA tag will not be written. [3]
+
+ -r STR Specify the read group in a format like
+ `@RG\tID:foo\tSM:bar'. [null]
+
+
+ sampe bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N max-
+ HitDis] [-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq>
+ > <out.sam>
+
+ Generate alignments in the SAM format given paired-end reads.
+ Repetitive read pairs will be placed randomly.
+
+ OPTIONS:
+
+ -a INT Maximum insert size for a read pair to be considered
+ being mapped properly. Since 0.4.5, this option is only
+ used when there are not enough good alignment to infer
+ the distribution of insert sizes. [500]
+
+ -o INT Maximum occurrences of a read for pairing. A read with
+ more occurrneces will be treated as a single-end read.
+ Reducing this parameter helps faster pairing. [100000]
+
+ -P Load the entire FM-index into memory to reduce disk
+ operations (base-space reads only). With this option, at
+ least 1.25N bytes of memory are required, where N is the
+ length of the genome.
+
+ -n INT Maximum number of alignments to output in the XA tag for
+ reads paired properly. If a read has more than INT hits,
+ the XA tag will not be written. [3]
+
+ -N INT Maximum number of alignments to output in the XA tag for
+ disconcordant read pairs (excluding singletons). If a
+ read has more than INT hits, the XA tag will not be
+ written. [10]
+
+ -r STR Specify the read group in a format like
+ `@RG\tID:foo\tSM:bar'. [null]
+
+
+ bwasw bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r
+ gapExtPen] [-t nThreads] [-w bandWidth] [-T thres] [-s hspIntv]
+ [-z zBest] [-N nHspRev] [-c thresCoef] <in.db.fasta> <in.fq>
+ [mate.fq]
+
+ Align query sequences in the in.fq file. When mate.fq is
+ present, perform paired-end alignment. The paired-end mode only
+ works for reads Illumina short-insert libraries. In the paired-
+ end mode, BWA-SW may still output split alignments but they are
+ all marked as not properly paired; the mate positions will not
+ be written if the mate has multiple local hits.
+
+ OPTIONS:
+
+ -a INT Score of a match [1]
+
+ -b INT Mismatch penalty [3]
+
+ -q INT Gap open penalty [5]
+
+ -r INT Gap extension penalty. The penalty for a contiguous
+ gap of size k is q+k*r. [2]
+
+ -t INT Number of threads in the multi-threading mode [1]
+
+ -w INT Band width in the banded alignment [33]
+
+ -T INT Minimum score threshold divided by a [37]
+
+ -c FLOAT Coefficient for threshold adjustment according to
+ query length. Given an l-long query, the threshold for
+ a hit to be retained is a*max{T,c*log(l)}. [5.5]
+
+ -z INT Z-best heuristics. Higher -z increases accuracy at the
+ cost of speed. [1]
+
+ -s INT Maximum SA interval size for initiating a seed. Higher
+ -s increases accuracy at the cost of speed. [3]
+
+ -N INT Minimum number of seeds supporting the resultant
+ alignment to skip reverse alignment. [5]
+
+
+SAM ALIGNMENT FORMAT
+ The output of the `aln' command is binary and designed for BWA use
+ only. BWA outputs the final alignment in the SAM (Sequence Align-
+ ment/Map) format. Each line consists of:
+
+
+ +----+-------+----------------------------------------------------------+
+ |Col | Field | Description |
+ +----+-------+----------------------------------------------------------+
+ | 1 | QNAME | Query (pair) NAME |
+ | 2 | FLAG | bitwise FLAG |
+ | 3 | RNAME | Reference sequence NAME |
+ | 4 | POS | 1-based leftmost POSition/coordinate of clipped sequence |
+ | 5 | MAPQ | MAPping Quality (Phred-scaled) |
+ | 6 | CIAGR | extended CIGAR string |
+ | 7 | MRNM | Mate Reference sequence NaMe (`=' if same as RNAME) |
+ | 8 | MPOS | 1-based Mate POSistion |
+ | 9 | ISIZE | Inferred insert SIZE |
+ |10 | SEQ | query SEQuence on the same strand as the reference |
+ |11 | QUAL | query QUALity (ASCII-33 gives the Phred base quality) |
+ |12 | OPT | variable OPTional fields in the format TAG:VTYPE:VALUE |
+ +----+-------+----------------------------------------------------------+
+
+ Each bit in the FLAG field is defined as:
+
+
+ +----+--------+---------------------------------------+
+ |Chr | Flag | Description |
+ +----+--------+---------------------------------------+
+ | p | 0x0001 | the read is paired in sequencing |
+ | P | 0x0002 | the read is mapped in a proper pair |
+ | u | 0x0004 | the query sequence itself is unmapped |
+ | U | 0x0008 | the mate is unmapped |
+ | r | 0x0010 | strand of the query (1 for reverse) |
+ | R | 0x0020 | strand of the mate |
+ | 1 | 0x0040 | the read is the first read in a pair |
+ | 2 | 0x0080 | the read is the second read in a pair |
+ | s | 0x0100 | the alignment is not primary |
+ | f | 0x0200 | QC failure |
+ | d | 0x0400 | optical or PCR duplicate |
+ +----+--------+---------------------------------------+
+
+ The Please check <http://samtools.sourceforge.net> for the format spec-
+ ification and the tools for post-processing the alignment.
+
+ BWA generates the following optional fields. Tags starting with `X' are
+ specific to BWA.
+
+
+ +----+------------------------------------------------+
+ |Tag | Meaning |
+ +----+------------------------------------------------+
+ |NM | Edit distance |
+ |MD | Mismatching positions/bases |
+ |AS | Alignment score |
+ |BC | Barcode sequence |
+ +----+------------------------------------------------+
+ |X0 | Number of best hits |
+ |X1 | Number of suboptimal hits found by BWA |
+ |XN | Number of ambiguous bases in the referenece |
+ |XM | Number of mismatches in the alignment |
+ |XO | Number of gap opens |
+ |XG | Number of gap extentions |
+ |XT | Type: Unique/Repeat/N/Mate-sw |
+ |XA | Alternative hits; format: (chr,pos,CIGAR,NM;)* |
+ +----+------------------------------------------------+
+ |XS | Suboptimal alignment score |
+ |XF | Support from forward/reverse alignment |
+ |XE | Number of supporting seeds |
+ +----+------------------------------------------------+
+
+ Note that XO and XG are generated by BWT search while the CIGAR string
+ by Smith-Waterman alignment. These two tags may be inconsistent with
+ the CIGAR string. This is not a bug.
+
+
+NOTES ON SHORT-READ ALIGNMENT
+ Alignment Accuracy
+ When seeding is disabled, BWA guarantees to find an alignment contain-
+ ing maximum maxDiff differences including maxGapO gap opens which do
+ not occur within nIndelEnd bp towards either end of the query. Longer
+ gaps may be found if maxGapE is positive, but it is not guaranteed to
+ find all hits. When seeding is enabled, BWA further requires that the
+ first seedLen subsequence contains no more than maxSeedDiff differ-
+ ences.
+
+ When gapped alignment is disabled, BWA is expected to generate the same
+ alignment as Eland version 1, the Illumina alignment program. However,
+ as BWA change `N' in the database sequence to random nucleotides, hits
+ to these random sequences will also be counted. As a consequence, BWA
+ may mark a unique hit as a repeat, if the random sequences happen to be
+ identical to the sequences which should be unqiue in the database.
+
+ By default, if the best hit is not highly repetitive (controlled by
+ -R), BWA also finds all hits contains one more mismatch; otherwise, BWA
+ finds all equally best hits only. Base quality is NOT considered in
+ evaluating hits. In the paired-end mode, BWA pairs all hits it found.
+ It further performs Smith-Waterman alignment for unmapped reads to res-
+ cue reads with a high erro rate, and for high-quality anomalous pairs
+ to fix potential alignment errors.
+
+
+ Estimating Insert Size Distribution
+ BWA estimates the insert size distribution per 256*1024 read pairs. It
+ first collects pairs of reads with both ends mapped with a single-end
+ quality 20 or higher and then calculates median (Q2), lower and higher
+ quartile (Q1 and Q3). It estimates the mean and the variance of the
+ insert size distribution from pairs whose insert sizes are within
+ interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
+ considered to be properly paired (SAM flag 0x2) is calculated by solv-
+ ing equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is
+ the standard error of the insert size distribution, L is the length of
+ the genome, p0 is prior of anomalous pair and Phi() is the standard
+ cumulative distribution function. For mapping Illumina short-insert
+ reads to the human genome, x is about 6-7 sigma away from the mean.
+ Quartiles, mean, variance and x will be printed to the standard error
+ output.
+
+
+ Memory Requirement
+ With bwtsw algorithm, 5GB memory is required for indexing the complete
+ human genome sequences. For short reads, the aln command uses ~3.2GB
+ memory and the sampe command uses ~5.4GB.
+
+
+ Speed
+ Indexing the human genome sequences takes 3 hours with bwtsw algorithm.
+ Indexing smaller genomes with IS algorithms is faster, but requires
+ more memory.
+
+ The speed of alignment is largely determined by the error rate of the
+ query sequences (r). Firstly, BWA runs much faster for near perfect
+ hits than for hits with many differences, and it stops searching for a
+ hit with l+2 differences if a l-difference hit is found. This means BWA
+ will be very slow if r is high because in this case BWA has to visit
+ hits with many differences and looking for these hits is expensive.
+ Secondly, the alignment algorithm behind makes the speed sensitive to
+ [k log(N)/m], where k is the maximum allowed differences, N the size of
+ database and m the length of a query. In practice, we choose k w.r.t. r
+ and therefore r is the leading factor. I would not recommend to use BWA
+ on data with r>0.02.
+
+ Pairing is slower for shorter reads. This is mainly because shorter
+ reads have more spurious hits and converting SA coordinates to chromo-
+ somal coordinates are very costly.
+
+
+CHANGES IN BWA-0.6
+ Since version 0.6, BWA has been able to work with a reference genome
+ longer than 4GB. This feature makes it possible to integrate the for-
+ ward and reverse complemented genome in one FM-index, which speeds up
+ both BWA-short and BWA-SW. As a tradeoff, BWA uses more memory because
+ it has to keep all positions and ranks in 64-bit integers, twice larger
+ than 32-bit integers used in the previous versions.
+
+ The latest BWA-SW also works for paired-end reads longer than 100bp. In
+ comparison to BWA-short, BWA-SW tends to be more accurate for highly
+ unique reads and more robust to relative long INDELs and structural
+ variants. Nonetheless, BWA-short usually has higher power to distin-
+ guish the optimal hit from many suboptimal hits. The choice of the map-
+ ping algorithm may depend on the application.
+
+
+SEE ALSO
+ BWA website <http://bio-bwa.sourceforge.net>, Samtools website
+ <http://samtools.sourceforge.net>
+
+
+AUTHOR
+ Heng Li at the Sanger Institute wrote the key source codes and inte-
+ grated the following codes for BWT construction: bwtsw
+ <http://i.cs.hku.hk/~ckwong3/bwtsw/>, implemented by Chi-Kwong Wong at
+ the University of Hong Kong and IS
+ <http://yuta.256.googlepages.com/sais> originally proposed by Nong Ge
+ <http://www.cs.sysu.edu.cn/nong/> at the Sun Yat-Sen University and
+ implemented by Yuta Mori.
+
+
+LICENSE AND CITATION
+ The full BWA package is distributed under GPLv3 as it uses source codes
+ from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
+ libraries are distributed under the MIT license.
+
+ If you use the BWA-backtrack algorithm, please cite the following
+ paper:
+
+ Li H. and Durbin R. (2009) Fast and accurate short read alignment with
+ Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID:
+ 19451168]
+
+ If you use the BWA-SW algorithm, please cite:
+
+ Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
+ Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID:
+ 20080505]
+
+ If you use the fastmap component of BWA, please cite:
+
+ Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-
+ genome de novo assembly. Bioinformatics, 28, 1838-1844. [PMID:
+ 22569178]
+
+ The BWA-MEM algorithm has not been published yet.
+
+
+HISTORY
+ BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
+ and mimics its binary file formats; BWA-SW resembles BWT-SW in several
+ ways. The initial idea about BWT-based alignment also came from the
+ group who developed BWT-SW. At the same time, BWA is different enough
+ from BWT-SW. The short-read alignment algorithm bears no similarity to
+ Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it
+ introduces heuristics that can hardly be applied to the original algo-
+ rithm. In all, BWA does not guarantee to find all local hits as what
+ BWT-SW is designed to do, but it is much faster than BWT-SW on both
+ short and long query sequences.
+
+ I started to write the first piece of codes on 24 May 2008 and got the
+ initial stable version on 02 June 2008. During this period, I was
+ acquainted that Professor Tak-Wah Lam, the first author of BWT-SW
+ paper, was collaborating with Beijing Genomics Institute on SOAP2, the
+ successor to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has
+ come out in November 2008. According to the SourceForge download page,
+ the third BWT-based short read aligner, bowtie, was first released in
+ August 2008. At the time of writing this manual, at least three more
+ BWT-based short-read aligners are being implemented.
+
+ The BWA-SW algorithm is a new component of BWA. It was conceived in
+ November 2008 and implemented ten months later.
+
+ The BWA-MEM algorithm is based on an algorithm finding super-maximal
+ exact matches (SMEMs), which was first published with the fermi assem-
+ bler paper in 2012. I first implemented the basic SMEM algorithm in the
+ fastmap command for an experiment and then extended the basic algorithm
+ and added the extension part in Feburary 2013 to make BWA-MEM a fully
+ featured mapper.
+
+
+
+
+bwa-0.7.0 27 Feburary 2013 bwa(1)
diff --git a/bwamem.c b/bwamem.c
new file mode 100644
index 0000000..52dc7fb
--- /dev/null
+++ b/bwamem.c
@@ -0,0 +1,848 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+
+#include "kstring.h"
+#include "bwamem.h"
+#include "bntseq.h"
+#include "ksw.h"
+#include "kvec.h"
+#include "ksort.h"
+#include "utils.h"
+
+/* Theory on probability and scoring *ungapped* alignment
+ *
+ * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution
+ * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate
+ *
+ * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x
+ *
+ * If the matching score is x and mismatch penalty is -y, we can compute error rate e:
+ * e = .75 * exp[-log(4) * y/x]
+ *
+ * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)}
+ * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)
+ *
+ * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale:
+ * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)
+ *
+ *
+ * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)
+ * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)
+ *
+ * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)
+ */
+
+mem_opt_t *mem_opt_init()
+{
+ mem_opt_t *o;
+ o = calloc(1, sizeof(mem_opt_t));
+ o->flag = 0;
+ o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
+ o->pen_unpaired = 9;
+ o->pen_clip = 5;
+ o->min_seed_len = 19;
+ o->split_width = 10;
+ o->max_occ = 10000;
+ o->max_chain_gap = 10000;
+ o->max_ins = 10000;
+ o->mask_level = 0.50;
+ o->chain_drop_ratio = 0.50;
+ o->split_factor = 1.5;
+ o->chunk_size = 10000000;
+ o->n_threads = 1;
+ o->max_matesw = 100;
+ mem_fill_scmat(o->a, o->b, o->mat);
+ return o;
+}
+
+void mem_fill_scmat(int a, int b, int8_t mat[25])
+{
+ int i, j, k;
+ for (i = k = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ mat[k++] = i == j? a : -b;
+ mat[k++] = 0; // ambiguous base
+ }
+ for (j = 0; j < 5; ++j) mat[k++] = 0;
+}
+
+/***************************
+ * SMEM iterator interface *
+ ***************************/
+
+struct __smem_i {
+ const bwt_t *bwt;
+ const uint8_t *query;
+ int start, len;
+ bwtintv_v *matches; // matches; to be returned by smem_next()
+ bwtintv_v *sub; // sub-matches inside the longest match; temporary
+ bwtintv_v *tmpvec[2]; // temporary arrays
+};
+
+smem_i *smem_itr_init(const bwt_t *bwt)
+{
+ smem_i *itr;
+ itr = calloc(1, sizeof(smem_i));
+ itr->bwt = bwt;
+ itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
+ itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
+ itr->matches = calloc(1, sizeof(bwtintv_v));
+ itr->sub = calloc(1, sizeof(bwtintv_v));
+ return itr;
+}
+
+void smem_itr_destroy(smem_i *itr)
+{
+ free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
+ free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
+ free(itr->matches->a); free(itr->matches);
+ free(itr->sub->a); free(itr->sub);
+ free(itr);
+}
+
+void smem_set_query(smem_i *itr, int len, const uint8_t *query)
+{
+ itr->query = query;
+ itr->start = 0;
+ itr->len = len;
+}
+
+const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
+{
+ int i, max, max_i, ori_start;
+ itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
+ if (itr->start >= itr->len || itr->start < 0) return 0;
+ while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
+ if (itr->start == itr->len) return 0;
+ ori_start = itr->start;
+ itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
+ if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
+ for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
+ bwtintv_t *p = &itr->matches->a[i];
+ int len = (uint32_t)p->info - (p->info>>32);
+ if (max < len) max = len, max_i = i;
+ }
+ if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long
+ int j;
+ bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging
+ bwtintv_t *p = &itr->matches->a[max_i];
+ bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
+ i = j = 0; a->n = 0;
+ while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
+ int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info);
+ int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info);
+ if (xi < xj) {
+ kv_push(bwtintv_t, *a, itr->matches->a[i]);
+ ++i;
+ } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) {
+ kv_push(bwtintv_t, *a, itr->sub->a[j]);
+ ++j;
+ } else ++j;
+ }
+ for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
+ for (; j < itr->sub->n; ++j)
+ if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start)
+ kv_push(bwtintv_t, *a, itr->sub->a[j]);
+ kv_copy(bwtintv_t, *itr->matches, *a);
+ }
+ return itr->matches;
+}
+
+/********************************
+ * Chaining while finding SMEMs *
+ ********************************/
+
+typedef struct {
+ int64_t rbeg;
+ int32_t qbeg, len;
+} mem_seed_t;
+
+typedef struct {
+ int n, m;
+ int64_t pos;
+ mem_seed_t *seeds;
+} mem_chain_t;
+
+typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
+
+#include "kbtree.h"
+
+#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
+KBTREE_INIT(chn, mem_chain_t, chain_cmp)
+
+static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
+{
+ int64_t qend, rend, x, y;
+ const mem_seed_t *last = &c->seeds[c->n-1];
+ qend = last->qbeg + last->len;
+ rend = last->rbeg + last->len;
+ if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
+ return 1; // contained seed; do nothing
+ x = p->qbeg - last->qbeg; // always non-negtive
+ y = p->rbeg - last->rbeg;
+ if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
+ if (c->n == c->m) {
+ c->m <<= 1;
+ c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
+ }
+ c->seeds[c->n++] = *p;
+ return 1;
+ }
+ return 0; // request to add a new chain
+}
+
+static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
+{
+ const bwtintv_v *a;
+ int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
+ split_len = split_len < itr->len? split_len : itr->len;
+ while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
+ int i;
+ for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
+ bwtintv_t *p = &a->a[i];
+ int slen = (uint32_t)p->info - (p->info>>32); // seed length
+ int64_t k;
+ if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
+ for (k = 0; k < p->x[2]; ++k) {
+ mem_chain_t tmp, *lower, *upper;
+ mem_seed_t s;
+ int to_add = 0;
+ s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
+ s.qbeg = p->info>>32;
+ s.len = slen;
+ if (kb_size(tree)) {
+ kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
+ if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1;
+ } else to_add = 1;
+ if (to_add) { // add the seed as a new chain
+ tmp.n = 1; tmp.m = 4;
+ tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
+ tmp.seeds[0] = s;
+ kb_putp(chn, tree, &tmp);
+ }
+ }
+ }
+ }
+}
+
+void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
+{
+ int i, j;
+ for (i = 0; i < chn->n; ++i) {
+ mem_chain_t *p = &chn->a[i];
+ printf("%d", p->n);
+ for (j = 0; j < p->n; ++j) {
+ bwtint_t pos;
+ int is_rev, ref_id;
+ pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
+ if (is_rev) pos -= p->seeds[j].len - 1;
+ bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
+ printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
+ }
+ putchar('\n');
+ }
+}
+
+mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
+{
+ mem_chain_v chain;
+ smem_i *itr;
+ kbtree_t(chn) *tree;
+
+ kv_init(chain);
+ if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
+ tree = kb_init(chn, KB_DEFAULT_SIZE);
+ itr = smem_itr_init(bwt);
+ smem_set_query(itr, len, seq);
+ mem_insert_seed(opt, tree, itr);
+
+ kv_resize(mem_chain_t, chain, kb_size(tree));
+
+ #define traverse_func(p_) (chain.a[chain.n++] = *(p_))
+ __kb_traverse(mem_chain_t, tree, traverse_func);
+ #undef traverse_func
+
+ smem_itr_destroy(itr);
+ kb_destroy(chn, tree);
+ return chain;
+}
+
+/********************
+ * Filtering chains *
+ ********************/
+
+typedef struct {
+ int beg, end, w;
+ void *p, *p2;
+} flt_aux_t;
+
+#define flt_lt(a, b) ((a).w > (b).w)
+KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
+
+int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
+{
+ flt_aux_t *a;
+ int i, j, n;
+ if (n_chn <= 1) return n_chn; // no need to filter
+ a = malloc(sizeof(flt_aux_t) * n_chn);
+ for (i = 0; i < n_chn; ++i) {
+ mem_chain_t *c = &chains[i];
+ int64_t end;
+ int w = 0, tmp;
+ for (j = 0, end = 0; j < c->n; ++j) {
+ const mem_seed_t *s = &c->seeds[j];
+ if (s->qbeg >= end) w += s->len;
+ else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end;
+ end = end > s->qbeg + s->len? end : s->qbeg + s->len;
+ }
+ tmp = w;
+ for (j = 0, end = 0; j < c->n; ++j) {
+ const mem_seed_t *s = &c->seeds[j];
+ if (s->rbeg >= end) w += s->len;
+ else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
+ end = end > s->qbeg + s->len? end : s->qbeg + s->len;
+ }
+ w = w < tmp? w : tmp;
+ a[i].beg = c->seeds[0].qbeg;
+ a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len;
+ a[i].w = w; a[i].p = c; a[i].p2 = 0;
+ }
+ ks_introsort(mem_flt, n_chn, a);
+ { // reorder chains such that the best chain appears first
+ mem_chain_t *swap;
+ swap = malloc(sizeof(mem_chain_t) * n_chn);
+ for (i = 0; i < n_chn; ++i) {
+ swap[i] = *((mem_chain_t*)a[i].p);
+ a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
+ }
+ memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
+ free(swap);
+ }
+ for (i = 1, n = 1; i < n_chn; ++i) {
+ for (j = 0; j < n; ++j) {
+ int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg;
+ int e_min = a[j].end < a[i].end? a[j].end : a[i].end;
+ if (e_min > b_max) { // have overlap
+ int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg;
+ if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
+ if (a[j].p2 == 0) a[j].p2 = a[i].p;
+ if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
+ break;
+ }
+ }
+ }
+ if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
+ }
+ for (i = 0; i < n; ++i) { // mark chains to be kept
+ mem_chain_t *c = (mem_chain_t*)a[i].p;
+ if (c->n > 0) c->n = -c->n;
+ c = (mem_chain_t*)a[i].p2;
+ if (c && c->n > 0) c->n = -c->n;
+ }
+ free(a);
+ for (i = 0; i < n_chn; ++i) { // free discarded chains
+ mem_chain_t *c = &chains[i];
+ if (c->n >= 0) {
+ free(c->seeds);
+ c->n = c->m = 0;
+ } else c->n = -c->n;
+ }
+ for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains
+ if (chains[i].n > 0) {
+ if (n != i) chains[n++] = chains[i];
+ else ++n;
+ }
+ }
+ return n;
+}
+
+/******************************
+ * De-overlap single-end hits *
+ ******************************/
+
+#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
+KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
+
+int mem_sort_and_dedup(int n, mem_alnreg_t *a)
+{
+ int m, i;
+ if (n <= 1) return n;
+ ks_introsort(mem_ars, n, a);
+ for (i = 1; i < n; ++i) { // mark identical hits
+ if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb)
+ a[i].qe = a[i].qb;
+ }
+ for (i = 1, m = 1; i < n; ++i) // exclude identical hits
+ if (a[i].qe > a[i].qb) {
+ if (m != i) a[m++] = a[i];
+ else ++m;
+ }
+ return m;
+}
+
+void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
+{ // similar to the loop in mem_chain_flt()
+ int i, k, tmp;
+ kvec_t(int) z;
+ if (n == 0) return;
+ kv_init(z);
+ for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1;
+ tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+ kv_push(int, z, 0);
+ for (i = 1; i < n; ++i) {
+ for (k = 0; k < z.n; ++k) {
+ int j = z.a[k];
+ int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
+ int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
+ if (e_min > b_max) { // have overlap
+ int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
+ if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
+ if (a[j].sub == 0) a[j].sub = a[i].score;
+ if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
+ break;
+ }
+ }
+ }
+ if (k == z.n) kv_push(int, z, i);
+ else a[i].secondary = z.a[k];
+ }
+ free(z.a);
+}
+
+/****************************************
+ * Construct the alignment from a chain *
+ ****************************************/
+
+static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
+{
+ int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
+ l = l > 1? l : 1;
+ return l < opt->w<<1? l : opt->w<<1;
+}
+
+void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
+{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
+ int i, k;
+ int64_t rlen, rmax[2], tmp, max = 0;
+ const mem_seed_t *s;
+ uint8_t *rseq = 0;
+ uint64_t *srt;
+
+ if (c->n == 0) return;
+ // get the max possible span
+ rmax[0] = l_pac<<1; rmax[1] = 0;
+ for (i = 0; i < c->n; ++i) {
+ int64_t b, e;
+ const mem_seed_t *t = &c->seeds[i];
+ b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
+ e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
+ rmax[0] = rmax[0] < b? rmax[0] : b;
+ rmax[1] = rmax[1] > e? rmax[1] : e;
+ if (t->len > max) max = t->len;
+ }
+ rmax[0] = rmax[0] > 0? rmax[0] : 0;
+ rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
+ if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
+ if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac;
+ else rmax[0] = l_pac;
+ }
+ // retrieve the reference sequence
+ rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
+ if (rlen != rmax[1] - rmax[0]) return;
+
+ srt = malloc(c->n * 8);
+ for (i = 0; i < c->n; ++i)
+ srt[i] = (uint64_t)c->seeds[i].len<<32 | i;
+ ks_introsort_64(c->n, srt);
+
+ for (k = c->n - 1; k >= 0; --k) {
+ mem_alnreg_t *a;
+ s = &c->seeds[(uint32_t)srt[k]];
+
+ for (i = 0; i < av->n; ++i) { // test whether extension has been made before
+ mem_alnreg_t *p = &av->a[i];
+ int64_t rd;
+ int qd, w, max_gap;
+ if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained
+ // qd: distance ahead of the seed on query; rd: on reference
+ qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
+ max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
+ w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
+ if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
+ // similar to the previous four lines, but this time we look at the region behind
+ qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
+ max_gap = cal_max_gap(opt, qd < rd? qd : rd);
+ w = max_gap < opt->w? max_gap : opt->w;
+ if (qd - rd < w && rd - qd < w) break;
+ }
+ if (i < av->n) continue;
+
+ a = kv_pushp(mem_alnreg_t, *av);
+ memset(a, 0, sizeof(mem_alnreg_t));
+
+ if (s->qbeg) { // left extension
+ uint8_t *rs, *qs;
+ int qle, tle, gtle, gscore;
+ qs = malloc(s->qbeg);
+ for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
+ tmp = s->rbeg - rmax[0];
+ rs = malloc(tmp);
+ for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
+ a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore);
+ // check whether we prefer to reach the end of the query
+ if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
+ else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
+ free(qs); free(rs);
+ } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
+
+ if (s->qbeg + s->len != l_query) { // right extension
+ int qle, tle, qe, re, gtle, gscore;
+ qe = s->qbeg + s->len;
+ re = s->rbeg + s->len - rmax[0];
+ a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore);
+ // similar to the above
+ if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
+ else a->qe = l_query, a->re = rmax[0] + re + gtle;
+ } else a->qe = l_query, a->re = s->rbeg + s->len;
+ if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
+
+ // compute seedcov
+ for (i = 0, a->seedcov = 0; i < c->n; ++i) {
+ const mem_seed_t *t = &c->seeds[i];
+ if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
+ a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
+ }
+ }
+ free(srt); free(rseq);
+}
+
+/*****************************
+ * Basic hit->SAM conversion *
+ *****************************/
+
+static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
+{
+ int w;
+ if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps
+ w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
+ if (w < abs(l1 - l2)) w = abs(l1 - l2);
+ return w;
+}
+
+void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
+{
+#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
+ int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1;
+ uint32_t *cigar = 0;
+ int64_t pos;
+ bwahit_t ptmp, *p = &ptmp;
+
+ if (!p_) { // in this case, generate an unmapped alignment
+ memset(&ptmp, 0, sizeof(bwahit_t));
+ ptmp.rb = ptmp.re = -1;
+ } else ptmp = *p_;
+ p->flag |= m? 1 : 0; // is paired in sequencing
+ p->flag |= !is_mapped(p)? 4 : 0; // is mapped
+ p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped
+ if (m && !is_mapped(p) && is_mapped(m)) {
+ p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq;
+ copy_mate = 1;
+ }
+ p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand
+ p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand
+ kputs(s->name, str); kputc('\t', str);
+ if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate
+ int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
+ if (p->flag&0x10000) sam_flag |= 0x100;
+ if (!copy_mate) {
+ int w2;
+ w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r);
+ w2 = w2 < w? w2 : w;
+ cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
+ p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
+ } else n_cigar = 0, cigar = 0;
+ pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
+ bns_cnt_ambi(bns, pos, p->re - p->rb, &rid);
+ kputw(sam_flag, str); kputc('\t', str);
+ kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str);
+ kputw(p->qual, str); kputc('\t', str);
+ if (n_cigar) {
+ int i, clip5, clip3;
+ clip5 = is_rev? s->l_seq - p->qe : p->qb;
+ clip3 = is_rev? p->qb : s->l_seq - p->qe;
+ if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); }
+ for (i = 0; i < n_cigar; ++i) {
+ kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str);
+ }
+ if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); }
+ } else kputc('*', str);
+ } else { // no coordinate
+ kputw(p->flag, str);
+ kputs("\t*\t0\t0\t*", str);
+ rid = -1;
+ }
+ if (m && is_mapped(m)) { // then print mate pos and isize
+ pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev);
+ bns_cnt_ambi(bns, pos, m->re - m->rb, &mid);
+ kputc('\t', str);
+ if (mid == rid) kputc('=', str);
+ else kputs(bns->anns[mid].name, str);
+ kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str);
+ kputc('\t', str);
+ if (mid == rid) {
+ int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb;
+ int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb;
+ kputw(p0 - p1 + (p0 > p1? 1 : -1), str);
+ } else kputw(0, str);
+ kputc('\t', str);
+ } else kputsn("\t*\t0\t0\t", 7, str);
+ if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL
+ kputsn("*\t*", 3, str);
+ } else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand
+ int i, qb = 0, qe = s->l_seq;
+ if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
+ kputc('\t', str);
+ if (s->qual) { // printf qual
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
+ str->s[str->l] = 0;
+ } else kputc('*', str);
+ } else { // the reverse strand
+ int i, qb = 0, qe = s->l_seq;
+ if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
+ kputc('\t', str);
+ if (s->qual) { // printf qual
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
+ str->s[str->l] = 0;
+ } else kputc('*', str);
+ }
+ if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); }
+ if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
+ if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
+ if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
+ if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
+ kputc('\n', str);
+ free(cigar);
+#undef is_mapped
+}
+
+/************************
+ * Integrated interface *
+ ************************/
+
+int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
+{
+ int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a;
+ double identity;
+ sub = a->csub > sub? a->csub : sub;
+ if (sub >= a->score) return 0;
+ l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
+ mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0;
+ identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
+ mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
+ if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
+ if (mapq > 60) mapq = 60;
+ if (mapq < 0) mapq = 0;
+ return mapq;
+}
+
+void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h)
+{
+ h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe;
+ h->score = a->score;
+ h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub;
+ h->qual = 0; // quality unset
+ h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set
+}
+
+void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m)
+{
+ int k;
+ kstring_t str;
+ str.l = str.m = 0; str.s = 0;
+ if (a->n > 0) {
+ int mapq0 = -1;
+ for (k = 0; k < a->n; ++k) {
+ bwahit_t h;
+ mem_alnreg_t *p = &a->a[k];
+ if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
+ if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
+ mem_alnreg2hit(p, &h);
+ bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re);
+ h.flag |= extra_flag;
+ if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard)
+ h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);
+ if (k == 0) mapq0 = h.qual;
+ else if (h.qual > mapq0) h.qual = mapq0;
+ bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
+ }
+ } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m);
+ s->sam = str.s;
+}
+
+mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
+{
+ int i;
+ mem_chain_v chn;
+ mem_alnreg_v regs;
+
+ for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
+ seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
+
+ chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq);
+ chn.n = mem_chain_flt(opt, chn.n, chn.a);
+ if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
+
+ kv_init(regs);
+ for (i = 0; i < chn.n; ++i) {
+ mem_chain_t *p = &chn.a[i];
+ mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
+ free(chn.a[i].seeds);
+ }
+ free(chn.a);
+ regs.n = mem_sort_and_dedup(regs.n, regs.a);
+ return regs;
+}
+
+mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
+{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
+ mem_alnreg_v ar;
+ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
+ mem_mark_primary_se(opt, ar.n, ar.a);
+ return ar;
+}
+
+// This routine is only used for the API purpose
+mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar)
+{
+ mem_aln_t a;
+ int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
+ int64_t pos, rb = ar->rb, re = ar->re;
+ memset(&a, 0, sizeof(mem_aln_t));
+ a.mapq = mem_approx_mapq_se(opt, ar);
+ bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
+ w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
+ w2 = w2 < opt->w? w2 : opt->w;
+ a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
+ a.NM = NM;
+ pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
+ a.is_rev = is_rev;
+ if (qb != 0 || qe != l_query) { // add clipping to CIGAR
+ int clip5, clip3;
+ clip5 = is_rev? l_query - qe : qb;
+ clip3 = is_rev? qb : l_query - qe;
+ a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
+ if (clip5) {
+ memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
+ a.cigar[0] = clip5<<4|3;
+ ++a.n_cigar;
+ }
+ if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3;
+ }
+ a.rid = bns_pos2rid(bns, pos);
+ a.pos = pos - bns->anns[a.rid].offset;
+ return a;
+}
+
+typedef struct {
+ int start, step, n;
+ const mem_opt_t *opt;
+ const bwt_t *bwt;
+ const bntseq_t *bns;
+ const uint8_t *pac;
+ const mem_pestat_t *pes;
+ bseq1_t *seqs;
+ mem_alnreg_v *regs;
+} worker_t;
+
+static void *worker1(void *data)
+{
+ worker_t *w = (worker_t*)data;
+ int i;
+ if (!(w->opt->flag&MEM_F_PE)) {
+ for (i = w->start; i < w->n; i += w->step)
+ w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
+ } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
+ for (i = w->start; i < w->n>>1; i += w->step) {
+ w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
+ w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
+ }
+ }
+ return 0;
+}
+
+static void *worker2(void *data)
+{
+ extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
+ worker_t *w = (worker_t*)data;
+ int i;
+ if (!(w->opt->flag&MEM_F_PE)) {
+ for (i = w->start; i < w->n; i += w->step) {
+ mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a);
+ mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
+ free(w->regs[i].a);
+ }
+ } else {
+ int n = 0;
+ for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet
+ n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]);
+ free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
+ }
+ fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n);
+ }
+ return 0;
+}
+
+void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
+{
+ int i;
+ worker_t *w;
+ mem_alnreg_v *regs;
+ mem_pestat_t pes[4];
+
+ w = calloc(opt->n_threads, sizeof(worker_t));
+ regs = malloc(n * sizeof(mem_alnreg_v));
+ for (i = 0; i < opt->n_threads; ++i) {
+ worker_t *p = &w[i];
+ p->start = i; p->step = opt->n_threads; p->n = n;
+ p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac;
+ p->seqs = seqs; p->regs = regs;
+ p->pes = &pes[0];
+ }
+#ifdef HAVE_PTHREAD
+ if (opt->n_threads == 1) {
+ worker1(w);
+ if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
+ worker2(w);
+ } else {
+ pthread_t *tid;
+ tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
+ for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]);
+ for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
+ if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
+ for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]);
+ for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
+ free(tid);
+ }
+#else
+ worker1(w);
+ if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
+ worker2(w);
+#endif
+ for (i = 0; i < n; ++i) {
+ fputs(seqs[i].sam, stdout);
+ free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
+ }
+ free(regs); free(w);
+}
diff --git a/bwamem.h b/bwamem.h
new file mode 100644
index 0000000..c2f124c
--- /dev/null
+++ b/bwamem.h
@@ -0,0 +1,145 @@
+#ifndef BWAMEM_H_
+#define BWAMEM_H_
+
+#include "bwt.h"
+#include "bntseq.h"
+#include "bwa.h"
+
+#define MEM_MAPQ_COEF 30.0
+#define MEM_MAPQ_MAX 60
+
+struct __smem_i;
+typedef struct __smem_i smem_i;
+
+#define MEM_F_HARDCLIP 0x1
+#define MEM_F_PE 0x2
+#define MEM_F_NOPAIRING 0x4
+#define MEM_F_ALL 0x8
+#define MEM_F_NO_MULTI 0x10
+
+typedef struct {
+ int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
+ int pen_unpaired; // phred-scaled penalty for unpaired reads
+ int pen_clip; // clipping penalty. This score is not deducted from the DP score.
+ int w; // band width
+
+ int flag; // see MEM_F_* macros
+ int min_seed_len; // minimum seed length
+ float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
+ int split_width; // split into a seed if its occurence is smaller than this value
+ int max_occ; // skip a seed if its occurence is larger than this value
+ int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
+ int n_threads; // number of threads
+ int chunk_size; // process chunk_size-bp sequences in a batch
+ float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
+ float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
+ int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
+ int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
+ int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
+} mem_opt_t;
+
+typedef struct {
+ int64_t rb, re; // [rb,re): reference sequence in the alignment
+ int qb, qe; // [qb,qe): query sequence in the alignment
+ int score; // best SW score
+ int sub; // 2nd best SW score
+ int csub; // SW score of a tandem hit
+ int sub_n; // approximate number of suboptimal hits
+ int seedcov; // length of regions coverged by seeds
+ int secondary; // index of the parent hit shadowing the current hit; <0 if primary
+} mem_alnreg_t;
+
+typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
+
+typedef struct {
+ int low, high, failed;
+ double avg, std;
+} mem_pestat_t;
+
+typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
+ int64_t rb, re;
+ int qb, qe, flag, qual;
+ // optional info
+ int score, sub;
+} bwahit_t;
+
+typedef struct { // This struct is only used for the convenience of API.
+ int rid;
+ int pos;
+ uint32_t is_rev:1, mapq:8, NM:23;
+ int n_cigar;
+ uint32_t *cigar;
+} mem_aln_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ smem_i *smem_itr_init(const bwt_t *bwt);
+ void smem_itr_destroy(smem_i *itr);
+ void smem_set_query(smem_i *itr, int len, const uint8_t *query);
+ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width);
+
+ mem_opt_t *mem_opt_init(void);
+ void mem_fill_scmat(int a, int b, int8_t mat[25]);
+
+ /**
+ * Align a batch of sequences and generate the alignments in the SAM format
+ *
+ * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
+ * Note that $seqs[i].sam may consist of several SAM lines if the
+ * corresponding sequence has multiple primary hits.
+ *
+ * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
+ * sequences must be interleaved: $n must be an even number and the 2i-th
+ * sequence and the (2i+1)-th sequence constitute a read pair. In this
+ * mode, there should be enough (typically >50) unique pairs for the
+ * routine to infer the orientation and insert size.
+ *
+ * @param opt alignment parameters
+ * @param bwt FM-index of the reference sequence
+ * @param bns Information of the reference
+ * @param pac 2-bit encoded reference
+ * @param n number of query sequences
+ * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
+ */
+ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
+
+ /**
+ * Find the aligned regions for one query sequence
+ *
+ * Note that this routine does not generate CIGAR. CIGAR should be
+ * generated later by bwa_gen_cigar() defined in bwa.c.
+ *
+ * @param opt alignment parameters
+ * @param bwt FM-index of the reference sequence
+ * @param bns Information of the reference
+ * @param pac 2-bit encoded reference
+ * @param l_seq length of query sequence
+ * @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied
+ *
+ * @return list of aligned regions.
+ */
+ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
+
+ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar);
+
+ /**
+ * Infer the insert size distribution from interleaved alignment regions
+ *
+ * This function can be called after mem_align1(), as long as paired-end
+ * reads are properly interleaved.
+ *
+ * @param opt alignment parameters
+ * @param l_pac length of concatenated reference sequence
+ * @param n number of query sequences; must be an even number
+ * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
+ * @param pes inferred insert size distribution (output)
+ */
+ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bwamem_pair.c b/bwamem_pair.c
new file mode 100644
index 0000000..9ff12b3
--- /dev/null
+++ b/bwamem_pair.c
@@ -0,0 +1,314 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include "kstring.h"
+#include "bwamem.h"
+#include "kvec.h"
+#include "utils.h"
+#include "ksw.h"
+
+#define MIN_RATIO 0.8
+#define MIN_DIR_CNT 10
+#define MIN_DIR_RATIO 0.05
+#define OUTLIER_BOUND 2.0
+#define MAPPING_BOUND 3.0
+#define MAX_STDDEV 4.0
+
+static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
+{
+ int64_t p2;
+ int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
+ p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
+ *dist = p2 > b1? p2 - b1 : b1 - p2;
+ return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
+}
+
+static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
+{
+ int j;
+ for (j = 1; j < r->n; ++j) { // choose unique alignment
+ int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
+ int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
+ if (e_min > b_max) { // have overlap
+ int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
+ if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
+ }
+ }
+ return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
+}
+
+void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
+{
+ int i, d, max;
+ uint64_v isize[4];
+ memset(pes, 0, 4 * sizeof(mem_pestat_t));
+ memset(isize, 0, sizeof(kvec_t(int)) * 4);
+ for (i = 0; i < n>>1; ++i) {
+ int dir;
+ int64_t is;
+ mem_alnreg_v *r[2];
+ r[0] = (mem_alnreg_v*)®s[i<<1|0];
+ r[1] = (mem_alnreg_v*)®s[i<<1|1];
+ if (r[0]->n == 0 || r[1]->n == 0) continue;
+ if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
+ if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
+ dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
+ if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
+ }
+ if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
+ for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
+ mem_pestat_t *r = &pes[d];
+ uint64_v *q = &isize[d];
+ int p25, p50, p75, x;
+ if (q->n < MIN_DIR_CNT) {
+ fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+ r->failed = 1;
+ continue;
+ } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+ ks_introsort_64(q->n, q->a);
+ p25 = q->a[(int)(.25 * q->n + .499)];
+ p50 = q->a[(int)(.50 * q->n + .499)];
+ p75 = q->a[(int)(.75 * q->n + .499)];
+ r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
+ if (r->low < 1) r->low = 1;
+ r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
+ fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
+ fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
+ for (i = x = 0, r->avg = 0; i < q->n; ++i)
+ if (q->a[i] >= r->low && q->a[i] <= r->high)
+ r->avg += q->a[i], ++x;
+ r->avg /= x;
+ for (i = 0, r->std = 0; i < q->n; ++i)
+ if (q->a[i] >= r->low && q->a[i] <= r->high)
+ r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
+ r->std = sqrt(r->std / x);
+ fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
+ r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
+ r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
+ if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
+ if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
+ if (r->low < 1) r->low = 1;
+ fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
+ free(q->a);
+ }
+ for (d = 0, max = 0; d < 4; ++d)
+ max = max > isize[d].n? max : isize[d].n;
+ for (d = 0; d < 4; ++d)
+ if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
+ pes[d].failed = 1;
+ fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+ }
+}
+
+int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
+{
+ int i, r, skip[4], n = 0;
+ for (r = 0; r < 4; ++r)
+ skip[r] = pes[r].failed? 1 : 0;
+ for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
+ int64_t dist;
+ r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
+ if (dist >= pes[r].low && dist <= pes[r].high)
+ skip[r] = 1;
+ }
+ if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
+ for (r = 0; r < 4; ++r) {
+ int is_rev, is_larger;
+ uint8_t *seq, *rev = 0, *ref;
+ int64_t rb, re, len;
+ if (skip[r]) continue;
+ is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
+ is_larger = !(r>>1); // whether the mate has larger coordinate
+ if (is_rev) {
+ rev = malloc(l_ms); // this is the reverse complement of $ms
+ for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
+ seq = rev;
+ } else seq = (uint8_t*)ms;
+ if (!is_rev) {
+ rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
+ re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
+ } else {
+ rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
+ re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
+ }
+ if (rb < 0) rb = 0;
+ if (re > l_pac<<1) re = l_pac<<1;
+ ref = bns_get_seq(l_pac, pac, rb, re, &len);
+ if (len == re - rb) { // no funny things happening
+ kswr_t aln;
+ mem_alnreg_t b;
+ int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len;
+ aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0);
+ memset(&b, 0, sizeof(mem_alnreg_t));
+ if (aln.score >= opt->min_seed_len) {
+ b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
+ b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
+ b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
+ b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
+ b.score = aln.score;
+ b.csub = aln.score2;
+ b.secondary = -1;
+ b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
+// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
+ kv_push(mem_alnreg_t, *ma, b); // make room for a new element
+ // move b s.t. ma is sorted
+ for (i = 0; i < ma->n - 1; ++i) // find the insertion point
+ if (ma->a[i].score < b.score) break;
+ tmp = i;
+ for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+ ma->a[i] = b;
+ }
+ ++n;
+ }
+ if (rev) free(rev);
+ free(ref);
+ }
+ return n;
+}
+
+int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
+{
+ extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
+ pair64_v v, u;
+ int r, i, k, y[4], ret; // y[] keeps the last hit
+ kv_init(v); kv_init(u);
+ for (r = 0; r < 2; ++r) { // loop through read number
+ for (i = 0; i < a[r].n; ++i) {
+ pair64_t key;
+ mem_alnreg_t *e = &a[r].a[i];
+ key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
+ key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
+ kv_push(pair64_t, v, key);
+ }
+ }
+ ks_introsort_128(v.n, v.a);
+ y[0] = y[1] = y[2] = y[3] = -1;
+ //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
+ for (i = 0; i < v.n; ++i) {
+ for (r = 0; r < 2; ++r) { // loop through direction
+ int dir = r<<1 | (v.a[i].y>>1&1), which;
+ if (pes[dir].failed) continue; // invalid orientation
+ which = r<<1 | ((v.a[i].y&1)^1);
+ if (y[which] < 0) continue; // no previous hits
+ for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
+ int64_t dist;
+ int q;
+ double ns;
+ pair64_t *p;
+ if ((v.a[k].y&3) != which) continue;
+ dist = (int64_t)v.a[i].x - v.a[k].x;
+ //printf("%d: %lld\n", k, dist);
+ if (dist > pes[dir].high) break;
+ if (dist < pes[dir].low) continue;
+ ns = (dist - pes[dir].avg) / pes[dir].std;
+ q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4)
+ if (q < 0) q = 0;
+ p = kv_pushp(pair64_t, u);
+ p->y = (uint64_t)k<<32 | i;
+ p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
+ //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
+ }
+ }
+ y[v.a[i].y&3] = i;
+ }
+ if (u.n) { // found at least one proper pair
+ int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+ ks_introsort_128(u.n, u.a);
+ i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
+ z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
+ z[v.a[k].y&1] = v.a[k].y<<32>>34;
+ ret = u.a[u.n-1].x >> 32;
+ *sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
+ for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
+ if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
+ } else ret = 0, *sub = 0, *n_sub = 0;
+ free(u.a); free(v.a);
+ return ret;
+}
+
+int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
+{
+ extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a);
+ extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m);
+ extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
+ extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
+ extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m);
+
+ int n = 0, i, j, z[2], o, subo, n_sub;
+ kstring_t str;
+ mem_alnreg_v b[2];
+ bwahit_t h[2];
+
+ str.l = str.m = 0; str.s = 0;
+ // perform SW for the best alignment
+ kv_init(b[0]); kv_init(b[1]);
+ for (i = 0; i < 2; ++i)
+ for (j = 0; j < a[i].n; ++j)
+ if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
+ kv_push(mem_alnreg_t, b[i], a[i].a[j]);
+ for (i = 0; i < 2; ++i)
+ for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
+ n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
+ free(b[0].a); free(b[1].a);
+ mem_mark_primary_se(opt, a[0].n, a[0].a);
+ mem_mark_primary_se(opt, a[1].n, a[1].a);
+ if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
+ // pairing single-end hits
+ if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
+ int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2];
+ // check if an end has multiple hits even after mate-SW
+ for (i = 0; i < 2; ++i) {
+ for (j = 1; j < a[i].n; ++j)
+ if (a[i].a[j].secondary < 0) break;
+ is_multi[i] = j < a[i].n? 1 : 0;
+ }
+ if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
+ // compute mapQ for the best SE hit
+ score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
+ //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
+ subo = subo > score_un? subo : score_un;
+ q_pe = (o - subo) * 6;
+ if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
+ if (q_pe < 0) q_pe = 0;
+ if (q_pe > 60) q_pe = 60;
+ // the following assumes no split hits
+ if (o > score_un) { // paired alignment is preferred
+ mem_alnreg_t *c[2];
+ c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
+ for (i = 0; i < 2; ++i) {
+ if (c[i]->secondary >= 0)
+ c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
+ q_se[i] = mem_approx_mapq_se(opt, c[i]);
+ }
+ q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
+ q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
+ extra_flag |= 2;
+ // cap at the tandem repeat score
+ q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6;
+ q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6;
+ } else { // the unpaired alignment is preferred
+ z[0] = z[1] = 0;
+ q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
+ q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
+ }
+ mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag;
+ bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re);
+ mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag;
+ bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re);
+ bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0;
+ bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s;
+ } else goto no_pairing;
+ return n;
+
+no_pairing:
+ for (i = 0; i < 2; ++i) {
+ if (a[i].n) {
+ mem_alnreg2hit(&a[i].a[0], &h[i]);
+ bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
+ } else h[i].rb = h[i].re = -1;
+ }
+ mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
+ mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);
+ return n;
+}
diff --git a/bwape.c b/bwape.c
index 779670f..0b2b8d6 100644
--- a/bwape.c
+++ b/bwape.c
@@ -10,6 +10,7 @@
#include "utils.h"
#include "stdaln.h"
#include "bwase.h"
+#include "bwa.h"
typedef struct {
int n;
@@ -21,24 +22,15 @@ typedef struct {
bwtint_t low, high, high_bayesian;
} isize_info_t;
-typedef struct {
- uint64_t x, y;
-} b128_t;
-
-#define b128_lt(a, b) ((a).x < (b).x)
#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
#define b128_hash(a) ((uint32_t)(a).x)
#include "khash.h"
-KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq)
-
-#include "ksort.h"
-KSORT_INIT(b128, b128_t, b128_lt)
-KSORT_INIT_GENERIC(uint64_t)
+KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
typedef struct {
- kvec_t(b128_t) arr;
- kvec_t(b128_t) pos[2];
+ pair64_v arr;
+ pair64_v pos[2];
kvec_t(bwt_aln1_t) aln[2];
} pe_data_t;
@@ -69,19 +61,6 @@ pe_opt_t *bwa_init_pe_opt()
po->ap_prior = 1e-5;
return po;
}
-
-static inline uint64_t hash_64(uint64_t key)
-{
- key += ~(key << 32);
- key ^= (key >> 22);
- key += ~(key << 13);
- key ^= (key >> 8);
- key += (key << 3);
- key ^= (key >> 15);
- key += ~(key << 27);
- key ^= (key >> 31);
- return key;
-}
/*
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
{
@@ -120,7 +99,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double
free(isizes);
return -1;
}
- ks_introsort(uint64_t, tot, isizes);
+ ks_introsort_64(tot, isizes);
p25 = isizes[(int)(tot*0.25 + 0.5)];
p50 = isizes[(int)(tot*0.50 + 0.5)];
p75 = isizes[(int)(tot*0.75 + 0.5)];
@@ -170,7 +149,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
{
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
uint64_t o_score, subo_score;
- b128_t last_pos[2][2], o_pos[2];
+ pair64_t last_pos[2][2], o_pos[2];
max_len = p[0]->full_len;
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
if (low_bound < max_len) low_bound = max_len;
@@ -206,11 +185,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
o_score = subo_score = (uint64_t)-1;
o_n = subo_n = 0;
- ks_introsort(b128, d->arr.n, d->arr.a);
+ ks_introsort_128(d->arr.n, d->arr.a);
for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
if (opt->type == BWA_PET_STD) {
for (i = 0; i < d->arr.n; ++i) {
- b128_t x = d->arr.a[i];
+ pair64_t x = d->arr.a[i];
int strand = x.y>>1&1;
if (strand == 1) { // reverse strand, then check
int y = 1 - (x.y&1);
@@ -221,19 +200,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
last_pos[x.y&1][1] = x;
}
}
- } else if (opt->type == BWA_PET_SOLID) {
- for (i = 0; i < d->arr.n; ++i) {
- b128_t x = d->arr.a[i];
- int strand = x.y>>1&1;
- if ((strand^x.y)&1) { // push
- int y = 1 - (x.y&1);
- __pairing_aux(last_pos[y][1], x);
- __pairing_aux(last_pos[y][0], x);
- } else { // check
- last_pos[x.y&1][0] = last_pos[x.y&1][1];
- last_pos[x.y&1][1] = x;
- }
- }
} else {
fprintf(stderr, "[paring] not implemented yet!\n");
exit(1);
@@ -345,7 +311,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
{ // only when both ends mapped
- b128_t x;
+ pair64_t x;
int j, k;
long long n_occ[2];
for (j = 0; j < 2; ++j) {
@@ -360,7 +326,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
bwt_aln1_t *r = d->aln[j].a + k;
bwtint_t l;
if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
- b128_t key;
+ pair64_t key;
int ret;
key.x = r->k; key.y = r->l;
khint_t iter = kh_put(b128, g_hash, key, &ret);
@@ -377,14 +343,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
x.x = kh_val(g_hash, iter).a[l]>>1;
x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
- kv_push(b128_t, d->arr, x);
+ kv_push(pair64_t, d->arr, x);
}
} else { // then calculate on the fly
for (l = r->k; l <= r->l; ++l) {
int strand;
x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
x.y = k<<2 | strand<<1 | j;
- kv_push(b128_t, d->arr, x);
+ kv_push(pair64_t, d->arr, x);
}
}
}
@@ -576,11 +542,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
++n_tot[is_singleton];
cigar[0] = cigar[1] = 0;
n_cigar[0] = n_cigar[1] = 0;
- if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
+ if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
ubyte_t *seq;
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
- if (popt->type == BWA_PET_STD) {
+ { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
seq = p[k]->rseq;
@@ -589,17 +555,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
seq = p[k]->seq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
}
- } else { // BWA_PET_SOLID
- if (p[1-k]->strand == 0) { // R3-F3 pairing
- if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
- else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
- seq = p[k]->rseq;
- seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
- } else { // F3-R3 pairing
- if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
- else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
- seq = p[k]->seq;
- }
}
// perform SW alignment
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
@@ -656,14 +611,14 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
return pacseq;
}
-void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
+void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, j, n_seqs, tot_seqs = 0;
bwa_seq_t *seqs[2];
bwa_seqio_t *ks[2];
clock_t t;
- bntseq_t *bns, *ntbns = 0;
+ bntseq_t *bns;
FILE *fp_sa[2];
gap_opt_t opt, opt0;
khint_t iter;
@@ -688,10 +643,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
opt0 = opt;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
- if (!(opt.mode & BWA_MODE_COMPREAD)) {
- popt->type = BWA_PET_SOLID;
- ntbns = bwa_open_nt(prefix);
- } else { // for Illumina alignment only
+ { // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
@@ -702,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
}
// core loop
- bwa_print_sam_SQ(bns);
+ bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_PG();
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
int cnt_chg;
@@ -724,7 +676,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
for (j = 0; j < 2; ++j)
- bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
+ bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
if (pac == 0) free(pacseq);
@@ -749,7 +701,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
// destroy
bns_destroy(bns);
- if (ntbns) bns_destroy(ntbns);
for (i = 0; i < 2; ++i) {
bwa_seq_close(ks[i]);
fclose(fp_sa[i]);
@@ -764,21 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
int bwa_sai2sam_pe(int argc, char *argv[])
{
- extern char *bwa_rg_line, *bwa_rg_id;
- extern int bwa_set_rg(const char *s);
- extern char *bwa_infer_prefix(const char *hint);
int c;
pe_opt_t *popt;
- char *prefix;
+ char *prefix, *rg_line = 0;
popt = bwa_init_pe_opt();
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
switch (c) {
case 'r':
- if (bwa_set_rg(optarg) < 0) {
- fprintf(stderr, "[%s] malformated @RG line\n", __func__);
- return 1;
- }
+ if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'a': popt->max_isize = atoi(optarg); break;
case 'o': popt->max_occ = atoi(optarg); break;
@@ -812,13 +757,11 @@ int bwa_sai2sam_pe(int argc, char *argv[])
fprintf(stderr, "\n");
return 1;
}
- if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
+ if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
- free(bwa_rg_line); free(bwa_rg_id);
return 0;
}
- bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt);
- free(bwa_rg_line); free(bwa_rg_id); free(prefix);
- free(popt);
+ bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
+ free(prefix); free(popt);
return 0;
}
diff --git a/bwase.c b/bwase.c
index 35744e7..2dd783b 100644
--- a/bwase.c
+++ b/bwase.c
@@ -10,9 +10,9 @@
#include "bntseq.h"
#include "utils.h"
#include "kstring.h"
+#include "bwa.h"
int g_log_n[256];
-char *bwa_rg_line, *bwa_rg_id;
void bwa_print_sam_PG();
@@ -71,8 +71,8 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
}
rest -= q->l - q->k + 1;
} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
- int j, i, k;
- for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
+ int j, i;
+ for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
double p = 1.0, x = drand48();
while (x < p) p -= p * j / (i--);
s->multi[z].pos = q->l - i;
@@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s)
s->len = s->full_len;
}
-void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
+void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
{
- ubyte_t *pacseq, *ntpac = 0;
+ ubyte_t *pacseq;
int i, j;
kstring_t *str;
- if (ntbns) { // in color space
- ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
- rewind(ntbns->fp_pac);
- fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
- }
-
if (!_pacseq) {
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
rewind(bns->fp_pac);
@@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
}
-#if 0
- if (ntbns) { // in color space
- for (i = 0; i < n_seqs; ++i) {
- bwa_seq_t *s = seqs + i;
- bwa_cs2nt_core(s, bns->l_pac, ntpac);
- for (j = 0; j < s->n_multi; ++j) {
- bwt_multi1_t *q = s->multi + j;
- int n_cigar;
- if (q->gap == 0) continue;
- free(q->cigar);
- q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
- (q->strand? 1 : -1) * q->gap, &n_cigar, 0);
- q->n_cigar = n_cigar;
- }
- if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
- free(s->cigar);
- s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
- (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
- }
- }
- }
-#endif
// generate MD tag
str = (kstring_t*)calloc(1, sizeof(kstring_t));
for (i = 0; i != n_seqs; ++i) {
@@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
if (s->type != BWA_TYPE_NO_MATCH) {
int nm;
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
- bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
+ bns->l_pac, pacseq, str, &nm);
s->nm = nm;
}
}
free(str->s); free(str);
// correct for trimmed reads
- if (!ntbns) // trimming is only enabled for Illumina reads
- for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
+ for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
if (!_pacseq) free(pacseq);
- free(ntpac);
}
int64_t pos_end(const bwa_seq_t *p)
@@ -442,11 +412,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
// print mate coordinate
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
- int m_seqid, m_is_N;
+ int m_seqid;
long long isize;
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
// redundant calculation here, but should not matter too much
- m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
+ bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
@@ -464,7 +434,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
err_printf("%s", p->qual);
} else err_printf("*");
- if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
+ if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
if (p->type != BWA_TYPE_NO_MATCH) {
@@ -512,74 +482,20 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
err_printf("%s", p->qual);
} else err_printf("*");
- if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
+ if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
putchar('\n');
}
}
-bntseq_t *bwa_open_nt(const char *prefix)
-{
- bntseq_t *ntbns;
- char *str;
- str = (char*)calloc(strlen(prefix) + 10, 1);
- strcat(strcpy(str, prefix), ".nt");
- ntbns = bns_restore(str);
- free(str);
- return ntbns;
-}
-
-void bwa_print_sam_SQ(const bntseq_t *bns)
-{
- int i;
- for (i = 0; i < bns->n_seqs; ++i)
- err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
- if (bwa_rg_line) err_printf("%s\n", bwa_rg_line);
-}
-
void bwase_initialize()
{
int i;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
}
-char *bwa_escape(char *s)
-{
- char *p, *q;
- for (p = q = s; *p; ++p) {
- if (*p == '\\') {
- ++p;
- if (*p == 't') *q++ = '\t';
- else if (*p == 'n') *q++ = '\n';
- else if (*p == 'r') *q++ = '\r';
- else if (*p == '\\') *q++ = '\\';
- } else *q++ = *p;
- }
- *q = '\0';
- return s;
-}
-
-int bwa_set_rg(const char *s)
-{
- char *p, *q, *r;
- if (strstr(s, "@RG") != s) return -1;
- if (bwa_rg_line) free(bwa_rg_line);
- if (bwa_rg_id) free(bwa_rg_id);
- bwa_rg_line = strdup(s);
- bwa_rg_id = 0;
- bwa_escape(bwa_rg_line);
- p = strstr(bwa_rg_line, "\tID:");
- if (p == 0) return -1;
- p += 4;
- for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
- bwa_rg_id = calloc(q - p + 1, 1);
- for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
- *r++ = *q;
- return 0;
-}
-
-void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
+void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, n_seqs, tot_seqs = 0, m_aln;
@@ -587,7 +503,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
- bntseq_t *bns, *ntbns = 0;
+ bntseq_t *bns;
FILE *fp_sa;
gap_opt_t opt;
@@ -599,9 +515,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
m_aln = 0;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
- if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
- ntbns = bwa_open_nt(prefix);
- bwa_print_sam_SQ(bns);
+ bwa_print_sam_hdr(bns, rg_line);
//bwa_print_sam_PG();
// set ks
ks = bwa_open_reads(opt.mode, fn_fa);
@@ -628,7 +542,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
- bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
+ bwa_refine_gapped(bns, n_seqs, seqs, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] print alignments... ");
@@ -642,7 +556,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
// destroy
bwa_seq_close(ks);
- if (ntbns) bns_destroy(ntbns);
bns_destroy(bns);
fclose(fp_sa);
free(aln);
@@ -650,17 +563,13 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
int bwa_sai2sam_se(int argc, char *argv[])
{
- extern char *bwa_infer_prefix(const char *hint);
int c, n_occ = 3;
- char *prefix;
+ char *prefix, *rg_line = 0;
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
switch (c) {
case 'h': break;
case 'r':
- if (bwa_set_rg(optarg) < 0) {
- fprintf(stderr, "[%s] malformated @RG line\n", __func__);
- return 1;
- }
+ if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'n': n_occ = atoi(optarg); break;
case 'f': xreopen(optarg, "w", stdout); break;
@@ -672,12 +581,10 @@ int bwa_sai2sam_se(int argc, char *argv[])
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
return 1;
}
- if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
+ if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
- free(bwa_rg_line); free(bwa_rg_id);
return 0;
}
- bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ);
- free(bwa_rg_line); free(bwa_rg_id);
+ bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
return 0;
}
diff --git a/bwase.h b/bwase.h
index f8e9b0a..26a9f68 100644
--- a/bwase.h
+++ b/bwase.h
@@ -14,7 +14,7 @@ extern "C" {
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
// Refine the approximate position of the sequence to an actual placement for the sequence.
- void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
// Backfill certain alignment properties mainly centering around number of matches.
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
// Calculate the end position of a read given a certain sequence.
diff --git a/bwaseqio.c b/bwaseqio.c
index e22d4cd..c1e9f97 100644
--- a/bwaseqio.c
+++ b/bwaseqio.c
@@ -5,7 +5,7 @@
#include "bamlite.h"
#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
extern unsigned char nst_nt4_table[256];
static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
diff --git a/bwt.c b/bwt.c
index fcc141e..4ee9ea8 100644
--- a/bwt.c
+++ b/bwt.c
@@ -45,6 +45,14 @@ void bwt_gen_cnt_table(bwt_t *bwt)
}
}
+static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
+{
+ bwtint_t x = k - (k > bwt->primary);
+ x = bwt_B0(bwt, x);
+ x = bwt->L2[x] + bwt_occ(bwt, k, x);
+ return k == bwt->primary? 0 : x;
+}
+
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_sa(bwt_t *bwt, int intv)
{
@@ -95,23 +103,22 @@ static inline int __occ_aux(uint64_t y, int c)
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
}
-inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
+bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
{
- bwtint_t n, l, j;
- uint32_t *p;
+ bwtint_t n;
+ uint32_t *p, *end;
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
if (k == (bwtint_t)(-1)) return 0;
- if (k >= bwt->primary) --k; // because $ is not in bwt
+ k -= (k >= bwt->primary); // because $ is not in bwt
// retrieve Occ at k/OCC_INTERVAL
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
// calculate Occ up to the last k/32
- j = k >> 5 << 5;
- for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
- n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
+ end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
+ for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
// calculate Occ
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
@@ -121,7 +128,7 @@ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
}
// an analogy to bwt_occ() but more efficient, requiring k <= l
-inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
+void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
{
bwtint_t _k, _l;
_k = (k >= bwt->primary)? k-1 : k;
@@ -158,52 +165,53 @@ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint
((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
+ (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
-inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
+void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
{
- bwtint_t l, j, x;
- uint32_t *p;
+ bwtint_t x;
+ uint32_t *p, tmp, *end;
if (k == (bwtint_t)(-1)) {
memset(cnt, 0, 4 * sizeof(bwtint_t));
return;
}
- if (k >= bwt->primary) --k; // because $ is not in bwt
+ k -= (k >= bwt->primary); // because $ is not in bwt
p = bwt_occ_intv(bwt, k);
memcpy(cnt, p, 4 * sizeof(bwtint_t));
- p += sizeof(bwtint_t);
- j = k >> 4 << 4;
- for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
- x += __occ_aux4(bwt, *p);
- x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
+ p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
+ end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
+ for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
+ tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
+ x += __occ_aux4(bwt, tmp) - (~k&15);
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
}
// an analogy to bwt_occ4() but more efficient, requiring k <= l
-inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
+void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
{
bwtint_t _k, _l;
- _k = (k >= bwt->primary)? k-1 : k;
- _l = (l >= bwt->primary)? l-1 : l;
- if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
+ _k = k - (k >= bwt->primary);
+ _l = l - (l >= bwt->primary);
+ if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
bwt_occ4(bwt, k, cntk);
bwt_occ4(bwt, l, cntl);
} else {
- bwtint_t i, j, x, y;
- uint32_t *p;
- if (k >= bwt->primary) --k; // because $ is not in bwt
- if (l >= bwt->primary) --l;
+ bwtint_t x, y;
+ uint32_t *p, tmp, *endk, *endl;
+ k -= (k >= bwt->primary); // because $ is not in bwt
+ l -= (l >= bwt->primary);
p = bwt_occ_intv(bwt, k);
memcpy(cntk, p, 4 * sizeof(bwtint_t));
- p += sizeof(bwtint_t);
+ p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
// prepare cntk[]
- j = k >> 4 << 4;
- for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
- x += __occ_aux4(bwt, *p);
+ endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
+ endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
+ for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
y = x;
- x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
+ tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
+ x += __occ_aux4(bwt, tmp) - (~k&15);
// calculate cntl[] and finalize cntk[]
- j = l >> 4 << 4;
- for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
- y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
+ for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
+ tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
+ y += __occ_aux4(bwt, tmp) - (~l&15);
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
@@ -277,7 +285,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
-int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
@@ -285,45 +293,45 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
mem->n = 0;
if (q[x] > 3) return x + 1;
+ if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
kv_init(a[0]); kv_init(a[1]);
- prev = tmpvec[0]? tmpvec[0] : &a[0];
- curr = tmpvec[1]? tmpvec[1] : &a[1];
- bwt_set_intv(bwt, q[x], ik);
+ prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
+ curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
+ bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
ik.info = x + 1;
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
- if (q[i] < 4) {
- c = 3 - q[i];
+ if (q[i] < 4) { // an A/C/G/T base
+ c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
- if (ok[c].x[2] != ik.x[2]) // change of the interval size
+ if (ok[c].x[2] != ik.x[2]) { // change of the interval size
kv_push(bwtintv_t, *curr, ik);
- if (ok[c].x[2] == 0) break; // cannot be extended
+ if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
+ }
ik = ok[c]; ik.info = i + 1;
} else { // an ambiguous base
kv_push(bwtintv_t, *curr, ik);
- break; // cannot be extended; in this case, i<len always stands
+ break; // always terminate extension at an ambiguous base; in this case, i<len always stands
}
}
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
- bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
+ bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
ret = curr->a[0].info; // this will be the returned value
swap = curr; curr = prev; prev = swap;
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
- if (q[i] > 3) break;
- c = i < 0? 0 : q[i];
+ c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
bwt_extend(bwt, p, ok, 1);
- if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
- if (curr->n == 0) { // curr->n to make sure there is no longer matches
+ if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
+ if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
kv_push(bwtintv_t, *mem, ik);
}
} // otherwise the match is contained in another longer match
- }
- if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) {
+ } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
ok[c].info = p->info;
kv_push(bwtintv_t, *curr, ok[c]);
}
@@ -333,7 +341,83 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
}
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
- if (tmpvec[0] == 0) free(a[0].a);
- if (tmpvec[1] == 0) free(a[1].a);
+ if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
+ if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
return ret;
}
+
+/*************************
+ * Read/write BWT and SA *
+ *************************/
+
+void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
+{
+ FILE *fp;
+ fp = xopen(fn, "wb");
+ fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
+ fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+ fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
+ fclose(fp);
+}
+
+void bwt_dump_sa(const char *fn, const bwt_t *bwt)
+{
+ FILE *fp;
+ fp = xopen(fn, "wb");
+ fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
+ fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+ fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
+ fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
+ fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
+ fclose(fp);
+}
+
+void bwt_restore_sa(const char *fn, bwt_t *bwt)
+{
+ char skipped[256];
+ FILE *fp;
+ bwtint_t primary;
+
+ fp = xopen(fn, "rb");
+ fread(&primary, sizeof(bwtint_t), 1, fp);
+ xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
+ fread(skipped, sizeof(bwtint_t), 4, fp); // skip
+ fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
+ fread(&primary, sizeof(bwtint_t), 1, fp);
+ xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
+
+ bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
+ bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
+ bwt->sa[0] = -1;
+
+ fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
+ fclose(fp);
+}
+
+bwt_t *bwt_restore_bwt(const char *fn)
+{
+ bwt_t *bwt;
+ FILE *fp;
+
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+ fp = xopen(fn, "rb");
+ fseek(fp, 0, SEEK_END);
+ bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
+ bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
+ fseek(fp, 0, SEEK_SET);
+ fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
+ fread(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+ fread(bwt->bwt, 4, bwt->bwt_size, fp);
+ bwt->seq_len = bwt->L2[4];
+ fclose(fp);
+ bwt_gen_cnt_table(bwt);
+
+ return bwt;
+}
+
+void bwt_destroy(bwt_t *bwt)
+{
+ if (bwt == 0) return;
+ free(bwt->sa); free(bwt->bwt);
+ free(bwt);
+}
diff --git a/bwt.h b/bwt.h
index 5823f82..e7b0f97 100644
--- a/bwt.h
+++ b/bwt.h
@@ -30,8 +30,10 @@
#include <stdint.h>
-// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line
-#define OCC_INTERVAL 0x80
+// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
+#define OCC_INTV_SHIFT 7
+#define OCC_INTERVAL (1LL<<OCC_INTV_SHIFT)
+#define OCC_INTV_MASK (OCC_INTERVAL - 1)
#ifndef BWA_UBYTE
#define BWA_UBYTE
@@ -74,13 +76,6 @@ typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
* called bwt_B0 instead of bwt_B */
#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
-// inverse Psi function
-#define bwt_invPsi(bwt, k) \
- (((k) == (bwt)->primary)? 0 : \
- ((k) < (bwt)->primary)? \
- (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
- : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
-
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
#ifdef __cplusplus
@@ -121,7 +116,9 @@ extern "C" {
* Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
* Return the end of the longest exact match starting from _x_.
*/
- int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+
+ // SMEM iterator interface
#ifdef __cplusplus
}
diff --git a/bwt_lite.c b/bwt_lite.c
index dd411e1..902e0fc 100644
--- a/bwt_lite.c
+++ b/bwt_lite.c
@@ -65,7 +65,7 @@ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
return n;
}
-inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
+void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
{
uint32_t x, b;
if (k == (uint32_t)(-1)) {
@@ -80,7 +80,7 @@ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
x -= 15 - (k&15);
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
}
-inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
+void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
{
bwtl_occ4(bwt, k, cntk);
bwtl_occ4(bwt, l, cntl);
diff --git a/bwtaln.c b/bwtaln.c
index efc7f66..96d4026 100644
--- a/bwtaln.c
+++ b/bwtaln.c
@@ -11,6 +11,7 @@
#include "bwtaln.h"
#include "bwtgap.h"
#include "utils.h"
+#include "bwa.h"
#ifdef HAVE_PTHREAD
#include <pthread.h>
@@ -219,32 +220,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
bwa_seq_close(ks);
}
-char *bwa_infer_prefix(const char *hint)
-{
- char *prefix;
- int l_hint;
- FILE *fp;
- l_hint = strlen(hint);
- prefix = malloc(l_hint + 3 + 4 + 1);
- strcpy(prefix, hint);
- strcpy(prefix + l_hint, ".64.bwt");
- if ((fp = fopen(prefix, "rb")) != 0) {
- fclose(fp);
- prefix[l_hint + 3] = 0;
- return prefix;
- } else {
- strcpy(prefix + l_hint, ".bwt");
- if ((fp = fopen(prefix, "rb")) == 0) {
- free(prefix);
- return 0;
- } else {
- fclose(fp);
- prefix[l_hint] = 0;
- return prefix;
- }
- }
-}
-
int bwa_aln(int argc, char *argv[])
{
int c, opte = -1;
@@ -252,7 +227,7 @@ int bwa_aln(int argc, char *argv[])
char *prefix;
opt = gap_init_opt();
- while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
+ while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
switch (c) {
case 'n':
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
@@ -272,7 +247,6 @@ int bwa_aln(int argc, char *argv[])
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
case 'R': opt->max_top2 = atoi(optarg); break;
case 'q': opt->trim_qual = atoi(optarg); break;
- case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
case 'f': xreopen(optarg, "wb", stdout); break;
case 'b': opt->mode |= BWA_MODE_BAM; break;
@@ -310,7 +284,6 @@ int bwa_aln(int argc, char *argv[])
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
fprintf(stderr, " -B INT length of barcode\n");
-// fprintf(stderr, " -c input sequences are in the color space\n");
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
@@ -330,7 +303,7 @@ int bwa_aln(int argc, char *argv[])
k = l;
}
}
- if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
+ if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
free(opt);
return 0;
diff --git a/bwtaln.h b/bwtaln.h
index 39eaf4b..412cc04 100644
--- a/bwtaln.h
+++ b/bwtaln.h
@@ -107,7 +107,6 @@ typedef struct {
} gap_opt_t;
#define BWA_PET_STD 1
-#define BWA_PET_SOLID 2
typedef struct {
int max_isize, force_isize;
diff --git a/bwtindex.c b/bwtindex.c
index 938e982..298153d 100644
--- a/bwtindex.c
+++ b/bwtindex.c
@@ -36,17 +36,160 @@
#include "main.h"
#include "utils.h"
-bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
-void bwa_pac_rev_core(const char *fn, const char *fn_rev);
+#ifdef _DIVBWT
+#include "divsufsort.h"
+#endif
-int bwa_index(int argc, char *argv[])
+int is_bwt(ubyte_t *T, int n);
+
+int64_t bwa_seq_len(const char *fn_pac)
+{
+ FILE *fp;
+ int64_t pac_len;
+ ubyte_t c;
+ fp = xopen(fn_pac, "rb");
+ fseek(fp, -1, SEEK_END);
+ pac_len = ftell(fp);
+ fread(&c, 1, 1, fp);
+ fclose(fp);
+ return (pac_len - 1) * 4 + (int)c;
+}
+
+bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
+{
+ bwt_t *bwt;
+ ubyte_t *buf, *buf2;
+ int i, pac_size;
+ FILE *fp;
+
+ // initialization
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+ bwt->seq_len = bwa_seq_len(fn_pac);
+ bwt->bwt_size = (bwt->seq_len + 15) >> 4;
+ fp = xopen(fn_pac, "rb");
+
+ // prepare sequence
+ pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
+ buf2 = (ubyte_t*)calloc(pac_size, 1);
+ fread(buf2, 1, pac_size, fp);
+ fclose(fp);
+ memset(bwt->L2, 0, 5 * 4);
+ buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
+ for (i = 0; i < bwt->seq_len; ++i) {
+ buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
+ ++bwt->L2[1+buf[i]];
+ }
+ for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
+ free(buf2);
+
+ // Burrows-Wheeler Transform
+ if (use_is) {
+ bwt->primary = is_bwt(buf, bwt->seq_len);
+ } else {
+#ifdef _DIVBWT
+ bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
+#else
+ err_fatal_simple("libdivsufsort is not compiled in.");
+#endif
+ }
+ bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
+ for (i = 0; i < bwt->seq_len; ++i)
+ bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
+ free(buf);
+ return bwt;
+}
+
+int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
+{
+ bwt_t *bwt;
+ int c, use_is = 1;
+ while ((c = getopt(argc, argv, "d")) >= 0) {
+ switch (c) {
+ case 'd': use_is = 0; break;
+ default: return 1;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
+ return 1;
+ }
+ bwt = bwt_pac2bwt(argv[optind], use_is);
+ bwt_dump_bwt(argv[optind+1], bwt);
+ bwt_destroy(bwt);
+ return 0;
+}
+
+#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
+
+void bwt_bwtupdate_core(bwt_t *bwt)
{
+ bwtint_t i, k, c[4], n_occ;
+ uint32_t *buf;
+
+ n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
+ bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
+ buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
+ c[0] = c[1] = c[2] = c[3] = 0;
+ for (i = k = 0; i < bwt->seq_len; ++i) {
+ if (i % OCC_INTERVAL == 0) {
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+ k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
+ }
+ if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
+ ++c[bwt_B00(bwt, i)];
+ }
+ // the last element
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+ xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
+ // update bwt
+ free(bwt->bwt); bwt->bwt = buf;
+}
+
+int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
+{
+ bwt_t *bwt;
+ if (argc < 2) {
+ fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
+ return 1;
+ }
+ bwt = bwt_restore_bwt(argv[1]);
+ bwt_bwtupdate_core(bwt);
+ bwt_dump_bwt(argv[1], bwt);
+ bwt_destroy(bwt);
+ return 0;
+}
+
+int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
+{
+ bwt_t *bwt;
+ int c, sa_intv = 32;
+ while ((c = getopt(argc, argv, "i:")) >= 0) {
+ switch (c) {
+ case 'i': sa_intv = atoi(optarg); break;
+ default: return 1;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
+ return 1;
+ }
+ bwt = bwt_restore_bwt(argv[optind]);
+ bwt_cal_sa(bwt, sa_intv);
+ bwt_dump_sa(argv[optind+1], bwt);
+ bwt_destroy(bwt);
+ return 0;
+}
+
+int bwa_index(int argc, char *argv[]) // the "index" command
+{
+ extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
+
char *prefix = 0, *str, *str2, *str3;
- int c, algo_type = 0, is_color = 0, is_64 = 0;
+ int c, algo_type = 0, is_64 = 0;
clock_t t;
int64_t l_pac;
- while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
+ while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "div") == 0) algo_type = 1;
@@ -55,7 +198,6 @@ int bwa_index(int argc, char *argv[])
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
break;
case 'p': prefix = strdup(optarg); break;
- case 'c': is_color = 1; break;
case '6': is_64 = 1; break;
default: return 1;
}
@@ -67,7 +209,6 @@ int bwa_index(int argc, char *argv[])
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
-// fprintf(stderr, " -c build color-space index\n");
fprintf(stderr, "\n");
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
@@ -83,29 +224,13 @@ int bwa_index(int argc, char *argv[])
str2 = (char*)calloc(strlen(prefix) + 10, 1);
str3 = (char*)calloc(strlen(prefix) + 10, 1);
- if (is_color == 0) { // nucleotide indexing
+ { // nucleotide indexing
gzFile fp = xzopen(argv[optind], "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
gzclose(fp);
- } else { // color indexing
- gzFile fp = xzopen(argv[optind], "r");
- strcat(strcpy(str, prefix), ".nt");
- t = clock();
- fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
- l_pac = bns_fasta2bntseq(fp, str, 0);
- fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
- gzclose(fp);
- {
- char *tmp_argv[3];
- tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
- t = clock();
- fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
- bwa_pac2cspac(3, tmp_argv);
- fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
- }
}
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
{
diff --git a/bwtio.c b/bwtio.c
deleted file mode 100644
index 7508609..0000000
--- a/bwtio.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "bwt.h"
-#include "utils.h"
-
-void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
-{
- FILE *fp;
- fp = xopen(fn, "wb");
- fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
- fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
- fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
- fclose(fp);
-}
-
-void bwt_dump_sa(const char *fn, const bwt_t *bwt)
-{
- FILE *fp;
- fp = xopen(fn, "wb");
- fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
- fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
- fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
- fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
- fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
- fclose(fp);
-}
-
-void bwt_restore_sa(const char *fn, bwt_t *bwt)
-{
- char skipped[256];
- FILE *fp;
- bwtint_t primary;
-
- fp = xopen(fn, "rb");
- fread(&primary, sizeof(bwtint_t), 1, fp);
- xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
- fread(skipped, sizeof(bwtint_t), 4, fp); // skip
- fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
- fread(&primary, sizeof(bwtint_t), 1, fp);
- xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
-
- bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
- bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
- bwt->sa[0] = -1;
-
- fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
- fclose(fp);
-}
-
-bwt_t *bwt_restore_bwt(const char *fn)
-{
- bwt_t *bwt;
- FILE *fp;
-
- bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
- fp = xopen(fn, "rb");
- fseek(fp, 0, SEEK_END);
- bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
- bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
- fseek(fp, 0, SEEK_SET);
- fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
- fread(bwt->L2+1, sizeof(bwtint_t), 4, fp);
- fread(bwt->bwt, 4, bwt->bwt_size, fp);
- bwt->seq_len = bwt->L2[4];
- fclose(fp);
- bwt_gen_cnt_table(bwt);
-
- return bwt;
-}
-
-void bwt_destroy(bwt_t *bwt)
-{
- if (bwt == 0) return;
- free(bwt->sa); free(bwt->bwt);
- free(bwt);
-}
diff --git a/bwtmisc.c b/bwtmisc.c
deleted file mode 100644
index c35d684..0000000
--- a/bwtmisc.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3 at sanger.ac.uk> */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include "bntseq.h"
-#include "utils.h"
-#include "main.h"
-#include "bwt.h"
-
-#ifdef _DIVBWT
-#include "divsufsort.h"
-#endif
-
-int is_bwt(ubyte_t *T, int n);
-
-int64_t bwa_seq_len(const char *fn_pac)
-{
- FILE *fp;
- int64_t pac_len;
- ubyte_t c;
- fp = xopen(fn_pac, "rb");
- fseek(fp, -1, SEEK_END);
- pac_len = ftell(fp);
- fread(&c, 1, 1, fp);
- fclose(fp);
- return (pac_len - 1) * 4 + (int)c;
-}
-
-bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
-{
- bwt_t *bwt;
- ubyte_t *buf, *buf2;
- int i, pac_size;
- FILE *fp;
-
- // initialization
- bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
- bwt->seq_len = bwa_seq_len(fn_pac);
- bwt->bwt_size = (bwt->seq_len + 15) >> 4;
- fp = xopen(fn_pac, "rb");
-
- // prepare sequence
- pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
- buf2 = (ubyte_t*)calloc(pac_size, 1);
- fread(buf2, 1, pac_size, fp);
- fclose(fp);
- memset(bwt->L2, 0, 5 * 4);
- buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
- for (i = 0; i < bwt->seq_len; ++i) {
- buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
- ++bwt->L2[1+buf[i]];
- }
- for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
- free(buf2);
-
- // Burrows-Wheeler Transform
- if (use_is) {
- bwt->primary = is_bwt(buf, bwt->seq_len);
- } else {
-#ifdef _DIVBWT
- bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
-#else
- err_fatal_simple("libdivsufsort is not compiled in.");
-#endif
- }
- bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
- for (i = 0; i < bwt->seq_len; ++i)
- bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
- free(buf);
- return bwt;
-}
-
-int bwa_pac2bwt(int argc, char *argv[])
-{
- bwt_t *bwt;
- int c, use_is = 1;
- while ((c = getopt(argc, argv, "d")) >= 0) {
- switch (c) {
- case 'd': use_is = 0; break;
- default: return 1;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
- return 1;
- }
- bwt = bwt_pac2bwt(argv[optind], use_is);
- bwt_dump_bwt(argv[optind+1], bwt);
- bwt_destroy(bwt);
- return 0;
-}
-
-#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
-
-void bwt_bwtupdate_core(bwt_t *bwt)
-{
- bwtint_t i, k, c[4], n_occ;
- uint32_t *buf;
-
- n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
- bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
- buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
- c[0] = c[1] = c[2] = c[3] = 0;
- for (i = k = 0; i < bwt->seq_len; ++i) {
- if (i % OCC_INTERVAL == 0) {
- memcpy(buf + k, c, sizeof(bwtint_t) * 4);
- k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
- }
- if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
- ++c[bwt_B00(bwt, i)];
- }
- // the last element
- memcpy(buf + k, c, sizeof(bwtint_t) * 4);
- xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
- // update bwt
- free(bwt->bwt); bwt->bwt = buf;
-}
-
-int bwa_bwtupdate(int argc, char *argv[])
-{
- bwt_t *bwt;
- if (argc < 2) {
- fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
- return 1;
- }
- bwt = bwt_restore_bwt(argv[1]);
- bwt_bwtupdate_core(bwt);
- bwt_dump_bwt(argv[1], bwt);
- bwt_destroy(bwt);
- return 0;
-}
-
-const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
-
-/* this function is not memory efficient, but this will make life easier
- Ideally we should also change .amb files as one 'N' in the nucleotide
- sequence leads to two ambiguous colors. I may do this later... */
-uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
-{
- uint8_t *pac, *cspac;
- bwtint_t i;
- int c1, c2;
- pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
- cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
- fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
- rewind(bns->fp_pac);
- c1 = pac[0]>>6; cspac[0] = c1<<6;
- for (i = 1; i < bns->l_pac; ++i) {
- c2 = pac[i>>2] >> (~i&3)*2 & 3;
- cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
- c1 = c2;
- }
- free(pac);
- return cspac;
-}
-
-int bwa_pac2cspac(int argc, char *argv[])
-{
- bntseq_t *bns;
- uint8_t *cspac, ct;
- char *str;
- FILE *fp;
-
- if (argc < 3) {
- fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
- return 1;
- }
- bns = bns_restore(argv[1]);
- cspac = bwa_pac2cspac_core(bns);
- bns_dump(bns, argv[2]);
- // now write cspac
- str = (char*)calloc(strlen(argv[2]) + 5, 1);
- strcat(strcpy(str, argv[2]), ".pac");
- fp = xopen(str, "wb");
- fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
- ct = bns->l_pac % 4;
- fwrite(&ct, 1, 1, fp);
- fclose(fp);
- bns_destroy(bns);
- free(cspac);
- return 0;
-}
-
-int bwa_bwt2sa(int argc, char *argv[])
-{
- bwt_t *bwt;
- int c, sa_intv = 32;
- while ((c = getopt(argc, argv, "i:")) >= 0) {
- switch (c) {
- case 'i': sa_intv = atoi(optarg); break;
- default: return 1;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
- return 1;
- }
- bwt = bwt_restore_bwt(argv[optind]);
- bwt_cal_sa(bwt, sa_intv);
- bwt_dump_sa(argv[optind+1], bwt);
- bwt_destroy(bwt);
- return 0;
-}
diff --git a/bwtsw2.h b/bwtsw2.h
index 0a1b860..0ec9676 100644
--- a/bwtsw2.h
+++ b/bwtsw2.h
@@ -12,8 +12,8 @@
#define BSW2_FLAG_RESCUED 0x800
typedef struct {
- int skip_sw:16, hard_clip:16;
- int a, b, q, r, t, qr, bw, max_ins;
+ int skip_sw:8, cpy_cmt:8, hard_clip:16;
+ int a, b, q, r, t, qr, bw, max_ins, max_chain_gap;
int z, is, t_seeds, multi_2nd;
float mask_level, coef;
int n_threads, chunk_size;
@@ -45,7 +45,7 @@ typedef struct {
typedef struct {
int l, tid;
- char *name, *seq, *qual, *sam;
+ char *name, *seq, *qual, *sam, *comment;
} bsw2seq1_t;
#ifdef __cplusplus
diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c
index 710051d..bc12d20 100644
--- a/bwtsw2_aux.c
+++ b/bwtsw2_aux.c
@@ -13,9 +13,10 @@
#include "bwtsw2.h"
#include "stdaln.h"
#include "kstring.h"
+#include "bwa.h"
#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
#include "ksort.h"
#define __left_lt(a, b) ((a).end > (b).end)
@@ -54,6 +55,8 @@ bsw2opt_t *bsw2_init_opt()
o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0;
o->mask_level = 0.50f; o->coef = 5.5f;
o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
+ o->max_chain_gap = 10000;
+ o->cpy_cmt = 0;
return o;
}
@@ -184,14 +187,14 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
bsw2aux_t *q = b->aux + i;
uint8_t *query;
bwtint_t k;
- int score, path_len, beg, end;
+ int path_len, beg, end;
if (p->l) continue;
beg = (p->flag & 0x10)? lq - p->end : p->beg;
end = (p->flag & 0x10)? lq - p->beg : p->end;
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
- score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
+ aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar);
#if 0
if (name && score != p->G) { // debugging only
@@ -227,7 +230,7 @@ void bsw2_debug_hits(const bwtsw2_t *b)
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
if (p->G > 0)
- printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
+ printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
}
}
@@ -286,12 +289,13 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8
}
}
b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
- bsw2_chain_filter(opt, l, b);
+ bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained
for (k = 0; k < 2; ++k) {
bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem);
merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
bsw2_resolve_duphits(0, 0, bb[k][0], 0);
bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem);
+ bsw2_resolve_duphits(0, 0, bb[k][0], 0);
b[k] = bb[k][0];
free(bb[k]);
}
@@ -549,7 +553,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
else kputc(ks->qual[j], &str);
}
- } else ksprintf(&str, "\t*");
+ } else kputs("\t*", &str);
// print optional tags
ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm);
if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn);
@@ -557,6 +561,12 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
if (p->flag&BSW2_FLAG_MATESW) type |= 1;
if (p->flag&BSW2_FLAG_TANDEM) type |= 2;
if (type) ksprintf(&str, "\tXT:i:%d", type);
+ if (opt->cpy_cmt && ks->comment) {
+ int l = strlen(ks->comment);
+ if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') {
+ kputc('\t', &str); kputs(ks->comment, &str);
+ }
+ }
kputc('\n', &str);
}
ks->sam = str.s;
@@ -747,23 +757,14 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
_seq->n = 0;
}
-static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p)
-{
- p->tid = -1;
- p->l = ks->seq.l;
- p->name = strdup(ks->name.s);
- p->seq = strdup(ks->seq.s);
- p->qual = ks->qual.l? strdup(ks->qual.s) : 0;
- p->sam = 0;
-}
-
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
{
gzFile fp, fp2;
kseq_t *ks, *ks2;
- int l, size = 0, is_pe = 0;
+ int l, is_pe = 0, i, n;
uint8_t *pac;
bsw2seq_t *_seq;
+ bseq1_t *bseq;
pac = calloc(bns->l_pac/4+1, 1);
if (pac == 0) {
@@ -781,34 +782,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
ks2 = kseq_init(fp2);
is_pe = 1;
} else fp2 = 0, ks2 = 0, is_pe = 0;
- while (kseq_read(ks) >= 0) {
- if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/')
- ks->name.l -= 2, ks->name.s[ks->name.l] = 0;
- if (_seq->n == _seq->max) {
- _seq->max = _seq->max? _seq->max<<1 : 1024;
+ while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
+ int size = 0;
+ if (n > _seq->max) {
+ _seq->max = n;
+ kroundup32(_seq->max);
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
}
- kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]);
- size += ks->seq.l;
- if (ks2) {
- if (kseq_read(ks2) >= 0) {
- if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/')
- ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0;
- kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge
- size += ks->seq.l;
- } else {
- fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__);
- is_pe = 0;
- }
- }
- if (size > opt->chunk_size * opt->n_threads) {
- fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
- process_seqs(_seq, opt, bns, pac, target, is_pe);
- size = 0;
+ _seq->n = n;
+ for (i = 0; i < n; ++i) {
+ bseq1_t *b = &bseq[i];
+ bsw2seq1_t *p = &_seq->seq[i];
+ p->tid = -1; p->l = b->l_seq;
+ p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
+ size += p->l;
}
+ fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
+ free(bseq);
+ process_seqs(_seq, opt, bns, pac, target, is_pe);
}
- fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
- process_seqs(_seq, opt, bns, pac, target, is_pe);
// free
free(pac);
free(_seq->seq); free(_seq);
diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c
index c734657..381d0b7 100644
--- a/bwtsw2_chain.c
+++ b/bwtsw2_chain.c
@@ -23,15 +23,15 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t
hsaip_t *q = chain + k;
int x = p->qbeg - q->qbeg; // always positive
int y = p->tbeg - q->tbeg;
- if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) {
+ if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained
if (p->qend > q->qend) q->qend = p->qend;
if (p->tend > q->tend) q->tend = p->tend;
++q->chain;
p->chain = shift + k;
break;
- }
+ } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains
}
- if (k < 0) {
+ if (k < 0) { // not added to any previous chains
chain[m] = *p;
chain[m].chain = 1;
chain[m].idx = p->chain = shift + m;
@@ -44,7 +44,7 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t
void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
{
hsaip_t *z[2], *chain[2];
- int i, j, k, n[2], m[2];
+ int i, j, k, n[2], m[2], thres = opt->t_seeds * 2;
char *flag;
// initialization
n[0] = b[0]->n; n[1] = b[1]->n;
@@ -71,6 +71,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
int tmp = p->qbeg;
p->qbeg = len - p->qend; p->qend = len - tmp;
}
+ //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend);
// filtering
flag = calloc(m[0] + m[1], 1);
ks_introsort(hsaip, m[0] + m[1], chain[0]);
@@ -79,7 +80,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
for (j = 0; j < k; ++j) {
hsaip_t *q = chain[0] + j;
if (flag[q->idx]) continue;
- if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) {
+ if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) {
flag[p->idx] = 1;
break;
}
diff --git a/bwtsw2_main.c b/bwtsw2_main.c
index 50355fe..ab126f2 100644
--- a/bwtsw2_main.c
+++ b/bwtsw2_main.c
@@ -6,19 +6,17 @@
#include "bwt.h"
#include "bwtsw2.h"
#include "utils.h"
+#include "bwa.h"
int bwa_bwtsw2(int argc, char *argv[])
{
- extern char *bwa_infer_prefix(const char *hint);
bsw2opt_t *opt;
- bwt_t *target;
- char buf[1024], *prefix;
- bntseq_t *bns;
+ bwaidx_t *idx;
int c;
opt = bsw2_init_opt();
srand48(11);
- while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:S")) >= 0) {
+ while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) {
switch (c) {
case 'q': opt->q = atoi(optarg); break;
case 'r': opt->r = atoi(optarg); break;
@@ -37,6 +35,8 @@ int bwa_bwtsw2(int argc, char *argv[])
case 'f': xreopen(optarg, "w", stdout); break;
case 'I': opt->max_ins = atoi(optarg); break;
case 'S': opt->skip_sw = 1; break;
+ case 'C': opt->cpy_cmt = 1; break;
+ case 'G': opt->max_chain_gap = atoi(optarg); break;
}
}
opt->qr = opt->q + opt->r;
@@ -54,6 +54,7 @@ int bwa_bwtsw2(int argc, char *argv[])
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -f FILE file to output results to instead of stdout\n");
fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n");
+ fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n");
fprintf(stderr, " -M mark multi-part alignments as secondary\n");
fprintf(stderr, " -S skip Smith-Waterman read pairing\n");
fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins);
@@ -62,7 +63,8 @@ int bwa_bwtsw2(int argc, char *argv[])
fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
fprintf(stderr, " -z INT Z-best [%d]\n", opt->z);
fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is);
- fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds);
+ fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds);
+ fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap);
fprintf(stderr, "\n");
fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n");
fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n");
@@ -77,19 +79,10 @@ int bwa_bwtsw2(int argc, char *argv[])
opt->t *= opt->a;
opt->coef *= opt->a;
- if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
- fprintf(stderr, "[%s] fail to locate the index\n", __func__);
- return 0;
- }
- strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt"));
- strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target);
- bns = bns_restore(prefix);
-
- bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
-
- bns_destroy(bns);
- bwt_destroy(target);
- free(opt); free(prefix);
+ if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0;
+ bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
+ bwa_idx_destroy(idx);
+ free(opt);
return 0;
}
diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c
index a6f4d80..cf29087 100644
--- a/bwtsw2_pair.c
+++ b/bwtsw2_pair.c
@@ -6,6 +6,7 @@
#include "bntseq.h"
#include "bwtsw2.h"
#include "kstring.h"
+#include "utils.h"
#ifndef _NO_SSE2
#include "ksw.h"
#else
@@ -24,7 +25,6 @@ typedef struct {
bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
{
- extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
int i, k, x, p25, p50, p75, tmp, max_len = 0;
uint64_t *isize;
bsw2pestat_t r;
@@ -44,7 +44,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
isize[k++] = l;
}
- ks_introsort_uint64_t(k, isize);
+ ks_introsort_64(k, isize);
p25 = isize[(int)(.25 * k + .499)];
p50 = isize[(int)(.50 * k + .499)];
p75 = isize[(int)(.75 * k + .499)];
@@ -74,9 +74,9 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
r.low = tmp > max_len? tmp : max_len;
if (r.low < 1) r.low = 1;
r.high = (int)(p75 + 3. * (p75 - p25) + .499);
- if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499);
+ if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499);
r.low = tmp > max_len? tmp : max_len;
- if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499);
+ if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499);
ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
free(isize);
return r;
@@ -127,35 +127,24 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
seq[i] = nst_nt4_table[(int)mseq[i]];
}
#ifndef _NO_SSE2
- {
- ksw_query_t *q;
- ksw_aux_t aux[2];
- // forward Smith-Waterman
- aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0];
- q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
- ksw_sse2(q, end - beg, ref, &aux[0]);
- free(q);
- if (aux[0].score < opt->t) {
- free(seq);
- return;
- }
- ++aux[0].qe; ++aux[0].te;
- // reverse Smith-Waterman
- seq_reverse(aux[0].qe, seq, 0);
- seq_reverse(aux[0].te, ref, 0);
- q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat);
- ksw_sse2(q, aux[0].te, ref, &aux[1]);
- free(q);
- ++aux[1].qe; ++aux[1].te;
- // write output
- a->G = aux[0].score;
- a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2;
+ { // FIXME!!! The following block has not been tested since the update of the ksw library
+ int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
+ kswr_t aln;
+ aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
+ a->G = aln.score;
+ a->G2 = aln.score2;
+ if (a->G < opt->t) a->G = 0;
if (a->G2 < opt->t) a->G2 = 0;
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
- a->k = beg + (aux[0].te - aux[1].te);
- a->len = aux[1].te;
- a->beg = aux[0].qe - aux[1].qe;
- a->end = aux[0].qe;
+ a->k = beg + aln.tb;
+ a->len = aln.te - aln.tb + 1;
+ a->beg = aln.qb;
+ a->end = aln.qe + 1;
+ /*
+ printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n');
+ printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n');
+ printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
+ */
}
#else
{
@@ -168,6 +157,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2);
if (a->G < opt->t) a->G = 0;
if (a->G2 < opt->t) a->G2 = 0;
+ if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
a->k = beg + path[0].i - 1;
a->len = path[1].i - path[0].i + 1;
a->beg = path[0].j - 1;
diff --git a/cs2nt.c b/cs2nt.c
deleted file mode 100644
index dfbce60..0000000
--- a/cs2nt.c
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <string.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include "bwtaln.h"
-#include "stdaln.h"
-
-/*
- Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
- decode as ATTGAC(RBGOG), there are one color change and one nt change;
- if we decode as ATTAAC(RBRBG), there are two color changes.
-
- In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
- as the penalty; otherwise, we will use color quality as the
- penalty. This means we always prefer two consistent color changes over
- a nt change, but if a color has high quality, we may prefer one nt
- change.
-
- In the above example, the penalties of the two types of decoding are
- q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
- otherwise the second. Note that no matter what we choose, the fourth
- base will get a low nt quality.
- */
-
-#define COLOR_MM 19
-#define NUCL_MM 25
-
-static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
-
-/*
- {A,C,G,T,N} -> {0,1,2,3,4}
- nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
- cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
- nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
- btarray[0..4*size]: backtrack array (working space)
- */
-void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
-{
- int h[8], curr, last;
- int x, y, xmin, hmin, k;
-
- // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
-
- // recursion: initial value
- if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
- else {
- for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
- h[nt_ref[0]] = 0;
- }
- // recursion: main loop
- curr = 1; last = 0;
- for (k = 1; k <= size; ++k) {
- for (x = 0; x != 4; ++x) {
- int min = 0x7fffffff, ymin = 0;
- for (y = 0; y != 4; ++y) {
- int s = h[last<<2|y];
- if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
- s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
- if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
- if (s < min) {
- min = s; ymin = y;
- }
- }
- h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
- }
- last = curr; curr = 1 - curr; // swap
- }
- // back trace
- hmin = 0x7fffffff; xmin = 0;
- for (x = 0; x != 4; ++x) {
- if (h[last<<2|x] < hmin) {
- hmin = h[last<<2|x]; xmin = x;
- }
- }
- nt_read[size] = xmin;
- for (k = size - 1; k >= 0; --k)
- nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
-}
-/*
- nt_read[0..size]: nucleotide read sequence: 0/1/2/3
- cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
- tarray[0..size*2-1]: temporary array
- */
-uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
-{
- int k, c1, c2;
- uint8_t *t2array = tarray + size;
- // get the color sequence of nt_read
- c1 = nt_read[0];
- for (k = 1; k <= size; ++k) {
- c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
- tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
- c1 = c2;
- }
- for (k = 1; k != size; ++k) {
- int q = 0;
- if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
- q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
- } else if (tarray[k-1] == cs_read[k-1]>>6) {
- q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
- } else if (tarray[k] == cs_read[k]>>6) {
- q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
- } // else, q = 0
- if (q < 0) q = 0;
- if (q > 60) q = 60;
- t2array[k] = nt_read[k]<<6 | q;
- if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
- }
- return t2array + 1; // of size-2
-}
-
-// this function will be called when p->seq has been reversed by refine_gapped()
-void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
-{
- uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
- int i, len;
- uint8_t *seq;
-
- // set temporary arrays
- if (p->type == BWA_TYPE_NO_MATCH) return;
- len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
- ta = (uint8_t*)malloc(len * 7);
- nt_ref = ta;
- cs_read = nt_ref + len;
- nt_read = cs_read + len;
- btarray = nt_read + len;
- tarray = nt_read + len;
-
-#define __gen_csbase(_cs, _i, _seq) do { \
- int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
- if (q > 60) q = 60; \
- if (_seq[_i] > 3) q = 63; \
- (_cs) = _seq[_i]<<6 | q; \
- } while (0)
-
- // generate len, nt_ref[] and cs_read
- seq = p->strand? p->rseq : p->seq;
- nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
- if (p->cigar == 0) { // no gap or clipping
- len = p->len;
- for (i = 0; i < p->len; ++i) {
- __gen_csbase(cs_read[i], i, seq);
- nt_ref[i+1] = bns_pac(pac, p->pos + i);
- }
- } else {
- int k, z;
- bwtint_t x, y;
- x = p->pos; y = 0;
- for (k = z = 0; k < p->n_cigar; ++k) {
- int l = __cigar_len(p->cigar[k]);
- if (__cigar_op(p->cigar[k]) == FROM_M) {
- for (i = 0; i < l; ++i, ++x, ++y) {
- __gen_csbase(cs_read[z], y, seq);
- nt_ref[z+1] = bns_pac(pac, x);
- ++z;
- }
- } else if (__cigar_op(p->cigar[k]) == FROM_I) {
- for (i = 0; i < l; ++i, ++y) {
- __gen_csbase(cs_read[z], y, seq);
- nt_ref[z+1] = 4;
- ++z;
- }
- } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
- else x += l;
- }
- len = z;
- }
-
- cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
- new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
-
- // update p
- p->len = p->full_len = len - 1;
- for (i = 0; i < p->len; ++i) {
- if ((new_nt_read[i]&0x3f) == 63) {
- p->qual[i] = 33; seq[i] = 4;
- } else {
- p->qual[i] = (new_nt_read[i]&0x3f) + 33;
- seq[i] = new_nt_read[i]>>6;
- }
- }
- p->qual[p->len] = seq[p->len] = 0;
- if (p->strand) {
- memcpy(p->seq, seq, p->len);
- seq_reverse(p->len, p->seq, 1);
- seq_reverse(p->len, p->qual, 0);
- } else {
- memcpy(p->rseq, seq, p->len);
- seq_reverse(p->len, p->rseq, 1);
- }
- free(ta);
-}
diff --git a/example.c b/example.c
new file mode 100644
index 0000000..6564cbd
--- /dev/null
+++ b/example.c
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <zlib.h>
+#include <string.h>
+#include <assert.h>
+#include "bwamem.h"
+#include "kseq.h" // for the FASTA/Q parser
+KSEQ_DECLARE(gzFile)
+
+int main(int argc, char *argv[])
+{
+ bwaidx_t *idx;
+ gzFile fp;
+ kseq_t *ks;
+ mem_opt_t *opt;
+
+ if (argc < 3) {
+ fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
+ return 1;
+ }
+
+ idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
+ assert(idx);
+ fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+ assert(fp);
+ ks = kseq_init(fp); // initialize the FASTA/Q parser
+ opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
+
+ while (kseq_read(ks) >= 0) { // read one sequence
+ mem_alnreg_v ar;
+ int i, k;
+ ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
+ for (i = 0; i < ar.n; ++i) { // traverse each hit
+ mem_aln_t a;
+ if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
+ a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
+ // print alignment
+ printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
+ for (k = 0; k < a.n_cigar; ++k) // print CIGAR
+ printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
+ printf("\t%d\n", a.NM); // print edit distance
+ free(a.cigar); // don't forget to deallocate CIGAR
+ }
+ free(ar.a); // and deallocate the hit list
+ }
+
+ free(opt);
+ kseq_destroy(ks);
+ gzclose(fp);
+ bwa_idx_destroy(idx);
+ return 0;
+}
diff --git a/fastmap.c b/fastmap.c
index 4d7a675..56cfb01 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -2,91 +2,155 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
-#include "bntseq.h"
-#include "bwt.h"
+#include "bwa.h"
+#include "bwamem.h"
#include "kvec.h"
+#include "utils.h"
#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
extern unsigned char nst_nt4_table[256];
-typedef struct {
- const bwt_t *bwt;
- const uint8_t *query;
- int start, len;
- bwtintv_v *tmpvec[2], *matches;
-} smem_i;
+void *kopen(const char *fn, int *_fd);
+int kclose(void *a);
-smem_i *smem_iter_init(const bwt_t *bwt)
+int main_mem(int argc, char *argv[])
{
- smem_i *iter;
- iter = calloc(1, sizeof(smem_i));
- iter->bwt = bwt;
- iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
- iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
- iter->matches = calloc(1, sizeof(bwtintv_v));
- return iter;
-}
+ mem_opt_t *opt;
+ int fd, fd2, i, c, n, copy_comment = 0;
+ gzFile fp, fp2 = 0;
+ kseq_t *ks, *ks2 = 0;
+ bseq1_t *seqs;
+ bwaidx_t *idx;
+ char *rg_line = 0;
+ void *ko = 0, *ko2 = 0;
-void smem_iter_destroy(smem_i *iter)
-{
- free(iter->tmpvec[0]->a);
- free(iter->tmpvec[1]->a);
- free(iter->matches->a);
- free(iter);
-}
+ opt = mem_opt_init();
+ while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
+ if (c == 'k') opt->min_seed_len = atoi(optarg);
+ else if (c == 'w') opt->w = atoi(optarg);
+ else if (c == 'A') opt->a = atoi(optarg);
+ else if (c == 'B') opt->b = atoi(optarg);
+ else if (c == 'O') opt->q = atoi(optarg);
+ else if (c == 'E') opt->r = atoi(optarg);
+ else if (c == 'L') opt->pen_clip = atoi(optarg);
+ else if (c == 'U') opt->pen_unpaired = atoi(optarg);
+ else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
+ else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
+ else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
+ else if (c == 'a') opt->flag |= MEM_F_ALL;
+ else if (c == 'p') opt->flag |= MEM_F_PE;
+ else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
+ else if (c == 'c') opt->max_occ = atoi(optarg);
+ else if (c == 'v') bwa_verbose = atoi(optarg);
+ else if (c == 'r') opt->split_factor = atof(optarg);
+ else if (c == 'C') copy_comment = 1;
+ else if (c == 'R') {
+ if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
+ } else if (c == 's') opt->split_width = atoi(optarg);
+ }
+ if (opt->n_threads < 1) opt->n_threads = 1;
+ if (optind + 1 >= argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
+ fprintf(stderr, "Algorithm options:\n\n");
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
+ fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
+ fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
+ fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
+ fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
+ fprintf(stderr, " -P skip pairing; perform mate SW only\n");
+ fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
+ fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
+ fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
+ fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
+ fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip);
+ fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
+ fprintf(stderr, "\nInput/output options:\n\n");
+ fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
+ fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
+ fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
+ fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
+ fprintf(stderr, " -H hard clipping\n");
+ fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
+ fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
+ fprintf(stderr, "\n");
+ free(opt);
+ return 1;
+ }
-void smem_set_query(smem_i *iter, int len, const uint8_t *query)
-{
- iter->query = query;
- iter->start = 0;
- iter->len = len;
-}
+ mem_fill_scmat(opt->a, opt->b, opt->mat);
+ if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+ bwa_print_sam_hdr(idx->bns, rg_line);
-int smem_next(smem_i *iter)
-{
- iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0;
- if (iter->start >= iter->len || iter->start < 0) return -1;
- while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases
- if (iter->start == iter->len) return -1;
- iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec);
- return iter->start;
+ ko = kopen(argv[optind + 1], &fd);
+ fp = gzdopen(fd, "r");
+ ks = kseq_init(fp);
+ if (optind + 2 < argc) {
+ if (opt->flag&MEM_F_PE) {
+ if (bwa_verbose >= 2)
+ fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
+ } else {
+ ko2 = kopen(argv[optind + 2], &fd2);
+ fp2 = gzdopen(fd2, "r");
+ ks2 = kseq_init(fp2);
+ opt->flag |= MEM_F_PE;
+ }
+ }
+ while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
+ int64_t size = 0;
+ if (!copy_comment)
+ for (i = 0; i < n; ++i) {
+ free(seqs[i].comment); seqs[i].comment = 0;
+ }
+ for (i = 0; i < n; ++i) size += seqs[i].l_seq;
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
+ mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs);
+ free(seqs);
+ }
+
+ free(opt);
+ bwa_idx_destroy(idx);
+ kseq_destroy(ks);
+ gzclose(fp); kclose(ko);
+ if (ks2) {
+ kseq_destroy(ks2);
+ gzclose(fp2); kclose(ko2);
+ }
+ return 0;
}
int main_fastmap(int argc, char *argv[])
{
- int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
+ int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0;
kseq_t *seq;
bwtint_t k;
gzFile fp;
- bwt_t *bwt;
- bntseq_t *bns;
- smem_i *iter;
+ smem_i *itr;
+ const bwtintv_v *a;
+ bwaidx_t *idx;
- while ((c = getopt(argc, argv, "w:l:s")) >= 0) {
+ while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
switch (c) {
- case 's': print_seq = 1; break;
+ case 's': split_width = atoi(optarg); break;
+ case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
}
}
if (optind + 1 >= argc) {
- fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
+ fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", split_width, min_len, min_iwidth);
return 1;
}
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
- { // load the packed sequences, BWT and SA
- char *tmp = calloc(strlen(argv[optind]) + 5, 1);
- strcat(strcpy(tmp, argv[optind]), ".bwt");
- bwt = bwt_restore_bwt(tmp);
- strcat(strcpy(tmp, argv[optind]), ".sa");
- bwt_restore_sa(tmp, bwt);
- free(tmp);
- bns = bns_restore(argv[optind]);
- }
- iter = smem_iter_init(bwt);
+ idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS);
+ itr = smem_itr_init(idx->bwt);
while (kseq_read(seq) >= 0) {
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
@@ -95,10 +159,10 @@ int main_fastmap(int argc, char *argv[])
} else putchar('\n');
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
- smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s);
- while (smem_next(iter) > 0) {
- for (i = 0; i < iter->matches->n; ++i) {
- bwtintv_t *p = &iter->matches->a[i];
+ smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
+ while ((a = smem_next(itr, min_len<<1, split_width)) != 0) {
+ for (i = 0; i < a->n; ++i) {
+ bwtintv_t *p = &a->a[i];
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
if (p->x[2] <= min_iwidth) {
@@ -106,10 +170,10 @@ int main_fastmap(int argc, char *argv[])
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
- pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
+ pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1;
- bns_cnt_ambi(bns, pos, len, &ref_id);
- printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
+ bns_cnt_ambi(idx->bns, pos, len, &ref_id);
+ printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
}
} else fputs("\t*", stdout);
putchar('\n');
@@ -118,9 +182,8 @@ int main_fastmap(int argc, char *argv[])
puts("//");
}
- smem_iter_destroy(iter);
- bns_destroy(bns);
- bwt_destroy(bwt);
+ smem_itr_destroy(itr);
+ bwa_idx_destroy(idx);
kseq_destroy(seq);
gzclose(fp);
return 0;
diff --git a/kbtree.h b/kbtree.h
new file mode 100644
index 0000000..5ed5330
--- /dev/null
+++ b/kbtree.h
@@ -0,0 +1,384 @@
+/*-
+ * Copyright 1997-1999, 2001, John-Mark Gurney.
+ * 2008-2009, Attractive Chaos <attractor at live.co.uk>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __AC_KBTREE_H
+#define __AC_KBTREE_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+typedef struct {
+ int32_t is_internal:1, n:31;
+} kbnode_t;
+
+#define __KB_KEY(type, x) ((type*)((char*)x + 4))
+#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr))
+
+#define __KB_TREE_T(name) \
+ typedef struct { \
+ kbnode_t *root; \
+ int off_key, off_ptr, ilen, elen; \
+ int n, t; \
+ int n_keys, n_nodes; \
+ } kbtree_##name##_t;
+
+#define __KB_INIT(name, key_t) \
+ kbtree_##name##_t *kb_init_##name(int size) \
+ { \
+ kbtree_##name##_t *b; \
+ b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \
+ b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
+ if (b->t < 2) { \
+ free(b); return 0; \
+ } \
+ b->n = 2 * b->t - 1; \
+ b->off_ptr = 4 + b->n * sizeof(key_t); \
+ b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
+ b->elen = (b->off_ptr + 3) >> 2 << 2; \
+ b->root = (kbnode_t*)calloc(1, b->ilen); \
+ ++b->n_nodes; \
+ return b; \
+ }
+
+#define __kb_destroy(b) do { \
+ int i, max = 8; \
+ kbnode_t *x, **top, **stack = 0; \
+ if (b) { \
+ top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \
+ *top++ = (b)->root; \
+ while (top != stack) { \
+ x = *--top; \
+ if (x->is_internal == 0) { free(x); continue; } \
+ for (i = 0; i <= x->n; ++i) \
+ if (__KB_PTR(b, x)[i]) { \
+ if (top - stack == max) { \
+ max <<= 1; \
+ stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
+ top = stack + (max>>1); \
+ } \
+ *top++ = __KB_PTR(b, x)[i]; \
+ } \
+ free(x); \
+ } \
+ } \
+ free(b); free(stack); \
+ } while (0)
+
+#define __kb_get_first(key_t, b, ret) do { \
+ kbnode_t *__x = (b)->root; \
+ while (__KB_PTR(b, __x)[0] != 0) \
+ __x = __KB_PTR(b, __x)[0]; \
+ (ret) = __KB_KEY(key_t, __x)[0]; \
+ } while (0)
+
+#define __KB_GET_AUX0(name, key_t, __cmp) \
+ static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+ { \
+ int tr, *rr, begin, end, n = x->n >> 1; \
+ if (x->n == 0) return -1; \
+ if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \
+ begin = 0; end = n; \
+ } else { begin = n; end = x->n - 1; } \
+ rr = r? r : &tr; \
+ n = end; \
+ while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
+ return n; \
+ }
+
+#define __KB_GET_AUX1(name, key_t, __cmp) \
+ static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+ { \
+ int tr, *rr, begin = 0, end = x->n; \
+ if (x->n == 0) return -1; \
+ rr = r? r : &tr; \
+ while (begin < end) { \
+ int mid = (begin + end) >> 1; \
+ if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
+ else end = mid; \
+ } \
+ if (begin == x->n) { *rr = 1; return x->n - 1; } \
+ if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \
+ return begin; \
+ }
+
+#define __KB_GET(name, key_t) \
+ static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+ { \
+ int i, r = 0; \
+ kbnode_t *x = b->root; \
+ while (x) { \
+ i = __kb_getp_aux_##name(x, k, &r); \
+ if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \
+ if (x->is_internal == 0) return 0; \
+ x = __KB_PTR(b, x)[i + 1]; \
+ } \
+ return 0; \
+ } \
+ static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
+ { \
+ return kb_getp_##name(b, &k); \
+ }
+
+#define __KB_INTERVAL(name, key_t) \
+ static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \
+ { \
+ int i, r = 0; \
+ kbnode_t *x = b->root; \
+ *lower = *upper = 0; \
+ while (x) { \
+ i = __kb_getp_aux_##name(x, k, &r); \
+ if (i >= 0 && r == 0) { \
+ *lower = *upper = &__KB_KEY(key_t, x)[i]; \
+ return; \
+ } \
+ if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \
+ if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \
+ if (x->is_internal == 0) return; \
+ x = __KB_PTR(b, x)[i + 1]; \
+ } \
+ } \
+ static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
+ { \
+ kb_intervalp_##name(b, &k, lower, upper); \
+ }
+
+#define __KB_PUT(name, key_t, __cmp) \
+ /* x must be an internal node */ \
+ static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
+ { \
+ kbnode_t *z; \
+ z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \
+ ++b->n_nodes; \
+ z->is_internal = y->is_internal; \
+ z->n = b->t - 1; \
+ memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
+ if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
+ y->n = b->t - 1; \
+ memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
+ __KB_PTR(b, x)[i + 1] = z; \
+ memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
+ __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \
+ ++x->n; \
+ } \
+ static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
+ { \
+ int i = x->n - 1; \
+ if (x->is_internal == 0) { \
+ i = __kb_getp_aux_##name(x, k, 0); \
+ if (i != x->n - 1) \
+ memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ __KB_KEY(key_t, x)[i + 1] = *k; \
+ ++x->n; \
+ } else { \
+ i = __kb_getp_aux_##name(x, k, 0) + 1; \
+ if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \
+ __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \
+ if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \
+ } \
+ __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \
+ } \
+ } \
+ static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+ { \
+ kbnode_t *r, *s; \
+ ++b->n_keys; \
+ r = b->root; \
+ if (r->n == 2 * b->t - 1) { \
+ ++b->n_nodes; \
+ s = (kbnode_t*)calloc(1, b->ilen); \
+ b->root = s; s->is_internal = 1; s->n = 0; \
+ __KB_PTR(b, s)[0] = r; \
+ __kb_split_##name(b, s, 0, r); \
+ r = s; \
+ } \
+ __kb_putp_aux_##name(b, r, k); \
+ } \
+ static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
+ { \
+ kb_putp_##name(b, &k); \
+ }
+
+
+#define __KB_DEL(name, key_t) \
+ static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
+ { \
+ int yn, zn, i, r = 0; \
+ kbnode_t *xp, *y, *z; \
+ key_t kp; \
+ if (x == 0) return *k; \
+ if (s) { /* s can only be 0, 1 or 2 */ \
+ r = x->is_internal == 0? 0 : s == 1? 1 : -1; \
+ i = s == 1? x->n - 1 : -1; \
+ } else i = __kb_getp_aux_##name(x, k, &r); \
+ if (x->is_internal == 0) { \
+ if (s == 2) ++i; \
+ kp = __KB_KEY(key_t, x)[i]; \
+ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ --x->n; \
+ return kp; \
+ } \
+ if (r == 0) { \
+ if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \
+ xp = __KB_PTR(b, x)[i]; \
+ kp = __KB_KEY(key_t, x)[i]; \
+ __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
+ return kp; \
+ } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \
+ xp = __KB_PTR(b, x)[i + 1]; \
+ kp = __KB_KEY(key_t, x)[i]; \
+ __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
+ return kp; \
+ } else if (yn == b->t - 1 && zn == b->t - 1) { \
+ y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \
+ __KB_KEY(key_t, y)[y->n++] = *k; \
+ memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
+ if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
+ y->n += z->n; \
+ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+ --x->n; \
+ free(z); \
+ return __kb_delp_aux_##name(b, y, k, s); \
+ } \
+ } \
+ ++i; \
+ if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \
+ if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \
+ memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+ if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+ __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \
+ __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
+ if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
+ --y->n; ++xp->n; \
+ } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
+ __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
+ __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \
+ if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
+ --y->n; \
+ memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
+ if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
+ } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
+ __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \
+ memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+ if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+ y->n += xp->n; \
+ memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
+ memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
+ --x->n; \
+ free(xp); \
+ xp = y; \
+ } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
+ __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
+ memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \
+ if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
+ xp->n += y->n; \
+ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+ --x->n; \
+ free(y); \
+ } \
+ } \
+ return __kb_delp_aux_##name(b, xp, k, s); \
+ } \
+ static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+ { \
+ kbnode_t *x; \
+ key_t ret; \
+ ret = __kb_delp_aux_##name(b, b->root, k, 0); \
+ --b->n_keys; \
+ if (b->root->n == 0 && b->root->is_internal) { \
+ --b->n_nodes; \
+ x = b->root; \
+ b->root = __KB_PTR(b, x)[0]; \
+ free(x); \
+ } \
+ return ret; \
+ } \
+ static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
+ { \
+ return kb_delp_##name(b, &k); \
+ }
+
+typedef struct {
+ kbnode_t *x;
+ int i;
+} __kbstack_t;
+
+#define __kb_traverse(key_t, b, __func) do { \
+ int __kmax = 8; \
+ __kbstack_t *__kstack, *__kp; \
+ __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
+ __kp->x = (b)->root; __kp->i = 0; \
+ for (;;) { \
+ while (__kp->x && __kp->i <= __kp->x->n) { \
+ if (__kp - __kstack == __kmax - 1) { \
+ __kmax <<= 1; \
+ __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
+ __kp = __kstack + (__kmax>>1) - 1; \
+ } \
+ (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
+ ++__kp; \
+ } \
+ --__kp; \
+ if (__kp >= __kstack) { \
+ if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
+ ++__kp->i; \
+ } else break; \
+ } \
+ free(__kstack); \
+ } while (0)
+
+#define KBTREE_INIT(name, key_t, __cmp) \
+ __KB_TREE_T(name) \
+ __KB_INIT(name, key_t) \
+ __KB_GET_AUX1(name, key_t, __cmp) \
+ __KB_GET(name, key_t) \
+ __KB_INTERVAL(name, key_t) \
+ __KB_PUT(name, key_t, __cmp) \
+ __KB_DEL(name, key_t)
+
+#define KB_DEFAULT_SIZE 512
+
+#define kbtree_t(name) kbtree_##name##_t
+#define kb_init(name, s) kb_init_##name(s)
+#define kb_destroy(name, b) __kb_destroy(b)
+#define kb_get(name, b, k) kb_get_##name(b, k)
+#define kb_put(name, b, k) kb_put_##name(b, k)
+#define kb_del(name, b, k) kb_del_##name(b, k)
+#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
+#define kb_getp(name, b, k) kb_getp_##name(b, k)
+#define kb_putp(name, b, k) kb_putp_##name(b, k)
+#define kb_delp(name, b, k) kb_delp_##name(b, k)
+#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
+
+#define kb_size(b) ((b)->n_keys)
+
+#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
+#define kb_str_cmp(a, b) strcmp(a, b)
+
+#endif
diff --git a/khash.h b/khash.h
index de6be6d..2422044 100644
--- a/khash.h
+++ b/khash.h
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2008, 2009 by attractor <attractor at live.co.uk>
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@@ -33,7 +33,6 @@ int main() {
khiter_t k;
khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret);
- if (!ret) kh_del(32, h, k);
kh_value(h, k) = 10;
k = kh_get(32, h, 10);
is_missing = (k == kh_end(h));
@@ -47,6 +46,29 @@ int main() {
*/
/*
+ 2011-12-29 (0.2.7):
+
+ * Minor code clean up; no actual effect.
+
+ 2011-09-16 (0.2.6):
+
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+ - http://code.google.com/p/ulib/
+ - http://nothings.org/computer/judy/
+
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as it
+ is more robust to certain non-random input.
+
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
+
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
2009-09-26 (0.2.4):
* Improve portability
@@ -86,11 +108,9 @@ int main() {
@header
Generic hash table library.
-
- @copyright Heng Li
*/
-#define AC_VERSION_KHASH_H "0.2.4"
+#define AC_VERSION_KHASH_H "0.2.6"
#include <stdlib.h>
#include <string.h>
@@ -111,24 +131,14 @@ typedef unsigned long long khint64_t;
#endif
#ifdef _MSC_VER
-#define inline __inline
+#define kh_inline __inline
+#else
+#define kh_inline inline
#endif
typedef khint32_t khint_t;
typedef khint_t khiter_t;
-#define __ac_HASH_PRIME_SIZE 32
-static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
-{
- 0ul, 3ul, 11ul, 23ul, 53ul,
- 97ul, 193ul, 389ul, 769ul, 1543ul,
- 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
- 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
- 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
- 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
- 3221225473ul, 4294967291ul
-};
-
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
@@ -137,88 +147,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+#ifdef KHASH_LINEAR
+#define __ac_inc(k, m) 1
+#else
+#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#endif
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
static const double __ac_HASH_UPPER = 0.77;
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- typedef struct { \
- khint_t n_buckets, size, n_occupied, upper_bound; \
- khint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- static inline kh_##name##_t *kh_init_##name() { \
- return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
+ extern kh_##name##_t *kh_init_##name(void); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ SCOPE kh_##name##_t *kh_init_##name(void) { \
+ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
} \
- static inline void kh_destroy_##name(kh_##name##_t *h) \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
{ \
if (h) { \
- free(h->keys); free(h->flags); \
- free(h->vals); \
- free(h); \
+ kfree((void *)h->keys); kfree(h->flags); \
+ kfree((void *)h->vals); \
+ kfree(h); \
} \
} \
- static inline void kh_clear_##name(kh_##name##_t *h) \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
- memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
- static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
- khint_t inc, k, i, last; \
- k = __hash_func(key); i = k % h->n_buckets; \
- inc = 1 + k % (h->n_buckets - 1); last = i; \
+ khint_t inc, k, i, last, mask; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); i = k & mask; \
+ inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
- else i += inc; \
+ i = (i + inc) & mask; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} \
- static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
- { \
+ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
- khint_t t = __ac_HASH_PRIME_SIZE - 1; \
- while (__ac_prime_list[t] > new_n_buckets) --t; \
- new_n_buckets = __ac_prime_list[t+1]; \
- if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
- else { \
- new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
- memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
- if (h->n_buckets < new_n_buckets) { \
- h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) \
- h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) new_n_buckets = 4; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (!new_flags) return -1; \
+ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (!new_keys) return -1; \
+ h->keys = new_keys; \
+ if (kh_is_map) { \
+ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ if (!new_vals) return -1; \
+ h->vals = new_vals; \
+ } \
+ } /* otherwise shrink */ \
} \
} \
- if (j) { \
+ if (j) { /* rehashing is needed */ \
for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \
khval_t val; \
+ khint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
if (kh_is_map) val = h->vals[j]; \
__ac_set_isdel_true(h->flags, j); \
- while (1) { \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
khint_t inc, k, i; \
k = __hash_func(key); \
- i = k % new_n_buckets; \
- inc = 1 + k % (new_n_buckets - 1); \
- while (!__ac_isempty(new_flags, i)) { \
- if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
- else i += inc; \
- } \
+ i = k & new_mask; \
+ inc = __ac_inc(k, new_mask); \
+ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
__ac_set_isempty_false(new_flags, i); \
- if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
- __ac_set_isdel_true(h->flags, i); \
- } else { \
+ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
h->keys[i] = key; \
if (kh_is_map) h->vals[i] = val; \
break; \
@@ -226,35 +276,39 @@ static const double __ac_HASH_UPPER = 0.77;
} \
} \
} \
- if (h->n_buckets > new_n_buckets) { \
- h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) \
- h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
} \
- free(h->flags); \
+ kfree(h->flags); /* free the working space */ \
h->flags = new_flags; \
h->n_buckets = new_n_buckets; \
h->n_occupied = h->size; \
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \
+ return 0; \
} \
- static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khint_t x; \
- if (h->n_occupied >= h->upper_bound) { \
- if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
- else kh_resize_##name(h, h->n_buckets + 1); \
- } \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size<<1)) { \
+ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \
- khint_t inc, k, i, site, last; \
- x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
- if (__ac_isempty(h->flags, i)) x = i; \
+ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \
- inc = 1 + k % (h->n_buckets - 1); last = i; \
+ inc = __ac_inc(k, mask); last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
- if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
- else i += inc; \
+ i = (i + inc) & mask; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
@@ -263,20 +317,20 @@ static const double __ac_HASH_UPPER = 0.77;
} \
} \
} \
- if (__ac_isempty(h->flags, x)) { \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; ++h->n_occupied; \
*ret = 1; \
- } else if (__ac_isdel(h->flags, x)) { \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; \
*ret = 2; \
- } else *ret = 0; \
+ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
return x; \
} \
- static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
@@ -284,6 +338,17 @@ static const double __ac_HASH_UPPER = 0.77;
} \
}
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
/* --- BEGIN OF HASH FUNCTIONS --- */
/*! @function
@@ -311,10 +376,10 @@ static const double __ac_HASH_UPPER = 0.77;
@param s Pointer to a null terminated string
@return The hash value
*/
-static inline khint_t __ac_X31_hash_string(const char *s)
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
{
- khint_t h = *s;
- if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ khint_t h = (khint_t)*s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
return h;
}
/*! @function
@@ -328,9 +393,21 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
/* --- END OF HASH FUNCTIONS --- */
-/* Other necessary macros... */
+/* Other convenient macros... */
/*!
@abstract Type of the hash table.
@@ -396,7 +473,6 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/
#define kh_del(name, h, k) kh_del_##name(h, k)
-
/*! @function
@abstract Test whether a bucket contains data.
@param h Pointer to the hash table [khash_t(name)*]
@@ -455,6 +531,34 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/
#define kh_n_buckets(h) ((h)->n_buckets)
+/*! @function
+ @abstract Iterate over the entries in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param kvar Variable to which key will be assigned
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (kvar) = kh_key(h,__i); \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/*! @function
+ @abstract Iterate over the values in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
/* More conenient interfaces */
/*! @function
diff --git a/kopen.c b/kopen.c
new file mode 100644
index 0000000..8887932
--- /dev/null
+++ b/kopen.c
@@ -0,0 +1,345 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#ifdef _WIN32
+#define _KO_NO_NET
+#endif
+
+#ifndef _KO_NO_NET
+static int socket_wait(int fd, int is_read)
+{
+ fd_set fds, *fdr = 0, *fdw = 0;
+ struct timeval tv;
+ int ret;
+ tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ if (is_read) fdr = &fds;
+ else fdw = &fds;
+ ret = select(fd+1, fdr, fdw, 0, &tv);
+ if (ret == -1) perror("select");
+ return ret;
+}
+
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+ int on = 1, fd;
+ struct linger lng = { 0, 0 };
+ struct addrinfo hints, *res = 0;
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+ if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+ freeaddrinfo(res);
+ return fd;
+#undef __err_connect
+}
+
+static int http_open(const char *fn)
+{
+ char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
+ int fd, ret, l;
+
+ /* parse URL; adapted from khttp_parse_url() in knetfile.c */
+ if (strstr(fn, "http://") != fn) return 0;
+ // set ->http_host
+ for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+ l = p - fn - 7;
+ http_host = calloc(l + 1, 1);
+ strncpy(http_host, fn + 7, l);
+ http_host[l] = 0;
+ for (q = http_host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ // get http_proxy
+ proxy = getenv("http_proxy");
+ // set host, port and path
+ if (proxy == 0) {
+ host = strdup(http_host); // when there is no proxy, server name is identical to http_host name.
+ port = strdup(*q? q : "80");
+ path = strdup(*p? p : "/");
+ } else {
+ host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+ for (q = host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ port = strdup(*q? q : "80");
+ path = strdup(fn);
+ }
+
+ /* connect; adapted from khttp_connect() in knetfile.c */
+ l = 0;
+ fd = socket_connect(host, port);
+ buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+ l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", path, http_host);
+ l += sprintf(buf + l, "\r\n");
+ write(fd, buf, l);
+ l = 0;
+ while (read(fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+ if (buf[l] == '\n' && l >= 3)
+ if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+ ++l;
+ }
+ buf[l] = 0;
+ if (l < 14) { // prematured header
+ close(fd);
+ fd = -1;
+ }
+ ret = strtol(buf + 8, &p, 0); // HTTP return code
+ if (ret != 200) {
+ close(fd);
+ fd = -1;
+ }
+ free(buf); free(http_host); free(host); free(port); free(path);
+ return fd;
+}
+
+typedef struct {
+ int max_response, ctrl_fd;
+ char *response;
+} ftpaux_t;
+
+static int kftp_get_response(ftpaux_t *aux)
+{
+ unsigned char c;
+ int n = 0;
+ char *p;
+ if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
+ while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+ if (n >= aux->max_response) {
+ aux->max_response = aux->max_response? aux->max_response<<1 : 256;
+ aux->response = realloc(aux->response, aux->max_response);
+ }
+ aux->response[n++] = c;
+ if (c == '\n') {
+ if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
+ && aux->response[3] != '-') break;
+ n = 0;
+ continue;
+ }
+ }
+ if (n < 2) return -1;
+ aux->response[n-2] = 0;
+ return strtol(aux->response, &p, 0);
+}
+
+static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
+{
+ if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+ write(aux->ctrl_fd, cmd, strlen(cmd));
+ return is_get? kftp_get_response(aux) : 0;
+}
+
+static int ftp_open(const char *fn)
+{
+ char *p, *host = 0, *port = 0, *retr = 0;
+ char host2[80], port2[10];
+ int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
+ ftpaux_t aux;
+
+ /* parse URL */
+ if (strstr(fn, "ftp://") != fn) return 0;
+ for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+ if (*p != '/') return 0;
+ l = p - fn - 6;
+ port = strdup("21");
+ host = calloc(l + 1, 1);
+ strncpy(host, fn + 6, l);
+ retr = calloc(strlen(p) + 8, 1);
+ sprintf(retr, "RETR %s\r\n", p);
+
+ /* connect to ctrl */
+ memset(&aux, 0, sizeof(ftpaux_t));
+ aux.ctrl_fd = socket_connect(host, port);
+ if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
+
+ /* connect to the data stream */
+ kftp_get_response(&aux);
+ kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
+ kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
+ kftp_send_cmd(&aux, "TYPE I\r\n", 1);
+ kftp_send_cmd(&aux, "PASV\r\n", 1);
+ for (p = aux.response; *p && *p != '('; ++p);
+ if (*p != '(') goto ftp_open_end;
+ ++p;
+ sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+ memcpy(pasv_ip, v, 4 * sizeof(int));
+ pasv_port = (v[4]<<8&0xff00) + v[5];
+ kftp_send_cmd(&aux, retr, 0);
+ sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
+ sprintf(port2, "%d", pasv_port);
+ fd = socket_connect(host2, port2);
+ if (fd == -1) goto ftp_open_end;
+ ret = kftp_get_response(&aux);
+ if (ret != 150) {
+ close(fd);
+ fd = -1;
+ }
+ close(aux.ctrl_fd);
+
+ftp_open_end:
+ free(host); free(port); free(retr); free(aux.response);
+ return fd;
+}
+#endif /* !defined(_KO_NO_NET) */
+
+static char **cmd2argv(const char *cmd)
+{
+ int i, beg, end, argc;
+ char **argv, *str;
+ end = strlen(cmd);
+ for (i = end - 1; i >= 0; --i)
+ if (!isspace(cmd[i])) break;
+ end = i + 1;
+ for (beg = 0; beg < end; ++beg)
+ if (!isspace(cmd[beg])) break;
+ if (beg == end) return 0;
+ for (i = beg + 1, argc = 0; i < end; ++i)
+ if (isspace(cmd[i]) && !isspace(cmd[i-1]))
+ ++argc;
+ argv = (char**)calloc(argc + 2, sizeof(void*));
+ argv[0] = str = (char*)calloc(end - beg + 1, 1);
+ strncpy(argv[0], cmd + beg, end - beg);
+ for (i = argc = 1; i < end - beg; ++i)
+ if (isspace(str[i])) str[i] = 0;
+ else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
+ return argv;
+}
+
+#define KO_STDIN 1
+#define KO_FILE 2
+#define KO_PIPE 3
+#define KO_HTTP 4
+#define KO_FTP 5
+
+typedef struct {
+ int type, fd;
+ pid_t pid;
+} koaux_t;
+
+void *kopen(const char *fn, int *_fd)
+{
+ koaux_t *aux = 0;
+ *_fd = -1;
+ if (strstr(fn, "http://") == fn) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_HTTP;
+ aux->fd = http_open(fn);
+ } else if (strstr(fn, "ftp://") == fn) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_FTP;
+ aux->fd = ftp_open(fn);
+ } else if (strcmp(fn, "-") == 0) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_STDIN;
+ aux->fd = STDIN_FILENO;
+ } else {
+ const char *p, *q;
+ for (p = fn; *p; ++p)
+ if (!isspace(*p)) break;
+ if (*p == '<') { // pipe open
+ int need_shell, pfd[2];
+ pid_t pid;
+ // a simple check to see if we need to invoke a shell; not always working
+ for (q = p + 1; *q; ++q)
+ if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
+ break;
+ need_shell = (*q != 0);
+ pipe(pfd);
+ pid = vfork();
+ if (pid == -1) { /* vfork() error */
+ close(pfd[0]); close(pfd[1]);
+ return 0;
+ }
+ if (pid == 0) { /* the child process */
+ char **argv; /* FIXME: I do not know if this will lead to a memory leak */
+ close(pfd[0]);
+ dup2(pfd[1], STDOUT_FILENO);
+ close(pfd[1]);
+ if (!need_shell) {
+ argv = cmd2argv(p + 1);
+ execvp(argv[0], argv);
+ free(argv[0]); free(argv);
+ } else execl("/bin/sh", "sh", "-c", p + 1, NULL);
+ exit(1);
+ } else { /* parent process */
+ close(pfd[1]);
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_PIPE;
+ aux->fd = pfd[0];
+ aux->pid = pid;
+ }
+ } else {
+#ifdef _WIN32
+ *_fd = open(fn, O_RDONLY | O_BINARY);
+#else
+ *_fd = open(fn, O_RDONLY);
+#endif
+ if (*_fd) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_FILE;
+ aux->fd = *_fd;
+ }
+ }
+ }
+ *_fd = aux->fd;
+ return aux;
+}
+
+int kclose(void *a)
+{
+ koaux_t *aux = (koaux_t*)a;
+ if (aux->type == KO_PIPE) {
+ int status;
+ pid_t pid;
+ pid = waitpid(aux->pid, &status, WNOHANG);
+ if (pid != aux->pid) kill(aux->pid, 15);
+ }
+ free(aux);
+ return 0;
+}
+
+#ifdef _KO_MAIN
+#define BUF_SIZE 0x10000
+int main(int argc, char *argv[])
+{
+ void *x;
+ int l, fd;
+ unsigned char buf[BUF_SIZE];
+ FILE *fp;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: kopen <file>\n");
+ return 1;
+ }
+ x = kopen(argv[1], &fd);
+ fp = fdopen(fd, "r");
+ if (fp == 0) {
+ fprintf(stderr, "ERROR: fail to open the input\n");
+ return 1;
+ }
+ do {
+ if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
+ fwrite(buf, 1, l, stdout);
+ } while (l == BUF_SIZE);
+ fclose(fp);
+ kclose(x);
+ return 0;
+}
+#endif
diff --git a/kseq.h b/kseq.h
index ad8937c..a5cec7c 100644
--- a/kseq.h
+++ b/kseq.h
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2008, by Heng Li <lh3 at sanger.ac.uk>
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@@ -23,6 +23,8 @@
SOFTWARE.
*/
+/* Last Modified: 05MAR2012 */
+
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
@@ -30,9 +32,14 @@
#include <string.h>
#include <stdlib.h>
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
+
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
- char *buf; \
+ unsigned char *buf; \
int begin, end, is_eof; \
type_t f; \
} kstream_t;
@@ -45,7 +52,7 @@
{ \
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
ks->f = f; \
- ks->buf = (char*)malloc(__bufsize); \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
return ks; \
} \
static inline void ks_destroy(kstream_t *ks) \
@@ -82,10 +89,10 @@ typedef struct __kstring_t {
#endif
#define __KS_GETUNTIL(__read, __bufsize) \
- static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
if (dret) *dret = 0; \
- str->l = 0; \
+ str->l = append? str->l : 0; \
if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
@@ -97,14 +104,20 @@ typedef struct __kstring_t {
if (ks->end == 0) break; \
} else break; \
} \
- if (delimiter) { \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
- } else { \
+ } else if (delimiter == KS_SEP_SPACE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i])) break; \
- } \
- if (str->m - str->l < i - ks->begin + 1) { \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
str->s = (char*)realloc(str->s, str->m); \
@@ -117,9 +130,15 @@ typedef struct __kstring_t {
break; \
} \
} \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
str->s[str->l] = '\0'; \
return str->l; \
- }
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
#define KSTREAM_INIT(type_t, __read, __bufsize) \
__KS_TYPE(type_t) \
@@ -127,19 +146,16 @@ typedef struct __kstring_t {
__KS_GETC(__read, __bufsize) \
__KS_GETUNTIL(__read, __bufsize)
-#define __KSEQ_BASIC(type_t) \
- static inline kseq_t *kseq_init(type_t fd) \
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
{ \
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
return s; \
} \
- static inline void kseq_rewind(kseq_t *ks) \
- { \
- ks->last_char = 0; \
- ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
- } \
- static inline void kseq_destroy(kseq_t *ks) \
+ SCOPE void kseq_destroy(kseq_t *ks) \
{ \
if (!ks) return; \
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
@@ -152,44 +168,46 @@ typedef struct __kstring_t {
-1 end-of-file
-2 truncated quality string
*/
-#define __KSEQ_READ \
- static int kseq_read(kseq_t *seq) \
- { \
- int c; \
- kstream_t *ks = seq->f; \
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
- if (c == -1) return -1; /* end of file */ \
- seq->last_char = c; \
- } /* the first header char has been read */ \
- seq->comment.l = seq->seq.l = seq->qual.l = 0; \
- if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
- if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
- if (isgraph(c)) { /* printable non-space character */ \
- if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
- seq->seq.m = seq->seq.l + 2; \
- kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
- } \
- seq->seq.s[seq->seq.l++] = (char)c; \
- } \
- } \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
- if (c != '+') return seq->seq.l; /* FASTA */ \
- if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
- seq->qual.m = seq->seq.m; \
- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
- } \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
- if (c == -1) return -2; /* we should not stop here */ \
- while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
- if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
- seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
seq->last_char = 0; /* we have not come to the next header line */ \
- if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
- return seq->seq.l; \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
}
#define __KSEQ_TYPE(type_t) \
@@ -199,10 +217,19 @@ typedef struct __kstring_t {
kstream_t *f; \
} kseq_t;
-#define KSEQ_INIT(type_t, __read) \
- KSTREAM_INIT(type_t, __read, 4096) \
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 16384) \
__KSEQ_TYPE(type_t) \
- __KSEQ_BASIC(type_t) \
- __KSEQ_READ
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
#endif
diff --git a/ksort.h b/ksort.h
index 52812e1..ad66a17 100644
--- a/ksort.h
+++ b/ksort.h
@@ -139,7 +139,7 @@ typedef struct {
tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
} \
} \
- inline void __ks_insertsort_##name(type_t *s, type_t *t) \
+ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
{ \
type_t *i, *j, swap_tmp; \
for (i = s + 1; i < t; ++i) \
diff --git a/kstring.h b/kstring.h
index 398901f..81d7d60 100644
--- a/kstring.h
+++ b/kstring.h
@@ -16,19 +16,33 @@ typedef struct __kstring_t {
} kstring_t;
#endif
-static inline int kputs(const char *p, kstring_t *s)
+static inline void ks_resize(kstring_t *s, size_t size)
+{
+ if (s->m < size) {
+ s->m = size;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
{
- int l = strlen(p);
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
- strcpy(s->s + s->l, p);
+ memcpy(s->s + s->l, p, l);
s->l += l;
+ s->s[s->l] = 0;
return l;
}
+static inline int kputs(const char *p, kstring_t *s)
+{
+ return kputsn(p, strlen(p), s);
+}
+
static inline int kputc(int c, kstring_t *s)
{
if (s->l + 1 >= s->m) {
@@ -41,6 +55,40 @@ static inline int kputc(int c, kstring_t *s)
return c;
}
+static inline int kputw(int c, kstring_t *s)
+{
+ char buf[16];
+ int l, x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+ char buf[16];
+ int l, i;
+ unsigned x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+ s->s[s->l] = 0;
+ return 0;
+}
+
int ksprintf(kstring_t *s, const char *fmt, ...);
#endif
diff --git a/ksw.c b/ksw.c
index bd29e96..b97fed5 100644
--- a/ksw.c
+++ b/ksw.c
@@ -23,7 +23,6 @@
SOFTWARE.
*/
-#ifndef _NO_SSE2
#include <stdlib.h>
#include <stdint.h>
#include <emmintrin.h>
@@ -37,22 +36,35 @@
#define UNLIKELY(x) (x)
#endif
-struct _ksw_query_t {
+const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
+
+struct _kswq_t {
int qlen, slen;
uint8_t shift, mdiff, max, size;
__m128i *qp, *H0, *H1, *E, *Hmax;
};
-ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
+/**
+ * Initialize the query data structure
+ *
+ * @param size Number of bytes used to store a score; valid valures are 1 or 2
+ * @param qlen Length of the query sequence
+ * @param query Query sequence
+ * @param m Size of the alphabet
+ * @param mat Scoring matrix in a one-dimension array
+ *
+ * @return Query data structure
+ */
+kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
{
- ksw_query_t *q;
+ kswq_t *q;
int slen, a, tmp, p;
size = size > 1? 2 : 1;
p = 8 * (3 - size); // # values per __m128i
slen = (qlen + p - 1) / p; // segmented length
- q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
- q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory
+ q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
+ q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
q->H0 = q->qp + slen * m;
q->H1 = q->H0 + slen;
q->E = q->H1 + slen;
@@ -91,11 +103,12 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in
return q;
}
-int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
{
- int slen, i, m_b, n_b, te = -1, gmax = 0;
+ int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
+ kswr_t r;
#define __max_16(ret, xx) do { \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
@@ -106,10 +119,13 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
} while (0)
// initialization
+ r = g_defr;
+ minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+ endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
- gapoe = _mm_set1_epi8(a->gapo + a->gape);
- gape = _mm_set1_epi8(a->gape);
+ gapoe = _mm_set1_epi8(_gapo + _gape);
+ gape = _mm_set1_epi8(_gape);
shift = _mm_set1_epi8(q->shift);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
@@ -165,11 +181,11 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
end_loop16:
//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
__max_16(imax, max); // imax is the maximum number in max
- if (imax >= a->T) { // write the b array; this condition adds branching unfornately
+ if (imax >= minsc) { // write the b array; this condition adds branching unfornately
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
- b = realloc(b, 8 * m_b);
+ b = (uint64_t*)realloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
@@ -178,34 +194,38 @@ end_loop16:
gmax = imax; te = i; // te is the end position on the target
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
- if (gmax + q->shift >= 255) break;
+ if (gmax + q->shift >= 255 || gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S; // swap H0 and H1
}
- a->score = gmax; a->te = te;
- { // get a->qe, the end of query match; find the 2nd best score
+ r.score = gmax + q->shift < 255? gmax : 255;
+ r.te = te;
+ if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
int max = -1, low, high, qlen = slen * 16;
uint8_t *t = (uint8_t*)Hmax;
- for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
- if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen;
+ for (i = 0; i < qlen; ++i, ++t)
+ if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
//printf("%d,%d\n", max, gmax);
- i = (a->score + q->max - 1) / q->max;
- low = te - i; high = te + i;
- for (i = 0, a->score2 = 0; i < n_b; ++i) {
- int e = (int32_t)b[i];
- if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
- a->score2 = b[i]>>32, a->te2 = e;
+ if (b) {
+ i = (r.score + q->max - 1) / q->max;
+ low = te - i; high = te + i;
+ for (i = 0; i < n_b; ++i) {
+ int e = (int32_t)b[i];
+ if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+ r.score2 = b[i]>>32, r.te2 = e;
+ }
}
}
free(b);
- return a->score + q->shift >= 255? 255 : a->score;
+ return r;
}
-int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
{
- int slen, i, m_b, n_b, te = -1, gmax = 0;
+ int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+ kswr_t r;
#define __max_8(ret, xx) do { \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
@@ -215,10 +235,13 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
} while (0)
// initialization
+ r = g_defr;
+ minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+ endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
- gapoe = _mm_set1_epi16(a->gapo + a->gape);
- gape = _mm_set1_epi16(a->gape);
+ gapoe = _mm_set1_epi16(_gapo + _gape);
+ gape = _mm_set1_epi16(_gape);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
@@ -260,11 +283,11 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
}
end_loop8:
__max_8(imax, max);
- if (imax >= a->T) {
+ if (imax >= minsc) {
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
- b = realloc(b, 8 * m_b);
+ b = (uint64_t*)realloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
@@ -273,31 +296,244 @@ end_loop8:
gmax = imax; te = i;
for (j = 0; LIKELY(j < slen); ++j)
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+ if (gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S;
}
- a->score = gmax; a->te = te;
+ r.score = gmax; r.te = te;
{
int max = -1, low, high, qlen = slen * 8;
uint16_t *t = (uint16_t*)Hmax;
- for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
- if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen;
- i = (a->score + q->max - 1) / q->max;
- low = te - i; high = te + i;
- for (i = 0, a->score2 = 0; i < n_b; ++i) {
- int e = (int32_t)b[i];
- if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
- a->score2 = b[i]>>32, a->te2 = e;
+ for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
+ if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
+ if (b) {
+ i = (r.score + q->max - 1) / q->max;
+ low = te - i; high = te + i;
+ for (i = 0; i < n_b; ++i) {
+ int e = (int32_t)b[i];
+ if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+ r.score2 = b[i]>>32, r.te2 = e;
+ }
}
}
free(b);
- return a->score;
+ return r;
}
-int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a)
+static void revseq(int l, uint8_t *s)
{
- if (q->size == 1) return ksw_sse2_16(q, tlen, target, a);
- else return ksw_sse2_8(q, tlen, target, a);
+ int i, t;
+ for (i = 0; i < l>>1; ++i)
+ t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
+}
+
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+ int size;
+ kswq_t *q;
+ kswr_t r, rr;
+ kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
+
+ q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
+ if (qry && *qry == 0) *qry = q;
+ func = q->size == 2? ksw_i16 : ksw_u8;
+ size = q->size;
+ r = func(q, tlen, target, gapo, gape, xtra);
+ if (qry == 0) free(q);
+ if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
+ revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
+ q = ksw_qinit(size, r.qe + 1, query, m, mat);
+ rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
+ revseq(r.qe + 1, query); revseq(r.te + 1, target);
+ free(q);
+ if (r.score == rr.score)
+ r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
+ return r;
+}
+
+/********************
+ *** SW extension ***
+ ********************/
+
+typedef struct {
+ int32_t h, e;
+} eh_t;
+
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
+{
+ eh_t *eh; // score array
+ int8_t *qp; // query profile
+ int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
+ if (h0 < 0) h0 = 0;
+ // allocate memory
+ qp = malloc(qlen * m);
+ eh = calloc(qlen + 1, 8);
+ // generate the query profile
+ for (k = i = 0; k < m; ++k) {
+ const int8_t *p = &mat[k * m];
+ for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+ }
+ // fill the first row
+ eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
+ for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
+ eh[j].h = eh[j-1].h - gape;
+ // adjust $w if it is too large
+ k = m * m;
+ for (i = 0, max = 0; i < k; ++i) // get the max score
+ max = max > mat[i]? max : mat[i];
+ max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
+ max_gap = max_gap > 1? max_gap : 1;
+ w = w < max_gap? w : max_gap;
+ // DP loop
+ max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
+ beg = 0, end = qlen;
+ for (i = 0; LIKELY(i < tlen); ++i) {
+ int f = 0, h1, m = 0, mj = -1;
+ int8_t *q = &qp[target[i] * qlen];
+ // compute the first column
+ h1 = h0 - (gapo + gape * (i + 1));
+ if (h1 < 0) h1 = 0;
+ // apply the band and the constraint (if provided)
+ if (beg < i - w) beg = i - w;
+ if (end > i + w + 1) end = i + w + 1;
+ if (end > qlen) end = qlen;
+ for (j = beg; LIKELY(j < end); ++j) {
+ // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
+ // Similar to SSE2-SW, cells are computed in the following order:
+ // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+ // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
+ // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
+ eh_t *p = &eh[j];
+ int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
+ p->h = h1; // set H(i,j-1) for the next row
+ h += q[j];
+ h = h > e? h : e;
+ h = h > f? h : f;
+ h1 = h; // save H(i,j) to h1 for the next column
+ mj = m > h? mj : j;
+ m = m > h? m : h; // m is stored at eh[mj+1]
+ h -= gapoe;
+ h = h > 0? h : 0;
+ e -= gape;
+ e = e > h? e : h; // computed E(i+1,j)
+ p->e = e; // save E(i+1,j) for the next row
+ f -= gape;
+ f = f > h? f : h; // computed F(i,j+1)
+ }
+ eh[end].h = h1; eh[end].e = 0;
+ if (j == qlen) {
+ max_ie = gscore > h1? max_ie : i;
+ gscore = gscore > h1? gscore : h1;
+ }
+ if (m == 0) break;
+ if (m > max) max = m, max_i = i, max_j = mj;
+ // update beg and end for the next round
+ for (j = mj; j >= beg && eh[j].h; --j);
+ beg = j + 1;
+ for (j = mj + 2; j <= end && eh[j].h; ++j);
+ end = j;
+ //beg = 0; end = qlen; // uncomment this line for debugging
+ }
+ free(eh); free(qp);
+ if (_qle) *_qle = max_j + 1;
+ if (_tle) *_tle = max_i + 1;
+ if (_gtle) *_gtle = max_ie + 1;
+ if (_gscore) *_gscore = gscore;
+ return max;
+}
+
+/********************
+ * Global alignment *
+ ********************/
+
+#define MINUS_INF -0x40000000
+
+static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
+{
+ if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
+ if (*n_cigar == *m_cigar) {
+ *m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
+ cigar = realloc(cigar, (*m_cigar) << 2);
+ }
+ cigar[(*n_cigar)++] = len<<4 | op;
+ } else cigar[(*n_cigar)-1] += len<<4;
+ return cigar;
+}
+
+int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+{
+ eh_t *eh;
+ int8_t *qp; // query profile
+ int i, j, k, gapoe = gapo + gape, score, n_col;
+ uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
+ if (n_cigar_) *n_cigar_ = 0;
+ // allocate memory
+ n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
+ z = malloc(n_col * tlen);
+ qp = malloc(qlen * m);
+ eh = calloc(qlen + 1, 8);
+ // generate the query profile
+ for (k = i = 0; k < m; ++k) {
+ const int8_t *p = &mat[k * m];
+ for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+ }
+ // fill the first row
+ eh[0].h = 0; eh[0].e = MINUS_INF;
+ for (j = 1; j <= qlen && j <= w; ++j)
+ eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+ for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
+ // DP loop
+ for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
+ int32_t f = MINUS_INF, h1, beg, end;
+ int8_t *q = &qp[target[i] * qlen];
+ uint8_t *zi = &z[i * n_col];
+ beg = i > w? i - w : 0;
+ end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
+ h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+ for (j = beg; LIKELY(j < end); ++j) {
+ // This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
+ // 1) not checking h>0; 2) recording direction for backtracking
+ eh_t *p = &eh[j];
+ int32_t h = p->h, e = p->e;
+ uint8_t d; // direction
+ p->h = h1;
+ h += q[j];
+ d = h >= e? 0 : 1;
+ h = h >= e? h : e;
+ d = h >= f? d : 2;
+ h = h >= f? h : f;
+ h1 = h;
+ h -= gapoe;
+ e -= gape;
+ d |= e > h? 1<<2 : 0;
+ e = e > h? e : h;
+ p->e = e;
+ f -= gape;
+ d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
+ f = f > h? f : h;
+ zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
+ }
+ eh[end].h = h1; eh[end].e = MINUS_INF;
+ }
+ score = eh[qlen].h;
+ if (n_cigar_ && cigar_) { // backtrack
+ int n_cigar = 0, m_cigar = 0, which = 0;
+ uint32_t *cigar = 0, tmp;
+ i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+ while (i >= 0 && k >= 0) {
+ which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
+ if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
+ else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
+ else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
+ }
+ if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
+ if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
+ for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
+ tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
+ *n_cigar_ = n_cigar, *cigar_ = cigar;
+ }
+ free(eh); free(qp); free(z);
+ return score;
}
/*******************************************
@@ -333,30 +569,33 @@ unsigned char seq_nt4_table[256] = {
int main(int argc, char *argv[])
{
- int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2;
+ int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
int8_t mat[25];
- ksw_aux_t a;
+ int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
+ uint8_t *rseq = 0;
gzFile fpt, fpq;
kseq_t *kst, *ksq;
+
// parse command line
- a.gapo = 5; a.gape = 2; a.T = 10;
- while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) {
+ while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
switch (c) {
case 'a': sa = atoi(optarg); break;
case 'b': sb = atoi(optarg); break;
- case 'q': a.gapo = atoi(optarg); break;
- case 'r': a.gape = atoi(optarg); break;
- case 't': a.T = atoi(optarg); break;
+ case 'q': gapo = atoi(optarg); break;
+ case 'r': gape = atoi(optarg); break;
+ case 't': minsc = atoi(optarg); break;
case 'f': forward_only = 1; break;
- case 's': size = atoi(optarg); break;
+ case '1': xtra |= KSW_XBYTE; break;
}
}
if (optind + 2 > argc) {
- fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] <target.fa> <query.fa>\n", size, sa, sb, a.gapo, a.gape);
+ fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
return 1;
}
+ if (minsc > 0xffff) minsc = 0xffff;
+ xtra |= KSW_XSUBO | minsc;
// initialize scoring matrix
- for (i = k = 0; i < 5; ++i) {
+ for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? sa : -sb;
mat[k++] = 0; // ambiguous base
@@ -367,35 +606,34 @@ int main(int argc, char *argv[])
fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
// all-pair alignment
while (kseq_read(ksq) > 0) {
- ksw_query_t *q[2];
- for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
- q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
+ kswq_t *q[2] = {0, 0};
+ kswr_t r;
+ for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
if (!forward_only) { // reverse
- for (i = 0; i < ksq->seq.l/2; ++i) {
- int t = ksq->seq.s[i];
- ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i];
- ksq->seq.s[ksq->seq.l-1-i] = t;
+ if ((int)ksq->seq.m > max_rseq) {
+ max_rseq = ksq->seq.m;
+ rseq = (uint8_t*)realloc(rseq, max_rseq);
}
- for (i = 0; i < ksq->seq.l; ++i)
- ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
- q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
- } else q[1] = 0;
+ for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
+ rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
+ }
gzrewind(fpt); kseq_rewind(kst);
while (kseq_read(kst) > 0) {
- int s;
- for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
- s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a);
- printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
- if (q[1]) {
- s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a);
- printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
+ for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
+ r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
+ if (r.score >= minsc)
+ printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
+ if (rseq) {
+ r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
+ if (r.score >= minsc)
+ printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
}
}
free(q[0]); free(q[1]);
}
+ free(rseq);
kseq_destroy(kst); gzclose(fpt);
kseq_destroy(ksq); gzclose(fpq);
return 0;
}
-#endif // _KSW_MAIN
-#endif // _NO_SSE2
+#endif
diff --git a/ksw.h b/ksw.h
index d93d6a9..d2975de 100644
--- a/ksw.h
+++ b/ksw.h
@@ -1,51 +1,108 @@
#ifndef __AC_KSW_H
#define __AC_KSW_H
-struct _ksw_query_t;
-typedef struct _ksw_query_t ksw_query_t;
+#include <stdint.h>
+
+#define KSW_XBYTE 0x10000
+#define KSW_XSTOP 0x20000
+#define KSW_XSUBO 0x40000
+#define KSW_XSTART 0x80000
+
+struct _kswq_t;
+typedef struct _kswq_t kswq_t;
typedef struct {
- // input
- unsigned gapo, gape; // the first gap costs gapo+gape
- unsigned T; // threshold
- // output
- int score, te, qe, score2, te2;
-} ksw_aux_t;
+ int score; // best score
+ int te, qe; // target end and query end
+ int score2, te2; // second best score and ending position on the target
+ int tb, qb; // target start and query start
+} kswr_t;
#ifdef __cplusplus
extern "C" {
#endif
/**
- * Initialize the query data structure
+ * Aligning two sequences
+ *
+ * @param qlen length of the query sequence (typically <tlen)
+ * @param query query sequence with 0 <= query[i] < m
+ * @param tlen length of the target sequence
+ * @param target target sequence
+ * @param m number of residue types
+ * @param mat m*m scoring matrix in one-dimension array
+ * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+ * @param gape gap extension penalty
+ * @param xtra extra information (see below)
+ * @param qry query profile (see below)
+ *
+ * @return alignment information in a struct; unset values to -1
+ *
+ * When xtra==0, ksw_align() uses a signed two-byte integer to store a
+ * score and only finds the best score and the end positions. The 2nd best
+ * score or the start positions are not attempted. The default behavior can
+ * be tuned by setting KSW_X* flags:
+ *
+ * KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs,
+ * kswr_t::score will be set to 255
+ *
+ * KSW_XSUBO: track the 2nd best score and the ending position on the
+ * target if the 2nd best is higher than (xtra&0xffff)
+ *
+ * KSW_XSTOP: stop if the maximum score is above (xtra&0xffff)
*
- * @param size Number of bytes used to store a score; valid valures are 1 or 2
- * @param qlen Length of the query sequence
- * @param query Query sequence
- * @param m Size of the alphabet
- * @param mat Scoring matrix in a one-dimension array
+ * KSW_XSTART: find the start positions
*
- * @return Query data structure
+ * When *qry==NULL, ksw_align() will compute and allocate the query profile
+ * and when the function returns, *qry will point to the profile, which can
+ * be deallocated simply by free(). If one query is aligned against multiple
+ * target sequences, *qry should be set to NULL during the first call and
+ * freed after the last call. Note that qry can equal 0. In this case, the
+ * query profile will be deallocated in ksw_align().
*/
- ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); // to free, simply call free()
+ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
/**
- * Compute the maximum local score for queries initialized with ksw_qinit(1, ...)
+ * Banded global alignment
*
- * @param q Query data structure returned by ksw_qinit(1, ...)
- * @param tlen Length of the target sequence
- * @param target Target sequence
- * @param a Auxiliary data structure (see ksw.h)
+ * @param qlen query length
+ * @param query query sequence with 0 <= query[i] < m
+ * @param tlen target length
+ * @param target target sequence with 0 <= target[i] < m
+ * @param m number of residue types
+ * @param mat m*m scoring mattrix in one-dimension array
+ * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+ * @param gape gap extension penalty
+ * @param w band width
+ * @param n_cigar (out) number of CIGAR elements
+ * @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free()
*
- * @return The maximum local score; if the returned value equals 255, the SW may not be finished
+ * @return score of the alignment
*/
- int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
+ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
- /** Compute the maximum local score for queries initialized with ksw_qinit(2, ...) */
- int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
-
- /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
- int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
+ /**
+ * Extend alignment
+ *
+ * The routine aligns $query and $target, assuming their upstream sequences,
+ * which are not provided, have been aligned with score $h0. In return,
+ * region [0,*qle) on the query and [0,*tle) on the target sequences are
+ * aligned together. If *gscore>=0, *gscore keeps the best score such that
+ * the entire query sequence is aligned; *gtle keeps the position on the
+ * target where *gscore is achieved. Returning *gscore and *gtle helps the
+ * caller to decide whether an end-to-end hit or a partial hit is preferred.
+ *
+ * The first 9 parameters are identical to those in ksw_global()
+ *
+ * @param h0 alignment score of upstream sequences
+ * @param _qle (out) length of the query in the alignment
+ * @param _tle (out) length of the target in the alignment
+ * @param _gtle (out) length of the target if query is fully aligned
+ * @param _gscore (out) score of the best end-to-end alignment; negative if not found
+ *
+ * @return best semi-local alignment score
+ */
+ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore);
#ifdef __cplusplus
}
diff --git a/kvec.h b/kvec.h
index 57204d6..9c9ca6e 100644
--- a/kvec.h
+++ b/kvec.h
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2008, by Attractive Chaos <attractivechaos at aol.co.uk>
+ Copyright (c) 2008, by Attractive Chaos <attractor at live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@@ -76,15 +76,15 @@ int main() {
(v).a[(v).n++] = (x); \
} while (0)
-#define kv_pushp(type, v) (((v).n == (v).m)? \
+#define kv_pushp(type, v) ((((v).n == (v).m)? \
((v).m = ((v).m? (v).m<<1 : 2), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
- : 0), ((v).a + ((v).n++))
+ : 0), &(v).a[(v).n++])
-#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \
+#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
- : (v).n <= (size_t)(i)? (v).n = (i) \
- : 0), (v).a[(i)]
+ : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
+ : 0), (v).a[(i)])
#endif
diff --git a/main.c b/main.c
index 0e7af77..ba60cf7 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.6.2-r126"
+#define PACKAGE_VERSION "0.7.0-r313"
#endif
static int usage()
@@ -20,14 +20,13 @@ static int usage()
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
+ fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, "\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
- fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n");
- fprintf(stderr, " stdsw standard SW/NW alignment\n");
fprintf(stderr, "\n");
return 1;
}
@@ -50,15 +49,13 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
- else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
- else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1);
- else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1);
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
+ else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;
diff --git a/main.h b/main.h
index 026a80b..3e70362 100644
--- a/main.h
+++ b/main.h
@@ -6,7 +6,6 @@ extern "C" {
#endif
int bwa_fa2pac(int argc, char *argv[]);
- int bwa_pac2cspac(int argc, char *argv[]);
int bwa_pac2bwt(int argc, char *argv[]);
int bwa_bwtupdate(int argc, char *argv[]);
int bwa_bwt2sa(int argc, char *argv[]);
@@ -17,11 +16,10 @@ extern "C" {
int bwa_sai2sam_se(int argc, char *argv[]);
int bwa_sai2sam_pe(int argc, char *argv[]);
- int bwa_stdsw(int argc, char *argv[]);
-
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
+ int main_mem(int argc, char *argv[]);
#ifdef __cplusplus
}
diff --git a/simple_dp.c b/simple_dp.c
deleted file mode 100644
index 7c078c2..0000000
--- a/simple_dp.c
+++ /dev/null
@@ -1,162 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <zlib.h>
-#include <stdint.h>
-#include "stdaln.h"
-#include "utils.h"
-
-#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
-
-typedef struct {
- int l;
- unsigned char *s;
- char *n;
-} seq1_t;
-
-typedef struct {
- int n_seqs, m_seqs;
- seq1_t *seqs;
-} seqs_t;
-
-unsigned char aln_rev_table[256] = {
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
- 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
- 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N',
- 'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
- 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
-};
-
-static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0;
-static AlnParam g_aln_param;
-
-static void revseq(int len, uint8_t *seq)
-{
- int i;
- for (i = 0; i < len>>1; ++i) {
- uint8_t tmp = aln_rev_table[seq[len-1-i]];
- seq[len-1-i] = aln_rev_table[seq[i]];
- seq[i] = tmp;
- }
- if (len&1) seq[i] = aln_rev_table[seq[i]];
-}
-
-static seqs_t *load_seqs(const char *fn)
-{
- seqs_t *s;
- seq1_t *p;
- gzFile fp;
- int l;
- kseq_t *seq;
-
- fp = xzopen(fn, "r");
- seq = kseq_init(fp);
- s = (seqs_t*)calloc(1, sizeof(seqs_t));
- s->m_seqs = 256;
- s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
- while ((l = kseq_read(seq)) >= 0) {
- if (s->n_seqs == s->m_seqs) {
- s->m_seqs <<= 1;
- s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
- }
- p = s->seqs + (s->n_seqs++);
- p->l = seq->seq.l;
- p->s = (unsigned char*)malloc(p->l + 1);
- memcpy(p->s, seq->seq.s, p->l);
- p->s[p->l] = 0;
- p->n = strdup((const char*)seq->name.s);
- }
- kseq_destroy(seq);
- gzclose(fp);
- fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
- return s;
-}
-
-static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
-{
- int i;
- for (i = 0; i < ss->n_seqs; ++i) {
- AlnAln *aa;
- seq1_t *p = ss->seqs + i;
- g_aln_param.band_width = l + p->l;
- aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
- if (aa->score >= g_thres || g_is_global) {
- printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
- aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
- // NB: I put the short sequence as the first sequence in SW, an insertion to
- // the reference becomes a deletion from the short sequence. Therefore, I use
- // "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
- for (i = 0; i != aa->n_cigar; ++i)
- printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
- printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
- }
- aln_free_AlnAln(aa);
- }
-}
-
-static void aln_seqs(const seqs_t *ss, const char *fn)
-{
- gzFile fp;
- kseq_t *seq;
- int l;
-
- fp = xzopen(fn, "r");
- seq = kseq_init(fp);
- while ((l = kseq_read(seq)) >= 0) {
- if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+');
- if (g_strand&2) {
- revseq(l, (uint8_t*)seq->seq.s);
- aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-');
- }
- }
- kseq_destroy(seq);
- gzclose(fp);
-}
-
-int bwa_stdsw(int argc, char *argv[])
-{
- int c;
- seqs_t *ss;
-
- while ((c = getopt(argc, argv, "gT:frp")) >= 0) {
- switch (c) {
- case 'g': g_is_global = 1; break;
- case 'T': g_thres = atoi(optarg); break;
- case 'f': g_strand |= 1; break;
- case 'r': g_strand |= 2; break;
- case 'p': g_aa = 1; break;
- }
- }
- if (g_strand == 0) g_strand = 3;
- if (g_aa) g_strand = 1;
- if (optind + 1 >= argc) {
- fprintf(stderr, "\nUsage: bwa stdsw [options] <seq1.long.fa> <seq2.short.fa>\n\n");
- fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres);
- fprintf(stderr, " -p protein alignment (suppressing -r)\n");
- fprintf(stderr, " -f forward strand only\n");
- fprintf(stderr, " -r reverse strand only\n");
- fprintf(stderr, " -g global alignment\n\n");
- fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n");
- fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n");
- fprintf(stderr, " sequence.\n\n");
- return 1;
- }
- g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast;
- g_aln_param.gap_end = 0;
- ss = load_seqs(argv[optind]);
- aln_seqs(ss, argv[optind+1]);
- return 0;
-}
diff --git a/solid2fastq.pl b/solid2fastq.pl
deleted file mode 100755
index c60ad81..0000000
--- a/solid2fastq.pl
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/perl -w
-
-# Author: lh3
-# Note: Ideally, this script should be written in C. It is a bit slow at present.
-# Also note that this script is different from the one contained in MAQ.
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-my %opts;
-my $version = '0.1.4';
-my $usage = qq{
-Usage: solid2fastq.pl <in.title> <out.prefix>
-
-Note: <in.title> is the string showed in the `# Title:' line of a
- ".csfasta" read file. Then <in.title>F3.csfasta is read sequence
- file and <in.title>F3_QV.qual is the quality file. If
- <in.title>R3.csfasta is present, this script assumes reads are
- paired; otherwise reads will be regarded as single-end.
-
- The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
- tag and `2' for F3. Usually you may want to use short <out.prefix>
- to save diskspace. Long <out.prefix> also causes troubles to maq.
-
-};
-
-getopts('', \%opts);
-die($usage) if (@ARGV != 2);
-my ($title, $pre) = @ARGV;
-my (@fhr, @fhw);
-my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual');
-my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0;
-if ($is_paired) { # paired end
- for (0 .. 3) {
- my $fn = "$title$fn_suff[$_]";
- $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
- open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
- }
- open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo
- open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die;
- open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
- my (@df, @dr);
- @df = &read1(1); @dr = &read1(2);
- while (@df && @dr) {
- if ($df[0] eq $dr[0]) { # mate pair
- print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
- @df = &read1(1); @dr = &read1(2);
- } else {
- if ($df[0] le $dr[0]) {
- print {$fhw[2]} $df[1];
- @df = &read1(1);
- } else {
- print {$fhw[2]} $dr[1];
- @dr = &read1(2);
- }
- }
- }
- if (@df) {
- print {$fhw[2]} $df[1];
- while (@df = &read1(1, $fhr[0], $fhr[1])) {
- print {$fhw[2]} $df[1];
- }
- }
- if (@dr) {
- print {$fhw[2]} $dr[1];
- while (@dr = &read1(2, $fhr[2], $fhr[3])) {
- print {$fhw[2]} $dr[1];
- }
- }
- close($fhr[$_]) for (0 .. $#fhr);
- close($fhw[$_]) for (0 .. $#fhw);
-} else { # single end
- for (0 .. 1) {
- my $fn = "$title$fn_suff[$_]";
- $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
- open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
- }
- open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
- my @df;
- while (@df = &read1(1, $fhr[0], $fhr[1])) {
- print {$fhw[2]} $df[1];
- }
- close($fhr[$_]) for (0 .. $#fhr);
- close($fhw[2]);
-}
-
-sub read1 {
- my $i = shift(@_);
- my $j = ($i-1)<<1;
- my ($key, $seq);
- my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
- while (<$fhs>) {
- my $t = <$fhq>;
- if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
- $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
- die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t);
- my $name = "$pre:$1_$2_$3/$i";
- $_ = substr(<$fhs>, 2);
- tr/0123./ACGTN/;
- my $s = $_;
- $_ = <$fhq>;
- s/-1\b/0/eg;
- s/^(\d+)\s*//;
- s/(\d+)\s*/chr($1+33)/eg;
- $seq = qq/\@$name\n$s+\n$_\n/;
- last;
- }
- }
- return defined($seq)? ($key, $seq) : ();
-}
diff --git a/stdaln.c b/stdaln.c
index eb41882..cd064cf 100644
--- a/stdaln.c
+++ b/stdaln.c
@@ -542,13 +542,12 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2,
int start, end, max_score;
int thres, *suba, *ss;
- int gap_open, gap_ext, b;
+ int gap_open, gap_ext;
int *score_matrix, N_MATRIX_ROW;
/* initialize some align-related parameters. just for compatibility */
gap_open = ap->gap_open;
gap_ext = ap->gap_ext;
- b = ap->band_width;
score_matrix = ap->matrix;
N_MATRIX_ROW = ap->row;
thres = _thres > 0? _thres : -_thres;
@@ -862,7 +861,7 @@ uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len, int G0, uint8_t *_mem)
{
- int q, r, qr, tmp_len;
+ int q, r, qr;
int32_t **s_array, *score_array;
int is_overflow, of_base;
uint32_t *eh;
@@ -889,7 +888,6 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2
s_array[i] = (int32_t*)_p, _p += 4 * len1;
/* initialization */
aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
- tmp_len = len1 + 1;
start = 1; end = 2;
end_i = end_j = 0;
score = 0;
diff --git a/utils.c b/utils.c
index 8c1ad7e..20b09ee 100644
--- a/utils.c
+++ b/utils.c
@@ -35,6 +35,18 @@
#include <sys/time.h>
#include "utils.h"
+#include "ksort.h"
+#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
+KSORT_INIT(128, pair64_t, pair64_lt)
+KSORT_INIT(64, uint64_t, ks_lt_generic)
+
+#include "kseq.h"
+KSEQ_INIT2(, gzFile, gzread)
+
+/********************
+ * System utilities *
+ ********************/
+
FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
{
FILE *fp = 0;
@@ -46,6 +58,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
}
return fp;
}
+
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
{
if (freopen(fn, mode, fp) == 0) {
@@ -56,6 +69,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE
}
return fp;
}
+
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
{
gzFile fp;
@@ -67,6 +81,7 @@ gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
}
return fp;
}
+
void err_fatal(const char *header, const char *fmt, ...)
{
va_list args;
@@ -86,68 +101,54 @@ void err_fatal_simple_core(const char *func, const char *msg)
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
{
- size_t ret = fwrite(ptr, size, nmemb, stream);
- if (ret != nmemb)
- {
- err_fatal_simple_core("fwrite", strerror(errno));
- }
- return ret;
+ size_t ret = fwrite(ptr, size, nmemb, stream);
+ if (ret != nmemb)
+ err_fatal_simple_core("fwrite", strerror(errno));
+ return ret;
}
int err_printf(const char *format, ...)
{
- va_list arg;
- int done;
-
- va_start(arg, format);
- done = vfprintf(stdout, format, arg);
- int saveErrno = errno;
- va_end(arg);
-
- if (done < 0)
- {
- err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno));
- }
- return done;
+ va_list arg;
+ int done;
+ va_start(arg, format);
+ done = vfprintf(stdout, format, arg);
+ int saveErrno = errno;
+ va_end(arg);
+ if (done < 0) err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno));
+ return done;
}
int err_fprintf(FILE *stream, const char *format, ...)
{
- va_list arg;
- int done;
-
- va_start(arg, format);
- done = vfprintf(stream, format, arg);
- int saveErrno = errno;
- va_end(arg);
-
- if (done < 0)
- {
- err_fatal_simple_core("vfprintf", strerror(saveErrno));
- }
- return done;
+ va_list arg;
+ int done;
+ va_start(arg, format);
+ done = vfprintf(stream, format, arg);
+ int saveErrno = errno;
+ va_end(arg);
+ if (done < 0) err_fatal_simple_core("vfprintf", strerror(saveErrno));
+ return done;
}
int err_fflush(FILE *stream)
{
- int ret = fflush(stream);
- if (ret != 0)
- {
- err_fatal_simple_core("fflush", strerror(errno));
- }
- return ret;
+ int ret = fflush(stream);
+ if (ret != 0) err_fatal_simple_core("fflush", strerror(errno));
+ return ret;
}
int err_fclose(FILE *stream)
{
- int ret = fclose(stream);
- if (ret != 0)
- {
- err_fatal_simple_core("fclose", strerror(errno));
- }
- return ret;
+ int ret = fclose(stream);
+ if (ret != 0) err_fatal_simple_core("fclose", strerror(errno));
+ return ret;
}
+/*********
+ * Timer *
+ *********/
+
double cputime()
{
struct rusage r;
diff --git a/utils.h b/utils.h
index b6839e9..a3db251 100644
--- a/utils.h
+++ b/utils.h
@@ -28,6 +28,7 @@
#ifndef LH3_UTILS_H
#define LH3_UTILS_H
+#include <stdint.h>
#include <stdio.h>
#include <zlib.h>
@@ -38,14 +39,19 @@
#define ATTRIBUTE(list)
#endif
-
-
#define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg)
#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
#define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg)
+typedef struct {
+ uint64_t x, y;
+} pair64_t;
+
+typedef struct { size_t n, m; uint64_t *a; } uint64_v;
+typedef struct { size_t n, m; pair64_t *a; } pair64_v;
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -66,8 +72,24 @@ extern "C" {
double cputime();
double realtime();
+ void ks_introsort_64 (size_t n, uint64_t *a);
+ void ks_introsort_128(size_t n, pair64_t *a);
+
#ifdef __cplusplus
}
#endif
+static inline uint64_t hash_64(uint64_t key)
+{
+ key += ~(key << 32);
+ key ^= (key >> 22);
+ key += ~(key << 13);
+ key ^= (key >> 8);
+ key += (key << 3);
+ key ^= (key >> 15);
+ key += ~(key << 27);
+ key ^= (key >> 31);
+ return key;
+}
+
#endif
--
Burrows-Wheeler Aligner
More information about the debian-med-commit
mailing list