[med-svn] [bwa] 02/04: Imported Upstream version 0.7.12
Andreas Tille
tille at debian.org
Mon Feb 9 08:06:52 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository bwa.
commit 41430b90dca9b49a359f5b3070a20b32a118d7f0
Author: Andreas Tille <tille at debian.org>
Date: Tue Jan 27 08:47:32 2015 +0100
Imported Upstream version 0.7.12
---
Makefile | 10 +-
NEWS.md | 72 +++++
README-alt.md | 178 ++++++++++++
README.md | 72 +++--
bntseq.c | 48 +++-
bntseq.h | 1 +
bwa-helper.js | 706 -----------------------------------------------
bwa.1 | 58 +++-
bwa.c | 146 +++++++++-
bwa.h | 16 +-
bwakit/README.md | 115 ++++++++
bwakit/bwa-postalt.js | 524 +++++++++++++++++++++++++++++++++++
bwakit/run-HLA | 20 ++
bwakit/run-bwamem | 186 +++++++++++++
bwakit/run-gen-ref | 39 +++
bwakit/typeHLA-selctg.js | 62 +++++
bwakit/typeHLA.js | 496 +++++++++++++++++++++++++++++++++
bwakit/typeHLA.sh | 49 ++++
bwamem.c | 159 ++++++++---
bwamem.h | 15 +-
bwamem_extra.c | 68 +++--
bwamem_pair.c | 102 ++++---
bwashm.c | 213 ++++++++++++++
bwt.c | 42 ++-
bwt.h | 4 +-
bwt_gen.c | 9 +-
bwtindex.c | 18 +-
example.c | 12 +-
fastmap.c | 251 ++++++++++++-----
kseq.h | 18 +-
kthread.c | 104 ++++++-
main.c | 8 +-
utils.c | 11 +
utils.h | 2 +-
34 files changed, 2846 insertions(+), 988 deletions(-)
diff --git a/Makefile b/Makefile
index 60c2104..1d6ae44 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
-AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
+AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
@@ -14,6 +14,10 @@ INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
+ifeq ($(shell uname -s),Linux)
+ LIBS += -lrt
+endif
+
.SUFFIXES:.c .o .cc
.c.o:
@@ -40,7 +44,7 @@ depend:
QSufSort.o: QSufSort.h
bamlite.o: bamlite.h malloc_wrap.h
-bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h
+bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h
bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
bwamem.o: ksort.h utils.h kbtree.h
@@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
bwase.o: bwa.h ksw.h
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
+bwashm.o: bwa.h bntseq.h bwt.h
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
bwt_gen.o: QSufSort.h malloc_wrap.h
bwt_lite.o: bwt_lite.h malloc_wrap.h
@@ -75,5 +80,4 @@ ksw.o: ksw.h malloc_wrap.h
main.o: kstring.h malloc_wrap.h utils.h
malloc_wrap.o: malloc_wrap.h
pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
-test-extend.o: ksw.h
utils.o: utils.h ksort.h malloc_wrap.h kseq.h
diff --git a/NEWS.md b/NEWS.md
index 33a4760..4692889 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,75 @@
+Release 0.7.12 (28 December 2014)
+---------------------------------
+
+This release fixed a bug in the pair-end mode when ALT contigs are present. It
+leads to undercalling in regions overlapping ALT contigs.
+
+(0.7.12: 28 December 2014, r1039)
+
+
+
+Release 0.7.11 (23 December, 2014)
+----------------------------------
+
+A major change to BWA-MEM is the support of mapping to ALT contigs in addition
+to the primary assembly. Part of the ALT mapping strategy is implemented in
+BWA-MEM and the rest in a postprocessing script for now. Due to the extra
+layer of complexity on generating the reference genome and on the two-step
+mapping, we start to provide a wrapper script and precompiled binaries since
+this release. The package may be more convenient to some specific use cases.
+For general uses, the single BWA binary still works like the old way.
+
+Another major addition to BWA-MEM is HLA typing, which made possible with the
+new ALT mapping strategy. Necessary data and programs are included in the
+binary release. The wrapper script also optionally performs HLA typing when HLA
+genes are included in the reference genome as additional ALT contigs.
+
+Other notable changes to BWA-MEM:
+
+ * Added option `-b` to `bwa index`. This option tunes the batch size used in
+ the construction of BWT. It is advised to use large `-b` for huge reference
+ sequences such as the BLAST *nt* database.
+
+ * Optimized for PacBio data. This includes a change to scoring based on a
+ study done by Aaron Quinlan and a heuristic speedup. Further speedup is
+ possible, but needs more careful investigation.
+
+ * Dropped PacBio read-to-read alignment for now. BWA-MEM is good for finding
+ the best hit, but is not very sensitive to suboptimal hits. Option `-x pbread`
+ is still available, but hidden on the command line. This may be removed in
+ future releases.
+
+ * Added a new pre-setting for Oxford Nanopore 2D reads. LAST is still a little
+ more sensitive on older bacterial data, but bwa-mem is as good on more
+ recent data and is times faster for mapping against mammalian genomes.
+
+ * Added LAST-like seeding. This improves the accuracy for longer reads.
+
+ * Added option `-H` to insert arbitrary header lines.
+
+ * Smarter option `-p`. Given an interleaved FASTQ stream, old bwa-mem identifies
+ the 2i-th and (2i+1)-th reads as a read pair. The new verion identifies
+ adjacent reads with the same read name as a read pair. It is possible to mix
+ single-end and paired-end reads in one FASTQ.
+
+ * Improved parallelization. Old bwa-mem waits for I/O. The new version puts
+ I/O on a separate thread. It performs mapping while reading FASTQ and
+ writing SAM. This saves significant wall-clock time when reading from
+ or writing to a slow Unix pipe.
+
+With the new release, the recommended way to map Illumina reads to GRCh38 is to
+use the bwakit binary package:
+
+ bwa.kit/run-gen-ref hs38DH
+ bwa.kit/bwa index hs38DH.fa
+ bwa.kit/run-bwamem -t8 -H -o out-prefix hs38DH.fa read1.fq.gz read2.fq.gz | sh
+
+Please check bwa.kit/README.md for details and command line options.
+
+(0.7.11: 23 December 2014, r1034)
+
+
+
Release 0.7.10 (13 July, 2014)
------------------------------
diff --git a/README-alt.md b/README-alt.md
new file mode 100644
index 0000000..4f72042
--- /dev/null
+++ b/README-alt.md
@@ -0,0 +1,178 @@
+## For the Impatient
+
+```sh
+# Download bwakit (or from <http://sourceforge.net/projects/bio-bwa/files/bwakit/> manually)
+wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
+ | gzip -dc | tar xf -
+# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
+bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa # create BWA index
+# mapping
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh # skip "|sh" to show command lines
+```
+
+This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA
+genotypes on each gene and `out.hla.all` for other possible HLA genotypes.
+Please check out [bwa/bwakit/README.md][kithelp] for details.
+
+## Background
+
+GRCh38 consists of several components: chromosomal assembly, unlocalized contigs
+(chromosome known but location unknown), unplaced contigs (chromosome unknown)
+and ALT contigs (long clustered variations). The combination of the first three
+components is called the *primary assembly*. It is recommended to use the
+complete primary assembly for all analyses. Using ALT contigs in read mapping is
+tricky.
+
+GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary
+assembly. However, sequences that are highly diverged from the primary assembly
+only contribute a few million bp. Most subsequences of ALT contigs are nearly
+identical to the primary assembly. If we align sequence reads to GRCh38+ALT
+blindly, we will get many additional reads with zero mapping quality and miss
+variants on them. It is crucial to make mappers aware of ALTs.
+
+BWA-MEM is ALT-aware. It essentially computes mapping quality across the
+non-redundant content of the primary assembly plus the ALT contigs and is free
+of the problem above.
+
+## Methods
+
+### Sequence alignment
+
+As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
+postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT
+contigs are present. The following picture shows an example about how BWA-MEM
+infers mapping quality and reports alignment after step 2:
+
+![](http://lh3lh3.users.sourceforge.net/images/alt-demo.png)
+
+#### Step 1: BWA-MEM mapping
+
+At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
+the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
+depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
+alignments and assigns mapQ following these two rules:
+
+1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of
+ an ALT hit is computed across all hits.
+
+2. If there are no non-ALT hits, the best ALT hit is outputted as the primary
+ alignment. If there are both ALT and non-ALT hits, non-ALT hits will be
+ primary and ALT hits be supplementary (SAM flag 0x800).
+
+In theory, non-ALT alignments from step 1 should be identical to alignments
+against the reference genome with ALT contigs. In practice, the two types of
+alignments may differ in rare cases due to seeding heuristics. When an ALT hit
+is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
+non-ALT hits.
+
+If we don't care about ALT hits, we may skip postprocessing (step 2).
+Nonetheless, postprocessing is recommended as it improves mapQ and gives more
+information about ALT hits.
+
+#### Step 2: Postprocessing
+
+Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
+potential hits reported in the XA tag, lifts ALT hits to the chromosomal
+positions using the ALT-to-ref alignment, groups them based on overlaps between
+their lifted positions, and then re-estimates mapQ across the best scoring hit
+in each group. Being aware of the ALT-to-ref alignment, this script can greatly
+improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also
+writes each hit overlapping the reported hit into a separate SAM line. This
+enables variant calling on each ALT contig independent of others.
+
+### On the completeness of GRCh38+ALT
+
+While GRCh38 is much more complete than GRCh37, it is still missing some true
+human sequences. To make sure every piece of sequence in the reference assembly
+is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
+to have enough support from multiple sources before considering to add it to the
+reference assembly. This careful and sophisticated procedure has left out some
+sequences, one of which is [this example][novel], a 10kb contig assembled from
+CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
+[BLAST][blast] to see where it maps.
+
+For a more complete reference genome, we compiled a new set of decoy sequences
+from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
+The sequences are included in `hs38DH-extra.fa` from the [BWA binary
+package][res].
+
+In addition to decoy, we also put multiple alleles of HLA genes in
+`hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
+version 3.18.0 and are used to collect reads sequenced from these genes.
+
+### HLA typing
+
+HLA genes are known to be associated with many autoimmune diseases, infectious
+diseases and drug responses. They are among the most important genes but are
+rarely studied by WGS projects due to the high sequence divergence between
+HLA genes and the reference genome in these regions.
+
+By including the HLA gene regions in the reference assembly as ALT contigs, we
+are able to effectively identify reads coming from these genes. We also provide
+a pipeline, which is included in the [BWA binary package][res], to type the
+several classic HLA genes. The pipeline is conceptually simple. It de novo
+assembles sequence reads mapped to each gene, aligns exon sequences of each
+allele to the assembled contigs and then finds the pairs of alleles that best
+explain the contigs. In practice, however, the completeness of IMGT/HLA and
+copy-number changes related to these genes are not so straightforward to
+resolve. HLA typing may not always be successful. Users may also consider to use
+other programs for typing such as [Warren et al (2012)][hla4], [Liu et al
+(2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though
+most of them are distributed under restrictive licenses.
+
+## Preliminary Evaluation
+
+To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
+unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
+(hs38DH), and called small variants from the alignment. CHM1 is haploid.
+Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
+true positive (TP) heterozygous calls from NA12878 are approximately equal
+to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
+should yield higher TP and lower FP. The following table shows the numbers for
+these assemblies:
+
+|Assembly|hs37 |hs38 |hs38DH|CHM1_1.1| huref|
+|:------:|------:|------:|------:|------:|------:|
+|FP | 255706| 168068| 142516|307172 | 575634|
+|TP |2142260|2163113|2150844|2167235|2137053|
+
+With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces
+FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
+from hs38 only and found the majority of them are associated with excessive read
+depth, clustered variants or weak alignment. We believe most hs38-only calls are
+problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
+with nearly identical library construction, the difference is ~140k, an order
+of magnitude higher than the difference between hs38 and hs38DH. ALT contigs,
+decoy and HLA genes in hs38DH improve variant calling and enable the analyses of
+ALT contigs and HLA typing at little cost.
+
+## Problems and Future Development
+
+There are some uncertainties about ALT mappings - we are not sure whether they
+help biological discovery and don't know the best way to analyze them. Without
+clear demand from downstream analyses, it is very difficult to design the
+optimal mapping strategy. The current BWA-MEM method is just a start. If it
+turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
+for performance; if not, we may make changes. It is also possible that we might
+make breakthrough on the representation of multiple genomes, in which case, we
+can even get rid of ALT contigs for good.
+
+
+
+[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
+[sb]: https://github.com/GregoryFaust/samblaster
+[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/
+[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc
+[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat
+[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
+[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
+[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/
+[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml
+[hla1]: http://biorxiv.org/content/early/2014/07/08/006973
+[hlalink]: http://www.hladiseaseassociations.com
+[hlatools]: https://www.biostars.org/p/93245/
+[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html
+[hla3]: http://www.biomedcentral.com/1471-2164/15/325
+[hla4]: http://genomemedicine.com/content/4/12/95
+[kithelp]: https://github.com/lh3/bwa/tree/master/bwakit
diff --git a/README.md b/README.md
index dff245e..9ac49bb 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa)
+[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest)
##Getting started
git clone https://github.com/lh3/bwa.git
@@ -30,6 +32,11 @@ SourceForge. After you acquire the source code, simply use `make` to compile
and copy the single executable `bwa` to the destination you want. The only
dependency required to build BWA is [zlib][14].
+Since 0.7.11, precompiled binary for x86\_64-linux is available in [bwakit][17].
+In addition to BWA, this self-consistent package also comes with bwa-associated
+and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs,
+adapter triming, duplicate marking, HLA typing and associated data files.
+
##Seeking helps
The detailed usage is described in the man page available together with the
@@ -63,7 +70,8 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
-6. [How to map sequences to GRCh38 with ALT contigs?](#h38)
+6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg)
+7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt)
####<a name="type"></a>1. What types of data does BWA work with?
@@ -72,11 +80,11 @@ algorithm and setting may vary. The following list gives the recommended
settings:
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
- contigs up to a few megabases mapped to a close related reference genome:
+ contigs up to a few megabases mapped to a closely related reference genome:
bwa mem ref.fa reads.fq > aln.sam
-* Illumina single-end reads no longer than ~70bp:
+* Illumina single-end reads shorter than ~70bp:
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
@@ -84,24 +92,21 @@ settings:
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
-* Illumina paired-end reads no longer than ~70bp:
+* Illumina paired-end reads shorter than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
- bwa samse ref.fa reads.sai reads.fq > aln-pe.sam
+ bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
-* PacBio subreads to a reference genome:
+* PacBio subreads or Oxford Nanopore reads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam
-
-* PacBio subreads to themselves (the output is not SAM):
-
- bwa mem -x pbread reads.fq reads.fq > overlap.pas
+ bwa mem -x ont2d ref.fa reads.fq > aln.sam
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small.
-As is shown above, with non-default settings, BWA-MEM works with PacBio subreads
-with a sequencing error rate as high as ~15%.
+As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
+reads with a sequencing error rate over 20%.
####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
@@ -130,37 +135,22 @@ case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
as well. BWA-MEM does not have this problem.
-####<a name="h38"></a>6. How to map sequences to GRCh38 with ALT contigs?
-
-BWA-backtrack and BWA-MEM partially support mapping to a reference containing
-ALT contigs that represent alternative alleles highly divergent from the
-reference genome.
-
- # download the K8 executable required by bwa-helper.js
- wget http://sourceforge.net/projects/lh3/files/k8/k8-0.2.1.tar.bz2/download
- tar -jxf k8-0.2.1.tar.bz2
-
- # download the ALT-to-GRCh38 alignment in the SAM format
- wget http://sourceforge.net/projects/bio-bwa/files/hs38.alt.sam.gz/download
-
- # download the GRCh38 sequences with ALT contigs
- wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
+####<a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
- # index and mapping
- bwa index -p hs38a GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
- bwa mem -h50 hs38a reads.fq | ./k8-linux bwa-helper.js genalt hs38.alt.sam.gz > out.sam
+Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT.
+BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please
+see [README-alt.md][18] for details. Briefly, it is recommended to use
+[bwakit][17], the binary release of BWA, for generating the reference genome
+and for mapping.
-Here, option `-h50` asks bwa-mem to output multiple hits in the XA tag if the
-read has 50 or fewer hits. For each SAM line containing the XA tag,
-`bwa-helper.js genalt` decodes the alignments in the XA tag, groups hits lifted
-to the same chromosomal region, adjusts mapping quality and outputs all the
-hits overlapping the reported hit. A read may be mapped to both the primary
-assembly and one or more ALT contigs all with high mapping quality.
+####<a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
-Note that this procedure assumes reads are single-end and may miss hits to
-highly repetitive regions as these hits will not be reported with option
-`-h50`. `bwa-helper.js` is a prototype implementation not recommended for
-production uses.
+If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM
+without post-processing. The alignments produced this way are very close to
+alignments against GRCh38 without ALT contigs. Nonetheless, applying
+post-processing helps to reduce false mappings caused by reads from the
+diverged part of ALT contigs and also enables HLA typing. It is recommended to
+run the post-processing script.
@@ -180,3 +170,5 @@ production uses.
[14]: http://zlib.net/
[15]: https://github.com/lh3/bwa/tree/mem
[16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/
+[17]: http://sourceforge.net/projects/bio-bwa/files/bwakit/
+[18]: https://github.com/lh3/bwa/blob/master/README-alt.md
diff --git a/bntseq.c b/bntseq.c
index eddae84..465e383 100644
--- a/bntseq.c
+++ b/bntseq.c
@@ -37,6 +37,9 @@
#include "kseq.h"
KSEQ_DECLARE(gzFile)
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
@@ -94,7 +97,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix)
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
{
- char str[1024];
+ char str[8192];
FILE *fp;
const char *fname;
bntseq_t *bns;
@@ -117,14 +120,14 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
if (scanres != 2) goto badread;
p->name = strdup(str);
// read fasta comments
- while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
+ while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
while (c != '\n' && c != EOF) c = fgetc(fp);
if (c == EOF) {
scanres = EOF;
goto badread;
}
*q = 0;
- if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
+ if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
else p->anno = strdup("");
// read the rest
scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
@@ -165,11 +168,41 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
bntseq_t *bns_restore(const char *prefix)
{
- char ann_filename[1024], amb_filename[1024], pac_filename[1024];
+ char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
+ FILE *fp;
+ bntseq_t *bns;
strcat(strcpy(ann_filename, prefix), ".ann");
strcat(strcpy(amb_filename, prefix), ".amb");
strcat(strcpy(pac_filename, prefix), ".pac");
- return bns_restore_core(ann_filename, amb_filename, pac_filename);
+ bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
+ if (bns == 0) return 0;
+ if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
+ char str[1024];
+ khash_t(str) *h;
+ int c, i, absent;
+ khint_t k;
+ h = kh_init(str);
+ for (i = 0; i < bns->n_seqs; ++i) {
+ k = kh_put(str, h, bns->anns[i].name, &absent);
+ kh_val(h, k) = i;
+ }
+ i = 0;
+ while ((c = fgetc(fp)) != EOF) {
+ if (c == '\t' || c == '\n' || c == '\r') {
+ str[i] = 0;
+ if (str[0] != '@') {
+ k = kh_get(str, h, str);
+ if (k != kh_end(h))
+ bns->anns[kh_val(h, k)].is_alt = 1;
+ }
+ while (c != '\n' && c != EOF) c = fgetc(fp);
+ i = 0;
+ } else str[i++] = c; // FIXME: potential segfault here
+ }
+ kh_destroy(str, h);
+ fclose(fp);
+ }
+ return bns;
}
void bns_destroy(bntseq_t *bns)
@@ -201,7 +234,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_
}
p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s);
- p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
+ p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = seq->seq.l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
@@ -333,8 +366,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
{
int is_rev, rid_b, rid_e;
if (rb < bns->l_pac && re > bns->l_pac) return -2;
+ assert(rb <= re);
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
- rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1);
+ rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
return rid_b == rid_e? rid_b : -1;
}
diff --git a/bntseq.h b/bntseq.h
index 6437cf6..03671d6 100644
--- a/bntseq.h
+++ b/bntseq.h
@@ -43,6 +43,7 @@ typedef struct {
int32_t len;
int32_t n_ambs;
uint32_t gi;
+ int32_t is_alt;
char *name, *anno;
} bntann1_t;
diff --git a/bwa-helper.js b/bwa-helper.js
deleted file mode 100644
index 8a234b8..0000000
--- a/bwa-helper.js
+++ /dev/null
@@ -1,706 +0,0 @@
-/*****************************************************************
- * The K8 Javascript interpreter is required to run this script. *
- * *
- * Source code: https://github.com/attractivechaos/k8 *
- * Binary: http://sourceforge.net/projects/lh3/files/k8/ *
- * *
- * Data file used for generating GRCh38 ALT alignments: *
- * *
- * http://sourceforge.net/projects/bio-bwa/files/ *
- *****************************************************************/
-
-/******************
- *** From k8.js ***
- ******************/
-
-var getopt = function(args, ostr) {
- var oli; // option letter list index
- if (typeof(getopt.place) == 'undefined')
- getopt.ind = 0, getopt.arg = null, getopt.place = -1;
- if (getopt.place == -1) { // update scanning pointer
- if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
- getopt.place = -1;
- return null;
- }
- if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
- ++getopt.ind;
- getopt.place = -1;
- return null;
- }
- }
- var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
- if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
- if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
- if (getopt.place < 0) ++getopt.ind;
- return '?';
- }
- if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
- getopt.arg = null;
- if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
- } else { // need an argument
- if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
- getopt.arg = args[getopt.ind].substr(getopt.place);
- else if (args.length <= ++getopt.ind) { // no arg
- getopt.place = -1;
- if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
- return '?';
- } else getopt.arg = args[getopt.ind]; // white space
- getopt.place = -1;
- ++getopt.ind;
- }
- return optopt;
-}
-
-function obj2str(o)
-{
- if (typeof(o) != 'object') {
- return o.toString();
- } else if (o == null) {
- return "null";
- } else if (Array.isArray(o)) {
- var s = "[";
- for (var i = 0; i < o.length; ++i) {
- if (i) s += ',';
- s += obj2str(o[i]);
- }
- return s + "]";
- } else {
- var i = 0, s = "{";
- for (var key in o) {
- if (i++) s += ',';
- s += key + ":";
- s += obj2str(o[key]);
- }
- return s + "}";
- }
-}
-
-Bytes.prototype.reverse = function()
-{
- for (var i = 0; i < this.length>>1; ++i) {
- var tmp = this[i];
- this[i] = this[this.length - i - 1];
- this[this.length - i - 1] = tmp;
- }
-}
-
-Bytes.prototype.revcomp = function()
-{
- if (Bytes.rctab == null) {
- var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
- var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
- Bytes.rctab = [];
- for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
- for (var i = 0; i < s1.length; ++i)
- Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
- }
- for (var i = 0; i < this.length>>1; ++i) {
- var tmp = this[this.length - i - 1];
- this[this.length - i - 1] = Bytes.rctab[this[i]];
- this[i] = Bytes.rctab[tmp];
- }
- if (this.length>>1)
- this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
-}
-
-/************************
- *** command markovlp ***
- ************************/
-
-function bwa_markOvlp(args)
-{
- var c, min_aln_ratio = .9, min_ext = 50;
- while ((c = getopt(args, "r:e:")) != null) {
- if (c == 'r') min_aln_ratio = parseFloat(getopt.arg);
- else if (c == 'e') min_ext = parseInt(getopt.arg);
- }
-
- var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
- var buf = new Bytes();
- var dir4 = ['>>', '><', '<>', '<<'];
-
- while (file.readline(buf) >= 0) {
- var t = buf.toString().split("\t")
- for (var i = 0; i < t.length; ++i)
- if (i != 0 && i != 4)
- t[i] = parseInt(t[i]);
- var el, a1, a2, e1, e2, r1, r2; // a: aligned length; e: extended length; r: remaining length
- e2 = a2 = t[7] - t[6];
- if (t[2] < t[3]) { // forward-forward match
- e1 = a1 = t[3] - t[2];
- r1 = t[2] - t[6];
- r2 = (t[5] - t[7]) - (t[1] - t[3]);
- el = r1 > 0? t[6] : t[2];
- el += r2 > 0? t[1] - t[3] : t[5] - t[7];
- } else { // reverse-forward match
- e1 = a1 = t[2] - t[3];
- r1 = (t[1] - t[2]) - t[6];
- r2 = (t[5] - t[7]) - t[3];
- el = r1 > 0? t[6] : t[1] - t[2];
- el += r2 > 0? t[3] : t[5] - t[7];
- }
- e1 += el; e2 += el;
- var type;
- if (a1 / e1 >= min_aln_ratio && a2 / e2 >= min_aln_ratio) {
- if ((r1 >= min_ext && r2 >= min_ext) || (r1 <= -min_ext && r2 <= -min_ext)) { // suffix-prefix match
- var d = t[2] < t[3]? 0 : 2;
- if (r1 < 0) d ^= 3; // reverse the direction
- type = 'O' + dir4[d];
- } else type = 'C' + (e1 / t[1] > e2 / t[5]? 1 : 2);
- } else type = 'I'; // internal local match; not a suffix-prefix match
- //print(t[1], e1, a1, t[5], e2, a2);
- print(type, buf);
- }
-
- buf.destroy();
- file.close();
-}
-
-/***********************
- *** command pas2bed ***
- ***********************/
-
-function bwa_pas2reg(args)
-{
- var file = args.length? new File(args[0]) : new File();
- var buf = new Bytes();
-
- while (file.readline(buf) >= 0) {
- var t = buf.toString().split("\t");
- if (t[0] == t[4]) continue;
- if (parseInt(t[2]) < parseInt(t[3])) print(t[0], t[1], t[2], t[3], t[8]);
- else print(t[0], t[1], t[3], t[2], t[8]);
- print(t[4], t[5], t[6], t[7], t[8]);
- }
-
- buf.destroy();
- file.close();
-}
-
-/*******************
- * command sam2pas *
- *******************/
-
-function bwa_sam2pas(args)
-{
- var file = args.length == 0? new File() : new File(args[0]);
- var buf = new Bytes();
- var seq_dict = {};
-
- while (file.readline(buf) >= 0) {
- var line = buf.toString();
- var m;
- if (/^@SQ/.test(line)) {
- var name = null, len = null;
- if ((m = /\tSN:(\S+)/.exec(line)) != null) name = m[1];
- if ((m = /\tLN:(\S+)/.exec(line)) != null) len = parseInt(m[1]);
- if (name != null && len != null) seq_dict[name] = len;
- }
- if (/^@/.test(line)) continue;
- var t = line.split("\t");
- var pos = parseInt(t[3]) - 1;
- var x = 0, y = 0, i = 0, clip = [0, 0], n_ins = 0, n_del = 0, o_ins = 0, o_del = 0, n_M = 0;
- var re = /(\d+)([MIDSH])/g;
- while ((m = re.exec(t[5])) != null) {
- var l = parseInt(m[1]);
- if (m[2] == 'M') x += l, y += l, n_M += l;
- else if (m[2] == 'I') y += l, n_ins += l, ++o_ins;
- else if (m[2] == 'D') x += l, n_del += l, ++o_del;
- else if (m[2] == 'S' || m[2] == 'H')
- clip[i == 0? 0 : 1] = l;
- ++i;
- }
- var is_rev = (parseInt(t[1]) & 16)? true : false;
- var misc = 'mapQ=' + t[4] + ';';
- var usc = 1;
- if ((m = /\tNM:i:(\d+)/.exec(line)) != null) {
- var NM = parseInt(m[1]);
- var diff = (NM / (n_M + n_ins + n_del)).toFixed(3);
- misc += 'diff=' + diff + ';n_mis=' + (NM - n_del - n_ins) + ';';
- }
- misc += 'n_del='+n_del+';n_ins='+n_ins+';o_del='+o_del+';o_ins='+o_ins + ';';
- if ((m = /\tAS:i:(\d+)/.exec(line)) != null) {
- misc += 'AS='+m[1] + ';';
- usc = (parseInt(m[1]) / (x > y? x : y)).toFixed(3);
- }
- if ((m = /\tXS:i:(\d+)/.exec(line)) != null) misc += 'XS='+m[1] + ';';
- var len = y + clip[0] + clip[1];
- var z = [t[0], len, clip[0], clip[0] + y, t[2], seq_dict[t[2]], pos, pos + x, usc, misc];
- if (parseInt(t[1]) & 16) z[2] = clip[1] + y, z[3] = clip[1];
- print(z.join("\t"));
- }
-
- buf.destroy();
- file.close();
-}
-
-/***********************
- *** command reg2cut ***
- ***********************/
-
-function bwa_reg2cut(args)
-{
- var c, min_usc = 0.5, min_ext = 100, min_len = 5000, cut = 250;
- while ((c = getopt(args, "s:e:l:c:")) != null) {
- if (c == 's') min_usc = parseFloat(getopt.arg);
- else if (c == 'e') min_ext = parseInt(getopt.arg);
- else if (c == 'l') min_len = parseInt(getopt.arg);
- else if (c == 'c') cut = parseInt(getopt.arg);
- }
-
- var file = args.length == getopt.ind? new File () : new File(args[getopt.ind]);
- var buf = new Bytes();
-
- function print_bed() {
- for (var i = 0; i < a.length; ++i) {
- var start = a[i][0] - cut > 0? a[i][0] : 0;
- var end = a[i][1] + cut < last_len? a[i][1] : last_len;
- if (end - start >= min_len) print(last_chr, start, end);
- }
- }
-
- var last_chr = null, last_len = null, max_c_usc = 0, start = 0, end = 0;
- var a = [];
- while (file.readline(buf) >= 0) {
- var t = buf.toString().split("\t");
- t[1] = parseInt(t[1]);
- t[2] = parseInt(t[2]);
- t[3] = parseInt(t[3]);
- t[4] = parseFloat(t[4]);
- var is_contained = t[2] < min_ext && t[1] - t[3] < min_ext? true : false;
- if (t[3] - t[2] < cut<<1) continue;
- t[2] += cut; t[3] -= cut;
- if (t[0] != last_chr) {
- a.push([start, end]);
- if (last_chr != null && max_c_usc < min_usc) print_bed();
- last_chr = t[0]; last_len = t[1]; start = t[2]; end = t[3];
- max_c_usc = is_contained? t[4] : 0;
- a.length = 0;
- } else {
- if (is_contained)
- max_c_usc = max_c_usc > t[4]? max_c_usc : t[4];
- if (t[4] < min_usc) continue;
- if (t[2] > end) {
- a.push([start, end]);
- start = t[2];
- end = end > t[3]? end : t[3];
- } else end = end > t[3]? end : t[3];
- }
- }
- a.push([start, end]);
- if (max_c_usc < min_usc) print_bed(); // the last sequence
-
- buf.destroy();
- file.close();
-}
-
-function bwa_shortname(args)
-{
- var file = args.length? new File(args[0]) : new File();
- var buf = new Bytes();
-
- var re = /(\S+)\/(\d+)_(\d+)((:\d+-\d+)+)/g;
- var re2 = /:(\d+)-(\d+)/g;
- while (file.readline(buf) >= 0) {
- var match, match2;
- var line = buf.toString();
- var x = [];
- while ((match = re.exec(line)) != null) {
- var start = parseInt(match[2]), len = parseInt(match[3]) - start;
- while ((match2 = re2.exec(match[4])) != null) {
- var a = parseInt(match2[1]) - 1;
- var b = parseInt(match2[2]);
- start += a; len = b - a;
- }
- x.push([match[0], match[1] + '/' + start.toString() + '_' + (start+len).toString()]);
- }
- for (var i = 0; i < x.length; ++i)
- line = line.replace(x[i][0], x[i][1]);
- print(line);
- }
-
- buf.destroy();
- file.close();
-}
-
-/*******************
- * Command gff2sam *
- *******************/
-
-function bwa_gff2sam(args)
-{
- if (args.length < 2) {
- print("Usage: k8 bwa-helper.js <aln.gff> <query-length.txt>");
- exit(1);
- }
-
- var file = new File(args[1]);
- var buf = new Bytes();
- var len = {};
-
- while (file.readline(buf) >= 0) {
- var t = buf.toString().split(/\s+/);
- len[t[0]] = parseInt(t[1]);
- }
- file.close();
-
- file = new File(args[0]);
- var re_cigar = /([MID])(\d+)/g;
- var lineno = 0;
- while (file.readline(buf) >= 0) {
- ++lineno;
- var t = buf.toString().split("\t");
- var m = /Target=(\S+)\s+(\d+)\s+(\d+)\s+([+-])/.exec(t[8]);
- if (m == null) {
- warn("WARNING: skipped line "+lineno+" due to the lack of Target.");
- continue;
- }
- var qname = m[1];
- var flag = t[6] == m[4]? 0 : 16;
- var qb = parseInt(m[2]) - 1, qe = parseInt(m[3]), qlen = len[qname];
- if (qlen == null)
- throw Error("Sequence "+qname+" is not present in the query-length.txt");
- var clip5 = qb, clip3 = qlen - qe;
- if (flag&16) clip5 ^= clip3, clip3 ^= clip5, clip5 ^= clip3; // swap
-
- m = /Gap\s*=\s*(([MID]\d+\s*)+)/.exec(t[8]);
- var cigar = clip5? clip5 + 'S' : '';
- var n_ins = 0, n_del = 0, n_match = 0, NM = null;
- if (m) {
- var mc;
- while ((mc = re_cigar.exec(m[1])) != null) {
- var l = parseInt(mc[2]);
- cigar += mc[2] + mc[1];
- if (mc[1] == 'I') n_ins += l;
- else if (mc[1] == 'D') n_del += l;
- else if (mc[1] == 'M') n_match += l;
- }
- if (n_ins + n_match != qe - qb || n_del + n_match != parseInt(t[4]) - parseInt(t[3]) + 1)
- throw Error("Inconsistent CIGAR at line "+lineno);
- } else { // ungapped alignment
- var tb = parseInt(t[3]) - 1, te = parseInt(t[4]);
- if (te - tb != qe - qb) {
- warn("WARNING: line "+lineno+" should contain gaps, but lacks Gap. Skipped.\n");
- } else cigar = (qe - qb) + 'M';
- }
- if (clip3) cigar += clip3 + 'S';
- if ((m = /num_mismatch=(\d+)/.exec(t[8])) != null)
- NM = parseInt(m[1]) + n_ins + n_del;
- var out = [qname, flag, t[0], t[3], 255, cigar, '*', 0, 0, '*', '*'];
- if (NM != null) out.push('NM:i:' + NM);
- print(out.join("\t"));
- }
- buf.destroy();
- file.close();
-}
-
-
-/******************************
- *** Generate ALT alignment ***
- ******************************/
-
-function intv_ovlp(intv, bits) // interval index
-{
- if (typeof bits == "undefined") bits = 13;
- intv.sort(function(a,b) {return a[0]-b[0];});
- // create the index
- var idx = [], max = 0;
- for (var i = 0; i < intv.length; ++i) {
- var b = intv[i][0]>>bits;
- var e = (intv[i][1]-1)>>bits;
- if (b != e) {
- for (var j = b; j <= e; ++j)
- if (idx[j] == null) idx[j] = i;
- } else if (idx[b] == null) idx[b] = i;
- max = max > e? max : e;
- }
- // closure
- return function(_b, _e) {
- var x = _b >> bits;
- if (x > max) return [];
- var off = idx[x];
- if (off == null) {
- var i;
- for (i = ((_e - 1) >> bits) - 1; i >= 0; --i)
- if (idx[i] != null) break;
- off = i < 0? 0 : idx[i];
- }
- var ovlp = [];
- for (var i = off; i < intv.length && intv[i][0] < _e; ++i)
- if (intv[i][1] > _b) ovlp.push(intv[i]);
- return ovlp;
- }
-}
-
-function bwa_genalt(args)
-{
- var re_cigar = /(\d+)([MIDSHN])/g;
-
- function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
- {
- var x = 0, y = 0;
- for (var i = 0; i < cigar.length; ++i) {
- var op = cigar[i][0], len = cigar[i][1];
- if (op == 'M') {
- if (y <= pos && pos < y + len)
- return x + (pos - y);
- x += len, y += len;
- } else if (op == 'D') {
- x += len;
- } else if (op == 'I') {
- if (y <= pos && pos < y + len)
- return x;
- y += len;
- } else if (op == 'S' || op == 'H') {
- if (y <= pos && pos < y + len)
- return -1;
- y += len;
- }
- }
- return -1;
- }
-
- function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5]
- {
- var h = {};
- h.ctg = s[0];
- h.start = parseInt(s[1].substr(1)) - 1;
- h.rev = (s[1].charAt(0) == '-');
- h.cigar = s[2];
- h.NM = parseInt(s[3]);
- var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
- l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
- while ((m = re_cigar.exec(h.cigar)) != null) {
- var l = parseInt(m[1]);
- if (m[2] == 'M') l_match += l;
- else if (m[2] == 'D') ++n_del, l_del += l;
- else if (m[2] == 'I') ++n_ins, l_ins += l;
- else if (m[2] == 'N') l_skip += l;
- else if (m[2] == 'H' || m[2] == 'S') l_clip += l;
- }
- h.end = h.start + l_match + l_del + l_skip;
- h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
- h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
- h.l_query = l_match + l_ins + l_clip;
- return h;
- }
-
- var c, opt = { a:1, b:4, o:6, e:1, verbose:3 };
-
- while ((c = getopt(args, 'v:')) != null) {
- if (c == 'v') opt.verbose = parseInt(getopt.arg);
- }
-
- if (args.length == getopt.ind) {
- print("Usage: k8 bwa-helper.js genalt <alt.sam> [aln.sam]");
- exit(1);
- }
-
- var file, buf = new Bytes();
- var aux = new Bytes(); // used for reverse and reverse complement
-
- // read the ALT-to-REF alignment and generate the index
- var intv = {};
- file = new File(args[getopt.ind]);
- while (file.readline(buf) >= 0) {
- var line = buf.toString();
- var t = line.split("\t");
- if (line.charAt(0) == '@') continue;
- var flag = parseInt(t[1]);
- var m, cigar = [], l_qaln = 0, l_qclip = 0;
- while ((m = re_cigar.exec(t[5])) != null) {
- var l = parseInt(m[1]);
- cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
- if (m[2] == 'M' || m[2] == 'I') l_qaln += l;
- else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
- }
- var j = flag&16? cigar.length-1 : 0;
- var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
- if (intv[t[0]] == null) intv[t[0]] = [];
- intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
- //print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar);
- }
- file.close();
- // create the interval index
- var idx = {};
- for (var ctg in intv)
- idx[ctg] = intv_ovlp(intv[ctg]);
-
- // process SAM
- file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
- while (file.readline(buf) >= 0) {
- var m, line = buf.toString();
- if (line.charAt(0) == '@' || (m = /\tXA:Z:(\S+)/.exec(line)) == null) { // TODO: this does not work with PE file
- if (opt.verbose < 4) print(line);
- continue;
- }
-
- // parse hits
- var hits = [];
- var XA_strs = m[1].split(";");
- var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
- var t = line.split("\t");
- var flag = parseInt(t[1]);
- hits.push(parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt));
- for (var i = 0; i < XA_strs.length; ++i) // hits in the XA tag
- if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
- hits.push(parse_hit(XA_strs[i].split(","), opt));
-
- // lift mapping positions to coordinates on the primary assembly
- var n_lifted = 0;
- for (var i = 0; i < hits.length; ++i) {
- var h = hits[i];
-
- if (idx[h.ctg] == null) continue;
- var a = idx[h.ctg](h.start, h.end);
- if (a == null || a.length == 0) continue;
-
- // find the approximate position on the primary assembly
- var lifted = [];
- for (var j = 0; j < a.length; ++j) {
- var s, e;
- if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
- s = cigar2pos(a[j][6], h.start);
- e = cigar2pos(a[j][6], h.end - 1) + 1;
- } else {
- s = cigar2pos(a[j][6], a[j][2] - h.end);
- e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
- }
- if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
- s += a[j][5]; e += a[j][5];
- lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
- }
- if (lifted.length) ++n_lifted, hits[i].lifted = lifted;
- }
- if (n_lifted == 0) {
- if (opt.verbose < 4) print(line);
- continue;
- }
-
- // group hits
- for (var i = 0; i < hits.length; ++i) { // set keys for sorting
- if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used
- hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
- else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
- hits[i].i = i; // keep the original index
- }
- hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
- var last_chr = null, end = 0, g = -1;
- for (var i = 0; i < hits.length; ++i) {
- if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
- else if (hits[i].pstart >= end) ++g;
- hits[i].g = g;
- end = end > hits[i].pend? end : hits[i].pend;
- }
- var reported_g = null, reported_i = null;
- for (var i = 0; i < hits.length; ++i)
- if (hits[i].i == 0)
- reported_g = hits[i].g, reported_i = i;
- var n_group0 = 0; // #hits overlapping the reported hit
- for (var i = 0; i < hits.length; ++i)
- if (hits[i].g == reported_g)
- ++n_group0;
- if (n_group0 == 1) { // then keep the reported alignment and mapQ
- if (opt.verbose < 4) print(line);
- continue;
- }
-
- // re-estimate mapQ
- var group_max = [];
- for (var i = 0; i < hits.length; ++i) {
- var g = hits[i].g;
- if (group_max[g] == null || group_max[g][0] < hits[i].score)
- group_max[g] = [hits[i].score, g];
- }
- if (group_max.length > 1)
- group_max.sort(function(x,y) {return y[0]-x[0]});
- var mapQ;
- if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM
- mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
- } else mapQ = 0;
- mapQ = mapQ < 60? mapQ : 60;
- var ori_mapQ = parseInt(t[4]);
- mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
-
- // generate lifted_str
- for (var i = 0; i < hits.length; ++i) {
- if (hits[i].lifted && hits[i].lifted.length) {
- var lifted = hits[i].lifted;
- var u = '';
- for (var j = 0; j < lifted.length; ++j)
- u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
- hits[i].lifted_str = u;
- }
- }
-
- // generate reversed quality and reverse-complemented sequence if necessary
- var rs = null, rq = null; // reversed quality and reverse complement sequence
- var need_rev = false;
- for (var i = 0; i < hits.length; ++i) {
- if (hits[i].g != reported_g || i == reported_i) continue;
- if (hits[i].rev != hits[reported_i].rev)
- need_rev = true;
- }
- if (need_rev) { // reverse and reverse complement
- aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
- aux.set(t[10],0); aux.reverse(); rq = aux.toString();
- }
-
- // print
- t[4] = mapQ;
- t.push("om:i:"+ori_mapQ);
- if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str);
- print(t.join("\t"));
- var cnt = 0;
- for (var i = 0; i < hits.length; ++i) {
- if (opt.verbose >= 5) print(obj2str(hits[i]));
- if (hits[i].g != reported_g || i == reported_i) continue;
- var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
- // update name
- if (flag&0x40) s[0] += "/1";
- if (flag&0x80) s[0] += "/2";
- s[0] += "_" + (++cnt);
- if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]);
- else s.push(rs, rq);
- s.push("NM:i:" + hits[i].NM);
- if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
- print(s.join("\t"));
- }
- }
- file.close();
-
- aux.destroy();
- buf.destroy();
-}
-
-/*********************
- *** Main function ***
- *********************/
-
-function main(args)
-{
- if (args.length == 0) {
- print("\nUsage: k8 bwa-helper.js <command> [arguments]\n");
- print("Commands: genalt generate ALT alignments");
- print(" sam2pas convert SAM to pairwise alignment summary format (PAS)");
- print(" pas2reg extract covered regions");
- print(" reg2cut regions to extract for the 2nd round bwa-mem");
- print(" markovlp identify bi-directional overlaps");
- print(" gff2sam convert GFF3 alignment to SAM");
- print(" shortname shorten sequence name after subseq (PacBio read names only)");
- print("");
- exit(1);
- }
-
- var cmd = args.shift();
- if (cmd == 'sam2pas') bwa_sam2pas(args);
- else if (cmd == 'gff2sam') bwa_gff2sam(args);
- else if (cmd == 'markovlp') bwa_markOvlp(args);
- else if (cmd == 'pas2reg') bwa_pas2reg(args);
- else if (cmd == 'reg2cut') bwa_reg2cut(args);
- else if (cmd == 'genalt') bwa_genalt(args);
- else if (cmd == 'shortname') bwa_shortname(args);
- else warn("Unrecognized command");
-}
-
-main(arguments);
diff --git a/bwa.1 b/bwa.1
index ba8bc9e..e30f382 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "19 May 2014" "bwa-0.7.9-r783" "Bioinformatics tools"
+.TH bwa 1 "23 December 2014" "bwa-0.7.12-r1034" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
@@ -75,7 +75,7 @@ appropriate algorithm will be chosen automatically.
.TP
.B mem
.B bwa mem
-.RB [ -aCHMpP ]
+.RB [ -aCHjMpP ]
.RB [ -t
.IR nThreads ]
.RB [ -k
@@ -88,6 +88,12 @@ appropriate algorithm will be chosen automatically.
.IR seedSplitRatio ]
.RB [ -c
.IR maxOcc ]
+.RB [ -D
+.IR chainShadow ]
+.RB [ -m
+.IR maxMateSW ]
+.RB [ -W
+.IR minSeedMatch ]
.RB [ -A
.IR matchScore ]
.RB [ -B
@@ -102,6 +108,8 @@ appropriate algorithm will be chosen automatically.
.IR unpairPen ]
.RB [ -R
.IR RGline ]
+.RB [ -H
+.IR HDlines ]
.RB [ -v
.IR verboseLevel ]
.I db.prefix
@@ -193,9 +201,28 @@ Discard a MEM if it has more than
.I INT
occurence in the genome. This is an insensitive parameter. [500]
.TP
+.BI -D \ INT
+Drop chains shorter than
+.I FLOAT
+fraction of the longest overlapping chain [0.5]
+.TP
+.BI -m \ INT
+Perform at most
+.I INT
+rounds of mate-SW [50]
+.TP
+.BI -W \ INT
+Drop a chain if the number of bases in seeds is smaller than
+.IR INT .
+This option is primarily used for longer contigs/reads. When positive, it also
+affects seed filtering. [0]
+.TP
.B -P
In the paired-end mode, perform SW to rescue missing hits only but do not try to find
hits that fit a proper pair.
+
+.TP
+.B SCORING OPTIONS:
.TP
.BI -A \ INT
Matching score. [1]
@@ -233,8 +260,9 @@ more aggressive read pair. [17]
.B INPUT/OUTPUT OPTIONS:
.TP
.B -p
-Assume the first input query file is interleaved paired-end FASTA/Q. See the
-command description for details.
+Smart pairing. If two adjacent reads have the same name, they are considered
+to form a read pair. This way, paired-end and single-end reads can be mixed
+in a single FASTA/Q stream.
.TP
.BI -R \ STR
Complete read group header line. '\\t' can be used in
@@ -243,15 +271,30 @@ and will be converted to a TAB in the output SAM. The read group ID will be
attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
[null]
.TP
+.BI -H \ ARG
+If ARG starts with @, it is interpreted as a string and gets inserted into the
+output SAM header; otherwise, ARG is interpreted as a file with all lines
+starting with @ in the file inserted into the SAM header. [null]
+.TP
.BI -T \ INT
Don't output alignment with score lower than
.IR INT .
This option affects output and occasionally SAM flag 2. [30]
.TP
-.BI -h \ INT
+.BI -j
+Treat ALT contigs as part of the primary assembly (i.e. ignore the
+.I db.prefix.alt
+file).
+.TP
+.BI -h \ INT[,INT2]
If a query has not more than
.I INT
-hits with score higher than 80% of the best hit, output them all in the XA tag [5]
+hits with score higher than 80% of the best hit, output them all in the XA tag.
+If
+.I INT2
+is specified, BWA-MEM outputs up to
+.I INT2
+hits if the list contains a hit to an ALT contig. [5,200]
.TP
.B -a
Output all found alignments for single-end or unpaired paired-end reads. These
@@ -571,6 +614,7 @@ R 0x0020 strand of the mate
s 0x0100 the alignment is not primary
f 0x0200 QC failure
d 0x0400 optical or PCR duplicate
+S 0x0800 supplementary alignment
.TE
.PP
@@ -604,8 +648,6 @@ _
XS Suboptimal alignment score
XF Support from forward/reverse alignment
XE Number of supporting seeds
-_
-XP Alt primary hits; format: /(chr,pos,CIGAR,mapQ,NM;)+/
.TE
.PP
diff --git a/bwa.c b/bwa.c
index 43d8a58..f9ce6a1 100644
--- a/bwa.c
+++ b/bwa.c
@@ -7,6 +7,7 @@
#include "ksw.h"
#include "utils.h"
#include "kstring.h"
+#include "kvec.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
@@ -14,6 +15,7 @@
int bwa_verbose = 3;
char bwa_rg_id[256];
+char *bwa_pg;
/************************
* Batch FASTA/Q reader *
@@ -54,10 +56,12 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
}
trim_readno(&ks->name);
kseq2bseq1(ks, &seqs[n]);
+ seqs[n].id = n;
size += seqs[n++].l_seq;
if (ks2) {
trim_readno(&ks2->name);
kseq2bseq1(ks2, &seqs[n]);
+ seqs[n].id = n;
size += seqs[n++].l_seq;
}
if (size >= chunk_size && (n&1) == 0) break;
@@ -70,6 +74,24 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
return seqs;
}
+void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
+{
+ int i, has_last;
+ kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
+ for (i = 1, has_last = 1; i < n; ++i) {
+ if (has_last) {
+ if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
+ kv_push(bseq1_t, a[1], seqs[i-1]);
+ kv_push(bseq1_t, a[1], seqs[i]);
+ has_last = 0;
+ } else kv_push(bseq1_t, a[0], seqs[i-1]);
+ } else has_last = 1;
+ }
+ if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
+ sep[0] = a[0].a, m[0] = a[0].n;
+ sep[1] = a[1].a, m[1] = a[1].n;
+}
+
/*****************
* CIGAR related *
*****************/
@@ -227,7 +249,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
return bwt;
}
-bwaidx_t *bwa_idx_load(const char *hint, int which)
+bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
{
bwaidx_t *idx;
char *prefix;
@@ -239,7 +261,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BNS) {
+ int i, c;
idx->bns = bns_restore(prefix);
+ for (i = c = 0; i < idx->bns->n_seqs; ++i)
+ if (idx->bns->anns[i].is_alt) ++c;
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
@@ -251,27 +278,113 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
return idx;
}
+bwaidx_t *bwa_idx_load(const char *hint, int which)
+{
+ return bwa_idx_load_from_disk(hint, which);
+}
+
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
- if (idx->bwt) bwt_destroy(idx->bwt);
- if (idx->bns) bns_destroy(idx->bns);
- if (idx->pac) free(idx->pac);
+ if (idx->mem == 0) {
+ if (idx->bwt) bwt_destroy(idx->bwt);
+ if (idx->bns) bns_destroy(idx->bns);
+ if (idx->pac) free(idx->pac);
+ } else {
+ free(idx->bwt); free(idx->bns->anns); free(idx->bns);
+ if (!idx->is_shm) free(idx->mem);
+ }
free(idx);
}
+int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
+{
+ int64_t k = 0, x;
+ int i;
+
+ // generate idx->bwt
+ x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
+ x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
+ x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
+
+ // generate idx->bns and idx->pac
+ x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
+ x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
+ x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
+ for (i = 0; i < idx->bns->n_seqs; ++i) {
+ idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
+ idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
+ }
+ idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
+ assert(k == l_mem);
+
+ idx->l_mem = k; idx->mem = mem;
+ return 0;
+}
+
+int bwa_idx2mem(bwaidx_t *idx)
+{
+ int i;
+ int64_t k, x, tmp;
+ uint8_t *mem;
+
+ // copy idx->bwt
+ x = idx->bwt->bwt_size * 4;
+ mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
+ memmove(mem + sizeof(bwt_t), mem, x);
+ memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
+ x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
+ free(idx->bwt->sa);
+ free(idx->bwt); idx->bwt = 0;
+
+ // copy idx->bns
+ tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
+ for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
+ tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
+ mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
+ x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
+ x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
+ free(idx->bns->ambs);
+ x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
+ for (i = 0; i < idx->bns->n_seqs; ++i) {
+ x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
+ x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
+ free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
+ }
+ free(idx->bns->anns);
+
+ // copy idx->pac
+ x = idx->bns->l_pac/4+1;
+ mem = realloc(mem, k + x);
+ memcpy(mem + k, idx->pac, x); k += x;
+ free(idx->bns); idx->bns = 0;
+ free(idx->pac); idx->pac = 0;
+
+ return bwa_mem2idx(k, mem, idx);
+}
+
/***********************
* SAM header routines *
***********************/
-void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
+void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
{
- int i;
+ int i, n_SQ = 0;
extern char *bwa_pg;
- for (i = 0; i < bns->n_seqs; ++i)
- err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
- if (rg_line) err_printf("%s\n", rg_line);
- err_printf("%s\n", bwa_pg);
+ if (hdr_line) {
+ const char *p = hdr_line;
+ while ((p = strstr(p, "@SQ\t")) != 0) {
+ if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
+ p += 4;
+ }
+ }
+ if (n_SQ == 0) {
+ for (i = 0; i < bns->n_seqs; ++i)
+ err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
+ } else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
+ fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
+ if (hdr_line) err_printf("%s\n", hdr_line);
+ if (bwa_pg) err_printf("%s\n", bwa_pg);
}
static char *bwa_escape(char *s)
@@ -319,3 +432,16 @@ err_set_rg:
return 0;
}
+char *bwa_insert_header(const char *s, char *hdr)
+{
+ int len = 0;
+ if (s == 0 || s[0] != '@') return hdr;
+ if (hdr) {
+ len = strlen(hdr);
+ hdr = realloc(hdr, len + strlen(s) + 2);
+ hdr[len++] = '\n';
+ strcpy(hdr + len, s);
+ } else hdr = strdup(s);
+ bwa_escape(hdr + len);
+ return hdr;
+}
diff --git a/bwa.h b/bwa.h
index bbc2525..1541c1c 100644
--- a/bwa.h
+++ b/bwa.h
@@ -10,14 +10,20 @@
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
+#define BWA_CTL_SIZE 0x10000
+
typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
+
+ int is_shm;
+ int64_t l_mem;
+ uint8_t *mem;
} bwaidx_t;
typedef struct {
- int l_seq;
+ int l_seq, id;
char *name, *comment, *seq, *qual, *sam;
} bseq1_t;
@@ -29,6 +35,7 @@ extern "C" {
#endif
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
+ void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
@@ -37,11 +44,16 @@ extern "C" {
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
+ bwaidx_t *bwa_idx_load_from_shm(const char *hint);
+ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
+ int bwa_idx2mem(bwaidx_t *idx);
+ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
- void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
+ void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
char *bwa_set_rg(const char *s);
+ char *bwa_insert_header(const char *s, char *hdr);
#ifdef __cplusplus
}
diff --git a/bwakit/README.md b/bwakit/README.md
new file mode 100644
index 0000000..b3a4033
--- /dev/null
+++ b/bwakit/README.md
@@ -0,0 +1,115 @@
+## Introduction
+
+Bwakit is a self-consistent installation-free package of scripts and precompiled
+binaries, providing an end-to-end solution to read mapping. In addition to the
+basic mapping functionality implemented in bwa, bwakit is able to generate
+proper human reference genome and to take advantage of ALT contigs, if present,
+to improve read mapping and to perform HLA typing for high-coverage human data.
+It can remap name- or coordinate-sorted BAM with read group and barcode
+information retained. Bwakit also *optionally* trims adapters (via
+[trimadap][ta]), marks duplicates (via [samblaster][sb]) and sorts the final
+alignment (via [samtools][smtl]).
+
+Bwakit has two entry scripts: `run-gen-ref` which downloads and generates human
+reference genomes, and `run-bwamem` which prints mapping command lines on the
+standard output that can be piped to `sh` to execute. The two scripts will call
+other programs or use data in `bwa.kit`. The following shows an example about
+how to use bwakit:
+
+```sh
+# Download the bwa-0.7.11 binary package (download link may change)
+wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
+ | gzip -dc | tar xf -
+# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
+bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa # create BWA index
+# mapping
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh
+```
+
+The last mapping command line will generate the following files:
+
+* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this
+ file, one read may be placed on multiple overlapping ALT contigs at the same
+ time even if the read is mapped better to some contigs than others. This makes
+ it possible to analyze each contig independent of others.
+
+* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes.
+
+* `out.hla.all`: other possible genotypes on the six HLA genes.
+
+* `out.log.*`: bwa-mem, samblaster and HLA typing log files.
+
+Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The
+scripts in the package are available in the [bwa/bwakit][kit] directory.
+Packaging is done manually for now.
+
+## Limitations
+
+* HLA typing only works for high-coverage human data. The typing accuracy can
+ still be improved. We encourage researchers to develop better HLA typing tools
+ based on the intermediate output of bwakit (for each HLA gene included in the
+ index, bwakit writes all reads matching it in a separate file).
+
+* Duplicate marking only works when all reads from a single paired-end library
+ are provided as the input. This limitation is the necessary tradeoff of fast
+ MarkDuplicate provided by samblaster.
+
+* The adapter trimmer is chosen as it is fast, pipe friendly and does not
+ discard reads. However, it is conservative and suboptimal. If this is a
+ concern, it is recommended to preprocess input reads with a more sophisticated
+ adapter trimmer. We also hope existing trimmers can be modified to operate on
+ an interleaved FASTQ stream. We will replace trimadap once a better trimmer
+ meets our needs.
+
+* Bwakit can be memory demanding depends on the functionality invoked. For 30X
+ human data, bwa-mem takes about 11GB RAM with 32 threads, samblaster uses
+ close to 10GB and BAM shuffling (if the input is sorted BAM) uses several GB.
+ In the current setting, sorting uses about 10GB.
+
+
+## Package Contents
+```
+bwa.kit
+|-- README.md This README file.
+|-- run-bwamem *Entry script* for the entire mapping pipeline.
+|-- bwa *BWA binary*
+|-- k8 Interpretor for *.js scripts.
+|-- bwa-postalt.js Post-process alignments to ALT contigs/decoys/HLA genes.
+|-- htsbox Used by run-bwamem for shuffling BAMs and BAM=>FASTQ.
+|-- samblaster MarkDuplicates for reads from the same library. v0.1.20
+|-- samtools SAMtools for sorting and SAM=>BAM conversion. v1.1
+|-- seqtk For FASTQ manipulation.
+|-- trimadap Trim Illumina PE sequencing adapters.
+|
+|-- run-gen-ref *Entry script* for generating human reference genomes.
+|-- resource-GRCh38 Resources for generating GRCh38
+| |-- hs38DH-extra.fa Decoy and HLA gene sequences. Used by run-gen-ref.
+| `-- hs38DH.fa.alt ALT-to-GRCh38 alignment. Used by run-gen-ref.
+|
+|-- run-HLA HLA typing for sequences extracted by bwa-postalt.js.
+|-- typeHLA.sh Type one HLA-gene. Called by run-HLA.
+|-- typeHLA.js HLA typing from exon-to-contig alignment. Used by typeHLA.sh.
+|-- typeHLA-selctg.js Select contigs overlapping HLA exons. Used by typeHLA.sh.
+|-- fermi2.pl Fermi2 wrapper. Used by typeHLA.sh for de novo assembly.
+|-- fermi2 Fermi2 binary. Used by fermi2.pl.
+|-- ropebwt2 RopeBWT2 binary. Used by fermi2.pl.
+|-- resource-human-HLA Resources for HLA typing
+| |-- HLA-ALT-exons.bed Exonic regions of HLA ALT contigs. Used by typeHLA.sh.
+| |-- HLA-CDS.fa CDS of HLA-{A,B,C,DQA1,DQB1,DRB1} genes from IMGT/HLA-3.18.0.
+| |-- HLA-ALT-type.txt HLA types for each HLA ALT contig. Not used.
+| `-- HLA-ALT-idx BWA indices of each HLA ALT contig. Used by typeHLA.sh
+| `-- (...)
+|
+`-- doc BWA documentations
+ |-- bwa.1 Manpage
+ |-- NEWS.md Release Notes
+ |-- README.md GitHub README page
+ `-- README-alt.md Documentation for ALT mapping
+```
+
+[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
+[sb]: https://github.com/GregoryFaust/samblaster
+[ta]: https://github.com/lh3/seqtk/blob/master/trimadap.c
+[smtl]: http://www.htslib.org
+[kit]: https://github.com/lh3/bwa/tree/master/bwakit
diff --git a/bwakit/bwa-postalt.js b/bwakit/bwa-postalt.js
new file mode 100644
index 0000000..5717045
--- /dev/null
+++ b/bwakit/bwa-postalt.js
@@ -0,0 +1,524 @@
+/*****************************************************************
+ * The K8 Javascript interpreter is required to run this script. *
+ * *
+ * Source code: https://github.com/attractivechaos/k8 *
+ * Binary: http://sourceforge.net/projects/lh3/files/k8/ *
+ * *
+ * Data file used for generating GRCh38 ALT alignments: *
+ * *
+ * http://sourceforge.net/projects/bio-bwa/files/ *
+ *****************************************************************/
+
+/******************
+ *** From k8.js ***
+ ******************/
+
+// Parse command-line options. A BSD getopt() clone in javascript.
+var getopt = function(args, ostr) {
+ var oli; // option letter list index
+ if (typeof(getopt.place) == 'undefined')
+ getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+ if (getopt.place == -1) { // update scanning pointer
+ if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+ getopt.place = -1;
+ return null;
+ }
+ if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+ ++getopt.ind;
+ getopt.place = -1;
+ return null;
+ }
+ }
+ var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+ if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+ if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
+ if (getopt.place < 0) ++getopt.ind;
+ return '?';
+ }
+ if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+ getopt.arg = null;
+ if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+ } else { // need an argument
+ if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+ getopt.arg = args[getopt.ind].substr(getopt.place);
+ else if (args.length <= ++getopt.ind) { // no arg
+ getopt.place = -1;
+ if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+ return '?';
+ } else getopt.arg = args[getopt.ind]; // white space
+ getopt.place = -1;
+ ++getopt.ind;
+ }
+ return optopt;
+}
+
+// reverse a string
+Bytes.prototype.reverse = function()
+{
+ for (var i = 0; i < this.length>>1; ++i) {
+ var tmp = this[i];
+ this[i] = this[this.length - i - 1];
+ this[this.length - i - 1] = tmp;
+ }
+}
+
+// reverse complement a DNA string
+Bytes.prototype.revcomp = function()
+{
+ if (Bytes.rctab == null) {
+ var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
+ var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
+ Bytes.rctab = [];
+ for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
+ for (var i = 0; i < s1.length; ++i)
+ Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
+ }
+ for (var i = 0; i < this.length>>1; ++i) {
+ var tmp = this[this.length - i - 1];
+ this[this.length - i - 1] = Bytes.rctab[this[i]];
+ this[i] = Bytes.rctab[tmp];
+ }
+ if (this.length>>1)
+ this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
+}
+
+// create index for a list of intervals for fast interval queries; ported from bedidx.c in samtools
+function intv_ovlp(intv, bits)
+{
+ if (typeof bits == "undefined") bits = 13;
+ intv.sort(function(a,b) {return a[0]-b[0];});
+ // create the index
+ var idx = [], max = 0;
+ for (var i = 0; i < intv.length; ++i) {
+ var b = intv[i][0]>>bits;
+ var e = (intv[i][1]-1)>>bits;
+ if (b != e) {
+ for (var j = b; j <= e; ++j)
+ if (idx[j] == null) idx[j] = i;
+ } else if (idx[b] == null) idx[b] = i;
+ max = max > e? max : e;
+ }
+ // closure
+ return function(_b, _e) {
+ var x = _b >> bits;
+ if (x > max) return [];
+ var off = idx[x];
+ if (off == null) {
+ var i;
+ for (i = ((_e - 1) >> bits) - 1; i >= 0; --i)
+ if (idx[i] != null) break;
+ off = i < 0? 0 : idx[i];
+ }
+ var ovlp = [];
+ for (var i = off; i < intv.length && intv[i][0] < _e; ++i)
+ if (intv[i][1] > _b) ovlp.push(intv[i]);
+ return ovlp;
+ }
+}
+
+var re_cigar = /(\d+)([MIDSHN])/g;
+
+/******************************
+ *** Generate ALT alignment ***
+ ******************************/
+
+// given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
+function cigar2pos(cigar, pos)
+{
+ var x = 0, y = 0;
+ for (var i = 0; i < cigar.length; ++i) {
+ var op = cigar[i][0], len = cigar[i][1];
+ if (op == 'M') {
+ if (y <= pos && pos < y + len)
+ return x + (pos - y);
+ x += len, y += len;
+ } else if (op == 'D') {
+ x += len;
+ } else if (op == 'I') {
+ if (y <= pos && pos < y + len)
+ return x;
+ y += len;
+ } else if (op == 'S' || op == 'H') {
+ if (y <= pos && pos < y + len)
+ return -1;
+ y += len;
+ }
+ }
+ return -1;
+}
+
+// Parse a hit. $s is an array that looks something like ["chr1", "+12345", "100M", 5]
+// Return an object keeping various information about the alignment.
+function parse_hit(s, opt)
+{
+ var h = {};
+ h.ctg = s[0];
+ h.start = parseInt(s[1].substr(1)) - 1;
+ h.rev = (s[1].charAt(0) == '-');
+ h.cigar = s[2];
+ h.NM = parseInt(s[3]);
+ h.hard = false;
+ var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
+ l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
+ while ((m = re_cigar.exec(h.cigar)) != null) {
+ var l = parseInt(m[1]);
+ if (m[2] == 'M') l_match += l;
+ else if (m[2] == 'D') ++n_del, l_del += l;
+ else if (m[2] == 'I') ++n_ins, l_ins += l;
+ else if (m[2] == 'N') l_skip += l;
+ else if (m[2] == 'H' || m[2] == 'S') {
+ l_clip += l;
+ if (m[2] == 'H') h.hard = true;
+ }
+ }
+ h.end = h.start + l_match + l_del + l_skip;
+ h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
+ h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
+ h.l_query = l_match + l_ins + l_clip;
+ return h;
+}
+
+function print_buffer(buf2, fp_hla, hla) // output alignments
+{
+ if (buf2.length == 0) return;
+ for (var i = 0; i < buf2.length; ++i)
+ print(buf2[i].join("\t"));
+ if (fp_hla != null) {
+ var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+');
+ for (var x in hla) {
+ if (fp_hla[x] != null);
+ fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n');
+ }
+ }
+}
+
+function collect_hla_hits(idx, ctg, start, end, hla) // collect reads hit to HLA genes
+{
+ var m, ofunc = idx[ctg];
+ if (ofunc == null) return;
+ var ovlp_alt = ofunc(start, end);
+ for (var i = 0; i < ovlp_alt.length; ++i)
+ if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
+ hla[m[1]] = true;
+}
+
+function bwa_postalt(args)
+{
+ var version = "r985";
+ var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 };
+
+ while ((c = getopt(args, 'vp:r:')) != null) {
+ if (c == 'p') opt.pre = getopt.arg;
+ else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
+ else if (c == 'v') { print(version); exit(0); }
+ }
+ if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
+
+ if (args.length == getopt.ind) {
+ print("");
+ print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
+ print("Options: -p STR prefix of output files containting sequences matching HLA genes [null]");
+ print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
+ print(" -v show version number");
+ print("");
+ print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
+ print(" the primary assembly, groups them and then estimates mapQ across groups. If");
+ print(" a non-ALT hit overlaps a lifted ALT hit, its mapping quality is set to the");
+ print(" smaller between its original mapQ and the adjusted mapQ of the ALT hit. If");
+ print(" multiple ALT hits are lifted to the same position, they will yield new SAM");
+ print(" lines with the same mapQ.");
+ print("");
+ exit(1);
+ }
+
+ var aux = new Bytes(); // used for reverse and reverse complement
+ var buf = new Bytes(); // line reading buffer
+
+ // read ALT-to-REF alignment
+ var intv_alt = {}, intv_pri = {}, hla_ctg = {}, is_alt = {}, hla_chr = null;
+ var file = new File(args[getopt.ind]);
+ while (file.readline(buf) >= 0) {
+ var line = buf.toString();
+ if (line.charAt(0) == '@') continue;
+ var t = line.split("\t");
+ if (t.length < 11) continue; // incomplete lines
+ is_alt[t[0]] = true;
+ var pos = parseInt(t[3]) - 1;
+ var flag = parseInt(t[1]);
+ if ((flag&4) || t[2] == '*') continue;
+ var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
+ if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
+ if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
+ ++hla_ctg[m[1]];
+ hla_chr = t[2];
+ }
+ while ((m = re_cigar.exec(t[5])) != null) {
+ var l = parseInt(m[1]);
+ cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
+ if (m[2] == 'M') l_qaln += l, l_tlen += l;
+ else if (m[2] == 'I') l_qaln += l;
+ else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
+ else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
+ }
+ var j = flag&16? cigar.length-1 : 0;
+ var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
+ if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
+ intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
+ if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
+ intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
+ }
+ file.close();
+ var idx_alt = {}, idx_pri = {};
+ for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
+ for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
+
+ // initialize the list of HLA contigs
+ var fp_hla = null;
+ if (opt.pre) {
+ fp_hla = {};
+ for (var h in hla_ctg)
+ fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
+ }
+
+ // process SAM
+ var buf2 = [], hla = {};
+ file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
+ while (file.readline(buf) >= 0) {
+ var m, line = buf.toString();
+
+ if (line.charAt(0) == '@') { // print and then skip the header line
+ print(line);
+ continue;
+ }
+
+ var t = line.split("\t");
+ t[1] = parseInt(t[1]); t[3] = parseInt(t[3]); t[4] = parseInt(t[4]);
+
+ // print bufferred reads
+ if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) {
+ print_buffer(buf2, fp_hla, hla);
+ buf2 = [], hla = {};
+ }
+
+ // skip unmapped lines
+ if (t[1]&4) {
+ buf2.push(t);
+ continue;
+ }
+
+ // parse the reported hit
+ var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
+ var flag = t[1];
+ var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
+ if (t[2] == hla_chr) collect_hla_hits(idx_pri, h.ctg, h.start, h.end, hla);
+
+ if (h.hard) { // the following does not work with hard clipped alignments
+ buf2.push(t);
+ continue;
+ }
+ var hits = [h];
+
+ // parse hits in the XA tag
+ if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
+ var XA_strs = m[1].split(";");
+ for (var i = 0; i < XA_strs.length; ++i)
+ if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
+ hits.push(parse_hit(XA_strs[i].split(","), opt));
+ }
+
+ // check if there are ALT hits
+ var has_alt = false;
+ for (var i = 0; i < hits.length; ++i)
+ if (is_alt[hits[i].ctg] != null) {
+ has_alt = true;
+ break;
+ }
+ if (!has_alt) {
+ buf2.push(t);
+ continue;
+ }
+
+ // lift mapping positions to the primary assembly
+ var n_rpt_lifted = 0, rpt_lifted = null;
+ for (var i = 0; i < hits.length; ++i) {
+ var a, h = hits[i];
+
+ if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
+ continue;
+
+ // find the approximate position on the primary assembly
+ var lifted = [];
+ for (var j = 0; j < a.length; ++j) {
+ var s, e;
+ if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
+ s = cigar2pos(a[j][6], h.start);
+ e = cigar2pos(a[j][6], h.end - 1) + 1;
+ } else {
+ s = cigar2pos(a[j][6], a[j][2] - h.end);
+ e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
+ }
+ if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
+ s += a[j][5]; e += a[j][5];
+ lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
+ if (i == 0) ++n_rpt_lifted;
+ }
+ if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
+ if (lifted.length) hits[i].lifted = lifted;
+ }
+
+ // prepare for hits grouping
+ for (var i = 0; i < hits.length; ++i) { // set keys for sorting
+ if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
+ hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
+ else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
+ hits[i].i = i; // keep the original index
+ }
+
+ // group hits based on the lifted positions on non-ALT sequences
+ if (hits.length > 1) {
+ hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
+ var last_chr = null, end = 0, g = -1;
+ for (var i = 0; i < hits.length; ++i) {
+ if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
+ else if (hits[i].pstart >= end) ++g;
+ hits[i].g = g;
+ end = end > hits[i].pend? end : hits[i].pend;
+ }
+ } else hits[0].g = 0;
+
+ // find the index and group id of the reported hit; find the size of the reported group
+ var reported_g = null, reported_i = null, n_group0 = 0;
+ if (hits.length > 1) {
+ for (var i = 0; i < hits.length; ++i)
+ if (hits[i].i == 0)
+ reported_g = hits[i].g, reported_i = i;
+ for (var i = 0; i < hits.length; ++i)
+ if (hits[i].g == reported_g)
+ ++n_group0;
+ } else {
+ if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
+ buf2.push(t);
+ continue;
+ }
+ reported_g = reported_i = 0, n_group0 = 1;
+ }
+
+ // re-estimate mapping quality if necessary
+ var mapQ, ori_mapQ = t[4];
+ if (n_group0 > 1) {
+ var group_max = [];
+ for (var i = 0; i < hits.length; ++i) {
+ var g = hits[i].g;
+ if (group_max[g] == null || group_max[g][0] < hits[i].score)
+ group_max[g] = [hits[i].score, g];
+ }
+ if (group_max.length > 1)
+ group_max.sort(function(x,y) {return y[0]-x[0]});
+ if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM
+ mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
+ } else mapQ = 0;
+ mapQ = mapQ < 60? mapQ : 60;
+ if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
+ else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
+ } else mapQ = t[4];
+
+ // find out whether the read is overlapping HLA genes
+ if (hits[reported_i].pctg == hla_chr) {
+ var rpt_start = 1<<30, rpt_end = 0;
+ for (var i = 0; i < hits.length; ++i) {
+ var h = hits[i];
+ if (h.g == reported_g) {
+ rpt_start = rpt_start < h.pstart? rpt_start : h.pstart;
+ rpt_end = rpt_end > h.pend ? rpt_end : h.pend;
+ }
+ }
+ collect_hla_hits(idx_pri, hla_chr, rpt_start, rpt_end, hla);
+ }
+
+ // adjust the mapQ of the primary hits
+ if (n_rpt_lifted <= 1) {
+ var l = n_rpt_lifted == 1? rpt_lifted : null;
+ for (var i = 0; i < buf2.length; ++i) {
+ var s = buf2[i], is_ovlp = true;
+ if (l != null) {
+ if (l[0] != s[2]) is_ovlp = false; // different chr
+ else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
+ else {
+ var start = s[3] - 1, end = start;
+ while ((m = re_cigar.exec(t[5])) != null)
+ if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
+ end += parseInt(m[1]);
+ if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
+ }
+ } else is_ovlp = false;
+ // get the "pa" tag if present
+ var om = -1, pa = 10.;
+ for (var j = 11; j < s.length; ++j)
+ if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
+ om = parseInt(m[1]);
+ else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null)
+ pa = parseFloat(m[1]);
+ if (is_ovlp) { // overlapping the lifted hit
+ if (om > 0) s[4] = om;
+ s[4] = s[4] < mapQ? s[4] : mapQ;
+ } else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa
+ if (om < 0) s.push("om:i:" + s[4]);
+ s[4] = 0;
+ }
+ }
+ }
+
+ // generate lifted_str
+ for (var i = 0; i < hits.length; ++i) {
+ if (hits[i].lifted && hits[i].lifted.length) {
+ var u = '', lifted = hits[i].lifted;
+ for (var j = 0; j < lifted.length; ++j)
+ u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
+ hits[i].lifted_str = u;
+ }
+ }
+
+ // stage the reported hit
+ t[4] = mapQ;
+ if (n_group0 > 1) t.push("om:i:"+ori_mapQ);
+ if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str);
+ buf2.push(t);
+
+ // stage the hits generated from the XA tag
+ var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence
+ var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null;
+ for (var i = 0; i < hits.length; ++i) {
+ if (hits[i].g != reported_g || i == reported_i) continue;
+ if (idx_alt[hits[i].ctg] == null) continue;
+ var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]];
+ if (t[6] == '=' && s[2] != t[2]) s[6] = t[2];
+ // print sequence/quality and set the rev flag
+ if (hits[i].rev == hits[reported_i].rev) {
+ s.push(t[9], t[10]);
+ s[1] = flag | 0x800;
+ } else { // we need to write the reverse sequence
+ if (rs == null || rq == null) {
+ aux.length = 0;
+ aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
+ aux.set(t[10],0); aux.reverse(); rq = aux.toString();
+ }
+ s.push(rs, rq);
+ s[1] = (flag ^ 0x10) | 0x800;
+ }
+ s.push("NM:i:" + hits[i].NM);
+ if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
+ if (rg != null) s.push(rg);
+ buf2.push(s);
+ }
+ }
+ print_buffer(buf2, fp_hla, hla);
+ file.close();
+ if (fp_hla != null)
+ for (var h in fp_hla)
+ fp_hla[h].close();
+
+ buf.destroy();
+ aux.destroy();
+}
+
+bwa_postalt(arguments);
diff --git a/bwakit/run-HLA b/bwakit/run-HLA
new file mode 100755
index 0000000..4fee16f
--- /dev/null
+++ b/bwakit/run-HLA
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+ctg_opt=""
+if [ $# -gt 1 ] && [ $1 == '-A' ]; then
+ ctg_opt="-A"
+ shift
+fi
+
+if [ $# -eq 0 ]; then
+ echo "Usage: $0 <prefix>"
+ exit 1
+fi
+
+for f in $1.HLA-*.fq; do
+ gene=`echo $f | perl -pe 's/^.*(HLA-[A-Z]+[0-9]*).*fq$/$1/'`
+ echo -e "\n*** Processing gene $gene...\n" >&2
+ `dirname $0`/typeHLA.sh $ctg_opt $1 $gene
+done
+
+ls $1.HLA-*.gt | xargs -i echo grep ^GT {} \| head -1 | sh | sed "s,^GT,$1,"
diff --git a/bwakit/run-bwamem b/bwakit/run-bwamem
new file mode 100755
index 0000000..165f93e
--- /dev/null
+++ b/bwakit/run-bwamem
@@ -0,0 +1,186 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my %opts = (t=>1);
+getopts("SadskHo:R:x:t:", \%opts);
+
+die('
+Usage: run-bwamem [options] <idxbase> <file1> [file2]
+
+Options: -o STR prefix for output files [inferred from input]
+ -R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null]
+ -x STR read type: pacbio, ont2d or intractg [default]
+ intractg: intra-species contig (kb query, highly similar)
+ pacbio: pacbio subreads (~10kb query, high error rate)
+ ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
+ -t INT number of threads [1]
+
+ -H apply HLA typing
+ -a trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
+ -d mark duplicate (via samblaster)
+ -S for BAM input, don\'t shuffle
+ -s sort the output alignment (via samtools; requring more RAM)
+ -k keep temporary files generated by typeHLA
+
+Examples:
+
+ * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
+
+ run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz
+
+ Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
+ with the quality of input. It is only intended for research purpose, not for diagnostic.
+
+ * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and
+ sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
+ the two types. Specifying -R stops read group transfer.
+
+ run-bwamem -sao prefix hs38DH.fa old-srt.bam
+
+ Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
+ mapping pipeline better. It is conservative and suboptimal. A more sophisticated
+ trimmer is recommended if this becomes a concern.
+
+ * Remap name-grouped BAM and mark duplicates:
+
+ run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam
+
+ Note: streamed duplicate marking requires all reads from a single paired-end library
+ to be aligned at the same time.
+
+Output files:
+
+ {-o}.aln.bam - final alignment
+ {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
+ {-o}.hla.all - additional HLA genotypes consistent with data
+ {-o}.log.* - log files
+
+') if @ARGV < 2;
+
+my $idx = $ARGV[0];
+
+my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
+my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
+$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root);
+die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
+
+die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
+ unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
+
+if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
+ warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
+ @ARGV = 2;
+}
+
+if (defined($opts{p}) && @ARGV >= 3) {
+ warn("WARNING: option -P is ignored as there are two input sequence files.\n");
+ delete $opts{p};
+}
+
+my $prefix;
+if (defined $opts{o}) {
+ $prefix = $opts{o};
+} elsif (@ARGV >= 3) {
+ my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
+ my $i;
+ for ($i = 0; $i < $len; ++$i) {
+ last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
+ }
+ $prefix = substr($ARGV[1], 0, $i) if $i > 0;
+} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
+ $prefix = $1;
+}
+die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix);
+
+my $size = 0;
+my $comp_ratio = 3.;
+for my $f (@ARGV[1..$#ARGV]) {
+ my @a = stat($f);
+ my $s = $a[7];
+ die("ERROR: failed to read file $f\n") if !defined($s);
+ $s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
+ $size += int($s) + 1;
+}
+
+my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
+my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
+
+if (defined($opts{x})) {
+ delete($opts{d}); delete($opts{a}); delete $opts{p};
+}
+
+# for BAM input, find @RG header lines
+my @RG_lines = ();
+if ($is_bam && !defined($opts{R})) {
+ my $fh;
+ open($fh, "$root/samtools view -H $ARGV[1] |") || die;
+ while (<$fh>) {
+ chomp;
+ if (/^\@RG\t/) {
+ s/\t/\\t/g;
+ push(@RG_lines, "-H'$_'");
+ }
+ }
+ close($fh);
+}
+
+warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0;
+
+my $cmd = '';
+if ($is_bam) {
+ my $cmd_sam2bam = "cat $ARGV[1] \\\n";
+ my $ntmps = int($size / 4e9) + 1;
+ my $cmd_shuf = !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
+ my $bam2fq_opt = @RG_lines > 0? " -t" : "";
+ my $cmd_bam2fq = " | $root/htsbox bam2fq -O$bam2fq_opt - \\\n";
+ $cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
+} elsif (@ARGV >= 3) {
+ $cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
+} else {
+ $cmd = "cat $ARGV[1] \\\n";
+}
+
+my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
+$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0;
+
+$cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
+$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
+$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
+
+my $has_hla = 0;
+if (-f "$ARGV[0].alt") {
+ my $fh;
+ open($fh, "$ARGV[0].alt") || die;
+ while (<$fh>) {
+ $has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
+ }
+ close($fh);
+ my $hla_pre = $has_hla? "-p $prefix.hla " : "";
+ $cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
+}
+
+my $t_sort = $opts{t} < 4? $opts{t} : 4;
+$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n";
+
+if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
+ $cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
+ $cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
+ $cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
+}
+
+print $cmd;
+
+sub which
+{
+ my $file = shift;
+ my $path = (@_)? shift : $ENV{PATH};
+ return if (!defined($path));
+ foreach my $x (split(":", $path)) {
+ $x =~ s/\/$//;
+ return "$x/$file" if (-x "$x/$file");
+ }
+ return;
+}
diff --git a/bwakit/run-gen-ref b/bwakit/run-gen-ref
new file mode 100755
index 0000000..e129b62
--- /dev/null
+++ b/bwakit/run-gen-ref
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+root=`dirname $0`
+
+url38="ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
+url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
+
+if [ $# -eq 0 ]; then
+ echo "Usage: $0 <hs38|hs38a|hs38DH|hs37|hs37d5>"
+ echo "Analysis sets:"
+ echo " hs38 primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV"
+ echo " hs38a hs38 plus ALT contigs"
+ echo " hs38DH hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)"
+ echo " hs37 primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome"
+ echo " hs37d5 hs37 plus decoy contigs (used by 1000g phase 3)"
+ echo ""
+ echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional"
+ echo " sequences and ALT-to-REF mapping included in the bwa.kit package."
+ exit 1;
+fi
+
+if [ $1 == "hs38DH" ]; then
+ (wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa
+ [ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt
+elif [ $1 == "hs38a" ]; then
+ wget -O- $url38 | gzip -dc > $1.fa
+ [ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt
+elif [ $1 == "hs38" ]; then
+ wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
+elif [ $1 == "hs37d5" ]; then
+ wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null
+elif [ $1 == "hs37" ]; then
+ wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa
+else
+ echo "ERROR: unknown genome build"
+fi
+
+[ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n"
+
diff --git a/bwakit/typeHLA-selctg.js b/bwakit/typeHLA-selctg.js
new file mode 100644
index 0000000..0e02a65
--- /dev/null
+++ b/bwakit/typeHLA-selctg.js
@@ -0,0 +1,62 @@
+var min_ovlp = 30;
+
+if (arguments.length < 3) {
+ print("Usage: k8 selctg.js <HLA-gene> <HLA-ALT-exons.bed> <ctg-to-ALT.sam> [min_ovlp="+min_ovlp+"]");
+ exit(1);
+}
+
+if (arguments.length >= 4) min_ovlp = parseInt(arguments[3]);
+var gene = arguments[0];
+
+var buf = new Bytes();
+
+var h = {};
+var file = new File(arguments[1]);
+while (file.readline(buf) >= 0) {
+ var t = buf.toString().split("\t");
+ if (t[3] != gene) continue;
+ if (h[t[0]] == null) h[t[0]] = [];
+ h[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
+}
+file.close();
+
+var s = {}, re = /(\d+)([MIDSHN])/g;
+file = new File(arguments[2]);
+while (file.readline(buf) >= 0) {
+ var line = buf.toString();
+ var m, t = line.split("\t");
+ var x = h[t[2]];
+ if (x == null) continue;
+
+ var start = parseInt(t[3]) - 1, end = start;
+ while ((m = re.exec(t[5])) != null) // parse CIGAR to get the end position
+ if (m[2] == 'M' || m[2] == 'D')
+ end += parseInt(m[1]);
+
+ var max_ovlp = 0;
+ for (var i = 0; i < x.length; ++i) {
+ var max_left = x[i][0] > start? x[i][0] : start;
+ var min_rght = x[i][1] < end ? x[i][1] : end;
+ max_ovlp = max_ovlp > min_rght - max_left? max_ovlp : min_rght - max_left;
+ }
+
+ var AS = null, XS = null;
+ if ((m = /AS:i:(\d+)/.exec(line)) != null) AS = parseInt(m[1]);
+ if ((m = /XS:i:(\d+)/.exec(line)) != null) XS = parseInt(m[1]);
+
+ if (s[t[0]] == null) s[t[0]] = [];
+ s[t[0]].push([AS, XS, max_ovlp]);
+}
+file.close();
+
+buf.destroy();
+
+for (var x in s) {
+ var is_rejected = false, y = s[x];
+ y.sort(function(a,b) {return b[0]-a[0]});
+ for (var i = 0; i < y.length && y[i][0] == y[0][0]; ++i)
+ if (y[0][2] < min_ovlp || y[i][0] == y[i][1])
+ is_rejected = true;
+ if (is_rejected) continue;
+ print(x);
+}
diff --git a/bwakit/typeHLA.js b/bwakit/typeHLA.js
new file mode 100644
index 0000000..b265d07
--- /dev/null
+++ b/bwakit/typeHLA.js
@@ -0,0 +1,496 @@
+/*****************************************************************
+ * The K8 Javascript interpreter is required to run this script. *
+ * *
+ * Source code: https://github.com/attractivechaos/k8 *
+ * Binary: http://sourceforge.net/projects/lh3/files/k8/ *
+ *****************************************************************/
+
+var getopt = function(args, ostr) {
+ var oli; // option letter list index
+ if (typeof(getopt.place) == 'undefined')
+ getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+ if (getopt.place == -1) { // update scanning pointer
+ if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+ getopt.place = -1;
+ return null;
+ }
+ if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+ ++getopt.ind;
+ getopt.place = -1;
+ return null;
+ }
+ }
+ var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+ if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+ if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
+ if (getopt.place < 0) ++getopt.ind;
+ return '?';
+ }
+ if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+ getopt.arg = null;
+ if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+ } else { // need an argument
+ if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+ getopt.arg = args[getopt.ind].substr(getopt.place);
+ else if (args.length <= ++getopt.ind) { // no arg
+ getopt.place = -1;
+ if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+ return '?';
+ } else getopt.arg = args[getopt.ind]; // white space
+ getopt.place = -1;
+ ++getopt.ind;
+ }
+ return optopt;
+}
+
+/************************
+ * Command line parsing *
+ ************************/
+
+var ver = "r19";
+var c, thres_len = 50, thres_ratio = .8, thres_nm = 5, thres_frac = .33, dbg = false;
+
+// parse command line options
+while ((c = getopt(arguments, "vdl:n:f:")) != null) {
+ if (c == 'l') thres_len = parseInt(getopt.arg);
+ else if (c == 'n') thres_nm = parseInt(getopt.arg);
+ else if (c == 'd') dbg = true;
+ else if (c == 'f') thres_frac = parseFloat(getopt.arg);
+ else if (c == 'v') { print(ver); exit(0); }
+}
+if (arguments.length == getopt.ind) {
+ print("");
+ print("Usage: k8 typeHLA.js [options] <exon-to-contig.sam>\n");
+ print("Options: -n INT drop a contig if the edit distance to the closest gene is >INT ["+thres_nm+"]");
+ print(" -l INT drop a contig if its match too short ["+thres_len+"]");
+ print(" -f FLOAT drop inconsistent contigs if their length <FLOAT fraction of total length ["+thres_ratio.toFixed(2)+"]");
+ print(" -d output extra info for debugging");
+ print(" -v show version number");
+ print("");
+ print("Note: The output is TAB delimited with each GT line consisting of allele1, allele2,");
+ print(" #mismatches/gaps on primary exons, #mismatches/gaps on other exons and #exons");
+ print(" used in typing. If unusure, use the first GT line as the final genotype.\n");
+ exit(1);
+}
+
+/*********************************
+ * Read gene-to-contig alignment *
+ *********************************/
+
+var file = new File(arguments[getopt.ind]);
+var buf = new Bytes();
+var re_cigar = /(\d+)([MIDSH])/g;
+
+var len = {}, list = [], gcnt = [];
+while (file.readline(buf) >= 0) {
+ var m, mm, line = buf.toString();
+ var t = line.split("\t");
+ var flag = parseInt(t[1]);
+ // SAM header
+ if (t[0].charAt(0) == '@') {
+ if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null)
+ len[mm[1]] = parseInt(m[1]);
+ continue;
+ }
+ // parse gene name and exon number
+ var gene = null, exon = null;
+ if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) {
+ gene = m[1], exon = parseInt(m[2]) - 1;
+ if (gcnt[exon] == null) gcnt[exon] = {};
+ gcnt[exon][gene] = true;
+ }
+ if (gene == null || exon == null || t[2] == '*') continue;
+ // parse clipping and aligned length
+ var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0];
+ while ((m = re_cigar.exec(t[5])) != null) {
+ var l = parseInt(m[1]);
+ if (m[2] == 'M') x += l, te += l;
+ else if (m[2] == 'I') x += l;
+ else if (m[2] == 'D') te += l;
+ else if (m[2] == 'S' || m[2] == 'H') clip[x==0?0:1] = l;
+ }
+ var tl = len[t[2]];
+ var left = ts < clip[0]? ts : clip[0];
+ var right = tl - te < clip[1]? tl - te : clip[1];
+ var qs, qe, ql = clip[0] + x + clip[1];
+ if (flag & 16) qs = clip[1], qe = ql - clip[0];
+ else qs = clip[0], qe = ql - clip[1];
+ var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
+ list.push([t[2], gene, exon, ts, te, nm, left + right, qs, qe, ql]); // left+right should be 0 given a prefix-suffix alignment
+}
+
+buf.destroy();
+file.close();
+
+/**************************************
+ * Prepare data structures for typing *
+ **************************************/
+
+// identify the primary exons, the exons associated with most genes
+var pri_exon = [], n_pri_exons;
+{
+ var cnt = [], max = 0;
+ // count the number of genes per exon and track the max
+ for (var e = 0; e < gcnt.length; ++e) {
+ if (gcnt[e] != null) {
+ var c = 0, h = gcnt[e];
+ for (var x in h) ++c;
+ cnt[e] = c;
+ max = max > c? max : c;
+ } else cnt[e] = 0;
+ }
+ warn("- Number of genes for each exon: [" +cnt.join(",") + "]");
+ // find primary exons
+ var pri_list = [];
+ for (var e = 0; e < cnt.length; ++e) {
+ if (cnt[e] == max) pri_list.push(e + 1);
+ pri_exon[e] = cnt[e] == max? 1 : 0;
+ }
+ warn("- List of primary exon(s): ["+pri_list.join(",")+"]");
+ n_pri_exons = pri_list.length;
+}
+
+// convert strings to integers (for performance)
+var ghash = {}, glist = [], chash = {}, clist = [], elist = [];
+for (var i = 0; i < list.length; ++i) {
+ if (ghash[list[i][1]] == null) {
+ ghash[list[i][1]] = glist.length;
+ glist.push(list[i][1]);
+ }
+ if (chash[list[i][0]] == null) {
+ chash[list[i][0]] = clist.length;
+ clist.push(list[i][0]);
+ }
+ var g = ghash[list[i][1]];
+ if (elist[g] == null) elist[g] = {};
+ elist[g][list[i][2]] = true;
+}
+
+// extract the 3rd and 4th digits
+var gsub = [], gsuf = [];
+for (var i = 0; i < glist.length; ++i) {
+ var m = /^HLA-[^*\s]+\*\d+:(\d+).*([A-Z]?)$/.exec(glist[i]);
+ gsub[i] = parseInt(m[1]);
+ gsuf[i] = /[A-Z]$/.test(glist[i])? 1 : 0;
+}
+
+/*************************************************
+ * Collect genes with perfect matches on primary *
+ *************************************************/
+
+// collect exons with fully covered by perfect match(es)
+var perf_exons = [];
+
+function push_perf_exons(matches, last)
+{
+ matches.sort(function(a, b) { return a[0]-b[0]; });
+ var cov = 0, start = 0, end = 0;
+ for (var i = 0; i < matches.length; ++i) {
+ if (matches[i][3] > 0) continue;
+ if (matches[i][0] <= end)
+ end = end > matches[i][1]? end : matches[i][1];
+ else cov += end - start, start = matches[i][0], end = matches[i][1];
+ }
+ cov += end - start;
+ if (matches[0][2] == cov) {
+ if (perf_exons[last[1]] == null) perf_exons[last[1]] = [];
+ //print(last[0], last[1], ghash[last[0]]);
+ perf_exons[last[1]].push(ghash[last[0]]);
+ }
+}
+
+var last = [null, -1], matches = [];
+for (var i = 0; i < list.length; ++i) {
+ var li = list[i];
+ if (last[0] != li[1] || last[1] != li[2]) {
+ if (matches.length) push_perf_exons(matches, last);
+ matches = [];
+ last = [li[1], li[2]];
+ }
+ matches.push([li[7], li[8], li[9], li[5]+li[6]]);
+}
+if (matches.length) push_perf_exons(matches, last);
+
+// for each gene, count how many primary exons are perfect
+var pg_aux_cnt = {};
+for (var e = 0; e < perf_exons.length; ++e) {
+ if (!pri_exon[e]) continue;
+ var pe = perf_exons[e];
+ var n = pe? pe.length : 0;
+ for (var i = 0; i < n; ++i) {
+ var g = pe[i];
+ if (pg_aux_cnt[g] == null) pg_aux_cnt[g] = 1;
+ else ++pg_aux_cnt[g];
+ }
+}
+
+// find genes with perfect matches on the primary exons
+var perf_genes = [];
+for (var g in pg_aux_cnt)
+ if (pg_aux_cnt[g] == n_pri_exons)
+ perf_genes.push(parseInt(g));
+warn("- Found " +perf_genes.length+ " genes fully covered by perfect matches on the primary exon(s)");
+
+var h_perf_genes = {};
+for (var i = 0; i < perf_genes.length; ++i) {
+ if (dbg) print("PG", glist[perf_genes[i]]);
+ h_perf_genes[perf_genes[i]] = true;
+}
+
+/*******************
+ * Filter hit list *
+ *******************/
+
+// reorganize hits to exons
+function list2exons(list, flt_flag, perf_hash)
+{
+ var exons = [];
+ for (var i = 0; i < list.length; ++i) {
+ var li = list[i], c = chash[li[0]], g = ghash[li[1]];
+ if (flt_flag != null && flt_flag[c] == 1) continue;
+ if (perf_hash != null && !perf_hash[g]) continue;
+ if (exons[li[2]] == null) exons[li[2]] = [];
+ exons[li[2]].push([c, g, li[5] + li[6], li[4] - li[3]]);
+ }
+ return exons;
+}
+
+var exons = list2exons(list), flt_flag = [], ovlp_len = [];
+for (var c = 0; c < clist.length; ++c) flt_flag[c] = ovlp_len[c] = 0;
+for (var e = 0; e < exons.length; ++e) {
+ if (!pri_exon[e]) continue;
+ var ee = exons[e];
+ var max_len = [];
+ for (var c = 0; c < clist.length; ++c) max_len[c] = 0;
+ for (var i = 0; i < ee.length; ++i) {
+ var l = ee[i][3] - ee[i][2];
+ if (l < 1) l = 1;
+ if (max_len[ee[i][0]] < l) max_len[ee[i][0]] = l;
+ }
+ for (var c = 0; c < clist.length; ++c) ovlp_len[c] += max_len[c];
+ for (var i = 0; i < ee.length; ++i)
+ flt_flag[ee[i][0]] |= (!h_perf_genes[ee[i][1]] || ee[i][2])? 1 : 1<<1;
+}
+
+var l_cons = 0, l_incons = 0;
+for (var c = 0; c < clist.length; ++c)
+ if (flt_flag[c]&2) l_cons += ovlp_len[c];
+ else if (flt_flag[c] == 1) l_incons += ovlp_len[c];
+
+warn("- Total length of contigs consistent/inconsistent with perfect genes: " +l_cons+ "/" +l_incons);
+var attempt_perf = (l_incons/(l_cons+l_incons) < thres_frac);
+
+/********************************
+ * Core function for genotyping *
+ ********************************/
+
+function type_gene(perf_mode)
+{
+ if (perf_mode) {
+ var flt_list = [];
+ for (var c = 0; c < clist.length; ++c)
+ if (flt_flag[c] == 1) flt_list.push(clist[c]);
+ warn(" - Filtered " +flt_list.length+ " inconsistent contig(s): [" +flt_list.join(",")+ "]");
+ exons = list2exons(list, flt_flag, h_perf_genes);
+ } else exons = list2exons(list);
+
+ /***********************
+ * Score each genotype *
+ ***********************/
+
+ // initialize genotype scores
+ var pair = [];
+ for (var i = 0; i < glist.length; ++i) {
+ pair[i] = [];
+ for (var j = 0; j <= i; ++j)
+ pair[i][j] = 0;
+ }
+
+ // these two arrays are used to output debugging information
+ var score = [], ctg = [];
+
+ function type_exon(e, gt_list)
+ {
+ function update_pair(x, m, is_pri)
+ {
+ var y, z;
+ y = (x>>14&0xff) + m < 0xff? (x>>14&0xff) + m : 0xff;
+ if (is_pri) z = (x>>22) + m < 0xff? (x>>22) + m : 0xff;
+ else z = x>>22;
+ return z<<22 | y<<14 | ((x&0x3fff) + (1<<6|is_pri));
+ }
+
+ score[e] = []; ctg[e] = [];
+ if (exons[e] == null) return;
+ var ee = exons[e], is_pri = pri_exon[e]? 1 : 0;
+ // find contigs and genes associated with the current exon
+ var ch = {}, gh = {};
+ for (var i = 0; i < ee.length; ++i)
+ if (elist[ee[i][1]][e] != null)
+ ch[ee[i][0]] = true, gh[ee[i][1]] = true;
+ var ga = [], ca = ctg[e];
+ for (var c in ch) ca.push(parseInt(c));
+ for (var g in gh) ga.push(parseInt(g));
+ var named_ca = [];
+ for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]);
+ warn(" - Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])...");
+ // set unmapped entries to high mismatch
+ var sc = score[e];
+ for (var k = 0; k < ga.length; ++k) {
+ var g = ga[k];
+ if (sc[g] == null) sc[g] = [];
+ for (var i = 0; i < ca.length; ++i)
+ sc[g][ca[i]] = 0xff;
+ }
+ // convert representation again and compute max_len[]
+ var max_len = [];
+ for (var i = 0; i < ee.length; ++i) {
+ var c = ee[i][0], g = ee[i][1];
+ if (gh[g] == null || ch[c] == null) continue;
+ sc[g][c] = sc[g][c] < ee[i][2]? sc[g][c] : ee[i][2];
+ if (max_len[c] == null) max_len[c] = 0;
+ max_len[c] = max_len[c] > ee[i][3]? max_len[c] : ee[i][3];
+ }
+ // drop mismapped contigs
+ var max_max_len = 0;
+ for (var k = 0; k < ca.length; ++k)
+ max_max_len = max_max_len > max_len[ca[k]]? max_max_len : max_len[ca[k]];
+ var dropped = [];
+ for (var k = 0; k < ca.length; ++k) {
+ var min = 0x7fffffff, c = ca[k];
+ for (var i = 0; i < ga.length; ++i) {
+ var g = ga[i];
+ min = min < sc[g][c]? min : sc[g][c];
+ }
+ dropped[c] = min > thres_nm? true : false;
+ if (max_len[c] < thres_len && max_len[c] < thres_ratio * max_max_len) dropped[c] = true;
+ if (dropped[c]) warn(" . Dropped low-quality contig " +clist[c]+ " (minNM=" +min+ "; maxLen=" +max_len[c]+ ")");
+ }
+ // fill the pair array
+ if (gt_list == null) {
+ for (var i = 0; i < ga.length; ++i) {
+ var m = 0, gi = ga[i], g1 = sc[gi];
+ // homozygous
+ for (var k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) m += g1[c];
+ }
+ pair[gi][gi] = update_pair(pair[gi][gi], m, is_pri);
+ // heterozygous
+ for (var j = i + 1; j < ga.length; ++j) {
+ var gj = ga[j], g2 = sc[gj], m = 0, a = [0, 0];
+ for (var k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) {
+ m += g1[c] < g2[c]? g1[c] : g2[c];
+ ++a[g1[c]<g2[c]? 0:1];
+ }
+ }
+ if (a[0] == 0 || a[1] == 0) m = 0xff; // if all contigs are assigned to one gene, it is not good
+ if (gi < gj) pair[gj][gi] = update_pair(pair[gj][gi], m, is_pri);
+ else pair[gi][gj] = update_pair(pair[gi][gj], m, is_pri);
+ }
+ }
+ } else {
+ var tmp_pairs = [], min = 0xff;
+ for (var i = 0; i < gt_list.length; ++i) {
+ var gt = gt_list[i], m = 0;
+ var g1 = sc[gt[0]], g2 = sc[gt[1]], a = [0, 0];
+ if (g1 == null || g2 == null) continue;
+ if (gt[0] == gt[1]) {
+ for (var k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) m += g1[c];
+ }
+ } else {
+ var a = [0, 0];
+ for (k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) {
+ m += g1[c] < g2[c]? g1[c] : g2[c];
+ ++a[g1[c]<g2[c]? 0:1];
+ }
+ }
+ if (a[0] == 0 || a[1] == 0) m = 0xff;
+ }
+ tmp_pairs.push([gt[0], gt[1], m]);
+ min = min < m? min : m;
+ }
+ if (min < 0xff) {
+ for (var i = 0; i < tmp_pairs.length; ++i) {
+ var t = tmp_pairs[i];
+ pair[t[0]][t[1]] = update_pair(pair[t[0]][t[1]], t[2], is_pri);
+ }
+ } else warn(" . Skipped exon " +(e+1)+ " as the assembly may be incomplete");
+ }
+ }
+
+ // type primary exons
+ warn(" - Processing primary exon(s)...");
+ for (var e = 0; e < exons.length; ++e)
+ if (pri_exon[e]) type_exon(e);
+
+ // generate the list of best genotypes on primary exons
+ var min_nm_pri = 0x7fffffff;
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons)
+ min_nm_pri = min_nm_pri < pair[i][j]>>22? min_nm_pri : pair[i][j]>>22;
+
+ var gt_list = [];
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>22 == min_nm_pri)
+ gt_list.push([i, j]);
+
+ warn(" - Collected " +gt_list.length+ " top genotypes on the primary exon(s); minimal edit distance: " +min_nm_pri);
+
+ // type other exons
+ warn(" - Processing other exon(s)...");
+ for (var e = 0; e < exons.length; ++e)
+ if (!pri_exon[e]) type_exon(e, gt_list);
+
+ /*****************************
+ * Choose the best genotypes *
+ *****************************/
+
+ // genotyping
+ var min_nm = 0x7fffffff;
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons)
+ min_nm = min_nm < pair[i][j]>>14? min_nm : pair[i][j]>>14;
+
+ var out = [];
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>14 <= min_nm + 1)
+ out.push([pair[i][j]>>14, pair[i][j]>>6&0xff, i, j, (gsuf[i] + gsuf[j])<<16|(gsub[i] + gsub[j])]);
+
+ out.sort(function(a, b) { return a[0]!=b[0]? a[0]-b[0] : a[1]!=b[1]? b[1]-a[1] : a[4]!=b[4]? a[4]-b[4] : a[2]!=b[2]? a[2]-b[2] : a[3]-b[3]});
+
+ return out;
+}
+
+/**********************
+ * Perform genotyping *
+ **********************/
+
+warn("- Typing in the imperfect mode...");
+var rst = type_gene(false);
+if (attempt_perf) {
+ warn("- Typing in the perfect mode...");
+ var rst_perf = type_gene(true);
+ warn("- Imperfect vs perfect mode: [" +(rst[0][0]>>8&0xff)+ "," +(rst[0][0]&0xff)+ "] vs [" +(rst_perf[0][0]>>8&0xff)+ "," +(rst_perf[0][0]&0xff)+ "]");
+ if (rst_perf[0][0] < rst[0][0]) {
+ warn("- Chose the result from the perfect mode");
+ rst = rst_perf;
+ } else warn("- Chose the result from the imperfect mode");
+} else warn("- Perfect mode is not attempted");
+
+/**********
+ * Output *
+ **********/
+
+for (var i = 0; i < rst.length; ++i)
+ print("GT", glist[rst[i][3]], glist[rst[i][2]], rst[i][0]>>8&0xff, rst[i][0]&0xff, rst[i][1]);
diff --git a/bwakit/typeHLA.sh b/bwakit/typeHLA.sh
new file mode 100755
index 0000000..b73100d
--- /dev/null
+++ b/bwakit/typeHLA.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+is_ctg=0
+
+if [ $# -gt 1 ] && [ $1 == '-A' ]; then
+ is_ctg=1
+ shift
+fi
+
+if [ $# -lt 2 ]; then
+ echo "Usage: $0 [-A] <prefix> <gene>"
+ exit 1
+fi
+
+preres="resource-human-HLA"
+root=`dirname $0`
+pre=$1.$2
+touch $pre.gt
+
+if [ ! -s $pre.fq ]; then
+ echo '** Empty input file. Abort!' >&2
+ exit 0
+fi
+
+if [ $is_ctg -eq 0 ]; then
+ echo "** De novo assembling..." >&2
+ len=`$root/seqtk comp $pre.fq | awk '{++x;y+=$2}END{printf("%.0f\n", y/x)}'`
+ $root/fermi2.pl unitig -f $root/fermi2 -r $root/ropebwt2 -t2 -l$len -p $pre.tmp $pre.fq > $pre.tmp.mak
+ make -f $pre.tmp.mak >&2
+ cp $pre.tmp.mag.gz $pre.mag.gz
+else
+ rm -f $pre.tmp.mag.gz
+ ln -s $pre.fq $pre.tmp.mag.gz
+fi
+
+echo "** Selecting contigs overlapping target exons..." >&2
+(ls $root/$preres/HLA-ALT-idx/*.fa.bwt | sed s,.bwt,, | xargs -i $root/bwa mem -t2 -B1 -O1 -E1 {} $pre.tmp.mag.gz 2>/dev/null) | grep -v ^@ | sort -k3,3 -k4,4n | gzip > $pre.tmp.ALT.sam.gz
+$root/k8 $root/typeHLA-selctg.js $2 $root/$preres/HLA-ALT-exons.bed $pre.tmp.ALT.sam.gz | $root/seqtk subseq $pre.tmp.mag.gz - | gzip -1 > $pre.tmp.fq.gz
+
+echo "** Mapping exons to de novo contigs..." >&2
+$root/bwa index -p $pre.tmp $pre.tmp.fq.gz 2>/dev/null
+$root/seqtk comp $root/$preres/HLA-CDS.fa | cut -f1 | grep ^$2 | $root/seqtk subseq $root/$preres/HLA-CDS.fa - | $root/bwa mem -aD.1 -t2 $pre.tmp - 2>/dev/null | gzip -1 > $pre.sam.gz
+
+echo "** Typing..." >&2
+$root/k8 $root/typeHLA.js $pre.sam.gz > $pre.gt
+
+# delete temporary files
+rm -f $pre.tmp.*
+[ $is_ctg -eq 1 ] && rm -f $pre.mag.gz
diff --git a/bwamem.c b/bwamem.c
index e9d9304..b32c166 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -2,6 +2,7 @@
#include <string.h>
#include <stdio.h>
#include <assert.h>
+#include <limits.h>
#include <math.h>
#ifdef HAVE_PTHREAD
#include <pthread.h>
@@ -57,6 +58,9 @@ mem_opt_t *mem_opt_init()
o->zdrop = 100;
o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5;
+
+ o->max_mem_intv = 20;
+
o->min_seed_len = 19;
o->split_width = 10;
o->max_occ = 500;
@@ -68,7 +72,8 @@ mem_opt_t *mem_opt_init()
o->split_factor = 1.5;
o->chunk_size = 10000000;
o->n_threads = 1;
- o->max_hits = 5;
+ o->max_XA_hits = 5;
+ o->max_XA_hits_alt = 200;
o->max_matesw = 50;
o->mask_level_redun = 0.95;
o->min_chain_weight = 0;
@@ -132,9 +137,26 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if (end - start < split_len || p->x[2] > opt->split_width) continue;
bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i)
- if ((a->mem1.a[i].info>>32) - (uint32_t)a->mem1.a[i].info >= opt->min_seed_len)
+ if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
+ // third pass: LAST-like
+ if (opt->max_mem_intv > 0) {
+ x = 0;
+ while (x < len) {
+ if (seq[x] < 4) {
+ if (1) {
+ bwtintv_t m;
+ x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
+ if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
+ } else { // for now, we never come to this block which is slower
+ x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
+ for (i = 0; i < a->mem1.n; ++i)
+ kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
+ }
+ } else ++x;
+ }
+ }
// sort
ks_introsort(mem_intv, a->mem.n, a->mem.a);
}
@@ -151,7 +173,7 @@ typedef struct {
typedef struct {
int n, m, first, rid;
- uint32_t w:30, kept:2;
+ uint32_t w:29, kept:2, is_alt:1;
float frac_rep;
int64_t pos;
mem_seed_t *seeds;
@@ -252,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
bwtintv_t *p = &aux->mem.a[i];
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
- if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
+ // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
mem_chain_t tmp, *lower, *upper;
@@ -272,6 +294,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s;
tmp.rid = rid;
+ tmp.is_alt = !!bns->anns[rid].is_alt;
kb_putp(chn, tree, &tmp);
}
}
@@ -325,7 +348,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
int j = chains.a[k];
int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]);
- if (e_min > b_max) { // have overlap
+ if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary
int li = chn_end(a[i]) - chn_beg(a[i]);
int lj = chn_end(a[j]) - chn_beg(a[j]);
int min_l = li < lj? li : lj;
@@ -371,9 +394,12 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
-#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))
+#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash))))
KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
+#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
+KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
+
#define PATCH_MAX_R_BW 0.05f
#define PATCH_MIN_SC_RATIO 0.90f
@@ -469,36 +495,73 @@ int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int
return n - 1;
}
-void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
+typedef kvec_t(int) int_v;
+
+static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
{ // similar to the loop in mem_chain_flt()
int i, k, tmp;
- kvec_t(int) z;
- if (n == 0) return;
- kv_init(z);
- for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
- ks_introsort(mem_ars_hash, n, a);
tmp = opt->a + opt->b;
tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
- kv_push(int, z, 0);
+ z->n = 0;
+ kv_push(int, *z, 0);
for (i = 1; i < n; ++i) {
- for (k = 0; k < z.n; ++k) {
- int j = z.a[k];
+ for (k = 0; k < z->n; ++k) {
+ int j = z->a[k];
int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
if (e_min > b_max) { // have overlap
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
if (a[j].sub == 0) a[j].sub = a[i].score;
- if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
+ if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
+ ++a[j].sub_n;
break;
}
}
}
- if (k == z.n) kv_push(int, z, i);
- else a[i].secondary = z.a[k];
+ if (k == z->n) kv_push(int, *z, i);
+ else a[i].secondary = z->a[k];
+ }
+}
+
+int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
+{
+ int i, n_pri;
+ int_v z = {0,0,0};
+ if (n == 0) return 0;
+ for (i = n_pri = 0; i < n; ++i) {
+ a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
+ if (!a[i].is_alt) ++n_pri;
+ }
+ ks_introsort(mem_ars_hash, n, a);
+ mem_mark_primary_se_core(opt, n, a, &z);
+ for (i = 0; i < n; ++i) {
+ mem_alnreg_t *p = &a[i];
+ p->secondary_all = i; // keep the rank in the first round
+ if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
+ p->alt_sc = a[p->secondary].score;
+ }
+ if (n_pri >= 0 && n_pri < n) {
+ kv_resize(int, z, n);
+ if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
+ for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
+ for (i = 0; i < n; ++i) {
+ if (a[i].secondary >= 0) {
+ a[i].secondary_all = z.a[a[i].secondary];
+ if (a[i].is_alt) a[i].secondary = INT_MAX;
+ } else a[i].secondary_all = -1;
+ }
+ if (n_pri > 0) { // mark primary for hits to the primary assembly only
+ for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
+ mem_mark_primary_se_core(opt, n_pri, a, &z);
+ }
+ } else {
+ for (i = 0; i < n; ++i)
+ a[i].secondary_all = a[i].secondary;
}
free(z.a);
+ return n_pri;
}
/*********************************
@@ -621,12 +684,12 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
// qd: distance ahead of the seed on query; rd: on reference
qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
- w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
+ w = max_gap < p->w? max_gap : p->w; // bounded by the band width
if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
// similar to the previous four lines, but this time we look at the region behind
qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
max_gap = cal_max_gap(opt, qd < rd? qd : rd);
- w = max_gap < opt->w? max_gap : opt->w;
+ w = max_gap < p->w? max_gap : p->w;
if (qd - rd < w && rd - qd < w) break;
}
if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
@@ -753,7 +816,7 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar)
return l;
}
-void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all)
+void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_)
{
int i, l_name;
mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
@@ -782,7 +845,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
if (p->n_cigar) { // aligned
for (i = 0; i < p->n_cigar; ++i) {
int c = p->cigar[i]&0xf;
- if (!softclip_all && (c == 3 || c == 4))
+ if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4))
c = which? 4 : 3; // use hard clipping for supplementary alignments
kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
}
@@ -810,7 +873,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
kputsn("*\t*", 3, str);
} else if (!p->is_rev) { // the forward strand
int i, qb = 0, qe = s->l_seq;
- if (p->n_cigar && which && !softclip_all) { // have cigar && not the primary alignment && not softclip all
+ if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { // have cigar && not the primary alignment && not softclip all
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4;
}
@@ -824,7 +887,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
} else kputc('*', str);
} else { // the reverse strand
int i, qb = 0, qe = s->l_seq;
- if (p->n_cigar && which && !softclip_all) {
+ if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) {
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4;
}
@@ -854,7 +917,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
for (i = 0; i < n; ++i) {
const mem_aln_t *r = &list[i];
int k;
- if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
+ if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
kputs(bns->anns[r->rid].name, str); kputc(',', str);
kputl(r->pos+1, str); kputc(',', str);
kputc("+-"[r->is_rev], str); kputc(',', str);
@@ -866,9 +929,19 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
kputc(';', str);
}
}
+ if (p->alt_sc > 0)
+ ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
}
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
+ if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) {
+ int tmp;
+ kputsn("\tXR:Z:", 6, str);
+ tmp = str->l;
+ kputs(bns->anns[p->rid].anno, str);
+ for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE
+ if (str->s[i] == '\t') str->s[i] = ' ';
+ }
kputc('\n', str);
}
@@ -903,42 +976,43 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
}
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
-void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
+void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
{
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str;
kvec_t(mem_aln_t) aa;
- int k;
+ int k, l;
char **XA = 0;
if (!(opt->flag & MEM_F_ALL))
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
kv_init(aa);
str.l = str.m = 0; str.s = 0;
- for (k = 0; k < a->n; ++k) {
+ for (k = l = 0; k < a->n; ++k) {
mem_alnreg_t *p = &a->a[k];
mem_aln_t *q;
if (p->score < opt->T) continue;
- if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
- if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
+ if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue;
+ if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
q = kv_pushp(mem_aln_t, aa);
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
assert(q->rid >= 0); // this should not happen with the new code
q->XA = XA? XA[k] : 0;
q->flag |= extra_flag; // flag secondary
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
- if (k && p->secondary < 0) // if supplementary
+ if (l && p->secondary < 0) // if supplementary
q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
- if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+ if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+ ++l;
}
if (aa.n == 0) { // no alignments good enough; then write an unaligned record
mem_aln_t t;
t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
t.flag |= extra_flag;
- mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP);
+ mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m);
} else {
for (k = 0; k < aa.n; ++k)
- mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP);
+ mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
free(aa.a);
}
@@ -981,6 +1055,11 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
}
}
+ for (i = 0; i < regs.n; ++i) {
+ mem_alnreg_t *p = ®s.a[i];
+ if (p->rid >= 0 && bns->anns[p->rid].is_alt)
+ p->is_alt = 1;
+ }
return regs;
}
@@ -1051,6 +1130,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
assert(a.rid == ar->rid);
a.pos = pos - bns->anns[a.rid].offset;
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
+ a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc;
free(query);
return a;
}
@@ -1092,7 +1172,7 @@ static void worker2(void *data, int i, int tid)
mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);
} else {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
- mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
+ mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
}
free(w->regs[i].a);
} else {
@@ -1106,16 +1186,15 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
{
extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
worker_t w;
- mem_alnreg_v *regs;
mem_pestat_t pes[4];
double ctime, rtime;
int i;
ctime = cputime(); rtime = realtime();
global_bns = bns;
- regs = malloc(n * sizeof(mem_alnreg_v));
+ w.regs = malloc(n * sizeof(mem_alnreg_v));
w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
- w.seqs = seqs; w.regs = regs; w.n_processed = n_processed;
+ w.seqs = seqs; w.n_processed = n_processed;
w.pes = &pes[0];
w.aux = malloc(opt->n_threads * sizeof(smem_aux_t));
for (i = 0; i < opt->n_threads; ++i)
@@ -1126,10 +1205,10 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
free(w.aux);
if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
- else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data
+ else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data
}
kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
- free(regs);
+ free(w.regs);
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
}
diff --git a/bwamem.h b/bwamem.h
index 7b14ca8..8ffe729 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -18,7 +18,9 @@ typedef struct __smem_i smem_i;
#define MEM_F_NO_RESCUE 0x20
#define MEM_F_SELF_OVLP 0x40
#define MEM_F_ALN_REG 0x80
+#define MEM_F_REF_HDR 0x100
#define MEM_F_SOFTCLIP 0x200
+#define MEM_F_SMARTPE 0x400
typedef struct {
int a, b; // match score and mismatch penalty
@@ -29,6 +31,8 @@ typedef struct {
int w; // band width
int zdrop; // Z-dropoff
+ uint64_t max_mem_intv;
+
int T; // output score threshold; only affecting output
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
@@ -48,7 +52,7 @@ typedef struct {
int mapQ_coef_fac;
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
- int max_hits; // if there are max_hits or fewer, output them all
+ int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;
@@ -59,13 +63,15 @@ typedef struct {
int score; // best local SW score
int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
int sub; // 2nd best SW score
+ int alt_sc;
int csub; // SW score of a tandem hit
int sub_n; // approximate number of suboptimal hits
int w; // actual band width used in extension
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
+ int secondary_all;
int seedlen0; // length of the starting seed
- int n_comp; // number of sub-alignments chained together
+ int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep;
uint64_t hash;
} mem_alnreg_t;
@@ -82,12 +88,12 @@ typedef struct { // This struct is only used for the convenience of API.
int64_t pos; // forward strand 5'-end mapping position
int rid; // reference sequence index in bntseq_t; <0 for unmapped
int flag; // extra flag
- uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
+ uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar; // number of CIGAR operations
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
char *XA; // alternative mappings
- int score, sub;
+ int score, sub, alt_sc;
} mem_aln_t;
#ifdef __cplusplus
@@ -97,6 +103,7 @@ extern "C" {
smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr);
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
+ void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
const bwtintv_v *smem_next(smem_i *itr);
mem_opt_t *mem_opt_init(void);
diff --git a/bwamem_extra.c b/bwamem_extra.c
index 59bb68e..02e817a 100644
--- a/bwamem_extra.c
+++ b/bwamem_extra.c
@@ -1,3 +1,4 @@
+#include <limits.h>
#include "bwa.h"
#include "bwamem.h"
#include "bntseq.h"
@@ -11,6 +12,8 @@ struct __smem_i {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
+ int min_intv, max_len;
+ uint64_t max_intv;
bwtintv_v *matches; // matches; to be returned by smem_next()
bwtintv_v *sub; // sub-matches inside the longest match; temporary
bwtintv_v *tmpvec[2]; // temporary arrays
@@ -25,6 +28,9 @@ smem_i *smem_itr_init(const bwt_t *bwt)
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
itr->matches = calloc(1, sizeof(bwtintv_v));
itr->sub = calloc(1, sizeof(bwtintv_v));
+ itr->min_intv = 1;
+ itr->max_len = INT_MAX;
+ itr->max_intv = 0;
return itr;
}
@@ -44,21 +50,22 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
itr->len = len;
}
+void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
+{
+ itr->min_intv = min_intv;
+ itr->max_len = max_len;
+ itr->max_intv = max_intv;
+}
+
const bwtintv_v *smem_next(smem_i *itr)
{
- int i, max, max_i, ori_start;
+ int ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
if (itr->start >= itr->len || itr->start < 0) return 0;
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
- itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
- if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
- for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
- bwtintv_t *p = &itr->matches->a[i];
- int len = (uint32_t)p->info - (p->info>>32);
- if (max < len) max = len, max_i = i;
- }
+ itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
return itr->matches;
}
@@ -105,43 +112,54 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
s->sam = str.s;
}
+static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
+{
+ int k = a[i].secondary_all;
+ if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
+ return -1;
+}
+
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
{
- int i, k, *cnt, tot;
- kstring_t *aln = 0;
- char **XA = 0;
+ int i, k, r, *cnt, tot;
+ kstring_t *aln = 0, str = {0,0,0};
+ char **XA = 0, *has_alt;
cnt = calloc(a->n, sizeof(int));
+ has_alt = calloc(a->n, 1);
for (i = 0, tot = 0; i < a->n; ++i) {
- int j = a->a[i].secondary;
- if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
- ++cnt[j], ++tot;
+ r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
+ if (r >= 0) {
+ ++cnt[r], ++tot;
+ if (a->a[i].is_alt) has_alt[r] = 1;
+ }
}
if (tot == 0) goto end_gen_alt;
aln = calloc(a->n, sizeof(kstring_t));
for (i = 0; i < a->n; ++i) {
mem_aln_t t;
- int j = a->a[i].secondary;
- if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
- if (cnt[j] > opt->max_hits) continue;
+ if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
+ if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
- kputs(bns->anns[t.rid].name, &aln[j]);
- kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]);
- kputc(',', &aln[j]);
+ str.l = 0;
+ kputs(bns->anns[t.rid].name, &str);
+ kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str);
+ kputc(',', &str);
for (k = 0; k < t.n_cigar; ++k) {
- kputw(t.cigar[k]>>4, &aln[j]);
- kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]);
+ kputw(t.cigar[k]>>4, &str);
+ kputc("MIDSHN"[t.cigar[k]&0xf], &str);
}
- kputc(',', &aln[j]); kputw(t.NM, &aln[j]);
- kputc(';', &aln[j]);
+ kputc(',', &str); kputw(t.NM, &str);
+ kputc(';', &str);
free(t.cigar);
+ kputsn(str.s, str.l, &aln[r]);
}
XA = calloc(a->n, sizeof(char*));
for (k = 0; k < a->n; ++k)
XA[k] = aln[k].s;
end_gen_alt:
- free(cnt); free(aln);
+ free(has_alt); free(cnt); free(aln); free(str.s);
return XA;
}
diff --git a/bwamem_pair.c b/bwamem_pair.c
index a3aeb80..7950736 100644
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
if (q->n < MIN_DIR_CNT) {
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
r->failed = 1;
+ free(q->a);
continue;
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
ks_introsort_64(q->n, q->a);
@@ -151,6 +152,7 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
memset(&b, 0, sizeof(mem_alnreg_t));
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
b.rid = a->rid;
+ b.is_alt = a->is_alt;
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
@@ -177,14 +179,14 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
return n;
}
-int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
+int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
{
pair64_v v, u;
int r, i, k, y[4], ret; // y[] keeps the last hit
int64_t l_pac = bns->l_pac;
kv_init(v); kv_init(u);
for (r = 0; r < 2; ++r) { // loop through read number
- for (i = 0; i < a[r].n; ++i) {
+ for (i = 0; i < n_pri[r]; ++i) {
pair64_t key;
mem_alnreg_t *e = &a[r].a[i];
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
@@ -240,21 +242,25 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons
return ret;
}
+void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m);
+
#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
{
- extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
+ extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
- extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
- extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all);
+ extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
- int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1;
+ int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
kstring_t str;
- mem_aln_t h[2];
+ mem_aln_t h[2], g[2], aa[2][2];
str.l = str.m = 0; str.s = 0;
+ memset(h, 0, sizeof(mem_aln_t) * 2);
+ memset(g, 0, sizeof(mem_aln_t) * 2);
+ n_aa[0] = n_aa[1] = 0;
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
mem_alnreg_v b[2];
kv_init(b[0]); kv_init(b[1]);
@@ -267,18 +273,18 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
}
- mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
- mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
+ n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
+ n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
// pairing single-end hits
- if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
+ if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
int is_multi[2], q_pe, score_un, q_se[2];
char **XA[2];
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
- for (j = 1; j < a[i].n; ++j)
+ for (j = 1; j < n_pri[i]; ++j)
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
- is_multi[i] = j < a[i].n? 1 : 0;
+ is_multi[i] = j < n_pri[i]? 1 : 0;
}
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
// compute mapQ for the best SE hit
@@ -310,42 +316,62 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
}
- // suboptimal hits
+ for (i = 0; i < 2; ++i) {
+ int k = a[i].a[z[i]].secondary_all;
+ if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
+ assert(a[i].a[k].secondary_all < 0);
+ for (j = 0; j < a[i].n; ++j)
+ if (a[i].a[j].secondary_all == k || j == k)
+ a[i].a[j].secondary_all = z[i];
+ a[i].a[z[i]].secondary_all = -1;
+ }
+ }
if (!(opt->flag & MEM_F_ALL)) {
- for (i = 0; i < 2; ++i) {
- int k = a[i].a[z[i]].secondary;
- if (k >= 0) { // switch secondary and primary
- assert(a[i].a[k].secondary < 0);
- for (j = 0; j < a[i].n; ++j)
- if (a[i].a[j].secondary == k || j == k)
- a[i].a[j].secondary = z[i];
- a[i].a[z[i]].secondary = -1;
- }
+ for (i = 0; i < 2; ++i)
XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
- }
} else XA[0] = XA[1] = 0;
// write SAM
- h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0;
- h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0;
- mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;
- mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;
- if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
- free(h[0].cigar); // h[0].XA will be freed in the following block
- free(h[1].cigar);
- // free XA
for (i = 0; i < 2; ++i) {
- if (XA[i]) {
- for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
- free(XA[i]);
+ h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
+ h[i].mapq = q_se[i];
+ h[i].flag |= 0x40<<i | extra_flag;
+ h[i].XA = XA[i]? XA[i][z[i]] : 0;
+ aa[i][n_aa[i]++] = h[i];
+ if (n_pri[i] < a[i].n) { // the read has ALT hits
+ mem_alnreg_t *p = &a[i].a[n_pri[i]];
+ if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) continue;
+ g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
+ g[i].flag |= 0x800 | 0x40<<i | extra_flag;
+ g[i].XA = XA[i]? XA[i][n_pri[i]] : 0;
+ aa[i][n_aa[i]++] = g[i];
}
}
+ for (i = 0; i < n_aa[0]; ++i)
+ mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
+ s[0].sam = strdup(str.s); str.l = 0;
+ for (i = 0; i < n_aa[1]; ++i)
+ mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
+ s[1].sam = str.s;
+ if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
+ // free
+ for (i = 0; i < 2; ++i) {
+ free(h[i].cigar); free(g[i].cigar);
+ if (XA[i] == 0) continue;
+ for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
+ free(XA[i]);
+ }
} else goto no_pairing;
return n;
no_pairing:
for (i = 0; i < 2; ++i) {
- if (a[i].n && a[i].a[0].score >= opt->T)
- h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]);
+ int which = -1;
+ if (a[i].n) {
+ if (a[i].a[0].score >= opt->T) which = 0;
+ else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
+ which = n_pri[i];
+ }
+ if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
}
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
@@ -354,8 +380,8 @@ no_pairing:
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
}
- mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
- mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
+ mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
+ mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); free(h[1].cigar);
return n;
diff --git a/bwashm.c b/bwashm.c
new file mode 100644
index 0000000..163f764
--- /dev/null
+++ b/bwashm.c
@@ -0,0 +1,213 @@
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include "bwa.h"
+
+int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
+{
+ const char *name;
+ uint8_t *shm, *shm_idx;
+ uint16_t *cnt;
+ int shmid, to_init = 0, l;
+ char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn;
+
+ if (hint == 0 || hint[0] == 0) return -1;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+
+ if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) {
+ shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
+ to_init = 1;
+ }
+ if (shmid < 0) return -1;
+ ftruncate(shmid, BWA_CTL_SIZE);
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ if (to_init) {
+ memset(shm, 0, BWA_CTL_SIZE);
+ cnt[1] = 4;
+ }
+
+ if (idx->mem == 0) bwa_idx2mem(idx);
+
+ if (tmpfn) {
+ FILE *fp;
+ if ((fp = fopen(tmpfn, "wb")) != 0) {
+ int64_t rest = idx->l_mem;
+ while (rest > 0) {
+ int64_t l = rest < 0x1000000? rest : 0x1000000;
+ rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
+ }
+ fclose(fp);
+ free(idx->mem); idx->mem = 0;
+ } else {
+ fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
+ tmpfn = 0;
+ }
+ }
+
+ strcat(strcpy(path, "/bwaidx-"), name);
+ if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
+ shm_unlink(path);
+ perror("shm_open()");
+ return -1;
+ }
+ l = 8 + strlen(name) + 1;
+ if (cnt[1] + l > BWA_CTL_SIZE) return -1;
+ memcpy(shm + cnt[1], &idx->l_mem, 8);
+ memcpy(shm + cnt[1] + 8, name, l - 8);
+ cnt[1] += l; ++cnt[0];
+ ftruncate(shmid, idx->l_mem);
+ shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+ if (tmpfn) {
+ FILE *fp;
+ fp = fopen(tmpfn, "rb");
+ int64_t rest = idx->l_mem;
+ while (rest > 0) {
+ int64_t l = rest < 0x1000000? rest : 0x1000000;
+ rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
+ }
+ fclose(fp);
+ unlink(tmpfn);
+ } else {
+ memcpy(shm_idx, idx->mem, idx->l_mem);
+ free(idx->mem);
+ }
+ bwa_mem2idx(idx->l_mem, shm_idx, idx);
+ idx->is_shm = 1;
+ return 0;
+}
+
+bwaidx_t *bwa_idx_load_from_shm(const char *hint)
+{
+ const char *name;
+ uint8_t *shm, *shm_idx;
+ uint16_t *cnt, i;
+ char *p, path[PATH_MAX + 1];
+ int shmid;
+ int64_t l_mem;
+ bwaidx_t *idx;
+
+ if (hint == 0 || hint[0] == 0) return 0;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ if (cnt[0] == 0) return 0;
+ for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
+ memcpy(&l_mem, p, 8); p += 8;
+ if (strcmp(p, name) == 0) break;
+ p += strlen(p) + 1;
+ }
+ if (i == cnt[0]) return 0;
+
+ strcat(strcpy(path, "/bwaidx-"), name);
+ if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0;
+ shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
+ idx = calloc(1, sizeof(bwaidx_t));
+ bwa_mem2idx(l_mem, shm_idx, idx);
+ idx->is_shm = 1;
+ return idx;
+}
+
+int bwa_shm_test(const char *hint)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ const char *name;
+
+ if (hint == 0 || hint[0] == 0) return 0;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ if (strcmp(p + 8, name) == 0) return 1;
+ p += strlen(p) + 9;
+ }
+ return 0;
+}
+
+int bwa_shm_list(void)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ int64_t l_mem;
+ memcpy(&l_mem, p, 8); p += 8;
+ printf("%s\t%ld\n", p, (long)l_mem);
+ p += strlen(p) + 1;
+ }
+ return 0;
+}
+
+int bwa_shm_destroy(void)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ char path[PATH_MAX + 1];
+
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ int64_t l_mem;
+ memcpy(&l_mem, p, 8); p += 8;
+ strcat(strcpy(path, "/bwaidx-"), p);
+ shm_unlink(path);
+ p += strlen(p) + 1;
+ }
+ munmap(shm, BWA_CTL_SIZE);
+ shm_unlink("/bwactl");
+ return 0;
+}
+
+int main_shm(int argc, char *argv[])
+{
+ int c, to_list = 0, to_drop = 0, ret = 0;
+ char *tmpfn = 0;
+ while ((c = getopt(argc, argv, "ldf:")) >= 0) {
+ if (c == 'l') to_list = 1;
+ else if (c == 'd') to_drop = 1;
+ else if (c == 'f') tmpfn = optarg;
+ }
+ if (optind == argc && !to_list && !to_drop) {
+ fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
+ fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
+ fprintf(stderr, " -l list names of indices in shared memory\n");
+ fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
+ return 1;
+ }
+ if (optind < argc && (to_list || to_drop)) {
+ fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
+ return 1;
+ }
+ if (optind < argc) {
+ if (bwa_shm_test(argv[optind]) == 0) {
+ bwaidx_t *idx;
+ idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
+ if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
+ fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
+ ret = 1;
+ }
+ bwa_idx_destroy(idx);
+ } else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
+ }
+ if (to_list) bwa_shm_list();
+ if (to_drop) bwa_shm_destroy();
+ return ret;
+}
diff --git a/bwt.c b/bwt.c
index c9bf6a3..9083654 100644
--- a/bwt.c
+++ b/bwt.c
@@ -30,6 +30,7 @@
#include <string.h>
#include <assert.h>
#include <stdint.h>
+#include <limits.h>
#include "utils.h"
#include "bwt.h"
#include "kvec.h"
@@ -284,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
}
-
-int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+// NOTE: $max_intv is not currently used in BWA-MEM
+int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
@@ -301,7 +302,10 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
ik.info = x + 1;
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
- if (q[i] < 4) { // an A/C/G/T base
+ if (ik.x[2] < max_intv) { // an interval small enough
+ kv_push(bwtintv_t, *curr, ik);
+ break;
+ } else if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
@@ -323,8 +327,8 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
- bwt_extend(bwt, p, ok, 1);
- if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
+ if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
+ if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
@@ -346,6 +350,34 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
return ret;
}
+int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+{
+ return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
+}
+
+int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
+{
+ int i, c;
+ bwtintv_t ik, ok[4];
+
+ memset(mem, 0, sizeof(bwtintv_t));
+ if (q[x] > 3) return x + 1;
+ bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
+ for (i = x + 1; i < len; ++i) { // forward search
+ if (q[i] < 4) { // an A/C/G/T base
+ c = 3 - q[i]; // complement of q[i]
+ bwt_extend(bwt, &ik, ok, 0);
+ if (ok[c].x[2] < max_intv && i - x >= min_len) {
+ *mem = ok[c];
+ mem->info = (uint64_t)x<<32 | (i + 1);
+ return i + 1;
+ }
+ ik = ok[c];
+ } else return i + 1;
+ }
+ return len;
+}
+
/*************************
* Read/write BWT and SA *
*************************/
diff --git a/bwt.h b/bwt.h
index d2ff0ac..c71d6b5 100644
--- a/bwt.h
+++ b/bwt.h
@@ -92,6 +92,7 @@ extern "C" {
void bwt_destroy(bwt_t *bwt);
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
+ void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
void bwt_cal_sa(bwt_t *bwt, int intv);
void bwt_bwtupdate_core(bwt_t *bwt);
@@ -118,8 +119,9 @@ extern "C" {
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
- // SMEM iterator interface
+ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
#ifdef __cplusplus
}
diff --git a/bwt_gen.c b/bwt_gen.c
index 6139d80..76f28c9 100644
--- a/bwt_gen.c
+++ b/bwt_gen.c
@@ -1598,15 +1598,20 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o
}
}
-void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size)
{
BWTInc *bwtInc;
- bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000);
+ bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size);
printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
BWTIncFree(bwtInc);
}
+void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+{
+ bwt_bwtgen2(fn_pac, fn_bwt, 10000000);
+}
+
int bwt_bwtgen_main(int argc, char *argv[])
{
if (argc < 3) {
diff --git a/bwtindex.c b/bwtindex.c
index 9e3ec15..2bfd667 100644
--- a/bwtindex.c
+++ b/bwtindex.c
@@ -189,11 +189,11 @@ int bwa_index(int argc, char *argv[]) // the "index" command
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
char *prefix = 0, *str, *str2, *str3;
- int c, algo_type = 0, is_64 = 0;
+ int c, algo_type = 0, is_64 = 0, block_size = 10000000;
clock_t t;
int64_t l_pac;
- while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
+ while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "div") == 0) algo_type = 1;
@@ -203,20 +203,26 @@ int bwa_index(int argc, char *argv[]) // the "index" command
break;
case 'p': prefix = strdup(optarg); break;
case '6': is_64 = 1; break;
+ case 'b':
+ block_size = strtol(optarg, &str, 10);
+ if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
+ else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
+ else if (*str == 'K' || *str == 'k') block_size *= 1024;
+ break;
default: return 1;
}
}
if (optind + 1 > argc) {
fprintf(stderr, "\n");
- fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] <in.fasta>\n\n");
+ fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
+ fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
fprintf(stderr, "\n");
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
- fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
- fprintf(stderr, " according to the length of the genome.\n\n");
+ fprintf(stderr, " `-a div' do not work not for long genomes.\n\n");
return 1;
}
if (prefix == 0) {
@@ -242,7 +248,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(str2, prefix); strcat(str2, ".bwt");
t = clock();
fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
- if (algo_type == 2) bwt_bwtgen(str, str2);
+ if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
else if (algo_type == 1 || algo_type == 3) {
bwt_t *bwt;
bwt = bwt_pac2bwt(str, algo_type == 3);
diff --git a/example.c b/example.c
index a6c9bdd..4e8494d 100644
--- a/example.c
+++ b/example.c
@@ -7,10 +7,6 @@
#include "kseq.h" // for the FASTA/Q parser
KSEQ_DECLARE(gzFile)
-#ifdef USE_MALLOC_WRAPPERS
-# include "malloc_wrap.h"
-#endif
-
int main(int argc, char *argv[])
{
bwaidx_t *idx;
@@ -47,10 +43,10 @@ int main(int argc, char *argv[])
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
// print alignment
- err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
+ printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
- err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
- err_printf("\t%d\n", a.NM); // print edit distance
+ printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
+ printf("\t%d\n", a.NM); // print edit distance
free(a.cigar); // don't forget to deallocate CIGAR
}
free(ar.a); // and deallocate the hit list
@@ -58,7 +54,7 @@ int main(int argc, char *argv[])
free(opt);
kseq_destroy(ks);
- err_gzclose(fp);
+ gzclose(fp);
bwa_idx_destroy(idx);
return 0;
}
diff --git a/fastmap.c b/fastmap.c
index 479e878..3cca4de 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -3,20 +3,98 @@
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
+#include <limits.h>
#include <ctype.h>
#include <math.h>
#include "bwa.h"
#include "bwamem.h"
#include "kvec.h"
#include "utils.h"
+#include "bntseq.h"
#include "kseq.h"
-#include "utils.h"
KSEQ_DECLARE(gzFile)
extern unsigned char nst_nt4_table[256];
void *kopen(const char *fn, int *_fd);
int kclose(void *a);
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+typedef struct {
+ kseq_t *ks, *ks2;
+ mem_opt_t *opt;
+ mem_pestat_t *pes0;
+ int64_t n_processed;
+ int copy_comment, actual_chunk_size;
+ bwaidx_t *idx;
+} ktp_aux_t;
+
+typedef struct {
+ ktp_aux_t *aux;
+ int n_seqs;
+ bseq1_t *seqs;
+} ktp_data_t;
+
+static void *process(void *shared, int step, void *_data)
+{
+ ktp_aux_t *aux = (ktp_aux_t*)shared;
+ ktp_data_t *data = (ktp_data_t*)_data;
+ int i;
+ if (step == 0) {
+ ktp_data_t *ret;
+ int64_t size = 0;
+ ret = calloc(1, sizeof(ktp_data_t));
+ ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
+ if (ret->seqs == 0) {
+ free(ret);
+ return 0;
+ }
+ if (!aux->copy_comment)
+ for (i = 0; i < ret->n_seqs; ++i) {
+ free(ret->seqs[i].comment);
+ ret->seqs[i].comment = 0;
+ }
+ for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq;
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
+ return ret;
+ } else if (step == 1) {
+ const mem_opt_t *opt = aux->opt;
+ const bwaidx_t *idx = aux->idx;
+ if (opt->flag & MEM_F_SMARTPE) {
+ bseq1_t *sep[2];
+ int n_sep[2];
+ mem_opt_t tmp_opt = *opt;
+ bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
+ if (n_sep[0]) {
+ tmp_opt.flag &= ~MEM_F_PE;
+ mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
+ for (i = 0; i < n_sep[0]; ++i)
+ data->seqs[sep[0][i].id].sam = sep[0][i].sam;
+ }
+ if (n_sep[1]) {
+ tmp_opt.flag |= MEM_F_PE;
+ mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
+ for (i = 0; i < n_sep[1]; ++i)
+ data->seqs[sep[1][i].id].sam = sep[1][i].sam;
+ }
+ free(sep[0]); free(sep[1]);
+ } else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
+ aux->n_processed += data->n_seqs;
+ return data;
+ } else if (step == 2) {
+ for (i = 0; i < data->n_seqs; ++i) {
+ if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
+ free(data->seqs[i].name); free(data->seqs[i].comment);
+ free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam);
+ }
+ free(data->seqs); free(data);
+ return 0;
+ }
+ return 0;
+}
static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
{
@@ -37,24 +115,24 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
int main_mem(int argc, char *argv[])
{
mem_opt_t *opt, opt0;
- int fd, fd2, i, c, n, copy_comment = 0;
+ int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
+ int fixed_chunk_size = -1;
gzFile fp, fp2 = 0;
- kseq_t *ks, *ks2 = 0;
- bseq1_t *seqs;
- bwaidx_t *idx;
- char *p, *rg_line = 0;
+ char *p, *rg_line = 0, *hdr_line = 0;
const char *mode = 0;
void *ko = 0, *ko2 = 0;
- int64_t n_processed = 0;
- mem_pestat_t pes[4], *pes0 = 0;
+ mem_pestat_t pes[4];
+ ktp_aux_t aux;
+ memset(&aux, 0, sizeof(ktp_aux_t));
memset(pes, 0, 4 * sizeof(mem_pestat_t));
for (i = 0; i < 4; ++i) pes[i].failed = 1;
- opt = mem_opt_init();
+ aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
- while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) {
+ while ((c = getopt(argc, argv, "1epaFMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
+ else if (c == '1') no_mt_io = 1;
else if (c == 'x') mode = optarg;
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
@@ -64,24 +142,34 @@ int main_mem(int argc, char *argv[])
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
else if (c == 'a') opt->flag |= MEM_F_ALL;
- else if (c == 'p') opt->flag |= MEM_F_PE;
+ else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
else if (c == 'F') opt->flag |= MEM_F_ALN_REG;
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
+ else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
else if (c == 'v') bwa_verbose = atoi(optarg);
+ else if (c == 'j') ignore_alt = 1;
else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
- else if (c == 'h') opt->max_hits = atoi(optarg), opt0.max_hits = 1;
else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
- else if (c == 'C') copy_comment = 1;
+ else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
+ else if (c == 'C') aux.copy_comment = 1;
+ else if (c == 'K') fixed_chunk_size = atoi(optarg);
+ else if (c == 'X') opt->mask_level = atof(optarg);
+ else if (c == 'h') {
+ opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
+ opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ opt->max_XA_hits_alt = strtol(p+1, &p, 10);
+ }
else if (c == 'Q') {
opt0.mapQ_coef_len = 1;
opt->mapQ_coef_len = atoi(optarg);
@@ -103,8 +191,24 @@ int main_mem(int argc, char *argv[])
opt->pen_clip3 = strtol(p+1, &p, 10);
} else if (c == 'R') {
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
+ } else if (c == 'H') {
+ if (optarg[0] != '@') {
+ FILE *fp;
+ if ((fp = fopen(optarg, "r")) != 0) {
+ char *buf;
+ buf = calloc(1, 0x10000);
+ while (fgets(buf, 0xffff, fp)) {
+ i = strlen(buf);
+ assert(buf[i-1] == '\n'); // a long line
+ buf[i-1] = 0;
+ hdr_line = bwa_insert_header(buf, hdr_line);
+ }
+ free(buf);
+ fclose(fp);
+ }
+ } else hdr_line = bwa_insert_header(optarg, hdr_line);
} else if (c == 'I') { // specify the insert size distribution
- pes0 = pes;
+ aux.pes0 = pes;
pes[1].failed = 0;
pes[1].avg = strtod(optarg, &p);
pes[1].std = pes[1].avg * .1;
@@ -123,6 +227,12 @@ int main_mem(int argc, char *argv[])
}
else return 1;
}
+
+ if (rg_line) {
+ hdr_line = bwa_insert_header(rg_line, hdr_line);
+ free(rg_line);
+ }
+
if (opt->n_threads < 1) opt->n_threads = 1;
if (optind + 1 >= argc || optind + 3 < argc) {
fprintf(stderr, "\n");
@@ -133,6 +243,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+ fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
@@ -141,24 +252,30 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, " -e discard full-length exact matches\n");
+ fprintf(stderr, "\nScoring options:\n\n");
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
- fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
+ fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n");
- fprintf(stderr, " pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n");
- fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n");
+ fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n");
+ fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n");
+ fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
+// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
fprintf(stderr, "\nInput/output options:\n\n");
- fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
+ fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
+ fprintf(stderr, " -H STR/FILE insert STR to header if it starts with @; or insert lines in FILE [null]\n");
+ fprintf(stderr, " -j treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
fprintf(stderr, "\n");
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
- fprintf(stderr, " -h INT if there are <INT hits with score >80%% of the max score, output all in XA [%d]\n", opt->max_hits);
+ fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt);
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
+ fprintf(stderr, " -V output the reference FASTA header in the XR tag\n");
fprintf(stderr, " -Y use soft clipping for supplementary alignments\n");
fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
@@ -173,23 +290,33 @@ int main_mem(int argc, char *argv[])
}
if (mode) {
- if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
- if (!opt0.a) opt->a = 2, opt0.a = 1;
- update_a(opt, &opt0);
- if (!opt0.o_del) opt->o_del = 2;
+ if (strcmp(mode, "intractg") == 0) {
+ if (!opt0.o_del) opt->o_del = 16;
+ if (!opt0.o_ins) opt->o_ins = 16;
+ if (!opt0.b) opt->b = 9;
+ if (!opt0.pen_clip5) opt->pen_clip5 = 5;
+ if (!opt0.pen_clip3) opt->pen_clip3 = 5;
+ } else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0 || strcmp(mode, "ont2d") == 0) {
+ if (!opt0.o_del) opt->o_del = 1;
if (!opt0.e_del) opt->e_del = 1;
- if (!opt0.o_ins) opt->o_ins = 2;
+ if (!opt0.o_ins) opt->o_ins = 1;
if (!opt0.e_ins) opt->e_ins = 1;
- if (!opt0.b) opt->b = 5;
+ if (!opt0.b) opt->b = 1;
if (opt0.split_factor == 0.) opt->split_factor = 10.;
- if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
- if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
+ if (strcmp(mode, "pbread") == 0) { // pacbio read-to-read setting; NOT working well!
opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG;
+ if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
if (!opt0.max_occ) opt->max_occ = 1000;
if (!opt0.min_seed_len) opt->min_seed_len = 13;
if (!opt0.max_chain_extend) opt->max_chain_extend = 25;
if (opt0.drop_ratio == 0.) opt->drop_ratio = .001;
+ } else if (strcmp(mode, "ont2d") == 0) {
+ if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
+ if (!opt0.min_seed_len) opt->min_seed_len = 14;
+ if (!opt0.pen_clip5) opt->pen_clip5 = 0;
+ if (!opt0.pen_clip3) opt->pen_clip3 = 0;
} else {
+ if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
if (!opt0.min_seed_len) opt->min_seed_len = 17;
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
if (!opt0.pen_clip3) opt->pen_clip3 = 0;
@@ -199,10 +326,16 @@ int main_mem(int argc, char *argv[])
return 1; // FIXME memory leak
}
} else update_a(opt, &opt0);
-// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF
bwa_fill_scmat(opt->a, opt->b, opt->mat);
- if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+ aux.idx = bwa_idx_load_from_shm(argv[optind]);
+ if (aux.idx == 0) {
+ if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+ } else if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
+ if (ignore_alt)
+ for (i = 0; i < aux.idx->bns->n_seqs; ++i)
+ aux.idx->bns->anns[i].is_alt = 0;
ko = kopen(argv[optind + 1], &fd);
if (ko == 0) {
@@ -210,11 +343,11 @@ int main_mem(int argc, char *argv[])
return 1;
}
fp = gzdopen(fd, "r");
- ks = kseq_init(fp);
+ aux.ks = kseq_init(fp);
if (optind + 2 < argc) {
if (opt->flag&MEM_F_PE) {
if (bwa_verbose >= 2)
- fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
+ fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
} else {
ko2 = kopen(argv[optind + 2], &fd2);
if (ko2 == 0) {
@@ -222,41 +355,21 @@ int main_mem(int argc, char *argv[])
return 1;
}
fp2 = gzdopen(fd2, "r");
- ks2 = kseq_init(fp2);
+ aux.ks2 = kseq_init(fp2);
opt->flag |= MEM_F_PE;
}
}
if (!(opt->flag & MEM_F_ALN_REG))
- bwa_print_sam_hdr(idx->bns, rg_line);
- while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
- int64_t size = 0;
- if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
- if (bwa_verbose >= 2)
- fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
- n = n>>1<<1;
- }
- if (!copy_comment)
- for (i = 0; i < n; ++i) {
- free(seqs[i].comment); seqs[i].comment = 0;
- }
- for (i = 0; i < n; ++i) size += seqs[i].l_seq;
- if (bwa_verbose >= 3)
- fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
- mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
- n_processed += n;
- for (i = 0; i < n; ++i) {
- if (seqs[i].sam) err_fputs(seqs[i].sam, stdout);
- free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
- }
- free(seqs);
- }
-
+ bwa_print_sam_hdr(aux.idx->bns, hdr_line);
+ aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
+ kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
+ free(hdr_line);
free(opt);
- bwa_idx_destroy(idx);
- kseq_destroy(ks);
+ bwa_idx_destroy(aux.idx);
+ kseq_destroy(aux.ks);
err_gzclose(fp); kclose(ko);
- if (ks2) {
- kseq_destroy(ks2);
+ if (aux.ks2) {
+ kseq_destroy(aux.ks2);
err_gzclose(fp2); kclose(ko2);
}
return 0;
@@ -264,7 +377,8 @@ int main_mem(int argc, char *argv[])
int main_fastmap(int argc, char *argv[])
{
- int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
+ int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
+ uint64_t max_intv = 0;
kseq_t *seq;
bwtint_t k;
gzFile fp;
@@ -272,16 +386,26 @@ int main_fastmap(int argc, char *argv[])
const bwtintv_v *a;
bwaidx_t *idx;
- while ((c = getopt(argc, argv, "w:l:p")) >= 0) {
+ while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
switch (c) {
case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
+ case 'i': min_intv = atoi(optarg); break;
+ case 'I': max_intv = atol(optarg); break;
+ case 'L': max_len = atoi(optarg); break;
default: return 1;
}
}
if (optind + 1 >= argc) {
- fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
+ fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
+ fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
+ fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
+ fprintf(stderr, " -l INT max MEM length [%d]\n", max_len);
+ fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
+ fprintf(stderr, "\n");
return 1;
}
@@ -289,6 +413,7 @@ int main_fastmap(int argc, char *argv[])
seq = kseq_init(fp);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
itr = smem_itr_init(idx->bwt);
+ smem_config(itr, min_intv, max_len, max_intv);
while (kseq_read(seq) >= 0) {
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
diff --git a/kseq.h b/kseq.h
index 642cd33..f3862c6 100644
--- a/kseq.h
+++ b/kseq.h
@@ -54,9 +54,9 @@
#define __KS_BASIC(type_t, __bufsize) \
static inline kstream_t *ks_init(type_t f) \
{ \
- kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
ks->f = f; \
- ks->buf = (unsigned char*)malloc(__bufsize); \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
return ks; \
} \
static inline void ks_destroy(kstream_t *ks) \
@@ -74,8 +74,7 @@
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) return -1; \
+ if (ks->end == 0) { ks->is_eof = 1; return -1;} \
} \
return (int)ks->buf[ks->begin++]; \
}
@@ -95,17 +94,16 @@ typedef struct __kstring_t {
#define __KS_GETUNTIL(__read, __bufsize) \
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
+ int gotany = 0; \
if (dret) *dret = 0; \
str->l = append? str->l : 0; \
- if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) break; \
+ if (ks->end == 0) { ks->is_eof = 1; break; } \
} else break; \
} \
if (delimiter == KS_SEP_LINE) { \
@@ -124,8 +122,9 @@ typedef struct __kstring_t {
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
- str->s = (char*)realloc(str->s, str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
} \
+ gotany = 1; \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
@@ -134,6 +133,7 @@ typedef struct __kstring_t {
break; \
} \
} \
+ if (!gotany && ks_eof(ks)) return -1; \
if (str->s == 0) { \
str->m = 1; \
str->s = (char*)calloc(1, 1); \
@@ -155,7 +155,7 @@ typedef struct __kstring_t {
#define __KSEQ_BASIC(SCOPE, type_t) \
SCOPE kseq_t *kseq_init(type_t fd) \
{ \
- kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
return s; \
} \
diff --git a/kthread.c b/kthread.c
index a44426b..80f84cb 100644
--- a/kthread.c
+++ b/kthread.c
@@ -1,23 +1,30 @@
#include <pthread.h>
#include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
struct kt_for_t;
typedef struct {
struct kt_for_t *t;
- int i;
+ long i;
} ktf_worker_t;
typedef struct kt_for_t {
- int n_threads, n;
+ int n_threads;
+ long n;
ktf_worker_t *w;
- void (*func)(void*,int,int);
+ void (*func)(void*,long,int);
void *data;
} kt_for_t;
-static inline int steal_work(kt_for_t *t)
+static inline long steal_work(kt_for_t *t)
{
- int i, k, min = 0x7fffffff, min_i = -1;
+ int i, min_i = -1;
+ long k, min = LONG_MAX;
for (i = 0; i < t->n_threads; ++i)
if (min > t->w[i].i) min = t->w[i].i, min_i = i;
k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
@@ -27,7 +34,7 @@ static inline int steal_work(kt_for_t *t)
static void *ktf_worker(void *data)
{
ktf_worker_t *w = (ktf_worker_t*)data;
- int i;
+ long i;
for (;;) {
i = __sync_fetch_and_add(&w->i, w->t->n_threads);
if (i >= w->t->n) break;
@@ -38,7 +45,7 @@ static void *ktf_worker(void *data)
pthread_exit(0);
}
-void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n)
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
{
int i;
kt_for_t t;
@@ -51,3 +58,86 @@ void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n)
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
}
+
+/*****************
+ * kt_pipeline() *
+ *****************/
+
+struct ktp_t;
+
+typedef struct {
+ struct ktp_t *pl;
+ int step, running;
+ void *data;
+} ktp_worker_t;
+
+typedef struct ktp_t {
+ void *shared;
+ void *(*func)(void*, int, void*);
+ int n_workers, n_steps;
+ ktp_worker_t *workers;
+ pthread_mutex_t mutex;
+ pthread_cond_t cv;
+} ktp_t;
+
+static void *ktp_worker(void *data)
+{
+ ktp_worker_t *w = (ktp_worker_t*)data;
+ ktp_t *p = w->pl;
+ while (w->step < p->n_steps) {
+ // test whether we can kick off the job with this worker
+ pthread_mutex_lock(&p->mutex);
+ for (;;) {
+ int i;
+ // test whether another worker is doing the same step
+ for (i = 0; i < p->n_workers; ++i) {
+ if (w == &p->workers[i]) continue; // ignore itself
+ if (p->workers[i].running && p->workers[i].step == w->step)
+ break;
+ }
+ if (i == p->n_workers) break; // no other workers doing w->step; then this worker will
+ pthread_cond_wait(&p->cv, &p->mutex);
+ }
+ w->running = 1;
+ pthread_mutex_unlock(&p->mutex);
+
+ // working on w->step
+ w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
+
+ // update step and let other workers know
+ pthread_mutex_lock(&p->mutex);
+ w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
+ w->running = 0;
+ pthread_cond_broadcast(&p->cv);
+ pthread_mutex_unlock(&p->mutex);
+ }
+ pthread_exit(0);
+}
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
+{
+ ktp_t aux;
+ pthread_t *tid;
+ int i;
+
+ if (n_threads < 1) n_threads = 1;
+ aux.n_workers = n_threads;
+ aux.n_steps = n_steps;
+ aux.func = func;
+ aux.shared = shared_data;
+ pthread_mutex_init(&aux.mutex, 0);
+ pthread_cond_init(&aux.cv, 0);
+
+ aux.workers = alloca(n_threads * sizeof(ktp_worker_t));
+ for (i = 0; i < n_threads; ++i) {
+ ktp_worker_t *w = &aux.workers[i];
+ w->step = w->running = 0; w->pl = &aux; w->data = 0;
+ }
+
+ tid = alloca(n_threads * sizeof(pthread_t));
+ for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+
+ pthread_mutex_destroy(&aux.mutex);
+ pthread_cond_destroy(&aux.cv);
+}
diff --git a/main.c b/main.c
index b873b75..9f776d4 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.10-r789"
+#define PACKAGE_VERSION "0.7.12-r1039"
#endif
int bwa_fa2pac(int argc, char *argv[]);
@@ -22,11 +22,10 @@ int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]);
+int main_shm(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]);
-char *bwa_pg;
-
static int usage()
{
fprintf(stderr, "\n");
@@ -43,6 +42,7 @@ static int usage()
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, "\n");
+ fprintf(stderr, " shm manage indices in shared memory\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
@@ -59,6 +59,7 @@ static int usage()
int main(int argc, char *argv[])
{
+ extern char *bwa_pg;
int i, ret;
double t_real;
kstring_t pg = {0,0,0};
@@ -81,6 +82,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
+ else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
diff --git a/utils.c b/utils.c
index 00be7f0..2983261 100644
--- a/utils.c
+++ b/utils.c
@@ -219,6 +219,17 @@ int err_fputs(const char *s, FILE *stream)
return ret;
}
+int err_puts(const char *s)
+{
+ int ret = puts(s);
+ if (EOF == ret)
+ {
+ _err_fatal_simple("puts", strerror(errno));
+ }
+
+ return ret;
+}
+
int err_fflush(FILE *stream)
{
int ret = fflush(stream);
diff --git a/utils.h b/utils.h
index 5ef6ac4..11966b8 100644
--- a/utils.h
+++ b/utils.h
@@ -80,7 +80,7 @@ extern "C" {
int err_fputc(int c, FILE *stream);
#define err_putchar(C) err_fputc((C), stdout)
int err_fputs(const char *s, FILE *stream);
-#define err_puts(S) err_fputs((S), stdout)
+ int err_puts(const char *s);
int err_fflush(FILE *stream);
int err_fclose(FILE *stream);
int err_gzclose(gzFile file);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bwa.git
More information about the debian-med-commit
mailing list