[med-svn] [Git][med-team/bwa-mem2][master] 8 commits: tidy changelog
Michael R. Crusoe
gitlab at salsa.debian.org
Mon Mar 8 15:25:05 GMT 2021
Michael R. Crusoe pushed to branch master at Debian Med / bwa-mem2
Commits:
73e6f44a by Michael R. Crusoe at 2021-03-08T15:41:12+01:00
tidy changelog
- - - - -
1fafa249 by Michael R. Crusoe at 2021-03-08T15:51:21+01:00
debian/watch: fix target URL pattern
- - - - -
944b4a3d by Michael R. Crusoe at 2021-03-08T15:51:37+01:00
New upstream version 2.2
- - - - -
8cc24166 by Michael R. Crusoe at 2021-03-08T15:51:37+01:00
routine-update: New upstream version
- - - - -
af1a6ff7 by Michael R. Crusoe at 2021-03-08T15:51:38+01:00
Update upstream source from tag 'upstream/2.2'
Update to upstream version '2.2'
with Debian dir 641b535d170a3c4220914da80bf42481634eb5ca
- - - - -
97e79292 by Michael R. Crusoe at 2021-03-08T15:51:38+01:00
routine-update: Standards-Version: 4.5.1
- - - - -
dacaf997 by Michael R. Crusoe at 2021-03-08T15:56:07+01:00
freshen patches
- - - - -
a7f38675 by Michael R. Crusoe at 2021-03-08T15:57:47+01:00
clean up changelog
- - - - -
28 changed files:
- Makefile
- README.md
- debian/changelog
- debian/control
- debian/copyright
- debian/patches/hardening
- debian/patches/series
- debian/patches/simde
- − debian/patches/spelling
- debian/watch
- src/FMI_search.cpp
- src/FMI_search.h
- src/bandedSWA.cpp
- src/bwamem.cpp
- src/bwamem_pair.cpp
- src/kopen.cpp
- src/kseq.h
- src/kstring.h
- src/kswv.cpp
- src/kthread.cpp
- src/macro.h
- src/main.cpp
- + src/memcpy_bwamem.cpp
- + src/memcpy_bwamem.h
- src/profiling.cpp
- src/profiling.h
- src/runsimd.cpp
- test/fmi_test.cpp
Changes:
=====================================
Makefile
=====================================
@@ -28,6 +28,10 @@
## Heng Li <hli at jimmy.harvard.edu>
##*****************************************************************************************/
+ifneq ($(portable),)
+ STATIC_GCC=-static-libgcc -static-libstdc++
+endif
+
EXE= bwa-mem2
#CXX= icpc
ifeq ($(CXX), icpc)
@@ -35,24 +39,36 @@ ifeq ($(CXX), icpc)
else ifeq ($(CXX), g++)
CC=gcc
endif
-ARCH_FLAGS= -msse4.1
+ARCH_FLAGS= -msse -msse2 -msse3 -mssse3 -msse4.1
MEM_FLAGS= -DSAIS=1
-CPPFLAGS= -DENABLE_PREFETCH -DV17=1 $(MEM_FLAGS)
+CPPFLAGS+= -DENABLE_PREFETCH -DV17=1 -DMATE_SORT=0 $(MEM_FLAGS)
INCLUDES= -Isrc -Iext/safestringlib/include
-LIBS= -lpthread -lm -lz -L. -lbwa -Lext/safestringlib -lsafestring
-OBJS= src/fastmap.o src/bwtindex.o src/utils.o src/kthread.o \
+LIBS= -lpthread -lm -lz -L. -lbwa -Lext/safestringlib -lsafestring $(STATIC_GCC)
+OBJS= src/fastmap.o src/bwtindex.o src/utils.o src/memcpy_bwamem.o src/kthread.o \
src/kstring.o src/ksw.o src/bntseq.o src/bwamem.o src/profiling.o src/bandedSWA.o \
src/FMI_search.o src/read_index_ele.o src/bwamem_pair.o src/kswv.o src/bwa.o \
src/bwamem_extra.o src/kopen.o
BWA_LIB= libbwa.a
SAFE_STR_LIB= ext/safestringlib/libsafestring.a
-ifneq ($(portable),)
- LIBS+=-static-libgcc -static-libstdc++
-endif
-
-ifeq ($(arch),sse)
+ifeq ($(arch),sse41)
+ ifeq ($(CXX), icpc)
ARCH_FLAGS=-msse4.1
+ else
+ ARCH_FLAGS=-msse -msse2 -msse3 -mssse3 -msse4.1
+ endif
+else ifeq ($(arch),sse42)
+ ifeq ($(CXX), icpc)
+ ARCH_FLAGS=-msse4.2
+ else
+ ARCH_FLAGS=-msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2
+ endif
+else ifeq ($(arch),avx)
+ ifeq ($(CXX), icpc)
+ ARCH_FLAGS=-mavx ##-xAVX
+ else
+ ARCH_FLAGS=-mavx
+ endif
else ifeq ($(arch),avx2)
ifeq ($(CXX), icpc)
ARCH_FLAGS=-march=core-avx2 #-xCORE-AVX2
@@ -70,9 +86,11 @@ else ifeq ($(arch),native)
else ifneq ($(arch),)
# To provide a different architecture flag like -march=core-avx2.
ARCH_FLAGS=$(arch)
+else
+myall:multi
endif
-CXXFLAGS= -g -O3 -fpermissive $(ARCH_FLAGS) #-Wall ##-xSSE2
+CXXFLAGS+= -g -O3 -fpermissive $(ARCH_FLAGS) #-Wall ##-xSSE2
.PHONY:all clean depend multi
.SUFFIXES:.cpp .o
@@ -84,24 +102,29 @@ all:$(EXE)
multi:
rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
- $(MAKE) arch=sse EXE=bwa-mem2.sse41 CXX=$(CXX) all
+ $(MAKE) arch=sse41 EXE=bwa-mem2.sse41 CXX=$(CXX) all
+ rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
+ $(MAKE) arch=sse42 EXE=bwa-mem2.sse42 CXX=$(CXX) all
+ rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
+ $(MAKE) arch=avx EXE=bwa-mem2.avx CXX=$(CXX) all
rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
$(MAKE) arch=avx2 EXE=bwa-mem2.avx2 CXX=$(CXX) all
rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
$(MAKE) arch=avx512 EXE=bwa-mem2.avx512bw CXX=$(CXX) all
- $(CXX) -Wall -O3 src/runsimd.cpp -Iext/safestringlib/include -Lext/safestringlib/ -lsafestring -o bwa-mem2
+ $(CXX) -Wall -O3 src/runsimd.cpp -Iext/safestringlib/include -Lext/safestringlib/ -lsafestring $(STATIC_GCC) -o bwa-mem2
+
$(EXE):$(BWA_LIB) $(SAFE_STR_LIB) src/main.o
- $(CXX) $(CXXFLAGS) src/main.o $(BWA_LIB) $(LIBS) -o $@
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) src/main.o $(BWA_LIB) $(LIBS) -o $@
$(BWA_LIB):$(OBJS)
ar rcs $(BWA_LIB) $(OBJS)
$(SAFE_STR_LIB):
- cd ext/safestringlib/ && make clean && make CC=$(CC) directories libsafestring.a
+ cd ext/safestringlib/ && $(MAKE) clean && $(MAKE) CC=$(CC) directories libsafestring.a
clean:
- rm -fr src/*.o $(BWA_LIB) $(EXE) bwa-mem2.sse41 bwa-mem2.avx2 bwa-mem2.avx512bw
+ rm -fr src/*.o $(BWA_LIB) $(EXE) bwa-mem2.sse41 bwa-mem2.sse42 bwa-mem2.avx bwa-mem2.avx2 bwa-mem2.avx512bw
cd ext/safestringlib/ && $(MAKE) clean
depend:
@@ -147,3 +170,4 @@ src/profiling.o: src/macro.h
src/read_index_ele.o: src/read_index_ele.h src/utils.h src/bntseq.h
src/read_index_ele.o: src/macro.h
src/utils.o: src/utils.h src/ksort.h src/kseq.h
+src/memcpy_bwamem.o: src/memcpy_bwamem.h
=====================================
README.md
=====================================
@@ -1,6 +1,11 @@
+[![GitHub Downloads](https://img.shields.io/github/downloads/bwa-mem2/bwa-mem2/total?label=GitHub%20Downloads)](https://github.com/bwa-mem2/bwa-mem2/releases)
+[![BioConda Install](https://img.shields.io/conda/dn/bioconda/bwa-mem2?label=BioConda%20Installs)](https://anaconda.org/bioconda/bwa-mem2)
+
## Important Information
-***Index structure has changed since commit 6743183. Rebuild the Index if you are using a later commit.***
+***We are happy to announce that the index size on disk is down by 8 times and in memory by 4 times due to moving to only one type of FM-index (2bit.64 instead of 2bit.64 and 8bit.32) and 8x compression of suffix array. For example, for human genome, index size on disk is down to ~10GB from ~80GB and memory footprint is down to ~10GB from ~40GB.***
+***There is a substantial reduction in index IO time due to the reduction and hardly any performance impact on read mapping.***
+***Due to this change in index structure (in commit #4b59796, 10th October 2020), you will need to rebuild the index.***
***Added MC flag in the output sam file in commit a591e22. Output should match original bwa-mem version 0.7.17.***
@@ -49,7 +54,7 @@ the most efficient implementation based on the SIMD instruction set available
on the running machine. Precompiled binaries were generated on a CentOS6
machine using the following command line:
```sh
-make CXX=icpc multi
+make CXX=icpc
```
[bwa]: https://github.com/lh3/bwa
@@ -74,8 +79,8 @@ Where <prefix> is the prefix specified when creating the index or the path to th
## Performance
-Datasets:
-
+Datasets:
+Reference Genome: human_g1k_v37.fasta
Alias | Dataset source | No. of reads | Read length
--------- | --------- | --------- | ---------
@@ -92,12 +97,47 @@ OS: CentOS Linux release 7.6.1810
Memory: 100GB
+We followed the steps below to collect the performance results:
+A. Data download steps:
+1. Download SRA toolkit from https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software#header-global
+2. tar xfzv sratoolkit.2.10.5-centos_linux64.tar.gz
+3. Download D2: sratoolkit.2.10.5-centos_linux64/bin/fastq-dump --split-files SRR7733443
+4. Download D3: sratoolkit.2.10.5-centos_linux64/bin/fastq-dump --split-files SRR9932168
+5. Download D4: sratoolkit.2.10.5-centos_linux64/bin/fastq-dump --split-files SRX6999918
+
+
+
+B. Alignment steps:
+1. git clone https://github.com/bwa-mem2/bwa-mem2.git
+2. cd bwa-mem2
+3. ```make CXX=icpc``` (using intel C/C++ compiler)
+or ```make``` (using gcc compiler)
+4. ./bwa-mem2 index <ref.fa>
+5. ./bwa-mem2 mem [-t <#threads>] <ref.fa> <in_1.fastq> [<in_2.fastq>] > <output.sam>
+
+For example, in our double socket (56 threads each) and double numa compute node, we used the following command line to align D2 to human_g1k_v37.fasta reference genome.
+```
+numactl -m 0 -C 0-27,56-83 ./bwa-mem2 index human_g1k_v37.fasta
+numactl -m 0 -C 0-27,56-83 ./bwa-mem2 mem -t 56 human_g1k_v37.fasta SRR7733443_1.fastq SRR7733443_2.fastq > d3_align.sam
+```
+
<p align="center">
<img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-1.png" height="400"/a></br>
<img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-2.png" height="400"/a></br>
<img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-3.png" height="400"/a></br>
<img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-4.png" height="400"/a></br>
-</p>
+</p>
+
+## bwa-mem2 seeding speedup with Enumerated Radix Trees (Code in ert branch)
+
+The ert branch of bwa-mem2 repository contains codebase of enuerated radix tree based acceleration of bwa-mem2. The ert code is built on the top of bwa-mem2 (thanks to the hard work by @arun-sub).
+The following are the highlights of the ert based bwa-mem2 tool:
+1. Exact same output as bwa-mem(2)
+2. The tool has two additional flags to enable the use of ert solution (for index creation and mapping), else it runs in vanilla bwa-mem2 mode
+3. It uses 1 additional flag to create ert index (different from bwa-mem2 index) and 1 additional flag for using that ert index (please see the readme of ert branch)
+4. The ert solution is 10% - 30% faster (tested on above machine configuration) in comparison to vanilla bwa-mem2 -- users are adviced to use option `-K 1000000` to see the speedups
+5. The memory foot print of the ert index is ~60GB
+6. The code is present in ert branch: https://github.com/bwa-mem2/bwa-mem2/tree/ert
## Citation
=====================================
debian/changelog
=====================================
@@ -1,11 +1,5 @@
-bwa-mem2 (2.0-2) UNRELEASED; urgency=medium
-
- * Add autopkgtests.
-
- -- Michael R. Crusoe <crusoe at debian.org> Thu, 23 Jul 2020 09:44:17 +0200
-
-bwa-mem2 (2.0-1) unstable; urgency=medium
+bwa-mem2 (2.2-1) UNRELEASED; urgency=medium
* Initial release. (Closes: #950607)
- -- Michael R. Crusoe <crusoe at debian.org> Thu, 09 Jul 2020 14:31:25 +0200
+ -- Michael R. Crusoe <crusoe at debian.org> Mon, 08 Mar 2021 15:51:37 +0100
=====================================
debian/control
=====================================
@@ -8,7 +8,7 @@ Build-Depends: debhelper-compat (= 13),
zlib1g-dev,
libsimde-dev,
help2man
-Standards-Version: 4.5.0
+Standards-Version: 4.5.1
Vcs-Browser: https://salsa.debian.org/med-team/bwa-mem2
Vcs-Git: https://salsa.debian.org/med-team/bwa-mem2.git
Homepage: https://github.com/bwa-mem2/bwa-mem2
=====================================
debian/copyright
=====================================
@@ -2,6 +2,7 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: BWA-MEM2
Upstream-Contact: Heng Li <lh3 at me.com>
Source: https://github.com/bwa-mem2/bwa-mem2/
+Files-Excluded: .git
Files: *
Copyright: 2019 Vasimuddin Md <vasimuddin.md at intel.com>
=====================================
debian/patches/hardening
=====================================
@@ -3,43 +3,11 @@ Description: Enable proper flag appending so that Debian can harden the build.
Forwarded: https://github.com/bwa-mem2/bwa-mem2/pull/69
--- bwa-mem2.orig/Makefile
+++ bwa-mem2/Makefile
-@@ -37,7 +37,8 @@
- endif
- ARCH_FLAGS= -msse4.1
- MEM_FLAGS= -DSAIS=1
--CPPFLAGS= -DENABLE_PREFETCH -DV17=1 $(MEM_FLAGS)
-+ORIG_CPPFLAGS:=$(CPPFLAGS)
-+CPPFLAGS+= -DENABLE_PREFETCH -DV17=1 $(MEM_FLAGS)
- INCLUDES= -Isrc -Iext/safestringlib/include
- LIBS= -lpthread -lm -lz -L. -lbwa -Lext/safestringlib -lsafestring
- OBJS= src/fastmap.o src/bwtindex.o src/utils.o src/kthread.o \
-@@ -71,14 +72,14 @@
- # To provide a different architecture flag like -march=core-avx2.
- ARCH_FLAGS=$(arch)
- endif
--
--CXXFLAGS= -g -O3 -fpermissive $(ARCH_FLAGS) #-Wall ##-xSSE2
-+ORIG_CXXFLAGS:=$(CXXFLAGS)
-+CXXFLAGS += -g -O3 -fpermissive $(ARCH_FLAGS) #-Wall ##-xSSE2
+@@ -115,7 +115,7 @@
- .PHONY:all clean depend multi
- .SUFFIXES:.cpp .o
-
- .cpp.o:
-- $(CXX) -c $(CXXFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
-+ $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(INCLUDES) $< -o $@
-
- all:$(EXE)
-
-@@ -89,10 +90,10 @@
- $(MAKE) arch=avx2 EXE=bwa-mem2.avx2 CXX=$(CXX) all
- rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
- $(MAKE) arch=avx512 EXE=bwa-mem2.avx512bw CXX=$(CXX) all
-- $(CXX) -Wall -O3 src/runsimd.cpp -Iext/safestringlib/include -Lext/safestringlib/ -lsafestring -o bwa-mem2
-+ $(CXX) $(ORIG_CXXFLAGS) $(ORIG_CPPFLAGS) $(LDFLAGS) -Wall -O3 src/runsimd.cpp -Iext/safestringlib/include -Lext/safestringlib/ -lsafestring -o bwa-mem2
$(EXE):$(BWA_LIB) $(SAFE_STR_LIB) src/main.o
-- $(CXX) $(CXXFLAGS) src/main.o $(BWA_LIB) $(LIBS) -o $@
+- $(CXX) $(CXXFLAGS) $(LDFLAGS) src/main.o $(BWA_LIB) $(LIBS) -o $@
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) src/main.o $(BWA_LIB) $(LIBS) -o $@
$(BWA_LIB):$(OBJS)
=====================================
debian/patches/series
=====================================
@@ -1,3 +1,2 @@
hardening
-spelling
simde
=====================================
debian/patches/simde
=====================================
@@ -173,7 +173,7 @@ Forwarded: https://github.com/bwa-mem2/bwa-mem2/pull/84
: "0" (func_id), "2" (subfunc_id));
--- bwa-mem2.orig/src/bwamem.cpp
+++ bwa-mem2/src/bwamem.cpp
-@@ -1861,7 +1861,7 @@
+@@ -1946,7 +1946,7 @@
{
int32_t i;
@@ -184,38 +184,28 @@ Forwarded: https://github.com/bwa-mem2/bwa-mem2/pull/84
__m512i zero512 = _mm512_setzero_si512();
--- bwa-mem2.orig/src/main.cpp
+++ bwa-mem2/src/main.cpp
-@@ -81,10 +81,13 @@
- #if ((!__AVX512BW__) & (__AVX2__))
- fprintf(stderr, "Executing in AVX2 mode!!\n");
- #endif
--#if ((!__AVX512BW__) && (!__AVX2__) && (__SSE2__))
-+#if ((!__AVX512BW__) && (!__AVX2__) && (__SSE4_1__))
- fprintf(stderr, "Executing in SSE4.1 mode!!\n");
- #endif
--#if ((!__AVX512BW__) && (!__AVX2__) && (!__SSE2__))
-+#if ((!__AVX512BW__) && (!__AVX2__) && (!__SSE4_1__) && (__SSE2__))
-+ fprintf(stderr, "Executing in SSE2 mode!!\n");
-+#endif
-+#if ((!__AVX512BW__) && (!__AVX2__) && (!__SSE4_1__) && (!__SSE2__))
- fprintf(stderr, "Executing in Scalar mode!!\n");
- #endif
- fprintf(stderr, "-----------------------------\n");
+@@ -85,6 +85,8 @@
+ fprintf(stderr, "Executing in SSE4.2 mode!!\n");
+ #elif __SSE4_1__
+ fprintf(stderr, "Executing in SSE4.1 mode!!\n");
++#elif __SSE2__
++ fprintf(stderr, "Executing in SSE2!!\n");
+ #endif
+ fprintf(stderr, "-----------------------------\n");
+
--- bwa-mem2.orig/src/bandedSWA.cpp
+++ bwa-mem2/src/bandedSWA.cpp
-@@ -4120,16 +4120,6 @@
- return;
- }
+@@ -4148,13 +4148,6 @@
--/********************************************************************************/
--/* SSE2 - 8 bit version */
+ /********************************************************************************/
+ /* SSE2 - 8 bit version */
-#ifndef __SSE4_1__
-static inline __m128i _mm_blendv_epi8 (__m128i x, __m128i y, __m128i mask)
-{
-- // Replace bit in x with bit in y when matching bit in mask is set:
-- return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y));
+- // Replace bit in x with bit in y when matching bit in mask is set:
+- return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y));
-}
-#endif
--
- #define ZSCORE8(i4_128, y4_128) \
- { \
- __m128i tmpi = _mm_sub_epi8(i4_128, x128); \
+
+ #define ZSCORE8(i4_128, y4_128) \
+ { \
=====================================
debian/patches/spelling deleted
=====================================
@@ -1,14 +0,0 @@
-From: Michael R. Crusoe <crusoe at debian.org>
-Subject: fix a typo
-Forwarded: https://github.com/bwa-mem2/bwa-mem2/pull/83
---- bwa-mem2.orig/src/profiling.cpp
-+++ bwa-mem2/src/profiling.cpp
-@@ -57,7 +57,7 @@
- uint64_t max, min;
- double avg;
- fprintf(stderr, "No. of OMP threads: %d\n", nthreads);
-- fprintf(stderr, "Processor is runnig @%lf MHz\n", proc_freq*1.0/1e6);
-+ fprintf(stderr, "Processor is running @%lf MHz\n", proc_freq*1.0/1e6);
- fprintf(stderr, "Runtime profile:\n");
-
- fprintf(stderr, "\n\tTime taken for main_mem function: %0.2lf sec\n\n",
=====================================
debian/watch
=====================================
@@ -1,3 +1,3 @@
version=4
-opts=filenamemangle=s/.+\/v?(\d\S+)\.tar\.gz/bwa-mem2-$1\.tar\.gz/,uversionmangle=s/pre/~pre/ \
- https://github.com/@PACKAGE@/@PACKAGE@/releases .*/download/v at ANY_VERSION@/Source_code_including_submodule at ARCHIVE_EXT@
+opts="filenamemangle=s%(?:.*?)Source_code_including_submodules\.tar\.gz%@PACKAGE at -$1.tar.gz%,uversionmangle=s/pre/~pre/" \
+ https://github.com/@PACKAGE@/@PACKAGE@/releases .*/download/v at ANY_VERSION@/Source_code_including_submodules at ARCHIVE_EXT@
=====================================
src/FMI_search.cpp
=====================================
@@ -30,10 +30,12 @@ Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at i
#include <stdio.h>
#include "sais.h"
#include "FMI_search.h"
+#include "memcpy_bwamem.h"
+#include "profiling.h"
+
#ifdef __cplusplus
extern "C" {
#endif
-#include "safe_mem_lib.h"
#include "safe_str_lib.h"
#ifdef __cplusplus
}
@@ -50,9 +52,7 @@ FMI_search::FMI_search(const char *fname)
sa_ls_word = NULL;
sa_ms_byte = NULL;
cp_occ = NULL;
-#if ((__AVX2__))
- c_bcast_array = NULL;
-#endif
+ one_hot_mask_array = NULL;
}
FMI_search::~FMI_search()
@@ -63,10 +63,8 @@ FMI_search::~FMI_search()
_mm_free(sa_ls_word);
if(cp_occ)
_mm_free(cp_occ);
-#if ((__AVX2__))
- if(c_bcast_array)
- _mm_free(c_bcast_array);
-#endif
+ if(one_hot_mask_array)
+ _mm_free(one_hot_mask_array);
}
int64_t FMI_search::pac_seq_len(const char *fn_pac)
@@ -143,14 +141,15 @@ void FMI_search::pac2nt(const char *fn_pac, std::string &reference_seq)
free(buf2);
}
-int FMI_search::build_fm_index_avx(const char *ref_file_name, char *binary_seq, int64_t ref_seq_len, int64_t *sa_bwt, int64_t *count) {
+int FMI_search::build_fm_index(const char *ref_file_name, char *binary_seq, int64_t ref_seq_len, int64_t *sa_bwt, int64_t *count) {
printf("ref_seq_len = %ld\n", ref_seq_len);
fflush(stdout);
char outname[PATH_MAX];
+
strcpy_s(outname, PATH_MAX, ref_file_name);
- strcat_s(outname, PATH_MAX, CP_FILENAME_SUFFIX_AVX);
- //sprintf(outname, "%s.bwt.8bit.%d", ref_file_name, CP_BLOCK_SIZE_AVX);
+ strcat_s(outname, PATH_MAX, CP_FILENAME_SUFFIX);
+ //sprintf(outname, "%s.bwt.2bit.%d", ref_file_name, CP_BLOCK_SIZE);
std::fstream outstream (outname, std::ios::out | std::ios::binary);
outstream.seekg(0);
@@ -165,18 +164,19 @@ int FMI_search::build_fm_index_avx(const char *ref_file_name, char *binary_seq,
outstream.write((char*)count, 5 * sizeof(int64_t));
int64_t i;
- int64_t ref_seq_len_aligned = ((ref_seq_len + CP_BLOCK_SIZE_AVX - 1) / CP_BLOCK_SIZE_AVX) * CP_BLOCK_SIZE_AVX;
+ int64_t ref_seq_len_aligned = ((ref_seq_len + CP_BLOCK_SIZE - 1) / CP_BLOCK_SIZE) * CP_BLOCK_SIZE;
int64_t size = ref_seq_len_aligned * sizeof(uint8_t);
bwt = (uint8_t *)_mm_malloc(size, 64);
- index_alloc += size;
assert_not_null(bwt, size, index_alloc);
+ int64_t sentinel_index = -1;
for(i=0; i< ref_seq_len; i++)
{
if(sa_bwt[i] == 0)
{
bwt[i] = 4;
printf("BWT[%ld] = 4\n", i);
+ sentinel_index = i;
}
else
{
@@ -201,188 +201,82 @@ int FMI_search::build_fm_index_avx(const char *ref_file_name, char *binary_seq,
bwt[i] = DUMMY_CHAR;
- printf("CP_SHIFT = %d, CP_MASK = %d\n", CP_SHIFT_AVX, CP_MASK_AVX);
- printf("sizeof CP_OCC = %ld\n", sizeof(CP_OCC_AVX));
+ printf("CP_SHIFT = %d, CP_MASK = %d\n", CP_SHIFT, CP_MASK);
+ printf("sizeof CP_OCC = %ld\n", sizeof(CP_OCC));
fflush(stdout);
// create checkpointed occ
- int64_t cp_occ_size = (ref_seq_len >> CP_SHIFT_AVX) + 1;
- CP_OCC_AVX *cp_occ = NULL;
+ int64_t cp_occ_size = (ref_seq_len >> CP_SHIFT) + 1;
+ CP_OCC *cp_occ = NULL;
- size = cp_occ_size * sizeof(CP_OCC_AVX);
- cp_occ = (CP_OCC_AVX *)_mm_malloc(size, 64);
- index_alloc += size;
+ size = cp_occ_size * sizeof(CP_OCC);
+ cp_occ = (CP_OCC *)_mm_malloc(size, 64);
assert_not_null(cp_occ, size, index_alloc);
- memset(cp_occ, 0, cp_occ_size * sizeof(CP_OCC_AVX));
+ memset(cp_occ, 0, cp_occ_size * sizeof(CP_OCC));
int64_t cp_count[16];
memset(cp_count, 0, 16 * sizeof(int64_t));
for(i = 0; i < ref_seq_len; i++)
{
- if((i & CP_MASK_AVX) == 0)
+ if((i & CP_MASK) == 0)
{
- CP_OCC_AVX cpo;
+ CP_OCC cpo;
cpo.cp_count[0] = cp_count[0];
cpo.cp_count[1] = cp_count[1];
cpo.cp_count[2] = cp_count[2];
cpo.cp_count[3] = cp_count[3];
- memcpy_s(cpo.bwt_str, CP_BLOCK_SIZE_AVX * sizeof(uint8_t), bwt + i, CP_BLOCK_SIZE_AVX * sizeof(uint8_t));
- cp_occ[i >> CP_SHIFT_AVX] = cpo;
+ int32_t j;
+ cpo.one_hot_bwt_str[0] = 0;
+ cpo.one_hot_bwt_str[1] = 0;
+ cpo.one_hot_bwt_str[2] = 0;
+ cpo.one_hot_bwt_str[3] = 0;
+
+ for(j = 0; j < CP_BLOCK_SIZE; j++)
+ {
+ cpo.one_hot_bwt_str[0] = cpo.one_hot_bwt_str[0] << 1;
+ cpo.one_hot_bwt_str[1] = cpo.one_hot_bwt_str[1] << 1;
+ cpo.one_hot_bwt_str[2] = cpo.one_hot_bwt_str[2] << 1;
+ cpo.one_hot_bwt_str[3] = cpo.one_hot_bwt_str[3] << 1;
+ uint8_t c = bwt[i + j];
+ //printf("c = %d\n", c);
+ if(c < 4)
+ {
+ cpo.one_hot_bwt_str[c] += 1;
+ }
+ }
+
+ cp_occ[i >> CP_SHIFT] = cpo;
}
cp_count[bwt[i]]++;
}
- outstream.write((char*)cp_occ, cp_occ_size * sizeof(CP_OCC_AVX));
+ outstream.write((char*)cp_occ, cp_occ_size * sizeof(CP_OCC));
_mm_free(cp_occ);
_mm_free(bwt);
- index_alloc -= (ref_seq_len_aligned * sizeof(uint8_t) + cp_occ_size * sizeof(CP_OCC_AVX));
- size = ref_seq_len * sizeof(uint32_t);
+ #if SA_COMPRESSION
+
+ size = ((ref_seq_len >> SA_COMPX)+ 1) * sizeof(uint32_t);
uint32_t *sa_ls_word = (uint32_t *)_mm_malloc(size, 64);
- index_alloc += size;
assert_not_null(sa_ls_word, size, index_alloc);
- size = ref_seq_len * sizeof(int8_t);
+ size = ((ref_seq_len >> SA_COMPX) + 1) * sizeof(int8_t);
int8_t *sa_ms_byte = (int8_t *)_mm_malloc(size, 64);
- index_alloc += size;
assert_not_null(sa_ms_byte, size, index_alloc);
+ int64_t pos = 0;
for(i = 0; i < ref_seq_len; i++)
{
- sa_ls_word[i] = sa_bwt[i] & 0xffffffff;
- sa_ms_byte[i] = (sa_bwt[i] >> 32) & 0xff;
- }
- outstream.write((char*)sa_ms_byte, ref_seq_len * sizeof(int8_t));
- outstream.write((char*)sa_ls_word, ref_seq_len * sizeof(uint32_t));
- outstream.close();
- printf("max_occ_ind = %ld\n", i >> CP_SHIFT_AVX);
- fflush(stdout);
-
- _mm_free(sa_ms_byte);
- _mm_free(sa_ls_word);
- return 0;
-}
-
-int FMI_search::build_fm_index_scalar(const char *ref_file_name, char *binary_seq, int64_t ref_seq_len, int64_t *sa_bwt, int64_t *count) {
- printf("ref_seq_len = %ld\n", ref_seq_len);
- fflush(stdout);
-
- char outname[PATH_MAX];
-
- strcpy_s(outname, PATH_MAX, ref_file_name);
- strcat_s(outname, PATH_MAX, CP_FILENAME_SUFFIX_SCALAR);
- //sprintf(outname, "%s.bwt.2bit.%d", ref_file_name, CP_BLOCK_SIZE_SCALAR);
-
- std::fstream outstream (outname, std::ios::out | std::ios::binary);
- outstream.seekg(0);
-
- printf("count = %ld, %ld, %ld, %ld, %ld\n", count[0], count[1], count[2], count[3], count[4]);
- fflush(stdout);
-
- uint8_t *bwt;
-
- ref_seq_len++;
- outstream.write((char *)(&ref_seq_len), 1 * sizeof(int64_t));
- outstream.write((char*)count, 5 * sizeof(int64_t));
-
- int64_t i;
- int64_t ref_seq_len_aligned = ((ref_seq_len + CP_BLOCK_SIZE_SCALAR - 1) / CP_BLOCK_SIZE_SCALAR) * CP_BLOCK_SIZE_SCALAR;
- int64_t size = ref_seq_len_aligned * sizeof(uint8_t);
- bwt = (uint8_t *)_mm_malloc(size, 64);
- assert_not_null(bwt, size, index_alloc);
-
- for(i=0; i< ref_seq_len; i++)
- {
- if(sa_bwt[i] == 0)
- {
- bwt[i] = 4;
- printf("BWT[%ld] = 4\n", i);
- }
- else
- {
- char c = binary_seq[sa_bwt[i]-1];
- switch(c)
- {
- case 0: bwt[i] = 0;
- break;
- case 1: bwt[i] = 1;
- break;
- case 2: bwt[i] = 2;
- break;
- case 3: bwt[i] = 3;
- break;
- default:
- fprintf(stderr, "ERROR! i = %ld, c = %c\n", i, c);
- exit(EXIT_FAILURE);
- }
- }
- }
- for(i = ref_seq_len; i < ref_seq_len_aligned; i++)
- bwt[i] = DUMMY_CHAR;
-
-
- printf("CP_SHIFT = %d, CP_MASK = %d\n", CP_SHIFT_SCALAR, CP_MASK_SCALAR);
- printf("sizeof CP_OCC = %ld\n", sizeof(CP_OCC_SCALAR));
- fflush(stdout);
- // create checkpointed occ
- int64_t cp_occ_size = (ref_seq_len >> CP_SHIFT_SCALAR) + 1;
- CP_OCC_SCALAR *cp_occ = NULL;
-
- size = cp_occ_size * sizeof(CP_OCC_SCALAR);
- cp_occ = (CP_OCC_SCALAR *)_mm_malloc(size, 64);
- assert_not_null(cp_occ, size, index_alloc);
- memset(cp_occ, 0, cp_occ_size * sizeof(CP_OCC_SCALAR));
- int64_t cp_count[16];
-
- memset(cp_count, 0, 16 * sizeof(int64_t));
- for(i = 0; i < ref_seq_len; i++)
- {
- if((i & CP_MASK_SCALAR) == 0)
+ if ((i & SA_COMPX_MASK) == 0)
{
- CP_OCC_SCALAR cpo;
- cpo.cp_count[0] = cp_count[0];
- cpo.cp_count[1] = cp_count[1];
- cpo.cp_count[2] = cp_count[2];
- cpo.cp_count[3] = cp_count[3];
-
- BIT_DATA_TYPE bwt_str_bit0 = 0;
- BIT_DATA_TYPE bwt_str_bit1 = 0;
- BIT_DATA_TYPE dollar_mask = 0;
- int32_t j;
- for(j = 0; j < CP_BLOCK_SIZE_SCALAR; j++)
- {
- uint8_t c = bwt[i + j];
- if((c == 4) || (c == DUMMY_CHAR))
- {
- dollar_mask <<= 1;
- dollar_mask += 1;
- c = 0;
- }
- else if(c > 3)
- {
- fprintf(stderr, "ERROR! [%ld, %d] c = %u\n", (long)i, j, c);
- exit(EXIT_FAILURE);
- }
- else
- {
- dollar_mask <<= 1;
- dollar_mask += 0;
- }
- bwt_str_bit0 = bwt_str_bit0 << 1;
- bwt_str_bit0 += (c & 1);
- bwt_str_bit1 = bwt_str_bit1 << 1;
- bwt_str_bit1 += ((c >> 1) & 1);
- }
- cpo.bwt_str_bit0 = bwt_str_bit0;
- cpo.bwt_str_bit1 = bwt_str_bit1;
- cpo.dollar_mask = dollar_mask;
-
- memset(cpo.pad, 0, PADDING_SCALAR);
- cp_occ[i >> CP_SHIFT_SCALAR] = cpo;
+ sa_ls_word[pos] = sa_bwt[i] & 0xffffffff;
+ sa_ms_byte[pos] = (sa_bwt[i] >> 32) & 0xff;
+ pos++;
}
- cp_count[bwt[i]]++;
}
- outstream.write((char*)cp_occ, cp_occ_size * sizeof(CP_OCC_SCALAR));
- _mm_free(cp_occ);
- _mm_free(bwt);
-
+ fprintf(stderr, "pos: %d, ref_seq_len__: %ld\n", pos, ref_seq_len >> SA_COMPX);
+ outstream.write((char*)sa_ms_byte, ((ref_seq_len >> SA_COMPX) + 1) * sizeof(int8_t));
+ outstream.write((char*)sa_ls_word, ((ref_seq_len >> SA_COMPX) + 1) * sizeof(uint32_t));
+
+ #else
+
size = ref_seq_len * sizeof(uint32_t);
uint32_t *sa_ls_word = (uint32_t *)_mm_malloc(size, 64);
assert_not_null(sa_ls_word, size, index_alloc);
@@ -396,8 +290,12 @@ int FMI_search::build_fm_index_scalar(const char *ref_file_name, char *binary_se
}
outstream.write((char*)sa_ms_byte, ref_seq_len * sizeof(int8_t));
outstream.write((char*)sa_ls_word, ref_seq_len * sizeof(uint32_t));
+
+ #endif
+
+ outstream.write((char *)(&sentinel_index), 1 * sizeof(int64_t));
outstream.close();
- printf("max_occ_ind = %ld\n", i >> CP_SHIFT_SCALAR);
+ printf("max_occ_ind = %ld\n", i >> CP_SHIFT);
fflush(stdout);
_mm_free(sa_ms_byte);
@@ -473,11 +371,11 @@ int FMI_search::build_index() {
//status = saisxx<const char *, int64_t *, int64_t>(reference_seq.c_str(), suffix_array + 1, pac_len, 4);
status = saisxx(reference_seq.c_str(), suffix_array + 1, pac_len);
suffix_array[0] = pac_len;
- fprintf(stderr, "build index ticks = %llu\n", __rdtsc() - startTick);
+ fprintf(stderr, "build suffix-array ticks = %llu\n", __rdtsc() - startTick);
startTick = __rdtsc();
- build_fm_index_avx(prefix, binary_ref_seq, pac_len, suffix_array, count);
- build_fm_index_scalar(prefix, binary_ref_seq, pac_len, suffix_array, count);
+ build_fm_index(prefix, binary_ref_seq, pac_len, suffix_array, count);
+ fprintf(stderr, "build fm-index ticks = %llu\n", __rdtsc() - startTick);
_mm_free(binary_ref_seq);
_mm_free(suffix_array);
return 0;
@@ -485,17 +383,22 @@ int FMI_search::build_index() {
void FMI_search::load_index()
{
+ one_hot_mask_array = (uint64_t *)_mm_malloc(64 * sizeof(uint64_t), 64);
+ one_hot_mask_array[0] = 0;
+ uint64_t base = 0x8000000000000000L;
+ one_hot_mask_array[1] = base;
+ int64_t i = 0;
+ for(i = 2; i < 64; i++)
+ {
+ one_hot_mask_array[i] = (one_hot_mask_array[i - 1] >> 1) | base;
+ }
+
char *ref_file_name = file_name;
//beCalls = 0;
char cp_file_name[PATH_MAX];
strcpy_s(cp_file_name, PATH_MAX, ref_file_name);
strcat_s(cp_file_name, PATH_MAX, CP_FILENAME_SUFFIX);
-// assert(strnlen(ref_file_name, 1000) + 12 < 1000);
-//#if ((!__AVX2__))
-// sprintf(cp_file_name, "%s.bwt.2bit.%d", ref_file_name, CP_BLOCK_SIZE_SCALAR);
-//#else
-// sprintf(cp_file_name, "%s.bwt.8bit.%d", ref_file_name, CP_BLOCK_SIZE_AVX);
-//#endif
+
// Read the BWT and FM index of the reference sequence
FILE *cpstream = NULL;
cpstream = fopen(cp_file_name,"rb");
@@ -531,19 +434,50 @@ void FMI_search::load_index()
{
count[ii] = count[ii] + 1;
}
+
+ #if SA_COMPRESSION
+
+ int64_t reference_seq_len_ = (reference_seq_len >> SA_COMPX) + 1;
+ sa_ms_byte = (int8_t *)_mm_malloc(reference_seq_len_ * sizeof(int8_t), 64);
+ sa_ls_word = (uint32_t *)_mm_malloc(reference_seq_len_ * sizeof(uint32_t), 64);
+ err_fread_noeof(sa_ms_byte, sizeof(int8_t), reference_seq_len_, cpstream);
+ err_fread_noeof(sa_ls_word, sizeof(uint32_t), reference_seq_len_, cpstream);
+
+ #else
+
sa_ms_byte = (int8_t *)_mm_malloc(reference_seq_len * sizeof(int8_t), 64);
sa_ls_word = (uint32_t *)_mm_malloc(reference_seq_len * sizeof(uint32_t), 64);
err_fread_noeof(sa_ms_byte, sizeof(int8_t), reference_seq_len, cpstream);
err_fread_noeof(sa_ls_word, sizeof(uint32_t), reference_seq_len, cpstream);
- fclose(cpstream);
+
+ #endif
sentinel_index = -1;
+ #if SA_COMPRESSION
+ err_fread_noeof(&sentinel_index, sizeof(int64_t), 1, cpstream);
+ fprintf(stderr, "* sentinel-index: %ld\n", sentinel_index);
+ #endif
+ fclose(cpstream);
+
int64_t x;
+ #if !SA_COMPRESSION
for(x = 0; x < reference_seq_len; x++)
{
- if(get_sa_entry(x) == 0)
+ // fprintf(stderr, "x: %ld\n", x);
+ #if SA_COMPRESSION
+ if(get_sa_entry_compressed(x) == 0) {
sentinel_index = x;
+ break;
+ }
+ #else
+ if(get_sa_entry(x) == 0) {
+ sentinel_index = x;
+ break;
+ }
+ #endif
}
+ fprintf(stderr, "\nsentinel_index: %ld\n", x);
+ #endif
fprintf(stderr, "* Count:\n");
for(x = 0; x < 5; x++)
@@ -556,26 +490,6 @@ void FMI_search::load_index()
ref_file_name);
bwa_idx_load_ele(ref_file_name, BWA_IDX_ALL);
-#if ((!__AVX2__))
- base_mask[0][0] = 0;
- base_mask[0][1] = 0;
- base_mask[1][0] = 0xffffffffffffffffL;
- base_mask[1][1] = 0;
- base_mask[2][0] = 0;
- base_mask[2][1] = 0xffffffffffffffffL;
- base_mask[3][0] = 0xffffffffffffffffL;
- base_mask[3][1] = 0xffffffffffffffffL;
-#else
- c_bcast_array = (uint8_t *)_mm_malloc(256 * sizeof(uint8_t), 64);
- for(ii = 0; ii < 4; ii++)
- {
- int32_t j;
- for(j = 0; j < 64; j++)
- {
- c_bcast_array[ii * 64 + j] = ii;
- }
- }
-#endif
fprintf(stderr, "* Done reading Index!!\n");
}
@@ -1118,15 +1032,8 @@ SMEM FMI_search::backwardExt(SMEM smem, uint8_t a)
{
int64_t sp = (int64_t)(smem.k);
int64_t ep = (int64_t)(smem.k) + (int64_t)(smem.s);
-#if ((!__AVX2__))
- GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, bwt_str_bit0_sp, bwt_str_bit1_sp, bit0_cmp_sp, bit1_cmp_sp, mismatch_mask_sp);
- GET_OCC(ep, b, occ_id_ep, y_ep, occ_ep, bwt_str_bit0_ep, bwt_str_bit1_ep, bit0_cmp_ep, bit1_cmp_ep, mismatch_mask_ep);
-#else
- __m256i b256;
- b256 = _mm256_load_si256((const __m256i *)(c_bcast_array + b * 64));
- GET_OCC(sp, b, b256, occ_id_sp, y_sp, occ_sp, bwt_str_sp, bwt_sp_vec, mask_sp_vec, mask_sp);
- GET_OCC(ep, b, b256, occ_id_ep, y_ep, occ_ep, bwt_str_ep, bwt_ep_vec, mask_ep_vec, mask_ep);
-#endif
+ GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, one_hot_bwt_str_c_sp, match_mask_sp);
+ GET_OCC(ep, b, occ_id_ep, y_ep, occ_ep, one_hot_bwt_str_c_ep, match_mask_ep);
k[b] = count[b] + occ_sp;
s[b] = occ_ep - occ_sp;
}
@@ -1191,3 +1098,278 @@ void FMI_search::get_sa_entries(SMEM *smemArray, int64_t *coordArray, int32_t *c
totalCoordCount += c;
}
}
+
+// sa_compression
+int64_t FMI_search::get_sa_entry_compressed(int64_t pos, int tid)
+{
+ if ((pos & SA_COMPX_MASK) == 0) {
+
+ #if SA_COMPRESSION
+ int64_t sa_entry = sa_ms_byte[pos >> SA_COMPX];
+ #else
+ int64_t sa_entry = sa_ms_byte[pos]; // simulation
+ #endif
+
+ sa_entry = sa_entry << 32;
+
+ #if SA_COMPRESSION
+ sa_entry = sa_entry + sa_ls_word[pos >> SA_COMPX];
+ #else
+ sa_entry = sa_entry + sa_ls_word[pos]; // simulation
+ #endif
+
+ return sa_entry;
+ }
+ else {
+ // tprof[MEM_CHAIN][tid] ++;
+ int64_t offset = 0;
+ int64_t sp = pos;
+ while(true)
+ {
+ int64_t occ_id_pp_ = sp >> CP_SHIFT;
+ int64_t y_pp_ = CP_BLOCK_SIZE - (sp & CP_MASK) - 1;
+ uint64_t *one_hot_bwt_str = cp_occ[occ_id_pp_].one_hot_bwt_str;
+ uint8_t b;
+
+ if((one_hot_bwt_str[0] >> y_pp_) & 1)
+ b = 0;
+ else if((one_hot_bwt_str[1] >> y_pp_) & 1)
+ b = 1;
+ else if((one_hot_bwt_str[2] >> y_pp_) & 1)
+ b = 2;
+ else if((one_hot_bwt_str[3] >> y_pp_) & 1)
+ b = 3;
+ else
+ b = 4;
+
+ if (b == 4) {
+ return offset;
+ }
+
+ GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, one_hot_bwt_str_c_sp, match_mask_sp);
+
+ sp = count[b] + occ_sp;
+
+ offset ++;
+ // tprof[ALIGN1][tid] ++;
+ if ((sp & SA_COMPX_MASK) == 0) break;
+ }
+ // assert((reference_seq_len >> SA_COMPX) - 1 >= (sp >> SA_COMPX));
+ #if SA_COMPRESSION
+ int64_t sa_entry = sa_ms_byte[sp >> SA_COMPX];
+ #else
+ int64_t sa_entry = sa_ms_byte[sp]; // simultion
+ #endif
+
+ sa_entry = sa_entry << 32;
+
+ #if SA_COMPRESSION
+ sa_entry = sa_entry + sa_ls_word[sp >> SA_COMPX];
+ #else
+ sa_entry = sa_entry + sa_ls_word[sp]; // simulation
+ #endif
+
+ sa_entry += offset;
+ return sa_entry;
+ }
+}
+
+void FMI_search::get_sa_entries(SMEM *smemArray, int64_t *coordArray, int32_t *coordCountArray, uint32_t count, int32_t max_occ, int tid)
+{
+
+ uint32_t i;
+ int32_t totalCoordCount = 0;
+ for(i = 0; i < count; i++)
+ {
+ int32_t c = 0;
+ SMEM smem = smemArray[i];
+ int64_t hi = smem.k + smem.s;
+ int64_t step = (smem.s > max_occ) ? smem.s / max_occ : 1;
+ int64_t j;
+ for(j = smem.k; (j < hi) && (c < max_occ); j+=step, c++)
+ {
+ int64_t pos = j;
+ int64_t sa_entry = get_sa_entry_compressed(pos, tid);
+ coordArray[totalCoordCount + c] = sa_entry;
+ }
+ // coordCountArray[i] = c;
+ *coordCountArray += c;
+ totalCoordCount += c;
+ }
+}
+
+// SA_COPMRESSION w/ PREFETCH
+int64_t FMI_search::call_one_step(int64_t pos, int64_t &sa_entry, int64_t &offset)
+{
+ if ((pos & SA_COMPX_MASK) == 0) {
+ sa_entry = sa_ms_byte[pos >> SA_COMPX];
+ sa_entry = sa_entry << 32;
+ sa_entry = sa_entry + sa_ls_word[pos >> SA_COMPX];
+ // return sa_entry;
+ return 1;
+ }
+ else {
+ // int64_t offset = 0;
+ int64_t sp = pos;
+
+ int64_t occ_id_pp_ = sp >> CP_SHIFT;
+ int64_t y_pp_ = CP_BLOCK_SIZE - (sp & CP_MASK) - 1;
+ uint64_t *one_hot_bwt_str = cp_occ[occ_id_pp_].one_hot_bwt_str;
+ uint8_t b;
+
+ if((one_hot_bwt_str[0] >> y_pp_) & 1)
+ b = 0;
+ else if((one_hot_bwt_str[1] >> y_pp_) & 1)
+ b = 1;
+ else if((one_hot_bwt_str[2] >> y_pp_) & 1)
+ b = 2;
+ else if((one_hot_bwt_str[3] >> y_pp_) & 1)
+ b = 3;
+ else
+ b = 4;
+ if (b == 4) {
+ sa_entry = 0;
+ return 1;
+ }
+
+ GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, one_hot_bwt_str_c_sp, match_mask_sp);
+
+ sp = count[b] + occ_sp;
+
+ offset ++;
+ if ((sp & SA_COMPX_MASK) == 0) {
+
+ sa_entry = sa_ms_byte[sp >> SA_COMPX];
+ sa_entry = sa_entry << 32;
+ sa_entry = sa_entry + sa_ls_word[sp >> SA_COMPX];
+
+ sa_entry += offset;
+ // return sa_entry;
+ return 1;
+ }
+ else {
+ sa_entry = sp;
+ return 0;
+ }
+ } // else
+}
+
+void FMI_search::get_sa_entries_prefetch(SMEM *smemArray, int64_t *coordArray,
+ int64_t *coordCountArray, int64_t count,
+ const int32_t max_occ, int tid, int64_t &id_)
+{
+
+ // uint32_t i;
+ int32_t totalCoordCount = 0;
+ int32_t mem_lim = 0, id = 0;
+
+ for(int i = 0; i < count; i++)
+ {
+ int32_t c = 0;
+ SMEM smem = smemArray[i];
+ mem_lim += smem.s;
+ }
+
+ int64_t *pos_ar = (int64_t *) _mm_malloc( mem_lim * sizeof(int64_t), 64);
+ int64_t *map_ar = (int64_t *) _mm_malloc( mem_lim * sizeof(int64_t), 64);
+
+ for(int i = 0; i < count; i++)
+ {
+ int32_t c = 0;
+ SMEM smem = smemArray[i];
+ int64_t hi = smem.k + smem.s;
+ int64_t step = (smem.s > max_occ) ? smem.s / max_occ : 1;
+ int64_t j;
+ for(j = smem.k; (j < hi) && (c < max_occ); j+=step, c++)
+ {
+ int64_t pos = j;
+ pos_ar[id] = pos;
+ map_ar[id++] = totalCoordCount + c;
+ // int64_t sa_entry = get_sa_entry_compressed(pos, tid);
+ // coordArray[totalCoordCount + c] = sa_entry;
+ }
+ //coordCountArray[i] = c;
+ *coordCountArray += c;
+ totalCoordCount += c;
+ }
+
+ id_ += id;
+
+ const int32_t sa_batch_size = 20;
+ int64_t working_set[sa_batch_size], map_pos[sa_batch_size];;
+ int64_t offset[sa_batch_size] = {-1};
+
+ int i = 0, j = 0;
+ while(i<id && j<sa_batch_size)
+ {
+ int64_t pos = pos_ar[i];
+ working_set[j] = pos;
+ map_pos[j] = map_ar[i];
+ offset[j] = 0;
+
+ if (pos & SA_COMPX_MASK == 0) {
+ _mm_prefetch(&sa_ms_byte[pos >> SA_COMPX], _MM_HINT_T0);
+ _mm_prefetch(&sa_ls_word[pos >> SA_COMPX], _MM_HINT_T0);
+ }
+ else {
+ int64_t occ_id_pp_ = pos >> CP_SHIFT;
+ _mm_prefetch(&cp_occ[occ_id_pp_], _MM_HINT_T0);
+ }
+ i++;
+ j++;
+ }
+
+ int lim = j, all_quit = 0;
+ while (all_quit < id)
+ {
+
+ for (int k=0; k<lim; k++)
+ {
+ int64_t sp = 0, pos = 0;
+ bool quit;
+ if (offset[k] >= 0) {
+ quit = call_one_step(working_set[k], sp, offset[k]);
+ }
+ else
+ continue;
+
+ if (quit) {
+ coordArray[map_pos[k]] = sp;
+ all_quit ++;
+
+ if (i < id)
+ {
+ pos = pos_ar[i];
+ working_set[k] = pos;
+ map_pos[k] = map_ar[i++];
+ offset[k] = 0;
+
+ if (pos & SA_COMPX_MASK == 0) {
+ _mm_prefetch(&sa_ms_byte[pos >> SA_COMPX], _MM_HINT_T0);
+ _mm_prefetch(&sa_ls_word[pos >> SA_COMPX], _MM_HINT_T0);
+ }
+ else {
+ int64_t occ_id_pp_ = pos >> CP_SHIFT;
+ _mm_prefetch(&cp_occ[occ_id_pp_], _MM_HINT_T0);
+ }
+ }
+ else
+ offset[k] = -1;
+ }
+ else {
+ working_set[k] = sp;
+ if (sp & SA_COMPX_MASK == 0) {
+ _mm_prefetch(&sa_ms_byte[sp >> SA_COMPX], _MM_HINT_T0);
+ _mm_prefetch(&sa_ls_word[sp >> SA_COMPX], _MM_HINT_T0);
+ }
+ else {
+ int64_t occ_id_pp_ = sp >> CP_SHIFT;
+ _mm_prefetch(&cp_occ[occ_id_pp_], _MM_HINT_T0);
+ }
+ }
+ }
+ }
+
+ _mm_free(pos_ar);
+ _mm_free(map_ar);
+}
=====================================
src/FMI_search.h
=====================================
@@ -46,38 +46,16 @@ Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at i
#define assert_not_null(x, size, cur_alloc) \
if (x == NULL) { fprintf(stderr, "Allocation of %0.2lf GB for " #x " failed.\nCurrent Allocation = %0.2lf GB\n", size * 1.0 /(1024*1024*1024), cur_alloc * 1.0 /(1024*1024*1024)); exit(EXIT_FAILURE); }
-#define CP_BLOCK_SIZE_SCALAR 64
-#define CP_FILENAME_SUFFIX_SCALAR ".bwt.2bit.64"
-#define CP_MASK_SCALAR 63
-#define CP_SHIFT_SCALAR 6
-#define BIT_DATA_TYPE uint64_t
-#define PADDING_SCALAR 8
-
-#define CP_BLOCK_SIZE_AVX 32
-#define CP_FILENAME_SUFFIX_AVX ".bwt.8bit.32"
-#define CP_MASK_AVX 31
-#define CP_SHIFT_AVX 5
+#define CP_BLOCK_SIZE 64
+#define CP_FILENAME_SUFFIX ".bwt.2bit.64"
+#define CP_MASK 63
+#define CP_SHIFT 6
typedef struct checkpoint_occ_scalar
{
- BIT_DATA_TYPE bwt_str_bit0;
- BIT_DATA_TYPE bwt_str_bit1;
- BIT_DATA_TYPE dollar_mask;
int64_t cp_count[4];
- uint8_t pad[PADDING_SCALAR];
-}CP_OCC_SCALAR;
-
-typedef struct checkpoint_occ_avx
-{
- uint8_t bwt_str[CP_BLOCK_SIZE_AVX];
- int64_t cp_count[4];
-}CP_OCC_AVX;
-
-#if ((!__AVX2__))
-
-typedef CP_OCC_SCALAR CP_OCC;
-#define CP_SHIFT CP_SHIFT_SCALAR
-#define CP_FILENAME_SUFFIX CP_FILENAME_SUFFIX_SCALAR
+ uint64_t one_hot_bwt_str[4];
+}CP_OCC;
#if defined(__clang__) || defined(__GNUC__)
static inline int _mm_countbits_64(unsigned long x) {
@@ -86,46 +64,13 @@ static inline int _mm_countbits_64(unsigned long x) {
#endif
#define \
-GET_OCC(pp, c, occ_id_pp, y_pp, occ_pp, bwt_str_bit0_pp, bwt_str_bit1_pp, bit0_cmp_pp, bit1_cmp_pp, mismatch_mask_pp) \
- int64_t occ_id_pp = pp >> CP_SHIFT_SCALAR; \
- int64_t y_pp = pp & CP_MASK_SCALAR; \
+GET_OCC(pp, c, occ_id_pp, y_pp, occ_pp, one_hot_bwt_str_c_pp, match_mask_pp) \
+ int64_t occ_id_pp = pp >> CP_SHIFT; \
+ int64_t y_pp = pp & CP_MASK; \
int64_t occ_pp = cp_occ[occ_id_pp].cp_count[c]; \
- if(y_pp > 0) \
- { \
- BIT_DATA_TYPE bwt_str_bit0_pp = cp_occ[occ_id_pp].bwt_str_bit0; \
- BIT_DATA_TYPE bwt_str_bit1_pp = cp_occ[occ_id_pp].bwt_str_bit1; \
- BIT_DATA_TYPE bit0_cmp_pp = bwt_str_bit0_pp ^ base_mask[c][0]; \
- BIT_DATA_TYPE bit1_cmp_pp = bwt_str_bit1_pp ^ base_mask[c][1]; \
- uint64_t mismatch_mask_pp = bit0_cmp_pp | bit1_cmp_pp | cp_occ[occ_id_pp].dollar_mask; \
- mismatch_mask_pp = mismatch_mask_pp >> (CP_BLOCK_SIZE_SCALAR - y_pp); \
- occ_pp += y_pp - _mm_countbits_64(mismatch_mask_pp); \
- }
-
-#else
-
-typedef CP_OCC_AVX CP_OCC;
-#define CP_SHIFT CP_SHIFT_AVX
-#define CP_FILENAME_SUFFIX CP_FILENAME_SUFFIX_AVX
-
-#if defined(__clang__) || defined(__GNUC__)
-static inline int _mm_countbits_32(unsigned x) {
- return __builtin_popcount(x);
-}
-#endif
-
-#define \
-GET_OCC(pp, c, c256, occ_id_pp, y_pp, occ_pp, bwt_str_pp, bwt_pp_vec, mask_pp_vec, mask_pp) \
- int64_t occ_id_pp = pp >> CP_SHIFT_AVX; \
- int64_t y_pp = pp & CP_MASK_AVX; \
- int64_t occ_pp = cp_occ[occ_id_pp].cp_count[c]; \
- uint8_t *bwt_str_pp = cp_occ[occ_id_pp].bwt_str; \
- __m256i bwt_pp_vec = _mm256_load_si256((const __m256i *)(bwt_str_pp)); \
- __m256i mask_pp_vec = _mm256_cmpeq_epi8(bwt_pp_vec, c256); \
- uint64_t mask_pp = _mm256_movemask_epi8(mask_pp_vec); \
- mask_pp = mask_pp << (32 - y_pp); \
- occ_pp += _mm_countbits_32(mask_pp);
-
-#endif
+ uint64_t one_hot_bwt_str_c_pp = cp_occ[occ_id_pp].one_hot_bwt_str[c]; \
+ uint64_t match_mask_pp = one_hot_bwt_str_c_pp & one_hot_mask_array[y_pp]; \
+ occ_pp += _mm_countbits_64(match_mask_pp);
typedef struct smem_struct
{
@@ -207,7 +152,18 @@ class FMI_search: public indexEle
int32_t *coordCountArray,
uint32_t count,
int32_t max_occ);
-
+ int64_t get_sa_entry_compressed(int64_t pos, int tid=0);
+ void get_sa_entries(SMEM *smemArray,
+ int64_t *coordArray,
+ int32_t *coordCountArray,
+ uint32_t count,
+ int32_t max_occ,
+ int tid);
+ int64_t call_one_step(int64_t pos, int64_t &sa_entry, int64_t &offset);
+ void get_sa_entries_prefetch(SMEM *smemArray, int64_t *coordArray,
+ int64_t *coordCountArray, int64_t count,
+ const int32_t max_occ, int tid, int64_t &id_);
+
int64_t reference_seq_len;
int64_t sentinel_index;
private:
@@ -218,21 +174,12 @@ private:
int8_t *sa_ms_byte;
CP_OCC *cp_occ;
-#if ((!__AVX2__))
- BIT_DATA_TYPE base_mask[4][2];
-#else
- uint8_t *c_bcast_array;
-#endif
+ uint64_t *one_hot_mask_array;
int64_t pac_seq_len(const char *fn_pac);
void pac2nt(const char *fn_pac,
std::string &reference_seq);
- int build_fm_index_avx(const char *ref_file_name,
- char *binary_seq,
- int64_t ref_seq_len,
- int64_t *sa_bwt,
- int64_t *count);
- int build_fm_index_scalar(const char *ref_file_name,
+ int build_fm_index(const char *ref_file_name,
char *binary_seq,
int64_t ref_seq_len,
int64_t *sa_bwt,
=====================================
src/bandedSWA.cpp
=====================================
The diff for this file was not included because it is too large.
=====================================
src/bwamem.cpp
=====================================
@@ -30,13 +30,7 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
#include "bwamem.h"
#include "FMI_search.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "safe_mem_lib.h"
-#ifdef __cplusplus
-}
-#endif
+#include "memcpy_bwamem.h"
//----------------
extern uint64_t tprof[LIM_R][LIM_C];
@@ -164,6 +158,17 @@ KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
+#if MATE_SORT
+void sort_alnreg_re(int n, mem_alnreg_t* a) {
+ ks_introsort(mem_ars2, n, a);
+}
+
+void sort_alnreg_score(int n, mem_alnreg_t* a) {
+ ks_introsort(mem_ars, n, a);
+}
+
+#endif
+
#define PATCH_MAX_R_BW 0.05f
#define PATCH_MIN_SC_RATIO 0.90f
@@ -230,6 +235,60 @@ int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
#define MEM_SEEDSW_COEF 0.05f
int stat;
+#if MATE_SORT
+int mem_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+ const uint8_t *pac, uint8_t *query, int n,
+ mem_alnreg_t *a)
+{
+ int m, i, j;
+ if (n <= 1) return n;
+ for (i = 0; i < n; ++i) a[i].n_comp = 1;
+
+ for (i = 1; i < n; ++i)
+ {
+ mem_alnreg_t *p = &a[i];
+ if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap)
+ continue; // then no need to go into the loop below
+
+ for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) {
+ mem_alnreg_t *q = &a[j];
+ int64_t or_, oq, mr, mq;
+ int score, w;
+ if (q->qe == q->qb) continue; // a[j] has been excluded
+ or_ = q->re - p->rb; // overlap length on the reference
+ oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
+ mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment
+ mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
+ if (or_ > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant
+ if (p->score < q->score)
+ {
+ p->qe = p->qb;
+ break;
+ }
+ else q->qe = q->qb;
+ }
+ else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p
+ p->n_comp += q->n_comp + 1;
+ p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov;
+ p->sub = p->sub > q->sub? p->sub : q->sub;
+ p->csub = p->csub > q->csub? p->csub : q->csub;
+ p->qb = q->qb, p->rb = q->rb;
+ p->truesc = p->score = score;
+ p->w = w;
+ q->qb = q->qe;
+ }
+ }
+ }
+ for (i = 0, m = 0; i < n; ++i) // exclude identical hits
+ if (a[i].qe > a[i].qb) {
+ if (m != i) a[m++] = a[i];
+ else ++m;
+ }
+ n = m;
+ return m;
+}
+#endif
+
int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
const uint8_t *pac, uint8_t *query, int n,
mem_alnreg_t *a)
@@ -323,7 +382,7 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c,
c->m <<= 1;
if (pm == SEEDS_PER_CHAIN) { // re-new memory
if ((auxSeedBuf = (mem_seed_t *) calloc(c->m, sizeof(mem_seed_t))) == NULL) { fprintf(stderr, "ERROR: out of memory auxSeedBuf\n"); exit(1); }
- memcpy_s((char*) (auxSeedBuf), c->m * sizeof(mem_seed_t), c->seeds, c->n * sizeof(mem_seed_t));
+ memcpy_bwamem((char*) (auxSeedBuf), c->m * sizeof(mem_seed_t), c->seeds, c->n * sizeof(mem_seed_t), __FILE__, __LINE__);
c->seeds = auxSeedBuf;
tprof[PE13][tid]++;
} else { // new memory
@@ -688,7 +747,8 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt,
int num[nseq];
memset(num, 0, nseq*sizeof(int));
- int64_t *sa_coord = (int64_t *) _mm_malloc(sizeof(int64_t) * opt->max_occ, 64);
+ int smem_buf_size = 6000;
+ int64_t *sa_coord = (int64_t *) _mm_malloc(sizeof(int64_t) * opt->max_occ * smem_buf_size, 64);
int64_t seedBufCount = 0;
for (int l=0; l<nseq; l++)
@@ -726,6 +786,24 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt,
} while (pos < num_smem - 1 && matchArray[pos].rid == matchArray[pos + 1].rid);
l_rep += e - b;
+ // bwt_sa
+ // assert(pos - smem_ptr + 1 < 6000);
+ if (pos - smem_ptr + 1 >= smem_buf_size)
+ {
+ int csize = smem_buf_size;
+ smem_buf_size *= 2;
+ sa_coord = (int64_t *) _mm_realloc(sa_coord, csize, opt->max_occ * smem_buf_size,
+ sizeof(int64_t));
+ assert(sa_coord != NULL);
+ }
+ int64_t id = 0, cnt_ = 0, mypos = 0;
+ #if SA_COMPRESSION
+ uint64_t tim = __rdtsc();
+ fmi->get_sa_entries_prefetch(&matchArray[smem_ptr], sa_coord, &cnt_,
+ pos - smem_ptr + 1, opt->max_occ, tid, id); // sa compressed prefetch
+ tprof[MEM_SA][tid] += __rdtsc() - tim;
+ #endif
+
for (i = smem_ptr; i <= pos; i++)
{
SMEM *p = &matchArray[i];
@@ -734,19 +812,26 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt,
int64_t k;
step = p->s > opt->max_occ? p->s / opt->max_occ : 1;
- // uint64_t tim = __rdtsc();
- int cnt = 0;
+ int cnt = 0;
+ #if !SA_COMPRESSION
+ uint64_t tim = __rdtsc();
fmi->get_sa_entries(p, sa_coord, &cnt, 1, opt->max_occ);
- cnt = 0;
- // tprof[MEM_SA][tid] += __rdtsc() - tim;
+ tprof[MEM_SA][tid] += __rdtsc() - tim;
+ #endif
+ cnt = 0;
for (k = count = 0; k < p->s && count < opt->max_occ; k += step, ++count)
{
mem_chain_t tmp, *lower, *upper;
mem_seed_t s;
int rid, to_add = 0;
-
+
+ #if SA_COMPRESSION
+ s.rbeg = tmp.pos = sa_coord[mypos++];
+ #else
s.rbeg = tmp.pos = sa_coord[cnt++];
+ #endif
+
s.qbeg = p->m;
s.score= s.len = slen;
if (s.rbeg < 0 || s.len < 0)
@@ -1202,7 +1287,7 @@ void mem_process_seqs(mem_opt_t *opt,
// PAIRED_END
if (opt->flag & MEM_F_PE) { // infer insert sizes if not provided
if (pes0)
- memcpy_s(pes, 4 * sizeof(mem_pestat_t), pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size
+ memcpy_bwamem(pes, 4 * sizeof(mem_pestat_t), pes0, 4 * sizeof(mem_pestat_t), __FILE__, __LINE__); // if pes0 != NULL, set the insert-size
// distribution as pes0
else {
fprintf(stderr, "[0000] Inferring insert size distribution of PE reads from data, "
@@ -1674,7 +1759,7 @@ void* _mm_realloc(void *ptr, int64_t csize, int64_t nsize, int16_t dsize) {
}
void *nptr = _mm_malloc(nsize * dsize, 64);
assert(nptr != NULL);
- memcpy_s(nptr, nsize * dsize, ptr, csize);
+ memcpy_bwamem(nptr, nsize * dsize, ptr, csize, __FILE__, __LINE__);
_mm_free(ptr);
return nptr;
=====================================
src/bwamem_pair.cpp
=====================================
@@ -154,7 +154,13 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns,
{
extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
-
+ #if MATE_SORT
+ extern int mem_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+ const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+ extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+ extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+ #endif
+
//int tid = omp_get_thread_num();
int64_t l_pac = bns->l_pac;
int i, r, skip[4], n = 0, rid = -1;
@@ -220,17 +226,56 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns,
b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
+
+ #if !MATE_SORT
// move b s.t. ma is sorted
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
if (ma->a[i].score < b.score) break;
tmp = i;
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
ma->a[i] = b;
+
+ #else
+
+ int resort = 0;
+ for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+ if (ma->a[i].re == b.re) {
+ resort = 1;
+ break;
+ }
+ if (ma->a[i].re > b.re) {
+ break;
+ }
+ }
+ if (resort) {
+ // Don't know where to put this alignment. So let the scores decide
+ sort_alnreg_score(ma->n - 1, ma->a);
+ for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+ if (ma->a[i].score < b.score) {
+ break;
+ }
+ }
+ tmp = i;
+ for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+ ma->a[i] = b;
+ // Now we can sort based on end position
+ sort_alnreg_re(ma->n, ma->a);
+ }
+ else {
+ tmp = i;
+ for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+ ma->a[i] = b;
+ }
+ #endif
tprof[PE26][0] ++;
}
++n;
}
+ #if !MATE_SORT
if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+ #else
+ if (n) ma->n = mem_dedup_patch(opt, 0, 0, 0, ma->n, ma->a); // sam_improvements
+ #endif
if (rev) free(rev);
free(ref);
}
@@ -313,7 +358,14 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns,
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
-
+
+ #if MATE_SORT
+ extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+ extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+ extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+ const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+ #endif
+
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
kstring_t str;
mem_aln_t h[2], g[2], aa[2][2];
@@ -332,12 +384,32 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns,
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
- for (i = 0; i < 2; ++i)
+ #if MATE_SORT
+ for (i = 0; i < 2; ++i) {
+ sort_alnreg_re(a[!i].n, a[!i].a);
+ int val = 0, swcount = 0;
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
int val = mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
n += val;
+ swcount += val;
+ }
+ if (swcount > 0) {
+ mem_alnreg_v* ma = &a[!i];
+ ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+ }
+ else {
+ sort_alnreg_score(a[!i].n, a[!i].a);
}
+ }
+
+ #else
+ for (i = 0; i < 2; ++i)
+ for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
+ int val = mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
+ n += val;
+ }
+ #endif
free(b[0].a); free(b[1].a);
}
@@ -571,9 +643,14 @@ int mem_sam_pe_batch(const mem_opt_t *opt, mem_cache *mmc,
kswv *pwsw = new kswv(opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, -1*opt->b, nthreads,
maxRefLen, maxQerLen);
+ // Shift 16-bit
+ for (int i=0; i<pcnt-pcnt8; i++)
+ seqPairArray[pcnt + MAX_LINE_LEN - 1 - i] = seqPairArray[pcnt-i-1];
+
#if __AVX512BW__
pwsw->getScores8(seqPairArray, seqBufRef, seqBufQer, aln, pcnt8, nthreads, 0);
- pwsw->getScores16(seqPairArray + pcnt8, seqBufRef, seqBufQer, aln, pcnt-pcnt8, nthreads, 0);
+ pwsw->getScores16(seqPairArray + pcnt8 + MAX_LINE_LEN, seqBufRef, seqBufQer,
+ aln, pcnt-pcnt8, nthreads, 0);
#else
fprintf(stderr, "Error: This should not have happened!! \nPlease look in to AVX512 macros\n");
exit(EXIT_FAILURE);
@@ -581,9 +658,11 @@ int mem_sam_pe_batch(const mem_opt_t *opt, mem_cache *mmc,
// Post-processing
int pos = 0, pos8 = 0, pos16 = 0;
- for (int i=0; i<pcnt; i++) {
- kswr_t r = aln[i];
+ for (int i=0; i<pcnt8; i++)
+ {
SeqPair sp = seqPairArray[i];
+ int ind = sp.regid;
+ kswr_t r = aln[ind];
int xtra = sp.h0;
if ((xtra & KSW_XSTART) == 0 || ((xtra & KSW_XSUBO) && r.score < (xtra & 0xffff))) continue;
@@ -592,19 +671,34 @@ int mem_sam_pe_batch(const mem_opt_t *opt, mem_cache *mmc,
uint8_t *qs = seqBufQer + sp.idq;
uint8_t *rs = seqBufRef + sp.idr;
revseq(r.qe + 1, qs); revseq(r.te + 1, rs);
- seqPairArray[pos++] = sp;
+ seqPairArray[pos++] = sp;
+ pos8 ++;
+ }
+
+ int id = pcnt8 + MAX_LINE_LEN;
+ for (int i=0; i<pcnt-pcnt8; i++)
+ {
+ SeqPair sp = seqPairArray[i + id];
+ int ind = sp.regid;
+ kswr_t r = aln[ind];
+ int xtra = sp.h0;
+ if ((xtra & KSW_XSTART) == 0 || ((xtra & KSW_XSUBO) && r.score < (xtra & 0xffff))) continue;
- if (i < pcnt8) pos8 ++;
- else pos16 ++;
+ sp.h0 = KSW_XSTOP | r.score;
+ sp.len2 = r.qe + 1;
+ uint8_t *qs = seqBufQer + sp.idq;
+ uint8_t *rs = seqBufRef + sp.idr;
+ revseq(r.qe + 1, qs); revseq(r.te + 1, rs);
+ seqPairArray[pos++] = sp;
+ pos16 ++;
}
int pcnt2 = pos;
assert(pos8 + pos16 == pcnt2);
#if __AVX512BW__
- pwsw->getScores8(seqPairArray, seqBufRef, seqBufQer, aln, pos8, nthreads, 1);
pwsw->getScores16(seqPairArray + pos8, seqBufRef, seqBufQer, aln, pos16, nthreads, 1);
- // pwsw->kswvScalarWrapper(seqPairArray, seqBufRef, seqBufQer, aln, pos, nthreads, 0); //debug
+ pwsw->getScores8(seqPairArray, seqBufRef, seqBufQer, aln, pos8, nthreads, 1);
#else
fprintf(stderr, "Error: This should not have happened!! \nPlease look in to AVX512 macros\n");
exit(EXIT_FAILURE);
@@ -628,7 +722,13 @@ int mem_sam_pe_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
const mem_alnreg_v *a, int l_query, const char *query);
-
+ #if MATE_SORT
+ extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+ extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+ extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+ const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+ #endif
+
int32_t *gar = (int32_t*) mmc->seqPairArrayAux[tid];
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
@@ -653,7 +753,29 @@ int mem_sam_pe_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
a[0].a[l].flg = 0;
for (int l=0; l<a[1].n; l++)
a[1].a[l].flg = 0;
-
+
+ #if MATE_SORT
+ for (i = 0; i < 2; ++i) {
+ sort_alnreg_re(a[!i].n, a[!i].a);
+ int val = 0, swcount = 0;
+ for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
+ val = mem_matesw_batch_post(opt, bns, pac, pes, &b[i].a[j],
+ s[!i].l_seq, (uint8_t*)s[!i].seq,
+ &a[!i], myaln, gcnt, gar, mmc);
+ n += val;
+ swcount += val;
+ // ncnt++;
+ gcnt += 4;
+ }
+ if (swcount > 0) {
+ mem_alnreg_v* ma = &a[!i];
+ ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+ }
+ else {
+ sort_alnreg_score(a[!i].n, a[!i].a);
+ }
+ }
+ #else
for (i = 0; i < 2; ++i) {
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
int val = mem_matesw_batch_post(opt, bns, pac, pes, &b[i].a[j],
@@ -664,7 +786,7 @@ int mem_sam_pe_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
gcnt += 4;
}
}
-
+ #endif
free(b[0].a); free(b[1].a);
}
@@ -978,9 +1100,15 @@ int mem_matesw_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
{
extern int mem_sort_dedup_patch_rev(const mem_opt_t *opt, const bntseq_t *bns,
const uint8_t *pac, uint8_t *query, int n,
- mem_alnreg_t *a);
+ mem_alnreg_t *a);
extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+ #if MATE_SORT
+ extern int mem_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+ const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+ extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+ extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+ #endif
int64_t l_pac = bns->l_pac;
int i, r, skip[4], n = 0, rid = -1;
@@ -1062,6 +1190,8 @@ int mem_matesw_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
+ #if !MATE_SORT
+
// move b s.t. ma is sorted
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
if (ma->a[i].score < b.score) break;
@@ -1069,11 +1199,48 @@ int mem_matesw_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
ma->a[i] = b;
+ #else
+ int resort = 0;
+ // move b s.t. ma is sorted
+ for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+ if (ma->a[i].re == b.re) {
+ resort = 1;
+ break;
+ }
+ if (ma->a[i].re > b.re) {
+ break;
+ }
+ }
+ if (resort) {
+ // Don't know where to put this alignment. So let the scores decide
+ sort_alnreg_score(ma->n - 1, ma->a);
+ for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+ if (ma->a[i].score < b.score) {
+ break;
+ }
+ }
+ tmp = i;
+ for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+ ma->a[i] = b;
+ // Now we can sort based on end position
+ sort_alnreg_re(ma->n, ma->a);
+ }
+ else {
+ tmp = i;
+ for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+ ma->a[i] = b;
+ }
+ #endif
tprof[PE26][0] ++;
}
++n;
}
- if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+ #if !MATE_SORT
+ if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+ #else
+ if (n) ma->n = mem_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+ #endif
+
if (rev) free(rev);
free(ref);
}
=====================================
src/kopen.cpp
=====================================
@@ -48,10 +48,10 @@
# include "malloc_wrap.h"
#endif
+#include "memcpy_bwamem.h"
#ifdef __cplusplus
extern "C" {
#endif
-#include "safe_mem_lib.h"
#include "safe_str_lib.h"
#ifdef __cplusplus
}
@@ -254,7 +254,7 @@ static int ftp_open(const char *fn)
if (*p != '(') goto ftp_open_end;
++p;
sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
- memcpy_s(pasv_ip, 4 * sizeof(int), v, 4 * sizeof(int));
+ memcpy_bwamem(pasv_ip, 4 * sizeof(int), v, 4 * sizeof(int), __FILE__, __LINE__);
pasv_port = (v[4]<<8&0xff00) + v[5];
kftp_send_cmd(&aux, retr, 0);
sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
=====================================
src/kseq.h
=====================================
@@ -35,13 +35,7 @@
#include <string.h>
#include <stdlib.h>
#include <assert.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "safe_mem_lib.h"
-#ifdef __cplusplus
-}
-#endif
+#include "memcpy_bwamem.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
@@ -138,7 +132,7 @@ typedef struct __kstring_t {
str->s = (char*)realloc(str->s, str->m); \
} \
gotany = 1; \
- memcpy_s(str->s + str->l, str->m - str->l, ks->buf + ks->begin, i - ks->begin); \
+ memcpy_bwamem(str->s + str->l, str->m - str->l, ks->buf + ks->begin, i - ks->begin, __FILE__, __LINE__); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
if (i < ks->end) { \
=====================================
src/kstring.h
=====================================
@@ -32,13 +32,7 @@
#include <stdlib.h>
#include <string.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "safe_mem_lib.h"
-#ifdef __cplusplus
-}
-#endif
+#include "memcpy_bwamem.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
@@ -72,7 +66,7 @@ static inline int kputsn(const char *p, int l, kstring_t *s)
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
- memcpy_s(s->s + s->l, s->m - s->l, p, l);
+ memcpy_bwamem(s->s + s->l, s->m - s->l, p, l, __FILE__, __LINE__);
s->l += l;
s->s[s->l] = 0;
return l;
=====================================
src/kswv.cpp
=====================================
@@ -565,7 +565,7 @@ int kswv::kswv512_u8(uint8_t seq1SoA[],
_mm512_store_si512((__m512i *) qe, qe512);
int live = 0;
- for (int l=0; l<SIMD_WIDTH8; l++) {
+ for (int l=0; l<SIMD_WIDTH8 && (po_ind + l) < numPairs; l++) {
int ind = po_ind + l;
int16_t *te;
if (i < SIMD_WIDTH16) te = te1;
@@ -679,16 +679,17 @@ int kswv::kswv512_u8(uint8_t seq1SoA[],
int16_t temp4[SIMD_WIDTH8] __attribute((aligned(64)));
_mm512_store_si512((__m512i *) temp, max512);
_mm512_store_si512((__m512i *) temp4, te512);
- _mm512_store_si512((__m512i *) (temp4 + SIMD_WIDTH16), te512_);
- for (int i=0; i<SIMD_WIDTH8; i++)
+ _mm512_store_si512((__m512i *) (temp4 + SIMD_WIDTH16), te512_);
+
+ for (int i=0; i<SIMD_WIDTH8 && (po_ind + i) < numPairs; i++)
{
int ind = po_ind + i;
int16_t *te2;
- if (i < SIMD_WIDTH16) te2 = temp4;
- else te2 = temp4;
+ // if (i < SIMD_WIDTH16) te2 = temp4;
+ // else te2 = temp4;
+ te2 = temp4;
#if !MAINY
ind = p[i].regid; // index of corr. aln
- assert(ind == po_ind + i);
#endif
if (qe[i]) {
aln[ind].score2 = (temp[i] == 0? (int)-1: (uint8_t) temp[i]);
@@ -1122,11 +1123,10 @@ int kswv::kswv512_16(int16_t seq1SoA[],
_mm512_store_si512((__m512i *) te, te512);
_mm512_store_si512((__m512i *) qe, qe512);
- for (int l=0; l<SIMD_WIDTH16; l++) {
+ for (int l=0; l<SIMD_WIDTH16 && (po_ind + l) < numPairs; l++) {
int ind = po_ind + l;
#if !MAINY
ind = p[l].regid; // index of corr. aln
- // assert(ind == po_ind + l);
if (phase) {
if (aln[ind].score == score[l]) {
aln[ind].tb = aln[ind].te - te[l];
@@ -1199,12 +1199,11 @@ int kswv::kswv512_16(int16_t seq1SoA[],
// _mm512_store_si512((__m512i *) temp, max512_);
_mm512_store_si512((__m512i *) temp1, max512);
_mm512_store_si512((__m512i *) temp2, te512);
-
- for (int i=0; i<SIMD_WIDTH16; i++) {
+
+ for (int i=0; i<SIMD_WIDTH16 && (po_ind + i) < numPairs; i++) {
int ind = po_ind + i;
#if !MAINY
ind = p[i].regid; // index of corr. aln
- assert(ind == po_ind + i);
#endif
aln[ind].score2 = temp1[i];
aln[ind].te2 = temp2[i];
=====================================
src/kthread.cpp
=====================================
@@ -65,13 +65,14 @@ static void *ktf_worker(void *data)
int st = i * BATCH_SIZE;
if (st >= w->t->n) break;
int ed = (i + 1) * BATCH_SIZE < w->t->n? (i + 1) * BATCH_SIZE : w->t->n;
- w->t->func(w->t->data, st, ed-st, tid);
+ // w->t->func(w->t->data, st, ed-st, tid);
+ w->t->func(w->t->data, st, ed-st, w - w->t->w);
}
while ((i = steal_work(w->t)) >= 0) {
int st = i * BATCH_SIZE;
int ed = (i + 1) * BATCH_SIZE < w->t->n? (i + 1) * BATCH_SIZE : w->t->n;
- w->t->func(w->t->data, st, ed-st, tid);
+ w->t->func(w->t->data, st, ed-st, w - w->t->w);
}
pthread_exit(0);
}
=====================================
src/macro.h
=====================================
@@ -58,11 +58,12 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
#define ALIGN_OFF 1
#define MAX_THREADS 256
-
-
#define LIM_R 128
#define LIM_C 128
+#define SA_COMPRESSION 1
+#define SA_COMPX 03 // (= power of 2)
+#define SA_COMPX_MASK 0x7 // 0x7 or 0x3 or 0x1
/*** Runtime profiling macros ***/
#define INDEX 0
=====================================
src/main.cpp
=====================================
@@ -1,122 +1,126 @@
-/*************************************************************************************
- The MIT License
-
- BWA-MEM2 (Sequence alignment using Burrows-Wheeler Transform),
- Copyright (C) 2019 Intel Corporation, Heng Li.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-
-Contacts: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at intel.com>;
- Heng Li <hli at jimmy.harvard.edu>
-*****************************************************************************************/
-
-// ----------------------------------
-#include "main.h"
-
-#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "2.0pre2"
-#endif
-
-
-// ----------------------------------
-uint64_t proc_freq, tprof[LIM_R][LIM_C], prof[LIM_R];
-// ----------------------------------
-
-int usage()
-{
- fprintf(stderr, "Usage: bwa-mem2 <command> <arguments>\n");
- fprintf(stderr, "Commands:\n");
- fprintf(stderr, " index create index\n");
- fprintf(stderr, " mem alignment\n");
- fprintf(stderr, " version print version number\n");
- return 1;
-}
-
-int main(int argc, char* argv[])
-{
-
- // ---------------------------------
- uint64_t tim = __rdtsc();
- sleep(1);
- proc_freq = __rdtsc() - tim;
-
- int ret = -1;
- if (argc < 2) return usage();
-
- if (strcmp(argv[1], "index") == 0)
- {
- uint64_t tim = __rdtsc();
- ret = bwa_index(argc-1, argv+1);
- fprintf(stderr, "Total time taken: %0.4lf\n", (__rdtsc() - tim)*1.0/proc_freq);
- return ret;
- }
- else if (strcmp(argv[1], "mem") == 0)
- {
- tprof[MEM][0] = __rdtsc();
- kstring_t pg = {0,0,0};
- extern char *bwa_pg;
-
- fprintf(stderr, "-----------------------------\n");
-#if __AVX512BW__
- fprintf(stderr, "Executing in AVX512 mode!!\n");
-#endif
-#if ((!__AVX512BW__) & (__AVX2__))
- fprintf(stderr, "Executing in AVX2 mode!!\n");
-#endif
-#if ((!__AVX512BW__) && (!__AVX2__) && (__SSE2__))
- fprintf(stderr, "Executing in SSE4.1 mode!!\n");
-#endif
-#if ((!__AVX512BW__) && (!__AVX2__) && (!__SSE2__))
- fprintf(stderr, "Executing in Scalar mode!!\n");
-#endif
- fprintf(stderr, "-----------------------------\n");
-
- ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
- for (int i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
- ksprintf(&pg, "\n");
- bwa_pg = pg.s;
- ret = main_mem(argc-1, argv+1);
- free(bwa_pg);
-
- /** Enable this return to avoid printing of the runtime profiling **/
- //return ret;
- }
- else if (strcmp(argv[1], "version") == 0)
- {
- puts(PACKAGE_VERSION);
- return 0;
- } else {
- fprintf(stderr, "ERROR: unknown command '%s'\n", argv[1]);
- return 1;
- }
-
- fprintf(stderr, "\nImportant parameter settings: \n");
- fprintf(stderr, "\tBATCH_SIZE: %d\n", BATCH_SIZE);
- fprintf(stderr, "\tMAX_SEQ_LEN_REF: %d\n", MAX_SEQ_LEN_REF);
- fprintf(stderr, "\tMAX_SEQ_LEN_QER: %d\n", MAX_SEQ_LEN_QER);
- fprintf(stderr, "\tMAX_SEQ_LEN8: %d\n", MAX_SEQ_LEN8);
- fprintf(stderr, "\tSEEDS_PER_READ: %d\n", SEEDS_PER_READ);
- fprintf(stderr, "\tSIMD_WIDTH8 X: %d\n", SIMD_WIDTH8);
- fprintf(stderr, "\tSIMD_WIDTH16 X: %d\n", SIMD_WIDTH16);
- fprintf(stderr, "\tAVG_SEEDS_PER_READ: %d\n", AVG_SEEDS_PER_READ);
-
- return 0;
-}
+/*************************************************************************************
+ The MIT License
+
+ BWA-MEM2 (Sequence alignment using Burrows-Wheeler Transform),
+ Copyright (C) 2019 Intel Corporation, Heng Li.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+
+Contacts: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at intel.com>;
+ Heng Li <hli at jimmy.harvard.edu>
+*****************************************************************************************/
+
+// ----------------------------------
+#include "main.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "2.2"
+#endif
+
+
+// ----------------------------------
+uint64_t proc_freq, tprof[LIM_R][LIM_C], prof[LIM_R];
+// ----------------------------------
+
+int usage()
+{
+ fprintf(stderr, "Usage: bwa-mem2 <command> <arguments>\n");
+ fprintf(stderr, "Commands:\n");
+ fprintf(stderr, " index create index\n");
+ fprintf(stderr, " mem alignment\n");
+ fprintf(stderr, " version print version number\n");
+ return 1;
+}
+
+int main(int argc, char* argv[])
+{
+
+ // ---------------------------------
+ uint64_t tim = __rdtsc();
+ sleep(1);
+ proc_freq = __rdtsc() - tim;
+
+ int ret = -1;
+ if (argc < 2) return usage();
+
+ if (strcmp(argv[1], "index") == 0)
+ {
+ uint64_t tim = __rdtsc();
+ ret = bwa_index(argc-1, argv+1);
+ fprintf(stderr, "Total time taken: %0.4lf\n", (__rdtsc() - tim)*1.0/proc_freq);
+ return ret;
+ }
+ else if (strcmp(argv[1], "mem") == 0)
+ {
+ tprof[MEM][0] = __rdtsc();
+ kstring_t pg = {0,0,0};
+ extern char *bwa_pg;
+
+ fprintf(stderr, "-----------------------------\n");
+#if __AVX512BW__
+ fprintf(stderr, "Executing in AVX512 mode!!\n");
+#elif __AVX2__
+ fprintf(stderr, "Executing in AVX2 mode!!\n");
+#elif __AVX__
+ fprintf(stderr, "Executing in AVX mode!!\n");
+#elif __SSE4_2__
+ fprintf(stderr, "Executing in SSE4.2 mode!!\n");
+#elif __SSE4_1__
+ fprintf(stderr, "Executing in SSE4.1 mode!!\n");
+#endif
+ fprintf(stderr, "-----------------------------\n");
+
+ #if SA_COMPRESSION
+ fprintf(stderr, "* SA compression enabled with xfactor: %d\n", 0x1 << SA_COMPX);
+ #endif
+
+ ksprintf(&pg, "@PG\tID:bwa-mem2\tPN:bwa-mem2\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
+
+ for (int i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
+ ksprintf(&pg, "\n");
+ bwa_pg = pg.s;
+ ret = main_mem(argc-1, argv+1);
+ free(bwa_pg);
+
+ /** Enable this return to avoid printing of the runtime profiling **/
+ //return ret;
+ }
+ else if (strcmp(argv[1], "version") == 0)
+ {
+ puts(PACKAGE_VERSION);
+ return 0;
+ } else {
+ fprintf(stderr, "ERROR: unknown command '%s'\n", argv[1]);
+ return 1;
+ }
+
+ fprintf(stderr, "\nImportant parameter settings: \n");
+ fprintf(stderr, "\tBATCH_SIZE: %d\n", BATCH_SIZE);
+ fprintf(stderr, "\tMAX_SEQ_LEN_REF: %d\n", MAX_SEQ_LEN_REF);
+ fprintf(stderr, "\tMAX_SEQ_LEN_QER: %d\n", MAX_SEQ_LEN_QER);
+ fprintf(stderr, "\tMAX_SEQ_LEN8: %d\n", MAX_SEQ_LEN8);
+ fprintf(stderr, "\tSEEDS_PER_READ: %d\n", SEEDS_PER_READ);
+ fprintf(stderr, "\tSIMD_WIDTH8 X: %d\n", SIMD_WIDTH8);
+ fprintf(stderr, "\tSIMD_WIDTH16 X: %d\n", SIMD_WIDTH16);
+ fprintf(stderr, "\tAVG_SEEDS_PER_READ: %d\n", AVG_SEEDS_PER_READ);
+
+ return 0;
+}
=====================================
src/memcpy_bwamem.cpp
=====================================
@@ -0,0 +1,49 @@
+/*************************************************************************************
+ The MIT License
+
+ BWA-MEM2 (Sequence alignment using Burrows-Wheeler Transform),
+ Copyright (C) 2019 Intel Corporation, Heng Li.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+
+Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at intel.com>;
+*****************************************************************************************/
+
+#include "memcpy_bwamem.h"
+
+errno_t memcpy_bwamem(void *dest, rsize_t dmax, const void *src, rsize_t smax, char *file_name, int line_num)
+{
+ errno_t ret;
+ int64_t bytes_copied;
+ for(bytes_copied = 0; bytes_copied < smax; bytes_copied += RSIZE_MAX_MEM)
+ {
+ int64_t bytes_remaining = smax - bytes_copied;
+ int64_t bytes_to_copy = (bytes_remaining > RSIZE_MAX_MEM) ? RSIZE_MAX_MEM : bytes_remaining;
+ int64_t dest_bytes_remaining = dmax - bytes_copied;
+ int64_t dest_bytes = (dest_bytes_remaining < bytes_to_copy) ? dest_bytes_remaining : bytes_to_copy;
+ if((ret = memcpy_s((char *)dest + bytes_copied, dest_bytes, (const char *)src + bytes_copied, bytes_to_copy)) != 0)
+ {
+ fprintf(stderr, "[%s: %d] memcpy_s returned %d\n", file_name, line_num, ret);
+ exit(EXIT_FAILURE);
+ }
+ }
+ return 0;
+}
=====================================
src/memcpy_bwamem.h
=====================================
@@ -0,0 +1,45 @@
+/*************************************************************************************
+ The MIT License
+
+ BWA-MEM2 (Sequence alignment using Burrows-Wheeler Transform),
+ Copyright (C) 2019 Intel Corporation, Heng Li.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+
+Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at intel.com>;
+*****************************************************************************************/
+
+
+#ifndef MEMCPY_BWA_H
+#define MEMCPY_BWA_H
+
+#include <stdlib.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "safe_mem_lib.h"
+#ifdef __cplusplus
+}
+#endif
+
+errno_t memcpy_bwamem(void *dest, rsize_t dmax, const void *src, rsize_t smax, char *file_name, int line_num);
+
+#endif
=====================================
src/profiling.cpp
=====================================
@@ -31,8 +31,7 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
#include "macro.h"
#include <stdint.h>
#include <assert.h>
-
-extern uint64_t proc_freq, tprof[LIM_R][LIM_C];
+#include "profiling.h"
int find_opt(uint64_t *a, int len, uint64_t *max, uint64_t *min, double *avg)
{
@@ -57,7 +56,7 @@ int display_stats(int nthreads)
uint64_t max, min;
double avg;
fprintf(stderr, "No. of OMP threads: %d\n", nthreads);
- fprintf(stderr, "Processor is runnig @%lf MHz\n", proc_freq*1.0/1e6);
+ fprintf(stderr, "Processor is running @%lf MHz\n", proc_freq*1.0/1e6);
fprintf(stderr, "Runtime profile:\n");
fprintf(stderr, "\n\tTime taken for main_mem function: %0.2lf sec\n\n",
@@ -140,17 +139,18 @@ int display_stats(int nthreads)
fprintf(stderr, "\t\tSAL compute avg: %0.2lf, (%0.2lf, %0.2lf)\n",
avg*1.0/proc_freq, max*1.0/proc_freq, min*1.0/proc_freq);
-#if HIDE
+ #if 1 //HIDE
find_opt(tprof[MEM_SA], nthreads, &max, &min, &avg);
fprintf(stderr, "\t\t\t\tMEM_SA avg: %0.2lf, (%0.2lf, %0.2lf)\n\n",
avg*1.0/proc_freq, max*1.0/proc_freq, min*1.0/proc_freq);
-#endif
+ #endif
// printf("\n\t BSW compute time (sec):\n");
find_opt(tprof[MEM_ALN2], nthreads, &max, &min, &avg);
fprintf(stderr, "\t\tBSW time, avg: %0.2lf, (%0.2lf, %0.2lf)\n",
avg*1.0/proc_freq, max*1.0/proc_freq, min*1.0/proc_freq);
+ #if HIDE
int agg1 = 0, agg2 = 0, agg3 = 0;
for (int i=0; i<nthreads; i++) {
agg1 += tprof[PE11][i];
@@ -160,10 +160,32 @@ int display_stats(int nthreads)
if (agg1 != agg3)
fprintf(stderr, "There is a discrepancy re-allocs, plz rectify!!\n");
- assert(agg2 != 0);
- fprintf(stderr, "\n\tTotal re-allocs: %d out of total requests: %d, Rate: %0.2f\n",
- agg1, agg2, agg1*1.0/agg2);
+ if(agg2 > 0)
+ {
+ fprintf(stderr, "\n\tTotal re-allocs: %d out of total requests: %d, Rate: %0.2f\n",
+ agg1, agg2, agg1*1.0/agg2);
+ }
+
+ double res, max_ = 0, min_=1e10;
+ for (int i=0; i<nthreads; i++) {
+ double val = (tprof[ALIGN1][i]*1.0) / tprof[MEM_CHAIN][i];
+ res += val;
+ if (max_ < val) max_ = val;
+ if (min_ > val) min_ = val;
+ }
+ fprintf(stderr, "\tAvg. FM-index traversal per get_sa_entry(): avg: %lf, max: %lf, min: %lf\n",
+ res/nthreads, max_, min_);
+ int64_t tot_inst1 = 0, tot_inst2 = 0;
+ for (int i=0; i<nthreads; i++) {
+ tot_inst1 += tprof[SAM1][i];
+ tot_inst2 += tprof[SAM2][i];
+ }
+
+ fprintf(stderr, "\ttot_inst1: %ld, tot_inst2: %ld, over %d threads\n",
+ tot_inst1, tot_inst2, nthreads);
+ #endif
+
#if HIDE
fprintf(stderr, "\n BSW Perf.:\n");
find_opt(tprof[MEM_ALN2_B], 1, &max, &min, &avg);
=====================================
src/profiling.h
=====================================
@@ -31,5 +31,5 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
#define _PROFILE_HPP
int display_stats(int );
-
+extern uint64_t proc_freq, tprof[LIM_R][LIM_C];
#endif
=====================================
src/runsimd.cpp
=====================================
@@ -152,11 +152,25 @@ static int exe_path(const char *exe, int max, char buf[], int *base_st)
static void test_and_launch(char *argv[], char *prefix, const char *simd) // we assume prefix is long enough
{
struct stat st;
+ int prefix_len = strlen(prefix);
strcat_s(prefix, PATH_MAX, simd);
- if (stat(prefix, &st) == 0 && (st.st_mode & S_IXUSR)) {
- //fprintf(stderr, "Launching executable \"%s\"\n", prefix);
- execv(prefix, argv);
- }
+ fprintf(stderr, "Looking to launch executable \"%s\", simd = %s\n", prefix, simd);
+ if(stat(prefix, &st) == 0)
+ {
+ if (st.st_mode & S_IXUSR) {
+ fprintf(stderr, "Launching executable \"%s\"\n", prefix);
+ execv(prefix, argv);
+ }
+ else
+ {
+ fprintf(stderr, "(st.st_mode & S_IXUSR) = %d, can not run executable: %s\n", st.st_mode & S_IXUSR, prefix);
+ }
+ }
+ else
+ {
+ fprintf(stderr, "stat(prefix, &st) = %d, can not run executable: %s\n", stat(prefix, &st), prefix);
+ }
+ prefix[prefix_len] = 0;
}
int main(int argc, char *argv[])
@@ -179,13 +193,10 @@ int main(int argc, char *argv[])
//prefix_len = strlen(prefix);
simd = x86_simd();
if (simd & SIMD_AVX512BW) test_and_launch(argv, prefix, ".avx512bw");
- if (simd & SIMD_AVX512F) test_and_launch(argv, prefix, ".avx512f");
if (simd & SIMD_AVX2) test_and_launch(argv, prefix, ".avx2");
if (simd & SIMD_AVX) test_and_launch(argv, prefix, ".avx");
if (simd & SIMD_SSE4_2) test_and_launch(argv, prefix, ".sse42");
if (simd & SIMD_SSE4_1) test_and_launch(argv, prefix, ".sse41");
- if (simd & SIMD_SSE2) test_and_launch(argv, prefix, ".sse2");
- if (simd & SIMD_SSE) test_and_launch(argv, prefix, ".sse");
free(prefix);
fprintf(stderr, "ERROR: fail to find the right executable\n");
return 2;
=====================================
test/fmi_test.cpp
=====================================
@@ -40,6 +40,7 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
#endif
#define QUERY_DB_SIZE 1280000000
+
int myrank, num_ranks;
int main(int argc, char **argv) {
@@ -88,8 +89,9 @@ int main(int argc, char **argv) {
assert(max_readlength > 0);
assert(max_readlength < 10000);
assert(numReads > 0);
- assert(numReads * max_readlength < QUERY_DB_SIZE);
printf("numReads = %d, max_readlength = %d, min_readlength = %d\n", numReads, max_readlength, min_readlength);
+ assert(numReads * max_readlength < QUERY_DB_SIZE);
+
uint8_t *enc_qdb=(uint8_t *)malloc(numReads * max_readlength * sizeof(uint8_t));
int64_t cind,st;
@@ -246,7 +248,7 @@ int main(int argc, char **argv) {
}
printf("totalSmems = %ld\n", totalSmem);
-#ifdef PRINT_OUTPUT
+ #ifdef PRINT_OUTPUT
int32_t prevRid = -1;
for(batch_id = 0; batch_id < num_batches; batch_id++)
{
@@ -271,7 +273,7 @@ int main(int argc, char **argv) {
u2 = smem.k + smem.s;
for(u3 = u1; u3 < u2; u3++)
{
- printf("%ld,", fmiSearch->get_sa_entry(u3));
+ //printf("%ld,", fmiSearch->get_sa_entry(u3));
}
printf("]");
#endif
View it on GitLab: https://salsa.debian.org/med-team/bwa-mem2/-/compare/7f44836367952bad93cebe969a6f27f9b706c426...a7f38675c6c0d9ec41cd660b2eb38165cc9af080
--
View it on GitLab: https://salsa.debian.org/med-team/bwa-mem2/-/compare/7f44836367952bad93cebe969a6f27f9b706c426...a7f38675c6c0d9ec41cd660b2eb38165cc9af080
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210308/e75cc62c/attachment-0001.htm>
More information about the debian-med-commit
mailing list