[med-svn] [Git][med-team/bwa-mem2][upstream] New upstream version 2.2

Mon Mar 8 15:25:15 GMT 2021


Michael R. Crusoe pushed to branch upstream at Debian Med / bwa-mem2


Commits:
944b4a3d by Michael R. Crusoe at 2021-03-08T15:51:37+01:00
New upstream version 2.2
- - - - -


20 changed files:

- Makefile
- README.md
- src/FMI_search.cpp
- src/FMI_search.h
- src/bandedSWA.cpp
- src/bwamem.cpp
- src/bwamem_pair.cpp
- src/kopen.cpp
- src/kseq.h
- src/kstring.h
- src/kswv.cpp
- src/kthread.cpp
- src/macro.h
- src/main.cpp
- + src/memcpy_bwamem.cpp
- + src/memcpy_bwamem.h
- src/profiling.cpp
- src/profiling.h
- src/runsimd.cpp
- test/fmi_test.cpp


Changes:

=====================================
Makefile
=====================================
@@ -28,6 +28,10 @@
 ##                                Heng Li <hli at jimmy.harvard.edu> 
 ##*****************************************************************************************/
 
+ifneq ($(portable),)
+	STATIC_GCC=-static-libgcc -static-libstdc++
+endif
+
 EXE=		bwa-mem2
 #CXX=		icpc
 ifeq ($(CXX), icpc)
@@ -35,24 +39,36 @@ ifeq ($(CXX), icpc)
 else ifeq ($(CXX), g++)
 	CC=gcc
 endif		
-ARCH_FLAGS=	-msse4.1
+ARCH_FLAGS=	-msse -msse2 -msse3 -mssse3 -msse4.1
 MEM_FLAGS=	-DSAIS=1
-CPPFLAGS=	-DENABLE_PREFETCH -DV17=1 $(MEM_FLAGS) 
+CPPFLAGS+=	-DENABLE_PREFETCH -DV17=1 -DMATE_SORT=0 $(MEM_FLAGS) 
 INCLUDES=   -Isrc -Iext/safestringlib/include
-LIBS=		-lpthread -lm -lz -L. -lbwa  -Lext/safestringlib -lsafestring
-OBJS=		src/fastmap.o src/bwtindex.o src/utils.o src/kthread.o \
+LIBS=		-lpthread -lm -lz -L. -lbwa -Lext/safestringlib -lsafestring $(STATIC_GCC)
+OBJS=		src/fastmap.o src/bwtindex.o src/utils.o src/memcpy_bwamem.o src/kthread.o \
 			src/kstring.o src/ksw.o src/bntseq.o src/bwamem.o src/profiling.o src/bandedSWA.o \
 			src/FMI_search.o src/read_index_ele.o src/bwamem_pair.o src/kswv.o src/bwa.o \
 			src/bwamem_extra.o src/kopen.o
 BWA_LIB=    libbwa.a
 SAFE_STR_LIB=    ext/safestringlib/libsafestring.a
 
-ifneq ($(portable),)
-	LIBS+=-static-libgcc -static-libstdc++
-endif
-
-ifeq ($(arch),sse)
+ifeq ($(arch),sse41)
+	ifeq ($(CXX), icpc)
 		ARCH_FLAGS=-msse4.1
+	else
+		ARCH_FLAGS=-msse -msse2 -msse3 -mssse3 -msse4.1
+	endif
+else ifeq ($(arch),sse42)
+	ifeq ($(CXX), icpc)	
+		ARCH_FLAGS=-msse4.2
+	else
+		ARCH_FLAGS=-msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2
+	endif
+else ifeq ($(arch),avx)
+	ifeq ($(CXX), icpc)
+		ARCH_FLAGS=-mavx ##-xAVX
+	else	
+		ARCH_FLAGS=-mavx
+	endif
 else ifeq ($(arch),avx2)
 	ifeq ($(CXX), icpc)
 		ARCH_FLAGS=-march=core-avx2 #-xCORE-AVX2
@@ -70,9 +86,11 @@ else ifeq ($(arch),native)
 else ifneq ($(arch),)
 # To provide a different architecture flag like -march=core-avx2.
 	ARCH_FLAGS=$(arch)
+else
+myall:multi
 endif
 
-CXXFLAGS=	-g -O3 -fpermissive $(ARCH_FLAGS) #-Wall ##-xSSE2
+CXXFLAGS+=	-g -O3 -fpermissive $(ARCH_FLAGS) #-Wall ##-xSSE2
 
 .PHONY:all clean depend multi
 .SUFFIXES:.cpp .o
@@ -84,24 +102,29 @@ all:$(EXE)
 
 multi:
 	rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
-	$(MAKE) arch=sse    EXE=bwa-mem2.sse41    CXX=$(CXX) all
+	$(MAKE) arch=sse41    EXE=bwa-mem2.sse41    CXX=$(CXX) all
+	rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
+	$(MAKE) arch=sse42    EXE=bwa-mem2.sse42    CXX=$(CXX) all
+	rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
+	$(MAKE) arch=avx    EXE=bwa-mem2.avx    CXX=$(CXX) all
 	rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
 	$(MAKE) arch=avx2   EXE=bwa-mem2.avx2     CXX=$(CXX) all
 	rm -f src/*.o $(BWA_LIB); cd ext/safestringlib/ && $(MAKE) clean;
 	$(MAKE) arch=avx512 EXE=bwa-mem2.avx512bw CXX=$(CXX) all
-	$(CXX) -Wall -O3 src/runsimd.cpp -Iext/safestringlib/include -Lext/safestringlib/ -lsafestring -o bwa-mem2
+	$(CXX) -Wall -O3 src/runsimd.cpp -Iext/safestringlib/include -Lext/safestringlib/ -lsafestring $(STATIC_GCC) -o bwa-mem2
+
 
 $(EXE):$(BWA_LIB) $(SAFE_STR_LIB) src/main.o
-	$(CXX) $(CXXFLAGS) src/main.o $(BWA_LIB) $(LIBS) -o $@
+	$(CXX) $(CXXFLAGS) $(LDFLAGS) src/main.o $(BWA_LIB) $(LIBS) -o $@
 
 $(BWA_LIB):$(OBJS)
 	ar rcs $(BWA_LIB) $(OBJS)
 
 $(SAFE_STR_LIB):
-	cd ext/safestringlib/ && make clean && make CC=$(CC) directories libsafestring.a
+	cd ext/safestringlib/ && $(MAKE) clean && $(MAKE) CC=$(CC) directories libsafestring.a
 
 clean:
-	rm -fr src/*.o $(BWA_LIB) $(EXE) bwa-mem2.sse41 bwa-mem2.avx2 bwa-mem2.avx512bw
+	rm -fr src/*.o $(BWA_LIB) $(EXE) bwa-mem2.sse41 bwa-mem2.sse42 bwa-mem2.avx bwa-mem2.avx2 bwa-mem2.avx512bw
 	cd ext/safestringlib/ && $(MAKE) clean
 
 depend:
@@ -147,3 +170,4 @@ src/profiling.o: src/macro.h
 src/read_index_ele.o: src/read_index_ele.h src/utils.h src/bntseq.h
 src/read_index_ele.o: src/macro.h
 src/utils.o: src/utils.h src/ksort.h src/kseq.h
+src/memcpy_bwamem.o: src/memcpy_bwamem.h


=====================================
README.md
=====================================
@@ -1,6 +1,11 @@
+[![GitHub Downloads](https://img.shields.io/github/downloads/bwa-mem2/bwa-mem2/total?label=GitHub%20Downloads)](https://github.com/bwa-mem2/bwa-mem2/releases)
+[![BioConda Install](https://img.shields.io/conda/dn/bioconda/bwa-mem2?label=BioConda%20Installs)](https://anaconda.org/bioconda/bwa-mem2)
+
 ## Important Information
 
-***Index structure has changed since commit 6743183. Rebuild the Index if you are using a later commit.***
+***We are happy to announce that the index size on disk is down by 8 times and in memory by 4 times due to moving to only one type of FM-index (2bit.64 instead of 2bit.64 and 8bit.32) and 8x compression of suffix array. For example, for human genome, index size on disk is down to ~10GB from ~80GB and memory footprint is down to ~10GB from ~40GB.***
+***There is a substantial reduction in index IO time due to the reduction and hardly any performance impact on read mapping.***
+***Due to this change in index structure (in commit #4b59796, 10th October 2020), you will need to rebuild the index.***
 
 ***Added MC flag in the output sam file in commit a591e22. Output should match original bwa-mem version 0.7.17.***
 
@@ -49,7 +54,7 @@ the most efficient implementation based on the SIMD instruction set available
 on the running machine. Precompiled binaries were generated on a CentOS6
 machine using the following command line:
 ```sh
-make CXX=icpc multi
+make CXX=icpc
 ```
 
 [bwa]: https://github.com/lh3/bwa
@@ -74,8 +79,8 @@ Where <prefix> is the prefix specified when creating the index or the path to th
 
 ## Performance
 
-Datasets:
-
+Datasets:  
+Reference Genome: human_g1k_v37.fasta
 
  Alias	    |  Dataset source				|  No. of reads	| Read length 
  --------- | --------- | --------- | --------- 
@@ -92,12 +97,47 @@ OS: CentOS Linux release 7.6.1810
 Memory: 100GB  
 
 
+We followed the steps below to collect the performance results:  
+A. Data download steps:
+1. Download SRA toolkit from https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software#header-global    
+2. tar xfzv sratoolkit.2.10.5-centos_linux64.tar.gz  
+3. Download D2: sratoolkit.2.10.5-centos_linux64/bin/fastq-dump --split-files SRR7733443   
+4. Download D3: sratoolkit.2.10.5-centos_linux64/bin/fastq-dump --split-files SRR9932168   
+5. Download D4: sratoolkit.2.10.5-centos_linux64/bin/fastq-dump --split-files SRX6999918   
+
+
+
+B. Alignment steps:   
+1. git clone https://github.com/bwa-mem2/bwa-mem2.git   
+2. cd bwa-mem2   
+3. ```make CXX=icpc``` (using intel C/C++ compiler)   
+or   ```make``` (using gcc compiler)   
+4. ./bwa-mem2 index <ref.fa>   
+5. ./bwa-mem2 mem [-t <#threads>] <ref.fa> <in_1.fastq> [<in_2.fastq>]  >  <output.sam>   
+
+For example,  in our double socket (56 threads each) and double numa compute node, we used the following command line to align D2 to human_g1k_v37.fasta reference genome.  
+```
+numactl -m 0 -C 0-27,56-83 ./bwa-mem2 index human_g1k_v37.fasta  
+numactl -m 0 -C 0-27,56-83 ./bwa-mem2 mem -t 56 human_g1k_v37.fasta SRR7733443_1.fastq SRR7733443_2.fastq > d3_align.sam
+```
+
 <p align="center">
 <img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-1.png" height="400"/a></br>
 <img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-2.png" height="400"/a></br>
 <img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-3.png" height="400"/a></br>
 <img src="https://github.com/bwa-mem2/bwa-mem2/blob/master/images/bwa-mem2-4.png" height="400"/a></br>
-</p>
+</p> 
+
+## bwa-mem2 seeding speedup with Enumerated Radix Trees (Code in ert branch)
+
+The ert branch of bwa-mem2 repository contains codebase of enuerated radix tree based acceleration of bwa-mem2. The ert code is built on the top of bwa-mem2 (thanks to the hard work by @arun-sub). 
+The following are the highlights of the ert based bwa-mem2 tool: 
+1. Exact same output as bwa-mem(2) 
+2. The tool has two additional flags to enable the use of ert solution (for index creation and mapping), else it runs in vanilla bwa-mem2 mode 
+3. It uses 1 additional flag to create ert index (different from bwa-mem2 index) and 1 additional flag for using that ert index (please see the readme of ert branch) 
+4. The ert solution is 10% - 30% faster (tested on above machine configuration) in comparison to vanilla bwa-mem2 -- users are adviced to use option `-K 1000000` to see the speedups 
+5. The memory foot print of the ert index is ~60GB 
+6. The code is present in ert branch: https://github.com/bwa-mem2/bwa-mem2/tree/ert
 
 
 ## Citation


=====================================
src/FMI_search.cpp
=====================================
@@ -30,10 +30,12 @@ Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at i
 #include <stdio.h>
 #include "sais.h"
 #include "FMI_search.h"
+#include "memcpy_bwamem.h"
+#include "profiling.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
-#include "safe_mem_lib.h"
 #include "safe_str_lib.h"
 #ifdef __cplusplus
 }
@@ -50,9 +52,7 @@ FMI_search::FMI_search(const char *fname)
     sa_ls_word = NULL;
     sa_ms_byte = NULL;
     cp_occ = NULL;
-#if ((__AVX2__))
-    c_bcast_array = NULL;
-#endif
+    one_hot_mask_array = NULL;
 }
 
 FMI_search::~FMI_search()
@@ -63,10 +63,8 @@ FMI_search::~FMI_search()
         _mm_free(sa_ls_word);
     if(cp_occ)
         _mm_free(cp_occ);
-#if ((__AVX2__))
-    if(c_bcast_array)
-        _mm_free(c_bcast_array);
-#endif
+    if(one_hot_mask_array)
+        _mm_free(one_hot_mask_array);
 }
 
 int64_t FMI_search::pac_seq_len(const char *fn_pac)
@@ -143,14 +141,15 @@ void FMI_search::pac2nt(const char *fn_pac, std::string &reference_seq)
 	free(buf2);
 }
 
-int FMI_search::build_fm_index_avx(const char *ref_file_name, char *binary_seq, int64_t ref_seq_len, int64_t *sa_bwt, int64_t *count) {
+int FMI_search::build_fm_index(const char *ref_file_name, char *binary_seq, int64_t ref_seq_len, int64_t *sa_bwt, int64_t *count) {
     printf("ref_seq_len = %ld\n", ref_seq_len);
     fflush(stdout);
 
     char outname[PATH_MAX];
+
     strcpy_s(outname, PATH_MAX, ref_file_name);
-    strcat_s(outname, PATH_MAX, CP_FILENAME_SUFFIX_AVX);
-    //sprintf(outname, "%s.bwt.8bit.%d", ref_file_name, CP_BLOCK_SIZE_AVX);
+    strcat_s(outname, PATH_MAX, CP_FILENAME_SUFFIX);
+    //sprintf(outname, "%s.bwt.2bit.%d", ref_file_name, CP_BLOCK_SIZE);
 
     std::fstream outstream (outname, std::ios::out | std::ios::binary);
     outstream.seekg(0);	
@@ -165,18 +164,19 @@ int FMI_search::build_fm_index_avx(const char *ref_file_name, char *binary_seq,
     outstream.write((char*)count, 5 * sizeof(int64_t));
 
     int64_t i;
-    int64_t ref_seq_len_aligned = ((ref_seq_len + CP_BLOCK_SIZE_AVX - 1) / CP_BLOCK_SIZE_AVX) * CP_BLOCK_SIZE_AVX;
+    int64_t ref_seq_len_aligned = ((ref_seq_len + CP_BLOCK_SIZE - 1) / CP_BLOCK_SIZE) * CP_BLOCK_SIZE;
     int64_t size = ref_seq_len_aligned * sizeof(uint8_t);
     bwt = (uint8_t *)_mm_malloc(size, 64);
-    index_alloc += size;
     assert_not_null(bwt, size, index_alloc);
 
+    int64_t sentinel_index = -1;
     for(i=0; i< ref_seq_len; i++)
     {
         if(sa_bwt[i] == 0)
         {
             bwt[i] = 4;
             printf("BWT[%ld] = 4\n", i);
+            sentinel_index = i;
         }
         else
         {
@@ -201,188 +201,82 @@ int FMI_search::build_fm_index_avx(const char *ref_file_name, char *binary_seq,
         bwt[i] = DUMMY_CHAR;
 
 
-    printf("CP_SHIFT = %d, CP_MASK = %d\n", CP_SHIFT_AVX, CP_MASK_AVX);
-    printf("sizeof CP_OCC = %ld\n", sizeof(CP_OCC_AVX));
+    printf("CP_SHIFT = %d, CP_MASK = %d\n", CP_SHIFT, CP_MASK);
+    printf("sizeof CP_OCC = %ld\n", sizeof(CP_OCC));
     fflush(stdout);
     // create checkpointed occ
-    int64_t cp_occ_size = (ref_seq_len >> CP_SHIFT_AVX) + 1;
-    CP_OCC_AVX *cp_occ = NULL;
+    int64_t cp_occ_size = (ref_seq_len >> CP_SHIFT) + 1;
+    CP_OCC *cp_occ = NULL;
 
-    size = cp_occ_size * sizeof(CP_OCC_AVX);
-    cp_occ = (CP_OCC_AVX *)_mm_malloc(size, 64);
-    index_alloc += size;
+    size = cp_occ_size * sizeof(CP_OCC);
+    cp_occ = (CP_OCC *)_mm_malloc(size, 64);
     assert_not_null(cp_occ, size, index_alloc);
-    memset(cp_occ, 0, cp_occ_size * sizeof(CP_OCC_AVX));
+    memset(cp_occ, 0, cp_occ_size * sizeof(CP_OCC));
     int64_t cp_count[16];
 
     memset(cp_count, 0, 16 * sizeof(int64_t));
     for(i = 0; i < ref_seq_len; i++)
     {
-        if((i & CP_MASK_AVX) == 0)
+        if((i & CP_MASK) == 0)
         {
-            CP_OCC_AVX cpo;
+            CP_OCC cpo;
             cpo.cp_count[0] = cp_count[0];
             cpo.cp_count[1] = cp_count[1];
             cpo.cp_count[2] = cp_count[2];
             cpo.cp_count[3] = cp_count[3];
-			memcpy_s(cpo.bwt_str, CP_BLOCK_SIZE_AVX * sizeof(uint8_t), bwt + i, CP_BLOCK_SIZE_AVX * sizeof(uint8_t));
 
-            cp_occ[i >> CP_SHIFT_AVX] = cpo;
+			int32_t j;
+            cpo.one_hot_bwt_str[0] = 0;
+            cpo.one_hot_bwt_str[1] = 0;
+            cpo.one_hot_bwt_str[2] = 0;
+            cpo.one_hot_bwt_str[3] = 0;
+
+			for(j = 0; j < CP_BLOCK_SIZE; j++)
+			{
+                cpo.one_hot_bwt_str[0] = cpo.one_hot_bwt_str[0] << 1;
+                cpo.one_hot_bwt_str[1] = cpo.one_hot_bwt_str[1] << 1;
+                cpo.one_hot_bwt_str[2] = cpo.one_hot_bwt_str[2] << 1;
+                cpo.one_hot_bwt_str[3] = cpo.one_hot_bwt_str[3] << 1;
+				uint8_t c = bwt[i + j];
+                //printf("c = %d\n", c);
+                if(c < 4)
+                {
+                    cpo.one_hot_bwt_str[c] += 1;
+                }
+			}
+
+            cp_occ[i >> CP_SHIFT] = cpo;
         }
         cp_count[bwt[i]]++;
     }
-    outstream.write((char*)cp_occ, cp_occ_size * sizeof(CP_OCC_AVX));
+    outstream.write((char*)cp_occ, cp_occ_size * sizeof(CP_OCC));
     _mm_free(cp_occ);
     _mm_free(bwt);
-    index_alloc -= (ref_seq_len_aligned * sizeof(uint8_t) + cp_occ_size * sizeof(CP_OCC_AVX));
 
-    size = ref_seq_len * sizeof(uint32_t);
+    #if SA_COMPRESSION  
+
+    size = ((ref_seq_len >> SA_COMPX)+ 1)  * sizeof(uint32_t);
     uint32_t *sa_ls_word = (uint32_t *)_mm_malloc(size, 64);
-    index_alloc += size;
     assert_not_null(sa_ls_word, size, index_alloc);
-    size = ref_seq_len * sizeof(int8_t);
+    size = ((ref_seq_len >> SA_COMPX) + 1) * sizeof(int8_t);
     int8_t *sa_ms_byte = (int8_t *)_mm_malloc(size, 64);
-    index_alloc += size;
     assert_not_null(sa_ms_byte, size, index_alloc);
+    int64_t pos = 0;
     for(i = 0; i < ref_seq_len; i++)
     {
-        sa_ls_word[i] = sa_bwt[i] & 0xffffffff;
-        sa_ms_byte[i] = (sa_bwt[i] >> 32) & 0xff;
-    }
-    outstream.write((char*)sa_ms_byte, ref_seq_len * sizeof(int8_t));
-    outstream.write((char*)sa_ls_word, ref_seq_len * sizeof(uint32_t));
-    outstream.close();
-    printf("max_occ_ind = %ld\n", i >> CP_SHIFT_AVX);    
-    fflush(stdout);
-
-    _mm_free(sa_ms_byte);
-    _mm_free(sa_ls_word);
-    return 0;
-}
-
-int FMI_search::build_fm_index_scalar(const char *ref_file_name, char *binary_seq, int64_t ref_seq_len, int64_t *sa_bwt, int64_t *count) {
-    printf("ref_seq_len = %ld\n", ref_seq_len);
-    fflush(stdout);
-
-    char outname[PATH_MAX];
-
-    strcpy_s(outname, PATH_MAX, ref_file_name);
-    strcat_s(outname, PATH_MAX, CP_FILENAME_SUFFIX_SCALAR);
-    //sprintf(outname, "%s.bwt.2bit.%d", ref_file_name, CP_BLOCK_SIZE_SCALAR);
-
-    std::fstream outstream (outname, std::ios::out | std::ios::binary);
-    outstream.seekg(0);	
-
-    printf("count = %ld, %ld, %ld, %ld, %ld\n", count[0], count[1], count[2], count[3], count[4]);
-    fflush(stdout);
-
-    uint8_t *bwt;
-
-    ref_seq_len++;
-    outstream.write((char *)(&ref_seq_len), 1 * sizeof(int64_t));
-    outstream.write((char*)count, 5 * sizeof(int64_t));
-
-    int64_t i;
-    int64_t ref_seq_len_aligned = ((ref_seq_len + CP_BLOCK_SIZE_SCALAR - 1) / CP_BLOCK_SIZE_SCALAR) * CP_BLOCK_SIZE_SCALAR;
-    int64_t size = ref_seq_len_aligned * sizeof(uint8_t);
-    bwt = (uint8_t *)_mm_malloc(size, 64);
-    assert_not_null(bwt, size, index_alloc);
-
-    for(i=0; i< ref_seq_len; i++)
-    {
-        if(sa_bwt[i] == 0)
-        {
-            bwt[i] = 4;
-            printf("BWT[%ld] = 4\n", i);
-        }
-        else
-        {
-            char c = binary_seq[sa_bwt[i]-1];
-            switch(c)
-            {
-                case 0: bwt[i] = 0;
-                          break;
-                case 1: bwt[i] = 1;
-                          break;
-                case 2: bwt[i] = 2;
-                          break;
-                case 3: bwt[i] = 3;
-                          break;
-                default:
-                        fprintf(stderr, "ERROR! i = %ld, c = %c\n", i, c);
-                        exit(EXIT_FAILURE);
-            }
-        }
-    }
-    for(i = ref_seq_len; i < ref_seq_len_aligned; i++)
-        bwt[i] = DUMMY_CHAR;
-
-
-    printf("CP_SHIFT = %d, CP_MASK = %d\n", CP_SHIFT_SCALAR, CP_MASK_SCALAR);
-    printf("sizeof CP_OCC = %ld\n", sizeof(CP_OCC_SCALAR));
-    fflush(stdout);
-    // create checkpointed occ
-    int64_t cp_occ_size = (ref_seq_len >> CP_SHIFT_SCALAR) + 1;
-    CP_OCC_SCALAR *cp_occ = NULL;
-
-    size = cp_occ_size * sizeof(CP_OCC_SCALAR);
-    cp_occ = (CP_OCC_SCALAR *)_mm_malloc(size, 64);
-    assert_not_null(cp_occ, size, index_alloc);
-    memset(cp_occ, 0, cp_occ_size * sizeof(CP_OCC_SCALAR));
-    int64_t cp_count[16];
-
-    memset(cp_count, 0, 16 * sizeof(int64_t));
-    for(i = 0; i < ref_seq_len; i++)
-    {
-        if((i & CP_MASK_SCALAR) == 0)
+        if ((i & SA_COMPX_MASK) == 0)
         {
-            CP_OCC_SCALAR cpo;
-            cpo.cp_count[0] = cp_count[0];
-            cpo.cp_count[1] = cp_count[1];
-            cpo.cp_count[2] = cp_count[2];
-            cpo.cp_count[3] = cp_count[3];
-
-			BIT_DATA_TYPE bwt_str_bit0 = 0;
-			BIT_DATA_TYPE bwt_str_bit1 = 0;
-			BIT_DATA_TYPE dollar_mask = 0;
-			int32_t j;
-			for(j = 0; j < CP_BLOCK_SIZE_SCALAR; j++)
-			{
-				uint8_t c = bwt[i + j];
-				if((c == 4) || (c == DUMMY_CHAR))
-				{
-					dollar_mask <<= 1;
-					dollar_mask += 1;
-					c = 0;
-				}
-				else if(c > 3)
-				{
-					fprintf(stderr, "ERROR! [%ld, %d] c = %u\n", (long)i, j, c);
-					exit(EXIT_FAILURE);
-				}
-				else
-				{
-					dollar_mask <<= 1;
-					dollar_mask += 0;
-				}
-				bwt_str_bit0 = bwt_str_bit0 << 1;
-				bwt_str_bit0 += (c & 1);
-				bwt_str_bit1 = bwt_str_bit1 << 1;
-				bwt_str_bit1 += ((c >> 1) & 1);
-			}
-			cpo.bwt_str_bit0 = bwt_str_bit0;
-			cpo.bwt_str_bit1 = bwt_str_bit1;
-			cpo.dollar_mask  = dollar_mask;
-
-            memset(cpo.pad, 0, PADDING_SCALAR);
-            cp_occ[i >> CP_SHIFT_SCALAR] = cpo;
+            sa_ls_word[pos] = sa_bwt[i] & 0xffffffff;
+            sa_ms_byte[pos] = (sa_bwt[i] >> 32) & 0xff;
+            pos++;
         }
-        cp_count[bwt[i]]++;
     }
-    outstream.write((char*)cp_occ, cp_occ_size * sizeof(CP_OCC_SCALAR));
-    _mm_free(cp_occ);
-    _mm_free(bwt);
-
+    fprintf(stderr, "pos: %d, ref_seq_len__: %ld\n", pos, ref_seq_len >> SA_COMPX);
+    outstream.write((char*)sa_ms_byte, ((ref_seq_len >> SA_COMPX) + 1) * sizeof(int8_t));
+    outstream.write((char*)sa_ls_word, ((ref_seq_len >> SA_COMPX) + 1) * sizeof(uint32_t));
+    
+    #else
+    
     size = ref_seq_len * sizeof(uint32_t);
     uint32_t *sa_ls_word = (uint32_t *)_mm_malloc(size, 64);
     assert_not_null(sa_ls_word, size, index_alloc);
@@ -396,8 +290,12 @@ int FMI_search::build_fm_index_scalar(const char *ref_file_name, char *binary_se
     }
     outstream.write((char*)sa_ms_byte, ref_seq_len * sizeof(int8_t));
     outstream.write((char*)sa_ls_word, ref_seq_len * sizeof(uint32_t));
+    
+    #endif
+
+    outstream.write((char *)(&sentinel_index), 1 * sizeof(int64_t));
     outstream.close();
-    printf("max_occ_ind = %ld\n", i >> CP_SHIFT_SCALAR);    
+    printf("max_occ_ind = %ld\n", i >> CP_SHIFT);    
     fflush(stdout);
 
     _mm_free(sa_ms_byte);
@@ -473,11 +371,11 @@ int FMI_search::build_index() {
 	//status = saisxx<const char *, int64_t *, int64_t>(reference_seq.c_str(), suffix_array + 1, pac_len, 4);
 	status = saisxx(reference_seq.c_str(), suffix_array + 1, pac_len);
 	suffix_array[0] = pac_len;
-    fprintf(stderr, "build index ticks = %llu\n", __rdtsc() - startTick);
+    fprintf(stderr, "build suffix-array ticks = %llu\n", __rdtsc() - startTick);
     startTick = __rdtsc();
 
-    build_fm_index_avx(prefix, binary_ref_seq, pac_len, suffix_array, count);
-	build_fm_index_scalar(prefix, binary_ref_seq, pac_len, suffix_array, count);
+	build_fm_index(prefix, binary_ref_seq, pac_len, suffix_array, count);
+    fprintf(stderr, "build fm-index ticks = %llu\n", __rdtsc() - startTick);
     _mm_free(binary_ref_seq);
     _mm_free(suffix_array);
     return 0;
@@ -485,17 +383,22 @@ int FMI_search::build_index() {
 
 void FMI_search::load_index()
 {
+    one_hot_mask_array = (uint64_t *)_mm_malloc(64 * sizeof(uint64_t), 64);
+    one_hot_mask_array[0] = 0;
+    uint64_t base = 0x8000000000000000L;
+    one_hot_mask_array[1] = base;
+    int64_t i = 0;
+    for(i = 2; i < 64; i++)
+    {
+        one_hot_mask_array[i] = (one_hot_mask_array[i - 1] >> 1) | base;
+    }
+
     char *ref_file_name = file_name;
     //beCalls = 0;
     char cp_file_name[PATH_MAX];
     strcpy_s(cp_file_name, PATH_MAX, ref_file_name);
     strcat_s(cp_file_name, PATH_MAX, CP_FILENAME_SUFFIX);
-//    assert(strnlen(ref_file_name, 1000) + 12 < 1000);
-//#if ((!__AVX2__))
-//    sprintf(cp_file_name, "%s.bwt.2bit.%d", ref_file_name, CP_BLOCK_SIZE_SCALAR);
-//#else
-//    sprintf(cp_file_name, "%s.bwt.8bit.%d", ref_file_name, CP_BLOCK_SIZE_AVX);
-//#endif
+
     // Read the BWT and FM index of the reference sequence
     FILE *cpstream = NULL;
     cpstream = fopen(cp_file_name,"rb");
@@ -531,19 +434,50 @@ void FMI_search::load_index()
     {
         count[ii] = count[ii] + 1;
     }
+
+    #if SA_COMPRESSION
+
+    int64_t reference_seq_len_ = (reference_seq_len >> SA_COMPX) + 1;
+    sa_ms_byte = (int8_t *)_mm_malloc(reference_seq_len_ * sizeof(int8_t), 64);
+    sa_ls_word = (uint32_t *)_mm_malloc(reference_seq_len_ * sizeof(uint32_t), 64);
+    err_fread_noeof(sa_ms_byte, sizeof(int8_t), reference_seq_len_, cpstream);
+    err_fread_noeof(sa_ls_word, sizeof(uint32_t), reference_seq_len_, cpstream);
+    
+    #else
+    
     sa_ms_byte = (int8_t *)_mm_malloc(reference_seq_len * sizeof(int8_t), 64);
     sa_ls_word = (uint32_t *)_mm_malloc(reference_seq_len * sizeof(uint32_t), 64);
     err_fread_noeof(sa_ms_byte, sizeof(int8_t), reference_seq_len, cpstream);
     err_fread_noeof(sa_ls_word, sizeof(uint32_t), reference_seq_len, cpstream);
-    fclose(cpstream);
+
+    #endif
 
     sentinel_index = -1;
+    #if SA_COMPRESSION
+    err_fread_noeof(&sentinel_index, sizeof(int64_t), 1, cpstream);
+    fprintf(stderr, "* sentinel-index: %ld\n", sentinel_index);
+    #endif
+    fclose(cpstream);
+
     int64_t x;
+    #if !SA_COMPRESSION
     for(x = 0; x < reference_seq_len; x++)
     {
-        if(get_sa_entry(x) == 0)
+        // fprintf(stderr, "x: %ld\n", x);
+        #if SA_COMPRESSION
+        if(get_sa_entry_compressed(x) == 0) {
             sentinel_index = x;
+            break;
+        }
+        #else
+        if(get_sa_entry(x) == 0) {
+            sentinel_index = x;
+            break;
+        }
+        #endif
     }
+    fprintf(stderr, "\nsentinel_index: %ld\n", x);    
+    #endif
 
     fprintf(stderr, "* Count:\n");
     for(x = 0; x < 5; x++)
@@ -556,26 +490,6 @@ void FMI_search::load_index()
             ref_file_name);
     bwa_idx_load_ele(ref_file_name, BWA_IDX_ALL);
 
-#if ((!__AVX2__))
-    base_mask[0][0] = 0;
-    base_mask[0][1] = 0;
-    base_mask[1][0] = 0xffffffffffffffffL;
-    base_mask[1][1] = 0;
-    base_mask[2][0] = 0;
-    base_mask[2][1] = 0xffffffffffffffffL;
-    base_mask[3][0] = 0xffffffffffffffffL;
-    base_mask[3][1] = 0xffffffffffffffffL;
-#else
-    c_bcast_array = (uint8_t *)_mm_malloc(256 * sizeof(uint8_t), 64);
-    for(ii = 0; ii < 4; ii++)
-    {
-        int32_t j;
-        for(j = 0; j < 64; j++)
-        {
-            c_bcast_array[ii * 64 + j] = ii;
-        }
-    }
-#endif
     fprintf(stderr, "* Done reading Index!!\n");
 }
 
@@ -1118,15 +1032,8 @@ SMEM FMI_search::backwardExt(SMEM smem, uint8_t a)
     {
         int64_t sp = (int64_t)(smem.k);
         int64_t ep = (int64_t)(smem.k) + (int64_t)(smem.s);
-#if ((!__AVX2__))
-        GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, bwt_str_bit0_sp, bwt_str_bit1_sp, bit0_cmp_sp, bit1_cmp_sp, mismatch_mask_sp);
-        GET_OCC(ep, b, occ_id_ep, y_ep, occ_ep, bwt_str_bit0_ep, bwt_str_bit1_ep, bit0_cmp_ep, bit1_cmp_ep, mismatch_mask_ep);
-#else
-        __m256i b256;
-        b256 = _mm256_load_si256((const __m256i *)(c_bcast_array + b * 64));
-        GET_OCC(sp, b, b256, occ_id_sp, y_sp, occ_sp, bwt_str_sp, bwt_sp_vec, mask_sp_vec, mask_sp);
-        GET_OCC(ep, b, b256, occ_id_ep, y_ep, occ_ep, bwt_str_ep, bwt_ep_vec, mask_ep_vec, mask_ep);
-#endif
+        GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, one_hot_bwt_str_c_sp, match_mask_sp);
+        GET_OCC(ep, b, occ_id_ep, y_ep, occ_ep, one_hot_bwt_str_c_ep, match_mask_ep);
         k[b] = count[b] + occ_sp;
         s[b] = occ_ep - occ_sp;
     }
@@ -1191,3 +1098,278 @@ void FMI_search::get_sa_entries(SMEM *smemArray, int64_t *coordArray, int32_t *c
         totalCoordCount += c;
     }
 }
+
+// sa_compression
+int64_t FMI_search::get_sa_entry_compressed(int64_t pos, int tid)
+{
+    if ((pos & SA_COMPX_MASK) == 0) {
+        
+        #if  SA_COMPRESSION
+        int64_t sa_entry = sa_ms_byte[pos >> SA_COMPX];
+        #else
+        int64_t sa_entry = sa_ms_byte[pos];     // simulation
+        #endif
+        
+        sa_entry = sa_entry << 32;
+        
+        #if  SA_COMPRESSION
+        sa_entry = sa_entry + sa_ls_word[pos >> SA_COMPX];
+        #else
+        sa_entry = sa_entry + sa_ls_word[pos];   // simulation
+        #endif
+        
+        return sa_entry;        
+    }
+    else {
+        // tprof[MEM_CHAIN][tid] ++;
+        int64_t offset = 0; 
+        int64_t sp = pos;
+        while(true)
+        {
+            int64_t occ_id_pp_ = sp >> CP_SHIFT;
+            int64_t y_pp_ = CP_BLOCK_SIZE - (sp & CP_MASK) - 1; 
+            uint64_t *one_hot_bwt_str = cp_occ[occ_id_pp_].one_hot_bwt_str;
+            uint8_t b;
+
+            if((one_hot_bwt_str[0] >> y_pp_) & 1)
+                b = 0;
+            else if((one_hot_bwt_str[1] >> y_pp_) & 1)
+                b = 1;
+            else if((one_hot_bwt_str[2] >> y_pp_) & 1)
+                b = 2;
+            else if((one_hot_bwt_str[3] >> y_pp_) & 1)
+                b = 3;
+            else
+                b = 4;
+
+            if (b == 4) {
+                return offset;
+            }
+
+            GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, one_hot_bwt_str_c_sp, match_mask_sp);
+
+            sp = count[b] + occ_sp;
+            
+            offset ++;
+            // tprof[ALIGN1][tid] ++;
+            if ((sp & SA_COMPX_MASK) == 0) break;
+        }
+        // assert((reference_seq_len >> SA_COMPX) - 1 >= (sp >> SA_COMPX));
+        #if  SA_COMPRESSION
+        int64_t sa_entry = sa_ms_byte[sp >> SA_COMPX];
+        #else
+        int64_t sa_entry = sa_ms_byte[sp];      // simultion
+        #endif
+        
+        sa_entry = sa_entry << 32;
+
+        #if  SA_COMPRESSION
+        sa_entry = sa_entry + sa_ls_word[sp >> SA_COMPX];
+        #else
+        sa_entry = sa_entry + sa_ls_word[sp];      // simulation
+        #endif
+        
+        sa_entry += offset;
+        return sa_entry;
+    }
+}
+
+void FMI_search::get_sa_entries(SMEM *smemArray, int64_t *coordArray, int32_t *coordCountArray, uint32_t count, int32_t max_occ, int tid)
+{
+    
+    uint32_t i;
+    int32_t totalCoordCount = 0;
+    for(i = 0; i < count; i++)
+    {
+        int32_t c = 0;
+        SMEM smem = smemArray[i];
+        int64_t hi = smem.k + smem.s;
+        int64_t step = (smem.s > max_occ) ? smem.s / max_occ : 1;
+        int64_t j;
+        for(j = smem.k; (j < hi) && (c < max_occ); j+=step, c++)
+        {
+            int64_t pos = j;
+            int64_t sa_entry = get_sa_entry_compressed(pos, tid);
+            coordArray[totalCoordCount + c] = sa_entry;
+        }
+        // coordCountArray[i] = c;
+        *coordCountArray += c;
+        totalCoordCount += c;
+    }
+}
+
+// SA_COPMRESSION w/ PREFETCH
+int64_t FMI_search::call_one_step(int64_t pos, int64_t &sa_entry, int64_t &offset)
+{
+    if ((pos & SA_COMPX_MASK) == 0) {        
+        sa_entry = sa_ms_byte[pos >> SA_COMPX];        
+        sa_entry = sa_entry << 32;        
+        sa_entry = sa_entry + sa_ls_word[pos >> SA_COMPX];        
+        // return sa_entry;
+        return 1;
+    }
+    else {
+        // int64_t offset = 0; 
+        int64_t sp = pos;
+
+        int64_t occ_id_pp_ = sp >> CP_SHIFT;
+        int64_t y_pp_ = CP_BLOCK_SIZE - (sp & CP_MASK) - 1; 
+        uint64_t *one_hot_bwt_str = cp_occ[occ_id_pp_].one_hot_bwt_str;
+        uint8_t b;
+
+        if((one_hot_bwt_str[0] >> y_pp_) & 1)
+            b = 0;
+        else if((one_hot_bwt_str[1] >> y_pp_) & 1)
+            b = 1;
+        else if((one_hot_bwt_str[2] >> y_pp_) & 1)
+            b = 2;
+        else if((one_hot_bwt_str[3] >> y_pp_) & 1)
+            b = 3;
+        else
+            b = 4;
+        if (b == 4) {
+            sa_entry = 0;
+            return 1;
+        }
+        
+        GET_OCC(sp, b, occ_id_sp, y_sp, occ_sp, one_hot_bwt_str_c_sp, match_mask_sp);
+        
+        sp = count[b] + occ_sp;
+        
+        offset ++;
+        if ((sp & SA_COMPX_MASK) == 0) {
+    
+            sa_entry = sa_ms_byte[sp >> SA_COMPX];        
+            sa_entry = sa_entry << 32;
+            sa_entry = sa_entry + sa_ls_word[sp >> SA_COMPX];
+            
+            sa_entry += offset;
+            // return sa_entry;
+            return 1;
+        }
+        else {
+            sa_entry = sp;
+            return 0;
+        }
+    } // else
+}
+
+void FMI_search::get_sa_entries_prefetch(SMEM *smemArray, int64_t *coordArray,
+                                         int64_t *coordCountArray, int64_t count,
+                                         const int32_t max_occ, int tid, int64_t &id_)
+{
+    
+    // uint32_t i;
+    int32_t totalCoordCount = 0;
+    int32_t mem_lim = 0, id = 0;
+    
+    for(int i = 0; i < count; i++)
+    {
+        int32_t c = 0;
+        SMEM smem = smemArray[i];
+        mem_lim += smem.s;
+    }
+
+    int64_t *pos_ar = (int64_t *) _mm_malloc( mem_lim * sizeof(int64_t), 64);
+    int64_t *map_ar = (int64_t *) _mm_malloc( mem_lim * sizeof(int64_t), 64);
+
+    for(int i = 0; i < count; i++)
+    {
+        int32_t c = 0;
+        SMEM smem = smemArray[i];
+        int64_t hi = smem.k + smem.s;
+        int64_t step = (smem.s > max_occ) ? smem.s / max_occ : 1;
+        int64_t j;
+        for(j = smem.k; (j < hi) && (c < max_occ); j+=step, c++)
+        {
+            int64_t pos = j;
+             pos_ar[id]  = pos;
+             map_ar[id++] = totalCoordCount + c;
+            // int64_t sa_entry = get_sa_entry_compressed(pos, tid);
+            // coordArray[totalCoordCount + c] = sa_entry;
+        }
+        //coordCountArray[i] = c;
+        *coordCountArray += c;
+        totalCoordCount += c;
+    }
+    
+    id_ += id;
+    
+    const int32_t sa_batch_size = 20;
+    int64_t working_set[sa_batch_size], map_pos[sa_batch_size];;
+    int64_t offset[sa_batch_size] = {-1};
+    
+    int i = 0, j = 0;    
+    while(i<id && j<sa_batch_size)
+    {
+        int64_t pos =  pos_ar[i];
+        working_set[j] = pos;
+        map_pos[j] = map_ar[i];
+        offset[j] = 0;
+        
+        if (pos & SA_COMPX_MASK == 0) {
+            _mm_prefetch(&sa_ms_byte[pos >> SA_COMPX], _MM_HINT_T0);
+            _mm_prefetch(&sa_ls_word[pos >> SA_COMPX], _MM_HINT_T0);
+        }
+        else {
+            int64_t occ_id_pp_ = pos >> CP_SHIFT;
+            _mm_prefetch(&cp_occ[occ_id_pp_], _MM_HINT_T0);
+        }
+        i++;
+        j++;
+    }
+        
+    int lim = j, all_quit = 0;
+    while (all_quit < id)
+    {
+        
+        for (int k=0; k<lim; k++)
+        {
+            int64_t sp = 0, pos = 0;
+            bool quit;
+            if (offset[k] >= 0) {
+                quit = call_one_step(working_set[k], sp, offset[k]);
+            }
+            else
+                continue;
+            
+            if (quit) {
+                coordArray[map_pos[k]] = sp;
+                all_quit ++;
+                
+                if (i < id)
+                {
+                    pos = pos_ar[i];
+                    working_set[k] = pos;
+                    map_pos[k] = map_ar[i++];
+                    offset[k] = 0;
+                    
+                    if (pos & SA_COMPX_MASK == 0) {
+                        _mm_prefetch(&sa_ms_byte[pos >> SA_COMPX], _MM_HINT_T0);
+                        _mm_prefetch(&sa_ls_word[pos >> SA_COMPX], _MM_HINT_T0);
+                    }
+                    else {
+                        int64_t occ_id_pp_ = pos >> CP_SHIFT;
+                        _mm_prefetch(&cp_occ[occ_id_pp_], _MM_HINT_T0);
+                    }
+                }
+                else
+                    offset[k] = -1;
+            }
+            else {
+                working_set[k] = sp;
+                if (sp & SA_COMPX_MASK == 0) {
+                    _mm_prefetch(&sa_ms_byte[sp >> SA_COMPX], _MM_HINT_T0);
+                    _mm_prefetch(&sa_ls_word[sp >> SA_COMPX], _MM_HINT_T0);
+                }
+                else {
+                    int64_t occ_id_pp_ = sp >> CP_SHIFT;
+                    _mm_prefetch(&cp_occ[occ_id_pp_], _MM_HINT_T0);
+                }                
+            }
+        }
+    }
+    
+    _mm_free(pos_ar);
+    _mm_free(map_ar);
+}


=====================================
src/FMI_search.h
=====================================
@@ -46,38 +46,16 @@ Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at i
 #define assert_not_null(x, size, cur_alloc) \
         if (x == NULL) { fprintf(stderr, "Allocation of %0.2lf GB for " #x " failed.\nCurrent Allocation = %0.2lf GB\n", size * 1.0 /(1024*1024*1024), cur_alloc * 1.0 /(1024*1024*1024)); exit(EXIT_FAILURE); }
 
-#define CP_BLOCK_SIZE_SCALAR 64
-#define CP_FILENAME_SUFFIX_SCALAR ".bwt.2bit.64"
-#define CP_MASK_SCALAR 63
-#define CP_SHIFT_SCALAR 6
-#define BIT_DATA_TYPE uint64_t
-#define PADDING_SCALAR 8
-
-#define CP_BLOCK_SIZE_AVX 32
-#define CP_FILENAME_SUFFIX_AVX ".bwt.8bit.32"
-#define CP_MASK_AVX 31
-#define CP_SHIFT_AVX 5
+#define CP_BLOCK_SIZE 64
+#define CP_FILENAME_SUFFIX ".bwt.2bit.64"
+#define CP_MASK 63
+#define CP_SHIFT 6
 
 typedef struct checkpoint_occ_scalar
 {
-    BIT_DATA_TYPE bwt_str_bit0;
-    BIT_DATA_TYPE bwt_str_bit1;
-    BIT_DATA_TYPE dollar_mask;
     int64_t cp_count[4];
-    uint8_t  pad[PADDING_SCALAR];
-}CP_OCC_SCALAR;
-
-typedef struct checkpoint_occ_avx
-{
-    uint8_t  bwt_str[CP_BLOCK_SIZE_AVX];
-    int64_t cp_count[4];
-}CP_OCC_AVX;
-
-#if ((!__AVX2__))
-
-typedef CP_OCC_SCALAR CP_OCC;
-#define CP_SHIFT CP_SHIFT_SCALAR
-#define CP_FILENAME_SUFFIX CP_FILENAME_SUFFIX_SCALAR
+    uint64_t one_hot_bwt_str[4];
+}CP_OCC;
 
 #if defined(__clang__) || defined(__GNUC__)
 static inline int _mm_countbits_64(unsigned long x) {
@@ -86,46 +64,13 @@ static inline int _mm_countbits_64(unsigned long x) {
 #endif
 
 #define \
-GET_OCC(pp, c, occ_id_pp, y_pp, occ_pp, bwt_str_bit0_pp, bwt_str_bit1_pp, bit0_cmp_pp, bit1_cmp_pp, mismatch_mask_pp) \
-                int64_t occ_id_pp = pp >> CP_SHIFT_SCALAR; \
-                int64_t y_pp = pp & CP_MASK_SCALAR; \
+GET_OCC(pp, c, occ_id_pp, y_pp, occ_pp, one_hot_bwt_str_c_pp, match_mask_pp) \
+                int64_t occ_id_pp = pp >> CP_SHIFT; \
+                int64_t y_pp = pp & CP_MASK; \
                 int64_t occ_pp = cp_occ[occ_id_pp].cp_count[c]; \
-                if(y_pp > 0) \
-                { \
-                BIT_DATA_TYPE bwt_str_bit0_pp = cp_occ[occ_id_pp].bwt_str_bit0; \
-                BIT_DATA_TYPE bwt_str_bit1_pp = cp_occ[occ_id_pp].bwt_str_bit1; \
-                BIT_DATA_TYPE bit0_cmp_pp = bwt_str_bit0_pp ^ base_mask[c][0]; \
-                BIT_DATA_TYPE bit1_cmp_pp = bwt_str_bit1_pp ^ base_mask[c][1]; \
-                uint64_t mismatch_mask_pp = bit0_cmp_pp | bit1_cmp_pp | cp_occ[occ_id_pp].dollar_mask; \
-                mismatch_mask_pp = mismatch_mask_pp >> (CP_BLOCK_SIZE_SCALAR - y_pp); \
-                occ_pp += y_pp - _mm_countbits_64(mismatch_mask_pp); \
-                }
-
-#else
-
-typedef CP_OCC_AVX CP_OCC;
-#define CP_SHIFT CP_SHIFT_AVX
-#define CP_FILENAME_SUFFIX CP_FILENAME_SUFFIX_AVX
-
-#if defined(__clang__) || defined(__GNUC__)
-static inline int _mm_countbits_32(unsigned x) {
-    return __builtin_popcount(x);
-}
-#endif
-
-#define \
-GET_OCC(pp, c, c256, occ_id_pp, y_pp, occ_pp, bwt_str_pp, bwt_pp_vec, mask_pp_vec, mask_pp) \
-                int64_t occ_id_pp = pp >> CP_SHIFT_AVX; \
-                int64_t y_pp = pp & CP_MASK_AVX; \
-                int64_t occ_pp = cp_occ[occ_id_pp].cp_count[c]; \
-                uint8_t *bwt_str_pp = cp_occ[occ_id_pp].bwt_str; \
-                __m256i bwt_pp_vec = _mm256_load_si256((const __m256i *)(bwt_str_pp)); \
-                __m256i mask_pp_vec = _mm256_cmpeq_epi8(bwt_pp_vec, c256); \
-                uint64_t mask_pp = _mm256_movemask_epi8(mask_pp_vec); \
-                mask_pp = mask_pp << (32 - y_pp); \
-                occ_pp += _mm_countbits_32(mask_pp);
-
-#endif
+                uint64_t one_hot_bwt_str_c_pp = cp_occ[occ_id_pp].one_hot_bwt_str[c]; \
+                uint64_t match_mask_pp = one_hot_bwt_str_c_pp & one_hot_mask_array[y_pp]; \
+                occ_pp += _mm_countbits_64(match_mask_pp);
 
 typedef struct smem_struct
 {
@@ -207,7 +152,18 @@ class FMI_search: public indexEle
                         int32_t *coordCountArray,
                         uint32_t count,
                         int32_t max_occ);
-
+    int64_t get_sa_entry_compressed(int64_t pos, int tid=0);
+    void get_sa_entries(SMEM *smemArray,
+                        int64_t *coordArray,
+                        int32_t *coordCountArray,
+                        uint32_t count,
+                        int32_t max_occ,
+                        int tid);
+    int64_t call_one_step(int64_t pos, int64_t &sa_entry, int64_t &offset);
+    void get_sa_entries_prefetch(SMEM *smemArray, int64_t *coordArray,
+                                 int64_t *coordCountArray, int64_t count,
+                                 const int32_t max_occ, int tid, int64_t &id_);
+    
     int64_t reference_seq_len;
     int64_t sentinel_index;
 private:
@@ -218,21 +174,12 @@ private:
         int8_t *sa_ms_byte;
         CP_OCC *cp_occ;
 
-#if ((!__AVX2__))
-        BIT_DATA_TYPE base_mask[4][2];
-#else
-        uint8_t *c_bcast_array;
-#endif
+        uint64_t *one_hot_mask_array;
 
         int64_t pac_seq_len(const char *fn_pac);
         void pac2nt(const char *fn_pac,
                     std::string &reference_seq);
-        int build_fm_index_avx(const char *ref_file_name,
-                               char *binary_seq,
-                               int64_t ref_seq_len,
-                               int64_t *sa_bwt,
-                               int64_t *count);
-        int build_fm_index_scalar(const char *ref_file_name,
+        int build_fm_index(const char *ref_file_name,
                                char *binary_seq,
                                int64_t ref_seq_len,
                                int64_t *sa_bwt,


=====================================
src/bandedSWA.cpp
=====================================
The diff for this file was not included because it is too large.

=====================================
src/bwamem.cpp
=====================================
@@ -30,13 +30,7 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
 
 #include "bwamem.h"
 #include "FMI_search.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "safe_mem_lib.h"
-#ifdef __cplusplus
-}
-#endif
+#include "memcpy_bwamem.h"
 
 //----------------
 extern uint64_t tprof[LIM_R][LIM_C];
@@ -164,6 +158,17 @@ KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
 #define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
 KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
 
+#if MATE_SORT
+void sort_alnreg_re(int n, mem_alnreg_t* a) {
+    ks_introsort(mem_ars2, n, a);
+}
+
+void sort_alnreg_score(int n, mem_alnreg_t* a) {
+    ks_introsort(mem_ars, n, a);
+}
+
+#endif
+
 #define PATCH_MAX_R_BW 0.05f
 #define PATCH_MIN_SC_RATIO 0.90f
 
@@ -230,6 +235,60 @@ int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
 #define MEM_SEEDSW_COEF 0.05f
 int stat;
 
+#if MATE_SORT
+int mem_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+                    const uint8_t *pac, uint8_t *query, int n,
+                    mem_alnreg_t *a)
+{
+    int m, i, j;
+    if (n <= 1) return n;
+    for (i = 0; i < n; ++i) a[i].n_comp = 1;
+
+    for (i = 1; i < n; ++i)
+    {
+        mem_alnreg_t *p = &a[i];
+        if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap)
+            continue; // then no need to go into the loop below
+
+        for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) {
+            mem_alnreg_t *q = &a[j];
+            int64_t or_, oq, mr, mq;
+            int score, w;
+            if (q->qe == q->qb) continue; // a[j] has been excluded
+            or_ = q->re - p->rb; // overlap length on the reference
+            oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
+            mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment
+            mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
+            if (or_ > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant
+                if (p->score < q->score)
+                {
+                    p->qe = p->qb;
+                    break;
+                }
+                else q->qe = q->qb;
+            }
+            else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p
+                p->n_comp += q->n_comp + 1;
+                p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov;
+                p->sub = p->sub > q->sub? p->sub : q->sub;
+                p->csub = p->csub > q->csub? p->csub : q->csub;
+                p->qb = q->qb, p->rb = q->rb;
+                p->truesc = p->score = score;
+                p->w = w;
+                q->qb = q->qe;
+            }
+        }
+    }
+    for (i = 0, m = 0; i < n; ++i) // exclude identical hits
+        if (a[i].qe > a[i].qb) {
+            if (m != i) a[m++] = a[i];
+            else ++m;
+        }
+    n = m;
+    return m;
+}
+#endif
+
 int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
                          const uint8_t *pac, uint8_t *query, int n,
                          mem_alnreg_t *a)
@@ -323,7 +382,7 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c,
             c->m <<= 1;
             if (pm == SEEDS_PER_CHAIN) {  // re-new memory
                 if ((auxSeedBuf = (mem_seed_t *) calloc(c->m, sizeof(mem_seed_t))) == NULL) { fprintf(stderr, "ERROR: out of memory auxSeedBuf\n"); exit(1); }
-                memcpy_s((char*) (auxSeedBuf), c->m * sizeof(mem_seed_t), c->seeds, c->n * sizeof(mem_seed_t));
+                memcpy_bwamem((char*) (auxSeedBuf), c->m * sizeof(mem_seed_t), c->seeds, c->n * sizeof(mem_seed_t), __FILE__, __LINE__);
                 c->seeds = auxSeedBuf;
                 tprof[PE13][tid]++;
             } else {  // new memory
@@ -688,7 +747,8 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt,
     
     int num[nseq];
     memset(num, 0, nseq*sizeof(int));
-    int64_t *sa_coord = (int64_t *) _mm_malloc(sizeof(int64_t) * opt->max_occ, 64);
+    int smem_buf_size = 6000;
+    int64_t *sa_coord = (int64_t *) _mm_malloc(sizeof(int64_t) * opt->max_occ * smem_buf_size, 64);
     int64_t seedBufCount = 0;
     
     for (int l=0; l<nseq; l++)
@@ -726,6 +786,24 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt,
         } while (pos < num_smem - 1 && matchArray[pos].rid == matchArray[pos + 1].rid);
         l_rep += e - b;
 
+        // bwt_sa
+        // assert(pos - smem_ptr + 1 < 6000);
+        if (pos - smem_ptr + 1 >= smem_buf_size)
+        {
+            int csize = smem_buf_size;
+            smem_buf_size *= 2;
+            sa_coord = (int64_t *) _mm_realloc(sa_coord, csize, opt->max_occ * smem_buf_size,
+                                               sizeof(int64_t));
+            assert(sa_coord != NULL);
+        }
+        int64_t id = 0, cnt_ = 0, mypos = 0;
+        #if SA_COMPRESSION
+        uint64_t tim = __rdtsc();
+        fmi->get_sa_entries_prefetch(&matchArray[smem_ptr], sa_coord, &cnt_,
+                                     pos - smem_ptr + 1, opt->max_occ, tid, id);  // sa compressed prefetch
+        tprof[MEM_SA][tid] += __rdtsc() - tim;
+        #endif
+        
         for (i = smem_ptr; i <= pos; i++)
         {
             SMEM *p = &matchArray[i];
@@ -734,19 +812,26 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt,
             int64_t k;
             step = p->s > opt->max_occ? p->s / opt->max_occ : 1;
 
-            // uint64_t tim = __rdtsc();
-            int cnt = 0;            
+            int cnt = 0;
+            #if !SA_COMPRESSION
+            uint64_t tim = __rdtsc();
             fmi->get_sa_entries(p, sa_coord, &cnt, 1, opt->max_occ);
-            cnt = 0;
-            // tprof[MEM_SA][tid] += __rdtsc() - tim;
+            tprof[MEM_SA][tid] += __rdtsc() - tim;
+            #endif
             
+            cnt = 0;            
             for (k = count = 0; k < p->s && count < opt->max_occ; k += step, ++count)
             {
                 mem_chain_t tmp, *lower, *upper;
                 mem_seed_t s;
                 int rid, to_add = 0;
-                                
+
+                #if SA_COMPRESSION
+                s.rbeg = tmp.pos = sa_coord[mypos++];
+                #else
                 s.rbeg = tmp.pos = sa_coord[cnt++];
+                #endif
+                
                 s.qbeg = p->m;
                 s.score= s.len = slen;
                 if (s.rbeg < 0 || s.len < 0) 
@@ -1202,7 +1287,7 @@ void mem_process_seqs(mem_opt_t *opt,
     // PAIRED_END
     if (opt->flag & MEM_F_PE) { // infer insert sizes if not provided
         if (pes0)
-            memcpy_s(pes, 4 * sizeof(mem_pestat_t), pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size
+            memcpy_bwamem(pes, 4 * sizeof(mem_pestat_t), pes0, 4 * sizeof(mem_pestat_t), __FILE__, __LINE__); // if pes0 != NULL, set the insert-size
                                                          // distribution as pes0
         else {
             fprintf(stderr, "[0000] Inferring insert size distribution of PE reads from data, "
@@ -1674,7 +1759,7 @@ void* _mm_realloc(void *ptr, int64_t csize, int64_t nsize, int16_t dsize) {
     }
     void *nptr = _mm_malloc(nsize * dsize, 64);
     assert(nptr != NULL);
-    memcpy_s(nptr, nsize * dsize, ptr, csize);
+    memcpy_bwamem(nptr, nsize * dsize, ptr, csize, __FILE__, __LINE__);
     _mm_free(ptr);
     
     return nptr;


=====================================
src/bwamem_pair.cpp
=====================================
@@ -154,7 +154,13 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns,
 {
     extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
                                     const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
-
+    #if MATE_SORT
+    extern int mem_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+                               const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+    extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+    extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+    #endif
+    
     //int tid = omp_get_thread_num();
     int64_t l_pac = bns->l_pac;
     int i, r, skip[4], n = 0, rid = -1;
@@ -220,17 +226,56 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns,
                 b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
 
                 kv_push(mem_alnreg_t, *ma, b); // make room for a new element
+                
+                #if !MATE_SORT
                 // move b s.t. ma is sorted
                 for (i = 0; i < ma->n - 1; ++i) // find the insertion point
                     if (ma->a[i].score < b.score) break;
                 tmp = i;
                 for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
                 ma->a[i] = b;
+                
+                #else
+                
+                int resort = 0;
+                for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+                    if (ma->a[i].re == b.re) {
+                        resort = 1;
+                        break;
+                    }
+                    if (ma->a[i].re > b.re) {
+                        break;
+                    }
+                }
+                if (resort) {
+                    // Don't know where to put this alignment. So let the scores decide
+                    sort_alnreg_score(ma->n - 1, ma->a);
+                    for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+                        if (ma->a[i].score < b.score) {
+                            break;
+                        }
+                    }
+                    tmp = i;
+                    for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+                    ma->a[i] = b;
+                    // Now we can sort based on end position
+                    sort_alnreg_re(ma->n, ma->a);
+                }
+                else {
+                    tmp = i;
+                    for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+                    ma->a[i] = b;
+                }
+                #endif
                 tprof[PE26][0] ++;
             }
             ++n;
         }
+        #if !MATE_SORT
         if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+        #else
+        if (n) ma->n = mem_dedup_patch(opt, 0, 0, 0, ma->n, ma->a); // sam_improvements
+        #endif
         if (rev) free(rev);
         free(ref);
     }
@@ -313,7 +358,14 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns,
     extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
     extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
     extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
-
+    
+    #if MATE_SORT
+    extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+    extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+    extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+                                    const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+    #endif
+    
     int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
     kstring_t str;
     mem_aln_t h[2], g[2], aa[2][2];
@@ -332,12 +384,32 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns,
                 if (a[i].a[j].score >= a[i].a[0].score  - opt->pen_unpaired)
                     kv_push(mem_alnreg_t, b[i], a[i].a[j]);
 
-        for (i = 0; i < 2; ++i)
+        #if MATE_SORT
+        for (i = 0; i < 2; ++i) {
+            sort_alnreg_re(a[!i].n, a[!i].a);
+            int val = 0, swcount = 0;
             for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
                 int val = mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
                 n += val;
+                swcount += val;
+            }
+            if (swcount > 0) {
+                mem_alnreg_v* ma = &a[!i];
+                ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+            }
+            else {
+                sort_alnreg_score(a[!i].n, a[!i].a);
             }
+        }
+
+        #else
         
+        for (i = 0; i < 2; ++i)
+            for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
+                int val = mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
+                n += val;
+            }
+        #endif
         free(b[0].a); free(b[1].a);     
     }
 
@@ -571,9 +643,14 @@ int mem_sam_pe_batch(const mem_opt_t *opt, mem_cache *mmc,
     kswv *pwsw = new kswv(opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, -1*opt->b, nthreads,
                           maxRefLen, maxQerLen);
 
+    // Shift 16-bit 
+    for (int i=0; i<pcnt-pcnt8; i++)
+        seqPairArray[pcnt + MAX_LINE_LEN - 1 - i] = seqPairArray[pcnt-i-1];
+    
 #if __AVX512BW__    
     pwsw->getScores8(seqPairArray, seqBufRef, seqBufQer, aln, pcnt8, nthreads, 0);
-    pwsw->getScores16(seqPairArray + pcnt8, seqBufRef, seqBufQer, aln, pcnt-pcnt8, nthreads, 0);
+    pwsw->getScores16(seqPairArray + pcnt8 + MAX_LINE_LEN, seqBufRef, seqBufQer,
+                      aln, pcnt-pcnt8, nthreads, 0);
 #else
     fprintf(stderr, "Error: This should not have happened!! \nPlease look in to AVX512 macros\n");
     exit(EXIT_FAILURE);
@@ -581,9 +658,11 @@ int mem_sam_pe_batch(const mem_opt_t *opt, mem_cache *mmc,
 
     // Post-processing
     int pos = 0, pos8 = 0, pos16 = 0;
-    for (int i=0; i<pcnt; i++) {
-        kswr_t r = aln[i];
+    for (int i=0; i<pcnt8; i++)
+    {
         SeqPair sp = seqPairArray[i];
+        int ind = sp.regid;
+        kswr_t r = aln[ind];
         int xtra = sp.h0;
         if ((xtra & KSW_XSTART) == 0 || ((xtra & KSW_XSUBO) && r.score < (xtra & 0xffff))) continue; 
         
@@ -592,19 +671,34 @@ int mem_sam_pe_batch(const mem_opt_t *opt, mem_cache *mmc,
         uint8_t *qs = seqBufQer + sp.idq;
         uint8_t *rs = seqBufRef + sp.idr;
         revseq(r.qe + 1, qs); revseq(r.te + 1, rs);
-        seqPairArray[pos++] = sp;
+        seqPairArray[pos++] = sp;        
+        pos8 ++;
+    }
+    
+    int id = pcnt8 + MAX_LINE_LEN;
+    for (int i=0; i<pcnt-pcnt8; i++)
+    {
+        SeqPair sp = seqPairArray[i + id];
+        int ind = sp.regid;
+        kswr_t r = aln[ind];
+        int xtra = sp.h0;
+        if ((xtra & KSW_XSTART) == 0 || ((xtra & KSW_XSUBO) && r.score < (xtra & 0xffff))) continue; 
         
-        if (i < pcnt8) pos8 ++;
-        else pos16 ++;
+        sp.h0 = KSW_XSTOP | r.score;
+        sp.len2 = r.qe + 1;
+        uint8_t *qs = seqBufQer + sp.idq;
+        uint8_t *rs = seqBufRef + sp.idr;
+        revseq(r.qe + 1, qs); revseq(r.te + 1, rs);
+        seqPairArray[pos++] = sp;        
+        pos16 ++;
     }
 
     int pcnt2 = pos;
     assert(pos8 + pos16 == pcnt2);
 
 #if __AVX512BW__
-    pwsw->getScores8(seqPairArray, seqBufRef, seqBufQer, aln, pos8, nthreads, 1);
     pwsw->getScores16(seqPairArray + pos8, seqBufRef, seqBufQer, aln, pos16, nthreads, 1);
-    // pwsw->kswvScalarWrapper(seqPairArray, seqBufRef, seqBufQer, aln, pos, nthreads, 0); //debug
+    pwsw->getScores8(seqPairArray, seqBufRef, seqBufQer, aln, pos8, nthreads, 1);
 #else
     fprintf(stderr, "Error: This should not have happened!! \nPlease look in to AVX512 macros\n");
     exit(EXIT_FAILURE);
@@ -628,7 +722,13 @@ int mem_sam_pe_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
                             bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
     extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
                               const mem_alnreg_v *a, int l_query, const char *query);
-
+    #if MATE_SORT
+    extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+    extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+    extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+                                    const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+    #endif
+    
     int32_t *gar = (int32_t*) mmc->seqPairArrayAux[tid];
     
     int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
@@ -653,7 +753,29 @@ int mem_sam_pe_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
             a[0].a[l].flg = 0;
         for (int l=0; l<a[1].n; l++)
             a[1].a[l].flg = 0;
-        
+
+        #if MATE_SORT
+        for (i = 0; i < 2; ++i) {
+            sort_alnreg_re(a[!i].n, a[!i].a);
+            int val = 0, swcount = 0;
+            for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
+                val = mem_matesw_batch_post(opt, bns, pac, pes, &b[i].a[j],
+                                                s[!i].l_seq, (uint8_t*)s[!i].seq,
+                                                &a[!i], myaln, gcnt, gar, mmc);
+                n += val;
+                swcount += val;
+                // ncnt++;
+                gcnt += 4;
+            }
+            if (swcount > 0) {
+                mem_alnreg_v* ma = &a[!i];
+                ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+            }
+            else {
+                sort_alnreg_score(a[!i].n, a[!i].a);
+            }
+        }
+        #else
         for (i = 0; i < 2; ++i) {
             for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) {
                 int val = mem_matesw_batch_post(opt, bns, pac, pes, &b[i].a[j],
@@ -664,7 +786,7 @@ int mem_sam_pe_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
                 gcnt += 4;
             }
         }
-
+        #endif
         free(b[0].a); free(b[1].a);
     }
 
@@ -978,9 +1100,15 @@ int mem_matesw_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
 {
     extern int mem_sort_dedup_patch_rev(const mem_opt_t *opt, const bntseq_t *bns,
                                         const uint8_t *pac, uint8_t *query, int n,
-                                        mem_alnreg_t *a);
+                                        mem_alnreg_t *a);    
     extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
                                     const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+    #if MATE_SORT    
+    extern int mem_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns,
+                               const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+    extern void sort_alnreg_re(int n, mem_alnreg_t* a);
+    extern void sort_alnreg_score(int n, mem_alnreg_t* a);
+    #endif
     
     int64_t l_pac = bns->l_pac;
     int i, r, skip[4], n = 0, rid = -1;
@@ -1062,6 +1190,8 @@ int mem_matesw_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
 
                 kv_push(mem_alnreg_t, *ma, b); // make room for a new element
 
+                #if !MATE_SORT
+
                 // move b s.t. ma is sorted
                 for (i = 0; i < ma->n - 1; ++i) // find the insertion point
                     if (ma->a[i].score < b.score) break;
@@ -1069,11 +1199,48 @@ int mem_matesw_batch_post(const mem_opt_t *opt, const bntseq_t *bns,
                 for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
                 ma->a[i] = b;
 
+                #else
+                int resort = 0;
+                // move b s.t. ma is sorted
+                for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+                    if (ma->a[i].re == b.re) {
+                        resort = 1;
+                        break;
+                    }
+                    if (ma->a[i].re > b.re) {
+                        break;
+                    }
+                }
+                if (resort) {
+                    // Don't know where to put this alignment. So let the scores decide
+                    sort_alnreg_score(ma->n - 1, ma->a);
+                    for (i = 0; i < ma->n - 1; ++i) { // find the insertion point
+                        if (ma->a[i].score < b.score) {
+                            break;
+                        }
+                    }
+                    tmp = i;
+                    for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+                    ma->a[i] = b;
+                    // Now we can sort based on end position
+                    sort_alnreg_re(ma->n, ma->a);
+                }
+                else {
+                    tmp = i;
+                    for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+                    ma->a[i] = b;
+                }
+                #endif
                 tprof[PE26][0] ++;
             }
             ++n;
         }
-        if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);        
+        #if !MATE_SORT
+        if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+        #else
+        if (n) ma->n = mem_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+        #endif
+        
         if (rev) free(rev);
         free(ref);
     }


=====================================
src/kopen.cpp
=====================================
@@ -48,10 +48,10 @@
 #  include "malloc_wrap.h"
 #endif
 
+#include "memcpy_bwamem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
-#include "safe_mem_lib.h"
 #include "safe_str_lib.h"
 #ifdef __cplusplus
 }
@@ -254,7 +254,7 @@ static int ftp_open(const char *fn)
 	if (*p != '(') goto ftp_open_end;
 	++p;
 	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
-	memcpy_s(pasv_ip, 4 * sizeof(int), v, 4 * sizeof(int));
+	memcpy_bwamem(pasv_ip, 4 * sizeof(int), v, 4 * sizeof(int), __FILE__, __LINE__);
 	pasv_port = (v[4]<<8&0xff00) + v[5];
 	kftp_send_cmd(&aux, retr, 0);
 	sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);


=====================================
src/kseq.h
=====================================
@@ -35,13 +35,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <assert.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "safe_mem_lib.h"
-#ifdef __cplusplus
-}
-#endif
+#include "memcpy_bwamem.h"
 
 #ifdef USE_MALLOC_WRAPPERS
 #  include "malloc_wrap.h"
@@ -138,7 +132,7 @@ typedef struct __kstring_t {
 				str->s = (char*)realloc(str->s, str->m);				\
 			}															\
 			gotany = 1;													\
-			memcpy_s(str->s + str->l, str->m - str->l, ks->buf + ks->begin, i - ks->begin); \
+			memcpy_bwamem(str->s + str->l, str->m - str->l, ks->buf + ks->begin, i - ks->begin, __FILE__, __LINE__); \
 			str->l = str->l + (i - ks->begin);							\
 			ks->begin = i + 1;											\
 			if (i < ks->end) {											\


=====================================
src/kstring.h
=====================================
@@ -32,13 +32,7 @@
 
 #include <stdlib.h>
 #include <string.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "safe_mem_lib.h"
-#ifdef __cplusplus
-}
-#endif
+#include "memcpy_bwamem.h"
 
 #ifdef USE_MALLOC_WRAPPERS
 #  include "malloc_wrap.h"
@@ -72,7 +66,7 @@ static inline int kputsn(const char *p, int l, kstring_t *s)
 		kroundup32(s->m);
 		s->s = (char*)realloc(s->s, s->m);
 	}
-	memcpy_s(s->s + s->l, s->m - s->l, p, l);
+	memcpy_bwamem(s->s + s->l, s->m - s->l, p, l, __FILE__, __LINE__);
 	s->l += l;
 	s->s[s->l] = 0;
 	return l;


=====================================
src/kswv.cpp
=====================================
@@ -565,7 +565,7 @@ int kswv::kswv512_u8(uint8_t seq1SoA[],
     _mm512_store_si512((__m512i *) qe, qe512);
 
     int live = 0;
-    for (int l=0; l<SIMD_WIDTH8; l++) {
+    for (int l=0; l<SIMD_WIDTH8 && (po_ind + l) < numPairs; l++) {
         int ind = po_ind + l;
         int16_t *te;
         if (i < SIMD_WIDTH16) te = te1;
@@ -679,16 +679,17 @@ int kswv::kswv512_u8(uint8_t seq1SoA[],
     int16_t temp4[SIMD_WIDTH8] __attribute((aligned(64)));
     _mm512_store_si512((__m512i *) temp, max512);
     _mm512_store_si512((__m512i *) temp4, te512);
-    _mm512_store_si512((__m512i *) (temp4 + SIMD_WIDTH16), te512_);     
-    for (int i=0; i<SIMD_WIDTH8; i++)
+    _mm512_store_si512((__m512i *) (temp4 + SIMD_WIDTH16), te512_);
+    
+    for (int i=0; i<SIMD_WIDTH8  && (po_ind + i) < numPairs; i++)
     {
         int ind = po_ind + i;
         int16_t *te2;
-        if (i < SIMD_WIDTH16) te2 = temp4;
-        else te2 = temp4;
+        // if (i < SIMD_WIDTH16) te2 = temp4;
+        // else te2 = temp4;
+        te2 = temp4;
 #if !MAINY
         ind = p[i].regid;    // index of corr. aln
-        assert(ind == po_ind + i);
 #endif
         if (qe[i]) {
             aln[ind].score2 = (temp[i] == 0? (int)-1: (uint8_t) temp[i]);
@@ -1122,11 +1123,10 @@ int kswv::kswv512_16(int16_t seq1SoA[],
     _mm512_store_si512((__m512i *) te, te512);
     _mm512_store_si512((__m512i *) qe, qe512);
 
-    for (int l=0; l<SIMD_WIDTH16; l++) {
+    for (int l=0; l<SIMD_WIDTH16 && (po_ind + l) < numPairs; l++) {
         int ind = po_ind + l;
 #if !MAINY
         ind = p[l].regid;    // index of corr. aln
-        // assert(ind == po_ind + l);
         if (phase) {
             if (aln[ind].score == score[l]) {
                 aln[ind].tb = aln[ind].te - te[l];
@@ -1199,12 +1199,11 @@ int kswv::kswv512_16(int16_t seq1SoA[],
     // _mm512_store_si512((__m512i *) temp, max512_);
     _mm512_store_si512((__m512i *) temp1, max512);
     _mm512_store_si512((__m512i *) temp2, te512);
-    
-    for (int i=0; i<SIMD_WIDTH16; i++) {
+
+    for (int i=0; i<SIMD_WIDTH16 && (po_ind + i) < numPairs; i++) {    
         int ind = po_ind + i;
 #if !MAINY
         ind = p[i].regid;    // index of corr. aln
-        assert(ind == po_ind + i);
 #endif      
         aln[ind].score2 = temp1[i];
         aln[ind].te2 = temp2[i];


=====================================
src/kthread.cpp
=====================================
@@ -65,13 +65,14 @@ static void *ktf_worker(void *data)
 		int st = i * BATCH_SIZE;
 		if (st >= w->t->n) break;
 		int ed = (i + 1) * BATCH_SIZE < w->t->n? (i + 1) * BATCH_SIZE : w->t->n;
-		w->t->func(w->t->data, st, ed-st, tid);
+		// w->t->func(w->t->data, st, ed-st, tid);
+        w->t->func(w->t->data, st, ed-st, w - w->t->w);
 	}
 
 	while ((i = steal_work(w->t)) >= 0) {
 		int st = i * BATCH_SIZE;
 		int ed = (i + 1) * BATCH_SIZE < w->t->n? (i + 1) * BATCH_SIZE : w->t->n;
-		w->t->func(w->t->data, st, ed-st, tid);
+		w->t->func(w->t->data, st, ed-st, w - w->t->w);
 	}
 	pthread_exit(0);
 }


=====================================
src/macro.h
=====================================
@@ -58,11 +58,12 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
 #define ALIGN_OFF 1
 
 #define MAX_THREADS 256
-
-
 #define LIM_R 128
 #define LIM_C 128
 
+#define SA_COMPRESSION 1
+#define SA_COMPX 03 // (= power of 2)
+#define SA_COMPX_MASK 0x7    // 0x7 or 0x3 or 0x1
 
 /*** Runtime profiling macros ***/
 #define INDEX 0


=====================================
src/main.cpp
=====================================
@@ -1,122 +1,126 @@
-/*************************************************************************************
-                           The MIT License
-
-   BWA-MEM2  (Sequence alignment using Burrows-Wheeler Transform),
-   Copyright (C) 2019 Intel Corporation, Heng Li.
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-
-Contacts: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at intel.com>;
-                                Heng Li <hli at jimmy.harvard.edu> 
-*****************************************************************************************/
-
-// ----------------------------------
-#include "main.h"
-
-#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "2.0pre2"
-#endif
-
-
-// ----------------------------------
-uint64_t proc_freq, tprof[LIM_R][LIM_C], prof[LIM_R];
-// ----------------------------------
-
-int usage()
-{
-    fprintf(stderr, "Usage: bwa-mem2 <command> <arguments>\n");
-    fprintf(stderr, "Commands:\n");
-    fprintf(stderr, "  index         create index\n");
-    fprintf(stderr, "  mem           alignment\n");
-    fprintf(stderr, "  version       print version number\n");
-    return 1;
-}
-
-int main(int argc, char* argv[])
-{
-        
-    // ---------------------------------    
-    uint64_t tim = __rdtsc();
-    sleep(1);
-    proc_freq = __rdtsc() - tim;
-
-    int ret = -1;
-    if (argc < 2) return usage();
-
-    if (strcmp(argv[1], "index") == 0)
-    {
-         uint64_t tim = __rdtsc();
-         ret = bwa_index(argc-1, argv+1);
-         fprintf(stderr, "Total time taken: %0.4lf\n", (__rdtsc() - tim)*1.0/proc_freq);
-         return ret;
-    }
-    else if (strcmp(argv[1], "mem") == 0)
-    {
-        tprof[MEM][0] = __rdtsc();
-        kstring_t pg = {0,0,0};
-        extern char *bwa_pg;
-
-        fprintf(stderr, "-----------------------------\n");
-#if __AVX512BW__
-        fprintf(stderr, "Executing in AVX512 mode!!\n");
-#endif
-#if ((!__AVX512BW__) & (__AVX2__))
-        fprintf(stderr, "Executing in AVX2 mode!!\n");
-#endif
-#if ((!__AVX512BW__) && (!__AVX2__) && (__SSE2__))
-        fprintf(stderr, "Executing in SSE4.1 mode!!\n");
-#endif
-#if ((!__AVX512BW__) && (!__AVX2__) && (!__SSE2__))
-        fprintf(stderr, "Executing in Scalar mode!!\n");
-#endif
-        fprintf(stderr, "-----------------------------\n");
-
-        ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
-        for (int i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
-        ksprintf(&pg, "\n");
-        bwa_pg = pg.s;
-        ret = main_mem(argc-1, argv+1);
-        free(bwa_pg);
-        
-        /** Enable this return to avoid printing of the runtime profiling **/
-        //return ret;
-    }
-    else if (strcmp(argv[1], "version") == 0)
-    {
-        puts(PACKAGE_VERSION);
-        return 0;
-    } else {
-        fprintf(stderr, "ERROR: unknown command '%s'\n", argv[1]);
-        return 1;
-    }
-        
-    fprintf(stderr, "\nImportant parameter settings: \n");
-    fprintf(stderr, "\tBATCH_SIZE: %d\n", BATCH_SIZE);
-    fprintf(stderr, "\tMAX_SEQ_LEN_REF: %d\n", MAX_SEQ_LEN_REF);
-    fprintf(stderr, "\tMAX_SEQ_LEN_QER: %d\n", MAX_SEQ_LEN_QER);
-    fprintf(stderr, "\tMAX_SEQ_LEN8: %d\n", MAX_SEQ_LEN8);
-    fprintf(stderr, "\tSEEDS_PER_READ: %d\n", SEEDS_PER_READ);
-    fprintf(stderr, "\tSIMD_WIDTH8 X: %d\n", SIMD_WIDTH8);
-    fprintf(stderr, "\tSIMD_WIDTH16 X: %d\n", SIMD_WIDTH16);
-    fprintf(stderr, "\tAVG_SEEDS_PER_READ: %d\n", AVG_SEEDS_PER_READ);
-    
-    return 0;
-}
+/*************************************************************************************
+                           The MIT License
+
+   BWA-MEM2  (Sequence alignment using Burrows-Wheeler Transform),
+   Copyright (C) 2019 Intel Corporation, Heng Li.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+Contacts: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at intel.com>;
+                                Heng Li <hli at jimmy.harvard.edu> 
+*****************************************************************************************/
+
+// ----------------------------------
+#include "main.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "2.2"
+#endif
+
+
+// ----------------------------------
+uint64_t proc_freq, tprof[LIM_R][LIM_C], prof[LIM_R];
+// ----------------------------------
+
+int usage()
+{
+    fprintf(stderr, "Usage: bwa-mem2 <command> <arguments>\n");
+    fprintf(stderr, "Commands:\n");
+    fprintf(stderr, "  index         create index\n");
+    fprintf(stderr, "  mem           alignment\n");
+    fprintf(stderr, "  version       print version number\n");
+    return 1;
+}
+
+int main(int argc, char* argv[])
+{
+        
+    // ---------------------------------    
+    uint64_t tim = __rdtsc();
+    sleep(1);
+    proc_freq = __rdtsc() - tim;
+
+    int ret = -1;
+    if (argc < 2) return usage();
+
+    if (strcmp(argv[1], "index") == 0)
+    {
+         uint64_t tim = __rdtsc();
+         ret = bwa_index(argc-1, argv+1);
+         fprintf(stderr, "Total time taken: %0.4lf\n", (__rdtsc() - tim)*1.0/proc_freq);
+         return ret;
+    }
+    else if (strcmp(argv[1], "mem") == 0)
+    {
+        tprof[MEM][0] = __rdtsc();
+        kstring_t pg = {0,0,0};
+        extern char *bwa_pg;
+
+        fprintf(stderr, "-----------------------------\n");
+#if __AVX512BW__
+        fprintf(stderr, "Executing in AVX512 mode!!\n");
+#elif __AVX2__
+        fprintf(stderr, "Executing in AVX2 mode!!\n");
+#elif __AVX__
+        fprintf(stderr, "Executing in AVX mode!!\n");        
+#elif __SSE4_2__
+        fprintf(stderr, "Executing in SSE4.2 mode!!\n");
+#elif __SSE4_1__
+        fprintf(stderr, "Executing in SSE4.1 mode!!\n");        
+#endif
+        fprintf(stderr, "-----------------------------\n");
+
+        #if SA_COMPRESSION
+        fprintf(stderr, "* SA compression enabled with xfactor: %d\n", 0x1 << SA_COMPX);
+        #endif
+        
+        ksprintf(&pg, "@PG\tID:bwa-mem2\tPN:bwa-mem2\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
+
+        for (int i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
+        ksprintf(&pg, "\n");
+        bwa_pg = pg.s;
+        ret = main_mem(argc-1, argv+1);
+        free(bwa_pg);
+        
+        /** Enable this return to avoid printing of the runtime profiling **/
+        //return ret;
+    }
+    else if (strcmp(argv[1], "version") == 0)
+    {
+        puts(PACKAGE_VERSION);
+        return 0;
+    } else {
+        fprintf(stderr, "ERROR: unknown command '%s'\n", argv[1]);
+        return 1;
+    }
+        
+    fprintf(stderr, "\nImportant parameter settings: \n");
+    fprintf(stderr, "\tBATCH_SIZE: %d\n", BATCH_SIZE);
+    fprintf(stderr, "\tMAX_SEQ_LEN_REF: %d\n", MAX_SEQ_LEN_REF);
+    fprintf(stderr, "\tMAX_SEQ_LEN_QER: %d\n", MAX_SEQ_LEN_QER);
+    fprintf(stderr, "\tMAX_SEQ_LEN8: %d\n", MAX_SEQ_LEN8);
+    fprintf(stderr, "\tSEEDS_PER_READ: %d\n", SEEDS_PER_READ);
+    fprintf(stderr, "\tSIMD_WIDTH8 X: %d\n", SIMD_WIDTH8);
+    fprintf(stderr, "\tSIMD_WIDTH16 X: %d\n", SIMD_WIDTH16);
+    fprintf(stderr, "\tAVG_SEEDS_PER_READ: %d\n", AVG_SEEDS_PER_READ);
+    
+    return 0;
+}


=====================================
src/memcpy_bwamem.cpp
=====================================
@@ -0,0 +1,49 @@
+/*************************************************************************************
+                           The MIT License
+
+   BWA-MEM2  (Sequence alignment using Burrows-Wheeler Transform),
+   Copyright (C) 2019  Intel Corporation, Heng Li.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at intel.com>;
+*****************************************************************************************/
+
+#include "memcpy_bwamem.h"
+
+errno_t memcpy_bwamem(void *dest, rsize_t dmax, const void *src, rsize_t smax, char *file_name, int line_num)
+{
+    errno_t ret;
+    int64_t bytes_copied;
+    for(bytes_copied = 0; bytes_copied < smax; bytes_copied += RSIZE_MAX_MEM)
+    {
+        int64_t bytes_remaining = smax - bytes_copied;
+        int64_t bytes_to_copy = (bytes_remaining > RSIZE_MAX_MEM) ? RSIZE_MAX_MEM : bytes_remaining;
+        int64_t dest_bytes_remaining = dmax - bytes_copied;
+        int64_t dest_bytes = (dest_bytes_remaining < bytes_to_copy) ? dest_bytes_remaining : bytes_to_copy;
+        if((ret = memcpy_s((char *)dest + bytes_copied, dest_bytes, (const char *)src + bytes_copied, bytes_to_copy)) != 0)
+        {
+            fprintf(stderr, "[%s: %d] memcpy_s returned %d\n", file_name, line_num, ret);
+            exit(EXIT_FAILURE);
+        }
+    }
+    return 0;
+}


=====================================
src/memcpy_bwamem.h
=====================================
@@ -0,0 +1,45 @@
+/*************************************************************************************
+                           The MIT License
+
+   BWA-MEM2  (Sequence alignment using Burrows-Wheeler Transform),
+   Copyright (C) 2019  Intel Corporation, Heng Li.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+Authors: Sanchit Misra <sanchit.misra at intel.com>; Vasimuddin Md <vasimuddin.md at intel.com>;
+*****************************************************************************************/
+
+
+#ifndef MEMCPY_BWA_H
+#define MEMCPY_BWA_H
+
+#include <stdlib.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "safe_mem_lib.h"
+#ifdef __cplusplus
+}
+#endif
+
+errno_t memcpy_bwamem(void *dest, rsize_t dmax, const void *src, rsize_t smax, char *file_name, int line_num);
+
+#endif


=====================================
src/profiling.cpp
=====================================
@@ -31,8 +31,7 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
 #include "macro.h"
 #include <stdint.h>
 #include <assert.h>
-
-extern uint64_t proc_freq, tprof[LIM_R][LIM_C];
+#include "profiling.h"
 
 int find_opt(uint64_t *a, int len, uint64_t *max, uint64_t *min, double *avg)
 {
@@ -57,7 +56,7 @@ int display_stats(int nthreads)
     uint64_t max, min;
     double avg;
     fprintf(stderr, "No. of OMP threads: %d\n", nthreads);
-    fprintf(stderr, "Processor is runnig @%lf MHz\n", proc_freq*1.0/1e6);
+    fprintf(stderr, "Processor is running @%lf MHz\n", proc_freq*1.0/1e6);
     fprintf(stderr, "Runtime profile:\n");
 
     fprintf(stderr, "\n\tTime taken for main_mem function: %0.2lf sec\n\n",
@@ -140,17 +139,18 @@ int display_stats(int nthreads)
     fprintf(stderr, "\t\tSAL compute avg: %0.2lf, (%0.2lf, %0.2lf)\n",
             avg*1.0/proc_freq, max*1.0/proc_freq, min*1.0/proc_freq);
     
-#if HIDE
+    #if 1 //HIDE
     find_opt(tprof[MEM_SA], nthreads, &max, &min, &avg);
     fprintf(stderr, "\t\t\t\tMEM_SA avg: %0.2lf, (%0.2lf, %0.2lf)\n\n",
             avg*1.0/proc_freq, max*1.0/proc_freq, min*1.0/proc_freq);
-#endif
+    #endif
     
     // printf("\n\t BSW compute time (sec):\n");
     find_opt(tprof[MEM_ALN2], nthreads, &max, &min, &avg);
     fprintf(stderr, "\t\tBSW time, avg: %0.2lf, (%0.2lf, %0.2lf)\n",
             avg*1.0/proc_freq, max*1.0/proc_freq, min*1.0/proc_freq);
 
+    #if HIDE
     int agg1 = 0, agg2 = 0, agg3 = 0;
     for (int i=0; i<nthreads; i++) {
         agg1 += tprof[PE11][i];
@@ -160,10 +160,32 @@ int display_stats(int nthreads)
     if (agg1 != agg3) 
         fprintf(stderr, "There is a discrepancy re-allocs, plz rectify!!\n");
 
-    assert(agg2 != 0);
-    fprintf(stderr, "\n\tTotal re-allocs: %d out of total requests: %d, Rate: %0.2f\n",
-            agg1, agg2, agg1*1.0/agg2);
+    if(agg2 > 0)
+    {
+        fprintf(stderr, "\n\tTotal re-allocs: %d out of total requests: %d, Rate: %0.2f\n",
+                agg1, agg2, agg1*1.0/agg2);
+    }
+
+    double res, max_ = 0, min_=1e10;
+    for (int i=0; i<nthreads; i++) {
+        double val = (tprof[ALIGN1][i]*1.0) / tprof[MEM_CHAIN][i];
+        res += val;
+        if (max_ < val) max_ = val;
+        if (min_ > val) min_ = val;
+    }
+    fprintf(stderr, "\tAvg. FM-index traversal per get_sa_entry(): avg: %lf, max: %lf, min: %lf\n",
+            res/nthreads, max_, min_);
 
+    int64_t tot_inst1 = 0, tot_inst2 = 0;
+    for (int i=0; i<nthreads; i++) {
+        tot_inst1 += tprof[SAM1][i];
+        tot_inst2 += tprof[SAM2][i];
+    }
+    
+    fprintf(stderr, "\ttot_inst1: %ld, tot_inst2: %ld, over %d threads\n",
+            tot_inst1, tot_inst2, nthreads);
+    #endif
+    
 #if HIDE
     fprintf(stderr, "\n BSW Perf.:\n");
     find_opt(tprof[MEM_ALN2_B], 1, &max, &min, &avg);


=====================================
src/profiling.h
=====================================
@@ -31,5 +31,5 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
 #define _PROFILE_HPP
 
 int display_stats(int );
-
+extern uint64_t proc_freq, tprof[LIM_R][LIM_C];
 #endif


=====================================
src/runsimd.cpp
=====================================
@@ -152,11 +152,25 @@ static int exe_path(const char *exe, int max, char buf[], int *base_st)
 static void test_and_launch(char *argv[], char *prefix, const char *simd) // we assume prefix is long enough
 {
 	struct stat st;
+    int prefix_len = strlen(prefix);
 	strcat_s(prefix, PATH_MAX, simd);
-	if (stat(prefix, &st) == 0 && (st.st_mode & S_IXUSR)) {
-		//fprintf(stderr, "Launching executable \"%s\"\n", prefix);
-		execv(prefix, argv);
-	}
+    fprintf(stderr, "Looking to launch executable \"%s\", simd = %s\n", prefix, simd);
+    if(stat(prefix, &st) == 0)
+    {
+        if (st.st_mode & S_IXUSR) {
+            fprintf(stderr, "Launching executable \"%s\"\n", prefix);
+            execv(prefix, argv);
+        }
+        else
+        {
+            fprintf(stderr, "(st.st_mode & S_IXUSR) = %d, can not run executable: %s\n", st.st_mode & S_IXUSR, prefix);
+        }
+    }
+    else
+    {
+        fprintf(stderr, "stat(prefix, &st) = %d, can not run executable: %s\n", stat(prefix, &st), prefix);
+    }
+    prefix[prefix_len] = 0;
 }
 
 int main(int argc, char *argv[])
@@ -179,13 +193,10 @@ int main(int argc, char *argv[])
 	//prefix_len = strlen(prefix);
 	simd = x86_simd();
 	if (simd & SIMD_AVX512BW) test_and_launch(argv, prefix, ".avx512bw");
-	if (simd & SIMD_AVX512F) test_and_launch(argv, prefix, ".avx512f");
 	if (simd & SIMD_AVX2) test_and_launch(argv, prefix, ".avx2");
 	if (simd & SIMD_AVX) test_and_launch(argv, prefix, ".avx");
 	if (simd & SIMD_SSE4_2) test_and_launch(argv, prefix, ".sse42");
 	if (simd & SIMD_SSE4_1) test_and_launch(argv, prefix, ".sse41");
-	if (simd & SIMD_SSE2) test_and_launch(argv, prefix, ".sse2");
-	if (simd & SIMD_SSE) test_and_launch(argv, prefix, ".sse");
 	free(prefix);
 	fprintf(stderr, "ERROR: fail to find the right executable\n");
 	return 2;


=====================================
test/fmi_test.cpp
=====================================
@@ -40,6 +40,7 @@ Authors: Vasimuddin Md <vasimuddin.md at intel.com>; Sanchit Misra <sanchit.misra at i
 #endif
 
 #define QUERY_DB_SIZE 1280000000
+
 int myrank, num_ranks;
 
 int main(int argc, char **argv) {
@@ -88,8 +89,9 @@ int main(int argc, char **argv) {
     assert(max_readlength > 0);
     assert(max_readlength < 10000);
     assert(numReads > 0);
-    assert(numReads * max_readlength < QUERY_DB_SIZE);
     printf("numReads = %d, max_readlength = %d, min_readlength = %d\n", numReads, max_readlength, min_readlength);
+    assert(numReads * max_readlength < QUERY_DB_SIZE);
+
     uint8_t *enc_qdb=(uint8_t *)malloc(numReads * max_readlength * sizeof(uint8_t));
 
     int64_t cind,st;
@@ -246,7 +248,7 @@ int main(int argc, char **argv) {
     }
     printf("totalSmems = %ld\n", totalSmem);
 
-#ifdef PRINT_OUTPUT
+    #ifdef PRINT_OUTPUT
     int32_t prevRid = -1;
     for(batch_id = 0; batch_id < num_batches; batch_id++)
     {
@@ -271,7 +273,7 @@ int main(int argc, char **argv) {
             u2 = smem.k + smem.s;
             for(u3 = u1; u3 < u2; u3++)
             {
-                printf("%ld,", fmiSearch->get_sa_entry(u3));
+                //printf("%ld,", fmiSearch->get_sa_entry(u3));
             }
             printf("]"); 
 #endif



View it on GitLab: https://salsa.debian.org/med-team/bwa-mem2/-/commit/944b4a3d2a651a95510c06c38086fb7386ff0913

-- 
View it on GitLab: https://salsa.debian.org/med-team/bwa-mem2/-/commit/944b4a3d2a651a95510c06c38086fb7386ff0913
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210308/5694ee9c/attachment-0001.htm>