[med-svn] [Git][med-team/unifrac-tools][master] 6 commits: Missed the planned update to new upstream version

Wed Nov 30 07:54:36 GMT 2022


Andreas Tille pushed to branch master at Debian Med / unifrac-tools


Commits:
a2be7a89 by Andreas Tille at 2022-11-30T08:45:31+01:00
Missed the planned update to new upstream version

- - - - -
2ad1d1b3 by Andreas Tille at 2022-11-30T08:46:02+01:00
routine-update: New upstream version

- - - - -
2eb2aa26 by Andreas Tille at 2022-11-30T08:46:03+01:00
New upstream version 1.1.3
- - - - -
2550cc5f by Andreas Tille at 2022-11-30T08:46:29+01:00
Update upstream source from tag 'upstream/1.1.3'

Update to upstream version '1.1.3'
with Debian dir df68d79f79d28cba84df6ee90e3f48087801825f
- - - - -
5c466d9c by Andreas Tille at 2022-11-30T08:49:03+01:00
Refresh patches

- - - - -
a5a4233e by Andreas Tille at 2022-11-30T08:50:39+01:00
Upload to unstable

- - - - -


16 changed files:

- .github/workflows/main.yml
- README.md
- debian/changelog
- debian/patches/baseline.patch
- debian/patches/remove_non-free_headers.patch
- src/Makefile
- src/api.cpp
- + src/testdata/compare_unifrac_matrix.py
- + src/testdata/compare_unifrac_pcoa.py
- + src/testdata/test500.biom
- + src/testdata/test500.tre
- + src/testdata/test500.unweighted_fp32.f.h5
- + src/testdata/test500.weighted_normalized_fp32.f.h5
- + src/testdata/test500.weighted_unnormalized_fp32.h5
- src/unifrac_task.cpp
- src/unifrac_task.hpp


Changes:

=====================================
.github/workflows/main.yml
=====================================
@@ -34,7 +34,7 @@ jobs:
 
         if [[ "$(uname -s)" == "Linux" ]];
         then
-          conda create --yes -n unifrac -c conda-forge -c bioconda gxx_linux-64 hdf5 mkl-include lz4 hdf5-static libcblas liblapacke make curl
+          conda create --yes -n unifrac -c conda-forge -c bioconda gxx_linux-64 hdf5 mkl-include lz4 zlib hdf5-static libcblas liblapacke make curl
         else
           conda create --yes -n unifrac -c conda-forge -c bioconda clangxx_osx-64 hdf5 mkl-include lz4 hdf5-static libcblas liblapacke make curl
         fi 
@@ -64,7 +64,11 @@ jobs:
         if [[ "$(uname -s)" == "Linux" ]]; 
         then
           rm -f ~/.R/Makevars
-          conda install -c conda-forge r-base
+          conda install --yes -c conda-forge r-base
+          unset CXXFLAGS
+          unset CFLAGS
+          unset DEBUG_CXXFLAGS
+          unset DEBUG_CFLAGS
           # the r-base package has a broken dependency
           ln -s $CONDA_PREFIX/lib/libreadline.so $CONDA_PREFIX/lib/libreadline.so.6
           R -e 'install.packages("Rcpp", repos="http://lib.stat.cmu.edu/R/CRAN/")'
@@ -85,7 +89,41 @@ jobs:
         pushd test
         ./capi_test 1
         popd
-        
+        pushd src/testdata
+        conda install --yes -c conda-forge h5py
+        # weighted_unnormalized
+        time ssu -m weighted_unnormalized_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
+        ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
+        ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
+        rm -f t1.h5
+        time ssu -f -m weighted_unnormalized_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
+        # matrrix will be different, but PCOA similar
+        ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
+        rm -f t1.h5
+        time ssu -m weighted_unnormalized -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp64 -o t1.h5
+        # minimal precision loss between fp32 and fp64
+        ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
+        ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
+        rm -f t1.h5
+        # weighted_normalized
+        time ssu -f -m weighted_normalized_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
+        ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
+        ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
+        rm -f t1.h5
+        time ssu -f -m weighted_normalized -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp64 -o t1.h5
+        ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
+        ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
+        rm -f t1.h5
+        # unweighted
+        time ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
+        ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
+        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
+        rm -f t1.h5
+        time ssu -f -m unweighted -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp64 -o t1.h5
+        ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
+        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
+        rm -f t1.h5
+        popd
     - name: Sanity checks
       shell: bash -l {0}
       run: |


=====================================
README.md
=====================================
@@ -12,6 +12,7 @@ A detailed description of the Strided State UniFrac algorithm can be found in [M
 
     ssu
     For UniFrac, please see:
+        Sfiligoi et al. mSystems 2022; DOI: 10.1128/msystems.00028-22
         McDonald et al. Nature Methods 2018; DOI: 10.1038/s41592-018-0187-8
         Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005
         Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06
@@ -107,6 +108,14 @@ If more than one GPU is present, one can select the one to use by setting:
 
 Note that there is no GPU support for MacOS.
 
+## Additional timing information
+
+When evaluating the performance of Unifrac it is sometimes necessary to distinguish
+the time spent interacting with the data from the Unifrac compute proper.
+Additional informational messages can be enabled by setting:
+
+    export UNIFRAC_TIMING_INFO=Y
+
 # Examples of use
 
 Below are a few light examples of different ways to use this library.


=====================================
debian/changelog
=====================================
@@ -1,8 +1,9 @@
-unifrac-tools (1.1.1-5) unstable; urgency=medium
+unifrac-tools (1.1.3-1) unstable; urgency=medium
 
   * Delete broken lintian-overrides
+  * New upstream version
 
- -- Andreas Tille <tille at debian.org>  Wed, 30 Nov 2022 08:40:57 +0100
+ -- Andreas Tille <tille at debian.org>  Wed, 30 Nov 2022 08:49:15 +0100
 
 unifrac-tools (1.1.1-4) unstable; urgency=medium
 


=====================================
debian/patches/baseline.patch
=====================================
@@ -5,14 +5,14 @@ Forwarded: not-needed
 Last-Update: 2022-10-02
 ---
 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- unifrac-tools.orig/src/Makefile
-+++ unifrac-tools/src/Makefile
-@@ -73,8 +73,6 @@
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -73,8 +73,6 @@ ifneq (,$(findstring pgi,$(COMPILER)))
  else
  	ifeq ($(PERFORMING_CONDA_BUILD),True)
- 		CPPFLAGS += -mtune=generic
+ 		CPPFLAGS += -mtune=generic -mavx
 -	else
--		CPPFLAGS += -mfma -march=native
+-		CPPFLAGS += -mavx -mfma -march=native
  	endif
  endif
  


=====================================
debian/patches/remove_non-free_headers.patch
=====================================
@@ -28,4 +28,4 @@ Description: mkl_cblas.h and mkl_lapacke.h are in libmkl-dev but this is non-fre
 +BLASLIB=-llapacke -lblas
  
  
- CPPFLAGS += -Wall  -std=c++11 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+ CPPFLAGS += -Wall  -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib


=====================================
src/Makefile
=====================================
@@ -68,13 +68,13 @@ endif
 
 ifneq (,$(findstring pgi,$(COMPILER)))
 	ifeq ($(PERFORMING_CONDA_BUILD),True)
-		CPPFLAGS += -tp=sandybridge
+		CPPFLAGS += -tp=sandybridge -Mvect=simd:256
 	endif
 else
 	ifeq ($(PERFORMING_CONDA_BUILD),True)
-		CPPFLAGS += -mtune=generic
+		CPPFLAGS += -mtune=generic -mavx
 	else
-		CPPFLAGS += -mfma -march=native
+		CPPFLAGS += -mavx -mfma -march=native
 	endif
 endif
 
@@ -86,7 +86,7 @@ endif
 BLASLIB=-llapacke -lcblas
 
 
-CPPFLAGS += -Wall  -std=c++11 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+CPPFLAGS += -Wall  -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
 
 all: api main install
 
@@ -148,6 +148,8 @@ rapi_test: main
 	echo CXX1X=h5c++ > ~/.R/Makevars
 	echo CXX=h5c++ >> ~/.R/Makevars 
 	echo CC=h5c++ >> ~/.R/Makevars
+	echo CFLAGS=  >> ~/.R/Makevars
+	echo CXXFLAGS=  >> ~/.R/Makevars
 	echo LDFLAGS= -llz4 $(BLASLIB) >> ~/.R/Makevars
 	rm -f *.o
 	Rscript R_interface/rapi_test.R


=====================================
src/api.cpp
=====================================
@@ -13,15 +13,28 @@
 #include <unistd.h>
 #include <sys/mman.h>
 #include <lz4.h>
+#include <time.h>
 
 #define MMAP_FD_MASK 0x0fff
 #define MMAP_FLAG    0x1000
 
-/* O_NOATIME is defined at fcntl.h when supported */
-#ifndef O_NOATIME
-#define O_NOATIME 0
-#endif
 
+#define SETUP_TDBG(method) const char *tdbg_method=method; \
+                          bool print_tdbg = false;\
+                          if (const char* env_p = std::getenv("UNIFRAC_TIMING_INFO")) { \
+                            print_tdbg = true; \
+                            std::string env_s(env_p); \
+                            if ((env_s=="NO") || (env_s=="N") || (env_s=="no") || (env_s=="n") || \
+                               (env_s=="NEVER") || (env_s=="never")) print_tdbg = false; \
+                          } \
+                          time_t tgdb_t0; time(&tgdb_t0); \
+                          if(print_tdbg) printf("INFO (unifrac): Starting %s\n",tdbg_method);
+
+#define TDBG_STEP(sname) if(print_tdbg) {\
+                           time_t tgdb_t1; time(&tgdb_t1); \
+                           printf("INFO (unifrac): dt %4i : Completed %s.%s\n",(int)(tgdb_t1-tgdb_t0),tdbg_method,sname); \
+                           tgdb_t0 = tgdb_t1; \
+                         }
 
 #define CHECK_FILE(filename, err) if(!is_file_exists(filename)) { \
                                       return err;                 \
@@ -198,8 +211,12 @@ void initialize_mat_full_no_biom_T(TMat* &result, const char* const * sample_ids
     } else {
       std::string mmap_template(mmap_dir);
       mmap_template+="/su_mmap_XXXXXX";
-      // note: mkostemp will update mmap_template in place
+      // note: mkstemp/mkostemp will update mmap_template in place
+#ifdef O_NOATIME
       int fd=mkostemp((char *) mmap_template.c_str(), O_NOATIME ); 
+#else
+      int fd=mkstemp((char *) mmap_template.c_str() );
+#endif
       if (fd<0) {
          result->matrix = NULL;
          // leave error handling to the caller
@@ -382,7 +399,9 @@ void set_tasks(std::vector<su::task_parameters> &tasks,
 compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree,
                              const char* unifrac_method, bool variance_adjust, double alpha,
                              bool bypass_tips, unsigned int nthreads, mat_t** result) {
+    SETUP_TDBG("one_off_inmem")
     SYNC_TREE_TABLE(tree, table)
+    TDBG_STEP("sync_tree_table")
     SET_METHOD(unifrac_method, unknown_method)
 
     const unsigned int stripe_stop = (table.n_samples + 1) / 2;
@@ -400,11 +419,13 @@ compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree,
     set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
     su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
 
+    TDBG_STEP("process_stripes")
     initialize_mat(*result, table, true);  // true -> is_upper_triangle
     for(unsigned int tid = 0; tid < threads.size(); tid++) {
         su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop);
     }
 
+    TDBG_STEP("stripes_to_condensed_form")
     destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0);
 
     return okay;
@@ -415,11 +436,13 @@ compute_status partial(const char* biom_filename, const char* tree_filename,
                        unsigned int nthreads, unsigned int stripe_start, unsigned int stripe_stop,
                        partial_mat_t** result) {
 
+    SETUP_TDBG("partial")
     CHECK_FILE(biom_filename, table_missing)
     CHECK_FILE(tree_filename, tree_missing)
     SET_METHOD(unifrac_method, unknown_method)
     PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
 
+    TDBG_STEP("load_files")
     // we resize to the largest number of possible stripes even if only computing
     // partial, however we do not allocate arrays for non-computed stripes so
     // there is a little memory waste here but should be on the order of
@@ -443,7 +466,9 @@ compute_status partial(const char* biom_filename, const char* tree_filename,
     set_tasks(tasks, alpha, table.n_samples, stripe_start, stripe_stop, bypass_tips, nthreads);
     su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
 
+    TDBG_STEP("process_stripes")
     initialize_partial_mat(*result, table, dm_stripes, stripe_start, stripe_stop, true);  // true -> is_upper_triangle
+    TDBG_STEP("partial_mat")
     destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, stripe_start, stripe_stop);
 
     return okay;
@@ -451,14 +476,17 @@ compute_status partial(const char* biom_filename, const char* tree_filename,
 
 compute_status faith_pd_one_off(const char* biom_filename, const char* tree_filename,
                                 r_vec** result){
+    SETUP_TDBG("faith_pd_one_off")
     CHECK_FILE(biom_filename, table_missing)
     CHECK_FILE(tree_filename, tree_missing)
     PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
 
+    TDBG_STEP("load_files")
     initialize_results_vec(*result, table);
 
     // compute faithpd
     su::faith_pd(table, tree_sheared, std::ref((*result)->values));
+    TDBG_STEP("faith_pd")
 
     return okay;
 }
@@ -466,10 +494,12 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file
 compute_status one_off(const char* biom_filename, const char* tree_filename,
                        const char* unifrac_method, bool variance_adjust, double alpha,
                        bool bypass_tips, unsigned int nthreads, mat_t** result) {
+    SETUP_TDBG("one_off")
     CHECK_FILE(biom_filename, table_missing)
     CHECK_FILE(tree_filename, tree_missing)
     PARSE_TREE_TABLE(tree_filename, table_filename)
 
+    TDBG_STEP("load_files")
     // condensed form
     return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
 }
@@ -481,6 +511,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
                                 bool bypass_tips, unsigned int nthreads,
                                 const char *mmap_dir,  
                                 TMat** result) {
+    SETUP_TDBG("one_off_matrix_inmem")
     if (mmap_dir!=NULL) {
      if (mmap_dir[0]==0) mmap_dir = NULL; // easier to have a simple test going on
     }
@@ -488,6 +519,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
     SET_METHOD(unifrac_method, unknown_method)
     SYNC_TREE_TABLE(tree, table)
 
+    TDBG_STEP("sync_tree_table")
     const unsigned int stripe_stop = (table.n_samples + 1) / 2;
     partial_mat_t *partial_mat = NULL;
 
@@ -501,6 +533,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
       set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
       su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
 
+      TDBG_STEP("process_stripes")
       initialize_partial_mat(partial_mat, table, dm_stripes, 0, stripe_stop, true);  // true -> is_upper_triangle
       if ((partial_mat==NULL) || (partial_mat->stripes==NULL) || (partial_mat->sample_ids==NULL) ) {
           fprintf(stderr, "Memory allocation error! (initialize_partial_mat)\n");
@@ -527,6 +560,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
                                   (4096/sizeof(TReal)); /* make it larger for mmap, as the limiting factor is swapping */
       su::stripes_to_matrix_T<TReal>(ps, partial_mat->n_samples, partial_mat->stripe_total, (*result)->matrix, tile_size);
     }
+    TDBG_STEP("stripes_to_matrix")
     destroy_partial_mat(&partial_mat);
 
     return okay;
@@ -538,9 +572,11 @@ compute_status one_off_matrix(const char* biom_filename, const char* tree_filena
                               bool bypass_tips, unsigned int nthreads,
                               const char *mmap_dir,
                               mat_full_fp64_t** result) {
+    SETUP_TDBG("one_off_matrix")
     CHECK_FILE(biom_filename, table_missing)
     CHECK_FILE(tree_filename, tree_missing)
     PARSE_TREE_TABLE(tree_filename, biom_filename)
+    TDBG_STEP("load_files")
     return one_off_matrix_T<double,mat_full_fp64_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
 }
 
@@ -549,9 +585,11 @@ compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_f
                                    bool bypass_tips, unsigned int nthreads,
                                    const char *mmap_dir,
                                    mat_full_fp32_t** result) {
+    SETUP_TDBG("one_off_matrix_fp32")
     CHECK_FILE(biom_filename, table_missing)
     CHECK_FILE(tree_filename, tree_missing)
     PARSE_TREE_TABLE(tree_filename, biom_filename)
+    TDBG_STEP("load_files")
     return one_off_matrix_T<float,mat_full_fp32_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
 }
 


=====================================
src/testdata/compare_unifrac_matrix.py
=====================================
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+
+# Requires h5py library
+# Usage:
+#  compare_unifrac_matrix.py fname1 fname2 precisio
+#
+# Example:
+#  compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 a.h5 1.e-5
+#
+import h5py
+import sys
+
+fname1=sys.argv[1]
+fname2=sys.argv[2]
+prec=float(sys.argv[3])
+
+d1= h5py.File(fname1)['matrix'][:,:]
+d2= h5py.File(fname2)['matrix'][:,:]
+
+l1=len(d1)
+l2=len(d2)
+if (l1!=l2):
+  print("The two files do not have the same number of rows")
+  sys.exit(1)
+
+for r in range(l1):
+  m=max(abs(d1[r,:]-d2[r,:]))
+  if (m>prec):
+    print("Diff too large at row %i: %e>%e"%(r,m,prec))
+    sys.exit(2)
+print("Files match within precision")


=====================================
src/testdata/compare_unifrac_pcoa.py
=====================================
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+# Requires h5py library
+# Usage:
+#  compare_unifrac_pcoa.py fname1 fname2 elements precision
+#
+# Example:
+#  compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 a.h5 3 0.1
+#
+import h5py
+import sys
+
+fname1=sys.argv[1]
+fname2=sys.argv[2]
+els=int(sys.argv[3])
+prec=float(sys.argv[4])
+
+d1= h5py.File(fname1)['pcoa_eigvals'][:els]
+d2= h5py.File(fname2)['pcoa_eigvals'][:els]
+
+l1=len(d1)
+l2=len(d2)
+if (l1!=l2):
+  print("The two files do not have the same number of elements")
+  sys.exit(1)
+
+m=max(abs((d1[:]-d2[:])/d1[:]))
+if (m>prec):
+    print("Diff too large: %e>%e "%(m,prec))
+    print("Raw data:",d1,d2, abs((d1[:]-d2[:])/d1[:]))
+    sys.exit(2)
+
+print("Files match within precision")


=====================================
src/testdata/test500.biom
=====================================
Binary files /dev/null and b/src/testdata/test500.biom differ


=====================================
src/testdata/test500.tre
=====================================
The diff for this file was not included because it is too large.

=====================================
src/testdata/test500.unweighted_fp32.f.h5
=====================================
Binary files /dev/null and b/src/testdata/test500.unweighted_fp32.f.h5 differ


=====================================
src/testdata/test500.weighted_normalized_fp32.f.h5
=====================================
Binary files /dev/null and b/src/testdata/test500.weighted_normalized_fp32.f.h5 differ


=====================================
src/testdata/test500.weighted_unnormalized_fp32.h5
=====================================
Binary files /dev/null and b/src/testdata/test500.weighted_unnormalized_fp32.h5 differ


=====================================
src/unifrac_task.cpp
=====================================
@@ -2,29 +2,35 @@
 #include "unifrac_task.hpp"
 #include <cstdlib>
 
+#ifndef _OPENACC
+// popcnt returns number of bits set to 1, if supported by fast HW compute
+// else return 0 when v is 0 and max-bits else
+// Note: The user of popcnt in this file must be able to use this modified semantics
+
+#if __AVX__
+#include <immintrin.h>
+static inline int32_t popcnt_u32(uint32_t v) {return  _mm_popcnt_u32(v);}
+static inline int64_t popcnt_u64(uint64_t v) {return  _mm_popcnt_u64(v);}
+#else
+static inline int32_t popcnt_u32(uint32_t v) {return (v==0) ? 0 : 32;}
+static inline int64_t popcnt_u64(uint64_t v) {return (v==0) ? 0 : 64;}
+// For future non-x86 ports, see https://barakmich.dev/posts/popcnt-arm64-go-asm/
+// and https://stackoverflow.com/questions/38113284/whats-the-difference-between-builtin-popcountll-and-mm-popcnt-u64
+#endif
+#endif
 
-
-
+// check for zero values and pre-compute single column sums
 template<class TFloat>
-void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
-    const uint64_t start_idx = this->task_p->start;
-    const uint64_t stop_idx = this->task_p->stop;
-    const uint64_t n_samples = this->task_p->n_samples;
-    const uint64_t n_samples_r = this->dm_stripes.n_samples_r;
-
-    // openacc only works well with local variables
-    const TFloat * const __restrict__ embedded_proportions = this->embedded_proportions;
-    TFloat * const __restrict__ dm_stripes_buf = this->dm_stripes.buf;
-
-    bool * const __restrict__ zcheck = this->zcheck;
-    TFloat * const __restrict__ sums = this->sums;
-
-    const uint64_t step_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::step_size;
-    const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
-
-    // check for zero values and pre-compute single column sums
+static inline void WeightedZerosAndSums(
+                      bool   * const __restrict__ zcheck,
+                      TFloat * const __restrict__ sums,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t n_samples,
+                      const uint64_t n_samples_r) {
 #ifdef _OPENACC
-#pragma acc parallel loop present(embedded_proportions,lengths,zcheck,sums)
+#pragma acc parallel loop gang vector present(embedded_proportions,lengths,zcheck,sums)
 #else
 #pragma omp parallel for default(shared)
 #endif
@@ -44,27 +50,172 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
             sums[k]     = my_sum;
             zcheck[k] = all_zeros;
     }
+}
 
+// Single step in computing Weighted part of Unifrac
+template<class TFloat>
+static inline TFloat WeightedVal1(
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t n_samples_r,
+                      const uint64_t k,
+                      const uint64_t l1) {
+            TFloat my_stripe = 0.0;
 
-    // now do the real compute
-#ifdef _OPENACC
-    const unsigned int acc_vector_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::acc_vector_size;
-#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,lengths,zcheck,sums) async
-#else
-    // use dynamic scheduling due to non-homogeneity in the loop
-#pragma omp parallel for default(shared) schedule(dynamic,1)
-#endif
-    for(uint64_t sk = 0; sk < sample_steps ; sk++) {
-     for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
-      for(uint64_t ik = 0; ik < step_size ; ik++) {
-       const uint64_t k = sk*step_size + ik;
+            for (uint64_t emb=0; emb<filled_embs; emb++) {
+                const uint64_t offset = n_samples_r * emb;
 
-       if (k>=n_samples) continue; // past the limit
+                TFloat u1 = embedded_proportions[offset + k];
+                TFloat v1 = embedded_proportions[offset + l1];
+                TFloat diff1 = u1 - v1;
+                TFloat length = lengths[emb];
+
+                my_stripe     += fabs(diff1) * length;
+            } // for emb
 
-       const bool zcheck_k = zcheck[sk]; // due to loop collapse in ACC, must load in here
+            return my_stripe;
+}
 
-       const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+#ifndef _OPENACC
+template<class TFloat>
+static inline void WeightedVal4(
+                      TFloat * const __restrict__ stripes,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t n_samples_r,
+                      const uint64_t ks,
+                      const uint64_t ls) {
+            TFloat my_stripe0 = 0.0;
+            TFloat my_stripe1 = 0.0;
+            TFloat my_stripe2 = 0.0;
+            TFloat my_stripe3 = 0.0;
 
+            for (uint64_t emb=0; emb<filled_embs; emb++) {
+                const uint64_t offset = n_samples_r * emb;
+
+                TFloat u0 = embedded_proportions[offset + ks];
+                TFloat u1 = embedded_proportions[offset + ks + 1];
+                TFloat u2 = embedded_proportions[offset + ks + 2];
+                TFloat u3 = embedded_proportions[offset + ks + 3];
+                TFloat v0 = embedded_proportions[offset + ls];
+                TFloat v1 = embedded_proportions[offset + ls + 1];
+                TFloat v2 = embedded_proportions[offset + ls + 2];
+                TFloat v3 = embedded_proportions[offset + ls + 3];
+                TFloat length = lengths[emb];
+
+                TFloat diff0 = u0 - v0;
+                TFloat diff1 = u1 - v1;
+                TFloat diff2 = u2 - v2;
+                TFloat diff3 = u3 - v3;
+                TFloat absdiff0 = fabs(diff0);
+                TFloat absdiff1 = fabs(diff1);
+                TFloat absdiff2 = fabs(diff2);
+                TFloat absdiff3 = fabs(diff3);
+
+                my_stripe0     += absdiff0 * length;
+                my_stripe1     += absdiff1 * length;
+                my_stripe2     += absdiff2 * length;
+                my_stripe3     += absdiff3 * length;
+            } // for emb
+
+            stripes[ks] += my_stripe0;
+            stripes[ks + 1] += my_stripe1;
+            stripes[ks + 2] += my_stripe2;
+            stripes[ks + 3] += my_stripe3;
+}
+
+template<class TFloat>
+static inline void WeightedVal8(
+                      TFloat * const __restrict__ stripes,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t n_samples_r,
+                      const uint64_t ks,
+                      const uint64_t ls) {
+            TFloat my_stripe0 = 0.0;
+            TFloat my_stripe1 = 0.0;
+            TFloat my_stripe2 = 0.0;
+            TFloat my_stripe3 = 0.0;
+            TFloat my_stripe4 = 0.0;
+            TFloat my_stripe5 = 0.0;
+            TFloat my_stripe6 = 0.0;
+            TFloat my_stripe7 = 0.0;
+
+            for (uint64_t emb=0; emb<filled_embs; emb++) {
+                const uint64_t offset = n_samples_r * emb;
+
+                TFloat u0 = embedded_proportions[offset + ks];
+                TFloat u1 = embedded_proportions[offset + ks + 1];
+                TFloat u2 = embedded_proportions[offset + ks + 2];
+                TFloat u3 = embedded_proportions[offset + ks + 3];
+                TFloat u4 = embedded_proportions[offset + ks + 4];
+                TFloat u5 = embedded_proportions[offset + ks + 5];
+                TFloat u6 = embedded_proportions[offset + ks + 6];
+                TFloat u7 = embedded_proportions[offset + ks + 7];
+                TFloat v0 = embedded_proportions[offset + ls];
+                TFloat v1 = embedded_proportions[offset + ls + 1];
+                TFloat v2 = embedded_proportions[offset + ls + 2];
+                TFloat v3 = embedded_proportions[offset + ls + 3];
+                TFloat v4 = embedded_proportions[offset + ls + 4];
+                TFloat v5 = embedded_proportions[offset + ls + 5];
+                TFloat v6 = embedded_proportions[offset + ls + 6];
+                TFloat v7 = embedded_proportions[offset + ls + 7];
+                TFloat length = lengths[emb];
+
+                TFloat diff0 = u0 - v0;
+                TFloat diff1 = u1 - v1;
+                TFloat diff2 = u2 - v2;
+                TFloat diff3 = u3 - v3;
+                TFloat diff4 = u4 - v4;
+                TFloat diff5 = u5 - v5;
+                TFloat diff6 = u6 - v6;
+                TFloat diff7 = u7 - v7;
+                TFloat absdiff0 = fabs(diff0);
+                TFloat absdiff1 = fabs(diff1);
+                TFloat absdiff2 = fabs(diff2);
+                TFloat absdiff3 = fabs(diff3);
+                TFloat absdiff4 = fabs(diff4);
+                TFloat absdiff5 = fabs(diff5);
+                TFloat absdiff6 = fabs(diff6);
+                TFloat absdiff7 = fabs(diff7);
+
+                my_stripe0     += absdiff0 * length;
+                my_stripe1     += absdiff1 * length;
+                my_stripe2     += absdiff2 * length;
+                my_stripe3     += absdiff3 * length;
+                my_stripe4     += absdiff4 * length;
+                my_stripe5     += absdiff5 * length;
+                my_stripe6     += absdiff6 * length;
+                my_stripe7     += absdiff7 * length;
+            } // for emb
+
+            stripes[ks] += my_stripe0;
+            stripes[ks + 1] += my_stripe1;
+            stripes[ks + 2] += my_stripe2;
+            stripes[ks + 3] += my_stripe3;
+            stripes[ks + 4] += my_stripe4;
+            stripes[ks + 5] += my_stripe5;
+            stripes[ks + 6] += my_stripe6;
+            stripes[ks + 7] += my_stripe7;
+}
+#endif
+
+// Single step in computing NormalizedWeighted Unifrac
+template<class TFloat>
+static inline void UnnormalizedWeighted1(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const __restrict__ sums,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t k,
+                      const uint64_t l1) {
        const bool allzero_k = zcheck[k];
        const bool allzero_l1 = zcheck[l1];
 
@@ -82,36 +233,282 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
 
             // keep reads in the same place to maximize GPU warp performance
             my_stripe = sums[ridx];
-
           } else {
             // both sides non zero, use the explicit but slow approach
-            my_stripe = 0.0;
-
-#pragma acc loop seq
-            for (uint64_t emb=0; emb<filled_embs; emb++) {
-                const uint64_t offset = n_samples_r * emb;
+            my_stripe = WeightedVal1(embedded_proportions, lengths,
+                                     filled_embs, n_samples_r,
+                                     k, l1);
+          }
 
-                TFloat u1 = embedded_proportions[offset + k];
-                TFloat v1 = embedded_proportions[offset + l1];
-                TFloat diff1 = u1 - v1;
-                TFloat length = lengths[emb];
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+          
+          // keep all writes in a single place, to maximize GPU warp performance
+          dm_stripe[k]       += my_stripe;
+       }
+}
 
-                my_stripe     += fabs(diff1) * length;
-            } // for emb
+#ifndef _OPENACC
+// Vectorized step in computing UnnormalizedWeighted Unifrac
+template<class TFloat>
+static inline void UnnormalizedWeighted4(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const __restrict__ sums,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t ks,
+                      const uint64_t ls) {
+       const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0];
+       const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0];
+       const bool allzero_k = z_k==0x01010101;
+       const bool allzero_l = z_l==0x01010101;
+
+       // popcnt is cheap but has large latency, so compute speculatively/early
+       const int32_t cnt_k = popcnt_u32(z_k);
+       const int32_t cnt_l = popcnt_u32(z_l);
+
+       if (allzero_k && allzero_l) {
+         // nothing to do, would have to add 0
+       } else if (allzero_k || allzero_l) {
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
 
+          if (allzero_k) {
+             // one side has all zeros, use distributed property
+             // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+             const TFloat sum_l0 = sums[ls];
+             const TFloat sum_l1 = sums[ls+1];
+             const TFloat sum_l2 = sums[ls+2];
+             const TFloat sum_l3 = sums[ls+3];
+             dm_stripe[ks]   += sum_l0;
+             dm_stripe[ks+1] += sum_l1;
+             dm_stripe[ks+2] += sum_l2;
+             dm_stripe[ks+3] += sum_l3;
+          } else {
+             // one side has all zeros, use distributed property
+             // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
+             const TFloat sum_k0 = sums[ks];
+             const TFloat sum_k1 = sums[ks+1];
+             const TFloat sum_k2 = sums[ks+2];
+             const TFloat sum_k3 = sums[ks+3];
+             dm_stripe[ks]   += sum_k0;
+             dm_stripe[ks+1] += sum_k1;
+             dm_stripe[ks+2] += sum_k2;
+             dm_stripe[ks+3] += sum_k3;
           }
+       } else if ((cnt_k<3) && (cnt_l<3)) {
+          // several of the elemens are nonzero, may as well use the vectorized version
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+
+          WeightedVal4(dm_stripe,
+                       embedded_proportions, lengths,
+                       filled_embs, n_samples_r,
+                       ks, ls);
+       } else {
+         // only a few have both sides partially non zero, use the fine grained compute
+         for (uint64_t i=0; i<4; i++) {
+            UnnormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks+i, ls+i);
+         }
+       } // (allzero_k && allzero_l)
+}
 
-          const uint64_t idx = (stripe-start_idx)*n_samples_r;
+template<class TFloat>
+static inline void UnnormalizedWeighted8(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const __restrict__ sums,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t ks,
+                      const uint64_t ls) {
+       const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0];
+       const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0];
+       const bool allzero_k = z_k==0x0101010101010101;
+       const bool allzero_l = z_l==0x0101010101010101;
+
+       // popcnt is cheap but has large latency, so compute speculatively/early
+       const int64_t cnt_k = popcnt_u64(z_k);
+       const int64_t cnt_l = popcnt_u64(z_l);
+
+       if (allzero_k && allzero_l) {
+         // nothing to do, would have to add 0
+       } else if (allzero_k || allzero_l) {
           TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
           //TFloat *dm_stripe = dm_stripes[stripe];
 
-          // keep all writes in a single place, to maximize GPU warp performance
-          dm_stripe[k] += my_stripe;
-       } 
+          if (allzero_k) {
+             // one side has all zeros, use distributed property
+             // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+             const TFloat sum_l0 = sums[ls];
+             const TFloat sum_l1 = sums[ls+1];
+             const TFloat sum_l2 = sums[ls+2];
+             const TFloat sum_l3 = sums[ls+3];
+             const TFloat sum_l4 = sums[ls+4];
+             const TFloat sum_l5 = sums[ls+5];
+             const TFloat sum_l6 = sums[ls+6];
+             const TFloat sum_l7 = sums[ls+7];
+             dm_stripe[ks]   += sum_l0;
+             dm_stripe[ks+1] += sum_l1;
+             dm_stripe[ks+2] += sum_l2;
+             dm_stripe[ks+3] += sum_l3;
+             dm_stripe[ks+4] += sum_l4;
+             dm_stripe[ks+5] += sum_l5;
+             dm_stripe[ks+6] += sum_l6;
+             dm_stripe[ks+7] += sum_l7;
+          } else {
+             // one side has all zeros, use distributed property
+             // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
+             const TFloat sum_k0 = sums[ks];
+             const TFloat sum_k1 = sums[ks+1];
+             const TFloat sum_k2 = sums[ks+2];
+             const TFloat sum_k3 = sums[ks+3];
+             const TFloat sum_k4 = sums[ks+4];
+             const TFloat sum_k5 = sums[ks+5];
+             const TFloat sum_k6 = sums[ks+6];
+             const TFloat sum_k7 = sums[ks+7];
+             dm_stripe[ks]   += sum_k0;
+             dm_stripe[ks+1] += sum_k1;
+             dm_stripe[ks+2] += sum_k2;
+             dm_stripe[ks+3] += sum_k3;
+             dm_stripe[ks+4] += sum_k4;
+             dm_stripe[ks+5] += sum_k5;
+             dm_stripe[ks+6] += sum_k6;
+             dm_stripe[ks+7] += sum_k7;
+          } 
+       } else if ((cnt_k<6) && (cnt_l<6)) {
+          // several of the elemens are nonzero, may as well use the vectorized version
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+
+          WeightedVal8(dm_stripe,
+                       embedded_proportions, lengths,
+                       filled_embs, n_samples_r,
+                       ks, ls);
+       } else {
+         // only a few have both sides partially non zero, use the fine grained compute
+         for (uint64_t i=0; i<8; i++) {
+            UnnormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks+i, ls+i);
+         }
+       } // (allzero_k && allzero_l)
+}
+#endif
+
+template<class TFloat>
+void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
+    const uint64_t start_idx = this->task_p->start;
+    const uint64_t stop_idx = this->task_p->stop;
+    const uint64_t n_samples = this->task_p->n_samples;
+    const uint64_t n_samples_r = this->dm_stripes.n_samples_r;
 
+    // openacc only works well with local variables
+    const TFloat * const __restrict__ embedded_proportions = this->embedded_proportions;
+    TFloat * const __restrict__ dm_stripes_buf = this->dm_stripes.buf;
+
+    bool * const __restrict__ zcheck = this->zcheck;
+    TFloat * const __restrict__ sums = this->sums;
+
+    const uint64_t step_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::step_size;
+    const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
+
+    // check for zero values and pre-compute single column sums
+    WeightedZerosAndSums(zcheck, sums,
+                         embedded_proportions, lengths,
+                         filled_embs, n_samples, n_samples_r);
+
+    // now do the real compute
+#ifdef _OPENACC
+    const unsigned int acc_vector_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::acc_vector_size;
+#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,lengths,zcheck,sums) async
+    for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+     for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
+      // SIMT-based GPU work great one at a time (HW will deal with parallelism)
+      for(uint64_t ik = 0; ik < step_size ; ik++) {
+       const uint64_t k = sk*step_size + ik;
+
+       if (k>=n_samples) continue; // past the limit
+
+       const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+       const uint64_t idx = (stripe-start_idx) * n_samples_r;
+
+       UnnormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   k, l1);
       } // for ik
      } // for stripe
     } // for sk
+#else
+    // tilling helps with better cache reuse without the need of multiple cores
+    const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
+
+    // use dynamic scheduling due to non-homogeneity in the loop
+    // Use a moderate block to prevent trashing but still have some cache reuse
+#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
+    for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
+     for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+      // tile to maximize cache reuse
+      for(uint64_t is = 0; is < step_size ; is++) {
+       const uint64_t stripe = start_idx+ss*step_size + is;
+       if (stripe<stop_idx) { // else past limit
+
+      // SIMD-based CPUs need help with vectorization
+      const uint64_t idx = (stripe-start_idx) * n_samples_r;
+      uint64_t ks = sk*step_size;
+      const uint64_t kmax = std::min(ks+step_size,n_samples);
+      uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
+
+      while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) {
+       UnnormalizedWeighted8<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks, ls);
+       ks+=8;
+       ls = (ls + 8)%n_samples; // wraparound
+      } // for ks+=8
+
+      while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) {
+       UnnormalizedWeighted4<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks, ls);
+       ks+=4;
+       ls = (ls + 4)%n_samples; // wraparound
+      } // for ks+=4
+
+      // deal with any leftovers in serial way
+      for( ; ks < kmax; ks++ ) {
+       UnnormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks, ls);
+       ls = (ls + 1)%n_samples; // wraparound
+      } // for ks
+
+       } // if stripe
+      } // for is
+     } // for sk
+    } // for ss
+#endif
 
 #ifdef _OPENACC
    // next iteration will use the alternative space
@@ -187,6 +584,276 @@ void SUCMP_NM::UnifracVawUnnormalizedWeightedTask<TFloat>::_run(unsigned int fil
 #endif
 }
 
+// Single step in computing NormalizedWeighted Unifrac
+template<class TFloat>
+static inline void NormalizedWeighted1(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      TFloat * const __restrict__ dm_stripes_total_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const __restrict__ sums,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t k,
+                      const uint64_t l1) {
+       const bool allzero_k = zcheck[k];
+       const bool allzero_l1 = zcheck[l1];
+
+       if (allzero_k && allzero_l1) {
+         // nothing to do, would have to add 0
+       } else {
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+          //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+
+          const TFloat sum_k = sums[k];
+          const TFloat sum_l = sums[l1];
+
+          // the totals can always use the distributed property
+          dm_stripe_total[k] += sum_k + sum_l;
+   
+          TFloat my_stripe;
+
+          if (allzero_k || allzero_l1) {
+            // one side has all zeros
+            // we can use the distributed property, and use the pre-computed values
+
+            my_stripe = (allzero_k) ? sum_l : // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+                                      sum_k;   // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
+          } else {
+            // both sides non zero, use the explicit but slow approach
+            my_stripe = WeightedVal1(embedded_proportions, lengths,
+                                     filled_embs, n_samples_r,
+                                     k, l1);
+          }
+
+          // keep all writes in a single place, to maximize GPU warp performance
+          dm_stripe[k]       += my_stripe;
+       }
+}
+
+#ifndef _OPENACC
+// Vectorized step in computing NormalizedWeighted Unifrac
+template<class TFloat>
+static inline void NormalizedWeighted4(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      TFloat * const __restrict__ dm_stripes_total_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const __restrict__ sums,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t ks,
+                      const uint64_t ls) {
+       const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0];
+       const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0];
+       const bool allzero_k = z_k==0x01010101;
+       const bool allzero_l = z_l==0x01010101;
+
+       if (allzero_k && allzero_l) {
+         // nothing to do, would have to add 0
+       } else {
+          const TFloat sum_k0 = sums[ks];
+          const TFloat sum_k1 = sums[ks+1];
+          const TFloat sum_k2 = sums[ks+2];
+          const TFloat sum_k3 = sums[ks+3];
+          const TFloat sum_l0 = sums[ls];
+          const TFloat sum_l1 = sums[ls+1];
+          const TFloat sum_l2 = sums[ls+2];
+          const TFloat sum_l3 = sums[ls+3];
+
+          const TFloat sum_kl0 = sum_k0 + sum_l0;
+          const TFloat sum_kl1 = sum_k1 + sum_l1;
+          const TFloat sum_kl2 = sum_k2 + sum_l2;
+          const TFloat sum_kl3 = sum_k3 + sum_l3;
+
+          int32_t cnt_k = 0;
+          int32_t cnt_l = 0;
+
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+
+          if (allzero_k) {
+             // one side has all zeros, use distributed property
+             // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+             dm_stripe[ks]   += sum_l0;
+             dm_stripe[ks+1] += sum_l1;
+             dm_stripe[ks+2] += sum_l2;
+             dm_stripe[ks+3] += sum_l3;
+          } else if (allzero_l) {
+             // one side has all zeros, use distributed property
+             // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
+             dm_stripe[ks]   += sum_k0;
+             dm_stripe[ks+1] += sum_k1;
+             dm_stripe[ks+2] += sum_k2;
+             dm_stripe[ks+3] += sum_k3;
+          } else {
+             // popcnt is cheap but has large latency, so compute early
+             cnt_k = popcnt_u32(z_k);
+             cnt_l = popcnt_u32(z_l);
+          }
+
+          // the totals can always use the distributed property
+          {
+             TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+             //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+             dm_stripe_total[ks]   += sum_kl0;
+             dm_stripe_total[ks+1] += sum_kl1;
+             dm_stripe_total[ks+2] += sum_kl2;
+             dm_stripe_total[ks+3] += sum_kl3;
+          }
+
+          if (allzero_k||allzero_l) {
+             // already done
+          } else if ((cnt_k<3) && (cnt_l<3)) {
+             // several of the elemens are nonzero, may as well use the vectorized version
+             TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+             //TFloat *dm_stripe = dm_stripes[stripe];
+
+             WeightedVal4(dm_stripe,
+                       embedded_proportions, lengths,
+                       filled_embs, n_samples_r,
+                       ks, ls);
+          } else {
+             // only a few have both sides partially non zero, use the fine grained compute
+             // Use UnnormalizedWeighted since we already computed dm_stripe_total
+             for (uint64_t i=0; i<4; i++) {
+                UnnormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks+i, ls+i);
+             }
+          }
+       } // (allzero_k && allzero_l)
+}
+
+template<class TFloat>
+static inline void NormalizedWeighted8(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      TFloat * const __restrict__ dm_stripes_total_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const __restrict__ sums,
+                      const TFloat * const __restrict__ embedded_proportions,
+                      const TFloat * __restrict__ lengths,
+                      const unsigned int filled_embs,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t ks,
+                      const uint64_t ls) {
+       const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0];
+       const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0];
+       const bool allzero_k = z_k==0x0101010101010101;
+       const bool allzero_l = z_l==0x0101010101010101;
+
+       if (allzero_k && allzero_l) {
+         // nothing to do, would have to add 0
+       } else {
+          const TFloat sum_k0 = sums[ks];
+          const TFloat sum_k1 = sums[ks+1];
+          const TFloat sum_k2 = sums[ks+2];
+          const TFloat sum_k3 = sums[ks+3];
+          const TFloat sum_k4 = sums[ks+4];
+          const TFloat sum_k5 = sums[ks+5];
+          const TFloat sum_k6 = sums[ks+6];
+          const TFloat sum_k7 = sums[ks+7];
+          const TFloat sum_l0 = sums[ls];
+          const TFloat sum_l1 = sums[ls+1];
+          const TFloat sum_l2 = sums[ls+2];
+          const TFloat sum_l3 = sums[ls+3];
+          const TFloat sum_l4 = sums[ls+4];
+          const TFloat sum_l5 = sums[ls+5];
+          const TFloat sum_l6 = sums[ls+6];
+          const TFloat sum_l7 = sums[ls+7];
+
+          const TFloat sum_kl0 = sum_k0 + sum_l0;
+          const TFloat sum_kl1 = sum_k1 + sum_l1;
+          const TFloat sum_kl2 = sum_k2 + sum_l2;
+          const TFloat sum_kl3 = sum_k3 + sum_l3;
+          const TFloat sum_kl4 = sum_k4 + sum_l4;
+          const TFloat sum_kl5 = sum_k5 + sum_l5;
+          const TFloat sum_kl6 = sum_k6 + sum_l6;
+          const TFloat sum_kl7 = sum_k7 + sum_l7;
+
+          int32_t cnt_k = 0;
+          int32_t cnt_l = 0;
+
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+
+          if (allzero_k) {
+             // one side has all zeros, use distributed property
+             // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+             dm_stripe[ks]   += sum_l0;
+             dm_stripe[ks+1] += sum_l1;
+             dm_stripe[ks+2] += sum_l2;
+             dm_stripe[ks+3] += sum_l3;
+             dm_stripe[ks+4] += sum_l4;
+             dm_stripe[ks+5] += sum_l5;
+             dm_stripe[ks+6] += sum_l6;
+             dm_stripe[ks+7] += sum_l7;
+          } else if (allzero_l) {
+             // one side has all zeros, use distributed property
+             // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
+             dm_stripe[ks]   += sum_k0;
+             dm_stripe[ks+1] += sum_k1;
+             dm_stripe[ks+2] += sum_k2;
+             dm_stripe[ks+3] += sum_k3;
+             dm_stripe[ks+4] += sum_k4;
+             dm_stripe[ks+5] += sum_k5;
+             dm_stripe[ks+6] += sum_k6;
+             dm_stripe[ks+7] += sum_k7;
+          } else {
+             // popcnt is cheap but has large latency, so compute early
+             cnt_k = popcnt_u64(z_k);
+             cnt_l = popcnt_u64(z_l);
+          }
+
+          // the totals can always use the distributed property
+          {
+             TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+             //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+             dm_stripe_total[ks]   += sum_kl0;
+             dm_stripe_total[ks+1] += sum_kl1;
+             dm_stripe_total[ks+2] += sum_kl2;
+             dm_stripe_total[ks+3] += sum_kl3;
+             dm_stripe_total[ks+4] += sum_kl4;
+             dm_stripe_total[ks+5] += sum_kl5;
+             dm_stripe_total[ks+6] += sum_kl6;
+             dm_stripe_total[ks+7] += sum_kl7;
+          }
+
+          if (allzero_k||allzero_l) {
+             // already done
+          } else if ((cnt_k<6) && (cnt_l<6)) {
+             // several of the elemens are nonzero, may as well use the vectorized version
+             TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+             //TFloat *dm_stripe = dm_stripes[stripe];
+
+             WeightedVal8(dm_stripe,
+                       embedded_proportions, lengths,
+                       filled_embs, n_samples_r,
+                       ks, ls);
+          } else {
+             // only a few have both sides partially non zero, use the fine grained compute
+             // Use UnnormalizedWeighted since we already computed dm_stripe_total
+             for (uint64_t i=0; i<8; i++) {
+                UnnormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks+i, ls+i);
+             }
+          }
+       } // (allzero_k && allzero_l)
+}
+#endif
+
 template<class TFloat>
 void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
     const uint64_t start_idx = this->task_p->start;
@@ -206,100 +873,88 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
     const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
 
     // check for zero values and pre-compute single column sums
-#ifdef _OPENACC
-#pragma acc parallel loop present(embedded_proportions,lengths,zcheck,sums)
-#else
-#pragma omp parallel for default(shared)
-#endif
-    for(uint64_t k=0; k<n_samples; k++) {
-            bool all_zeros=true;
-            TFloat my_sum = 0.0;
-
-#pragma acc loop seq
-            for (uint64_t emb=0; emb<filled_embs; emb++) {
-                const uint64_t offset = n_samples_r * emb;
-
-                TFloat u1 = embedded_proportions[offset + k];
-                my_sum += u1*lengths[emb];
-                all_zeros = all_zeros && (u1==0.0);
-            }
-
-            sums[k]     = my_sum;
-            zcheck[k] = all_zeros;
-    }
+    WeightedZerosAndSums(zcheck, sums,
+                         embedded_proportions, lengths,
+                         filled_embs, n_samples, n_samples_r);
 
     // point of thread
 #ifdef _OPENACC
     const unsigned int acc_vector_size = SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::acc_vector_size;
-#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,lengths,zcheck,sums) async
-#else
-    // use dynamic scheduling due to non-homogeneity in the loop
-#pragma omp parallel for schedule(dynamic,1) default(shared)
-#endif
+#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,lengths,zcheck,sums) async
     for(uint64_t sk = 0; sk < sample_steps ; sk++) {
      for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
+      // SIMT-based GPU work great one at a time (HW will deal with parallelism)
       for(uint64_t ik = 0; ik < step_size ; ik++) {
        const uint64_t k = sk*step_size + ik;
 
        if (k>=n_samples) continue; // past the limit
 
-       const bool zcheck_k = zcheck[sk]; // due to loop collapse in ACC, must load in here
-
        const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+       const uint64_t idx = (stripe-start_idx) * n_samples_r;
 
-       const bool allzero_k = zcheck[k];
-       const bool allzero_l1 = zcheck[l1];
-
-       if (allzero_k && allzero_l1) {
-         // nothing to do, would have to add 0
-       } else {
-          const uint64_t idx = (stripe-start_idx) * n_samples_r;
-          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
-          TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
-          //TFloat *dm_stripe = dm_stripes[stripe];
-          //TFloat *dm_stripe_total = dm_stripes_total[stripe];
-
-          // the totals can always use the distributed property
-          dm_stripe_total[k] += sums[k] + sums[l1];
-   
-          TFloat my_stripe;
-
-          if (allzero_k || allzero_l1) {
-            // one side has all zeros
-            // we can use the distributed property, and use the pre-computed values
-
-            const uint64_t ridx = (allzero_k) ? l1 : // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
-                                                k;   // if (nonzero_k)  ridx=k  // fabs(k-l1), with l1==0
-
-            // keep reads in the same place to maximize GPU warp performance
-            my_stripe = sums[ridx];
-          
-          } else {
-            // both sides non zero, use the explicit but slow approach
-
-            my_stripe = 0.0;
-
-#pragma acc loop seq
-            for (uint64_t emb=0; emb<filled_embs; emb++) {
-                const uint64_t offset = n_samples_r * emb;
-
-                TFloat u1 = embedded_proportions[offset + k];
-                TFloat v1 = embedded_proportions[offset + l1];
-                TFloat diff1 = u1 - v1;
-                TFloat length = lengths[emb];
-
-                my_stripe     += fabs(diff1) * length;
-            }
-
-          }
-
-          // keep all writes in a single place, to maximize GPU warp performance
-          dm_stripe[k]       += my_stripe;
-       }
-
+       NormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,dm_stripes_total_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   k, l1);
       } // for ik
      } // for stripe
     } // for sk
+#else
+    // tilling helps with better cache reuse without the need of multiple cores
+    const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
+
+    // use dynamic scheduling due to non-homogeneity in the loop
+    // Use a moderate block to prevent trashing but still have some cache reuse
+#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
+    for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
+     for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+      // tile to maximize cache reuse
+      for(uint64_t is = 0; is < step_size ; is++) {
+       const uint64_t stripe = start_idx+ss*step_size + is;
+       if (stripe<stop_idx) { // else past limit
+
+      // SIMD-based CPUs need help with vectorization
+      const uint64_t idx = (stripe-start_idx) * n_samples_r;
+      uint64_t ks = sk*step_size;
+      const uint64_t kmax = std::min(ks+step_size,n_samples);
+      uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
+
+      while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) {
+       NormalizedWeighted8<TFloat>(
+                                   dm_stripes_buf,dm_stripes_total_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks, ls);
+       ks+=8;
+       ls = (ls + 8)%n_samples; // wraparound
+      } // for ks+=8
+
+      while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) {
+       NormalizedWeighted4<TFloat>(
+                                   dm_stripes_buf,dm_stripes_total_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks, ls);
+       ks+=4;
+       ls = (ls + 4)%n_samples; // wraparound
+      } // for ks+=4
+
+      // deal with any leftovers in serial way
+      for( ; ks < kmax; ks++ ) {
+       NormalizedWeighted1<TFloat>(
+                                   dm_stripes_buf,dm_stripes_total_buf,
+                                   zcheck, sums, embedded_proportions, lengths,
+                                   filled_embs,idx, n_samples_r,
+                                   ks, ls);
+       ls = (ls + 1)%n_samples; // wraparound
+      } // for ks
+
+       } // if stripe
+      } // for is
+     } // for sk
+    } // for ss
+#endif
 
 #ifdef _OPENACC
    // next iteration will use the alternative space
@@ -535,6 +1190,320 @@ void SUCMP_NM::UnifracVawGeneralizedTask<TFloat>::_run(unsigned int filled_embs,
 #endif
 }
 
+// Single step in computing Unweighted Unifrac
+
+template<class TFloat>
+static inline void UnweightedOneSide(
+                      bool   * const __restrict__ zcheck,
+                      TFloat * const  __restrict__ stripe_sums,
+                      const TFloat * const   __restrict__ sums,
+                      const uint64_t * const __restrict__ embedded_proportions,
+                      const unsigned int filled_embs_els_round,
+                      const uint64_t n_samples_r,
+                      const uint64_t kl) {
+            bool all_zeros=true;
+            TFloat my_stripe = 0.0;
+
+            for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
+                const uint64_t offset = n_samples_r * emb_el;
+                const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
+
+                uint64_t o1 = embedded_proportions[offset + kl];
+
+                if (o1==0) {  // zeros are prevalent
+                    // nothing to do
+                } else {
+                 all_zeros = false;
+#ifndef _OPENACC
+                 // CPU/SIMD faster if we check for partial compute
+                 if (((uint32_t)o1)==0) {
+                    // only high part relevant
+                    //uint64_t x1 = u1 ^ v1;
+                    // With one of the two being 0, xor is just the non-negative one
+
+                    // Use the pre-computed sums
+                    // Each range of 8 lengths has already been pre-computed and stored in psum
+                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
+                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+                    TFloat esum      = psum[0x400+((uint8_t)(o1 >> 32))] +
+                                       psum[0x500+((uint8_t)(o1 >> 40))] +
+                                       psum[0x600+((uint8_t)(o1 >> 48))] +
+                                       psum[0x700+((uint8_t)(o1 >> 56))];
+                    my_stripe       += esum;
+                 } else if ((o1>>32)==0) {
+                    // only low part relevant
+                    //uint64_t x1 = u1 ^ v1;
+                    // With one of the two being 0, xor is just the non-negative one
+
+                    // Use the pre-computed sums
+                    // Each range of 8 lengths has already been pre-computed and stored in psum
+                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
+                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+                    TFloat esum      = psum[       (uint8_t)(o1)       ] + 
+                                       psum[0x100+((uint8_t)(o1 >>  8))] +
+                                       psum[0x200+((uint8_t)(o1 >> 16))] +
+                                       psum[0x300+((uint8_t)(o1 >> 24))];
+                    my_stripe       += esum;
+                 } else {
+#else
+                 // GPU/SIMT faster if we just go ahead will full at all times
+                 {
+#endif
+                    //uint64_t x1 = u1 ^ v1;
+                    // With one of the two being 0, xor is just the non-negative one
+
+                    // Use the pre-computed sums
+                    // Each range of 8 lengths has already been pre-computed and stored in psum
+                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
+                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+                    TFloat esum      = psum[       (uint8_t)(o1)       ] + 
+                                       psum[0x100+((uint8_t)(o1 >>  8))] +
+                                       psum[0x200+((uint8_t)(o1 >> 16))] +
+                                       psum[0x300+((uint8_t)(o1 >> 24))] +
+                                       psum[0x400+((uint8_t)(o1 >> 32))] +
+                                       psum[0x500+((uint8_t)(o1 >> 40))] +
+                                       psum[0x600+((uint8_t)(o1 >> 48))] +
+                                       psum[0x700+((uint8_t)(o1 >> 56))];
+                    my_stripe       += esum;
+                 }
+                }
+            }
+
+            zcheck[kl] = all_zeros;
+            stripe_sums[kl] = my_stripe;
+
+}
+
+#ifdef _OPENACC
+
+template<class TFloat>
+static inline void Unweighted1(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      TFloat * const __restrict__ dm_stripes_total_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const   __restrict__ stripe_sums,
+                      const TFloat * const   __restrict__ sums,
+                      const uint64_t * const __restrict__ embedded_proportions,
+                      const unsigned int filled_embs_els_round,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t k,
+                      const uint64_t l1) {
+       const bool allzero_k = zcheck[k];
+       const bool allzero_l1 = zcheck[l1];
+
+       if (allzero_k && allzero_l1) {
+          // nothing to do, would have to add 0
+       } else {
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+          //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+
+          bool did_update = false;
+          TFloat my_stripe = 0.0;
+          TFloat my_stripe_total = 0.0;
+
+          if (allzero_k || allzero_l1) {
+            // with one side zero, | and ^ are no-ops
+            const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero onea
+            my_stripe = stripe_sums[kl];
+            did_update = (my_stripe!=0.0);
+            my_stripe_total = my_stripe;
+          } else {
+            // we need both sides
+            for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
+                const uint64_t offset = n_samples_r * emb_el;
+                const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
+
+                uint64_t u1 = embedded_proportions[offset + k];
+                uint64_t v1 = embedded_proportions[offset + l1];
+                uint64_t o1 = u1 | v1;
+
+                if (o1!=0) {  // zeros are prevalent
+                    did_update=true;
+                    uint64_t x1 = u1 ^ v1;
+
+                    // Use the pre-computed sums
+                    // Each range of 8 lengths has already been pre-computed and stored in psum
+                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
+                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+                    my_stripe_total += psum[       (uint8_t)(o1)       ] + 
+                                       psum[0x100+((uint8_t)(o1 >>  8))] +
+                                       psum[0x200+((uint8_t)(o1 >> 16))] +
+                                       psum[0x300+((uint8_t)(o1 >> 24))] +
+                                       psum[0x400+((uint8_t)(o1 >> 32))] +
+                                       psum[0x500+((uint8_t)(o1 >> 40))] +
+                                       psum[0x600+((uint8_t)(o1 >> 48))] +
+                                       psum[0x700+((uint8_t)(o1 >> 56))];
+                    my_stripe       += psum[       (uint8_t)(x1)       ] + 
+                                       psum[0x100+((uint8_t)(x1 >>  8))] +
+                                       psum[0x200+((uint8_t)(x1 >> 16))] +
+                                       psum[0x300+((uint8_t)(x1 >> 24))] +
+                                       psum[0x400+((uint8_t)(x1 >> 32))] +
+                                       psum[0x500+((uint8_t)(x1 >> 40))] +
+                                       psum[0x600+((uint8_t)(x1 >> 48))] +
+                                       psum[0x700+((uint8_t)(x1 >> 56))];
+                }
+            }
+          }
+
+          if (did_update) {
+            dm_stripe[k]       += my_stripe;
+            dm_stripe_total[k] += my_stripe_total;
+          }
+       } // (allzero_k && allzero_l1)
+
+}
+#else
+
+template<class TFloat>
+static inline void Unweighted1(
+                      TFloat * const __restrict__ dm_stripes_buf,
+                      TFloat * const __restrict__ dm_stripes_total_buf,
+                      const bool   * const __restrict__ zcheck,
+                      const TFloat * const   __restrict__ stripe_sums,
+                      const TFloat * const   __restrict__ sums,
+                      const uint64_t * const __restrict__ embedded_proportions,
+                      const unsigned int filled_embs_els_round,
+                      const uint64_t idx,
+                      const uint64_t n_samples_r,
+                      const uint64_t k,
+                      const uint64_t l1) {
+       const bool allzero_k = zcheck[k];
+       const bool allzero_l1 = zcheck[l1];
+
+       if (allzero_k && allzero_l1) {
+          // nothing to do, would have to add 0
+       } else {
+          TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+          TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+          //TFloat *dm_stripe = dm_stripes[stripe];
+          //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+
+          bool did_update = false;
+          TFloat my_stripe = 0.0;
+          TFloat my_stripe_total = 0.0;
+
+          if (allzero_k || allzero_l1) {
+            // with one side zero, | and ^ are no-ops
+            const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero onea
+            my_stripe = stripe_sums[kl];
+            my_stripe_total = my_stripe;
+            did_update = (my_stripe!=0.0);
+          } else {
+            // we need both sides
+            for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
+                const uint64_t offset = n_samples_r * emb_el;
+                const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
+
+                uint64_t u1 = embedded_proportions[offset + k];
+                uint64_t v1 = embedded_proportions[offset + l1];
+                uint64_t o1 = u1 | v1;
+                uint64_t x1 = u1 ^ v1;
+
+                if (o1==0) {  // zeros are prevalent
+                    // nothing to do
+                } else if (((uint32_t)o1)==0) {
+                    // only high part relevant
+                    did_update=true;
+
+                    // Use the pre-computed sums
+                    // Each range of 8 lengths has already been pre-computed and stored in psum
+                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
+                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+                    my_stripe_total += psum[0x400+((uint8_t)(o1 >> 32))] +
+                                       psum[0x500+((uint8_t)(o1 >> 40))] +
+                                       psum[0x600+((uint8_t)(o1 >> 48))] +
+                                       psum[0x700+((uint8_t)(o1 >> 56))];
+                    my_stripe       += psum[0x400+((uint8_t)(x1 >> 32))] +
+                                       psum[0x500+((uint8_t)(x1 >> 40))] +
+                                       psum[0x600+((uint8_t)(x1 >> 48))] +
+                                       psum[0x700+((uint8_t)(x1 >> 56))];
+                } else if ((o1>>32)==0) {
+                    // only low part relevant
+                    did_update=true;
+
+                    // Use the pre-computed sums
+                    // Each range of 8 lengths has already been pre-computed and stored in psum
+                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
+                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+                    my_stripe_total += psum[       (uint8_t)(o1)       ] + 
+                                       psum[0x100+((uint8_t)(o1 >>  8))] +
+                                       psum[0x200+((uint8_t)(o1 >> 16))] +
+                                       psum[0x300+((uint8_t)(o1 >> 24))];
+                    my_stripe       += psum[       (uint8_t)(x1)       ] + 
+                                       psum[0x100+((uint8_t)(x1 >>  8))] +
+                                       psum[0x200+((uint8_t)(x1 >> 16))] +
+                                       psum[0x300+((uint8_t)(x1 >> 24))];
+
+                } else {
+                    did_update=true;
+
+                    // Use the pre-computed sums
+                    // Each range of 8 lengths has already been pre-computed and stored in psum
+                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
+                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+                    my_stripe_total += psum[       (uint8_t)(o1)       ] + 
+                                       psum[0x100+((uint8_t)(o1 >>  8))] +
+                                       psum[0x200+((uint8_t)(o1 >> 16))] +
+                                       psum[0x300+((uint8_t)(o1 >> 24))] +
+                                       psum[0x400+((uint8_t)(o1 >> 32))] +
+                                       psum[0x500+((uint8_t)(o1 >> 40))] +
+                                       psum[0x600+((uint8_t)(o1 >> 48))] +
+                                       psum[0x700+((uint8_t)(o1 >> 56))];
+                    my_stripe       += psum[       (uint8_t)(x1)       ] + 
+                                       psum[0x100+((uint8_t)(x1 >>  8))] +
+                                       psum[0x200+((uint8_t)(x1 >> 16))] +
+                                       psum[0x300+((uint8_t)(x1 >> 24))] +
+                                       psum[0x400+((uint8_t)(x1 >> 32))] +
+                                       psum[0x500+((uint8_t)(x1 >> 40))] +
+                                       psum[0x600+((uint8_t)(x1 >> 48))] +
+                                       psum[0x700+((uint8_t)(x1 >> 56))];
+                }
+            }
+          } // (allzero_k || allzero_l1)
+
+          if (did_update) {
+              dm_stripe[k]       += my_stripe;
+              dm_stripe_total[k] += my_stripe_total;
+          }
+       } // (allzero_k && allzero_l1)
+
+}
+#endif
+
+// check for zero values
+template<class TFloat, class T>
+static inline void UnweightedZerosAndSums(
+                      bool   * const __restrict__ zcheck,
+                      TFloat * const  __restrict__ stripe_sums,
+                      const TFloat * const   __restrict__ el_sums,
+                      const T * const __restrict__ embedded_proportions,
+                      const unsigned int filled_embs_els_round,
+                      const uint64_t n_samples,
+                      const uint64_t n_samples_r) {
+#ifdef _OPENACC
+#pragma acc parallel loop gang vector present(embedded_proportions,zcheck,el_sums,stripe_sums)
+#else
+#pragma omp parallel for default(shared)
+#endif
+    for(uint64_t k=0; k<n_samples; k++) {
+            UnweightedOneSide(
+                      zcheck, stripe_sums,
+                      el_sums, embedded_proportions,
+                      filled_embs_els_round, n_samples_r,
+                      k);
+    }
+}
+
 template<class TFloat>
 void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
     const uint64_t start_idx = this->task_p->start;
@@ -548,6 +1517,8 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
     TFloat * const __restrict__ dm_stripes_total_buf = this->dm_stripes_total.buf;
 
     TFloat * const __restrict__ sums = this->sums;
+    bool   * const __restrict__ zcheck = this->zcheck;
+    TFloat * const __restrict__ stripe_sums = this->stripe_sums;
 
     const uint64_t step_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::step_size;
     const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
@@ -557,7 +1528,6 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
 
     const uint64_t filled_embs_els_round = (filled_embs+63)/64;
 
-
     // pre-compute sums of length elements, since they are likely to be accessed many times
     // We will use a 8-bit map, to keep it small enough to keep in L1 cache
 #ifdef _OPENACC
@@ -613,79 +1583,68 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
         }
     }
 
+#pragma acc wait
+    // check for zero values and compute stripe sums
+    UnweightedZerosAndSums(zcheck, stripe_sums,
+                           sums, embedded_proportions,
+                           filled_embs_els_round, n_samples, n_samples_r);
+
     // point of thread
 #ifdef _OPENACC
-#pragma acc wait
     const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
-#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
-#else
-    // use dynamic scheduling due to non-homogeneity in the loop
-#pragma omp parallel for schedule(dynamic,1) default(shared)
-#endif
+#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums,zcheck,stripe_sums) async
     for(uint64_t sk = 0; sk < sample_steps ; sk++) {
       for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
         for(uint64_t ik = 0; ik < step_size ; ik++) {
             const uint64_t k = sk*step_size + ik;
             const uint64_t idx = (stripe-start_idx) * n_samples_r;
-            TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
-            TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
-            //TFloat *dm_stripe = dm_stripes[stripe];
-            //TFloat *dm_stripe_total = dm_stripes_total[stripe];
 
             if (k>=n_samples) continue; // past the limit
 
             const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
 
-            bool did_update = false;
-            TFloat my_stripe = 0.0;
-            TFloat my_stripe_total = 0.0;
-
-#pragma acc loop seq
-            for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
-                const uint64_t offset = n_samples_r * emb_el;
-                const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
-
-                uint64_t u1 = embedded_proportions[offset + k];
-                uint64_t v1 = embedded_proportions[offset + l1];
-                uint64_t o1 = u1 | v1;
-
-                if (o1!=0) {  // zeros are prevalent
-                    did_update=true;
-                    uint64_t x1 = u1 ^ v1;
-
-                    // Use the pre-computed sums
-                    // Each range of 8 lengths has already been pre-computed and stored in psum
-                    // Since embedded_proportions packed format is in 64-bit format for performance reasons
-                    //    we need to add the 8 sums using the four 8-bits for addressing inside psum
-
-                    my_stripe       += psum[              (x1 & 0xff)] + 
-                                   psum[0x100+((x1 >>  8) & 0xff)] +
-                                   psum[0x200+((x1 >> 16) & 0xff)] +
-                                   psum[0x300+((x1 >> 24) & 0xff)] +
-                                   psum[0x400+((x1 >> 32) & 0xff)] +
-                                   psum[0x500+((x1 >> 40) & 0xff)] +
-                                   psum[0x600+((x1 >> 48) & 0xff)] +
-                                   psum[0x700+((x1 >> 56)       )];
-                    my_stripe_total += psum[              (o1 & 0xff)] +
-                                   psum[0x100+((o1 >>  8) & 0xff)] +
-                                   psum[0x200+((o1 >> 16) & 0xff)] +
-                                   psum[0x300+((o1 >> 24) & 0xff)] +
-                                   psum[0x400+((o1 >> 32) & 0xff)] +
-                                   psum[0x500+((o1 >> 40) & 0xff)] +
-                                   psum[0x600+((o1 >> 48) & 0xff)] +
-                                   psum[0x700+((o1 >> 56)       )];
-                }
-            }
-
-            if (did_update) {
-              dm_stripe[k]       += my_stripe;
-              dm_stripe_total[k] += my_stripe_total;
-            }
-
+            Unweighted1<TFloat>(
+                                dm_stripes_buf,dm_stripes_total_buf,
+                                zcheck, stripe_sums,
+                                sums, embedded_proportions,
+                                filled_embs_els_round,idx, n_samples_r,
+                                k, l1);
         }
 
       }
     }
+#else
+    // tilling helps with better cache reuse without the need of multiple cores
+    const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
+
+    // use dynamic scheduling due to non-homogeneity in the loop
+    // Use a moderate block to prevent trashing but still have some cache reuse
+#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
+    for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
+     for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+       // tile to maximize cache reuse
+       for(uint64_t is = 0; is < step_size ; is++) {
+        const uint64_t stripe = start_idx+ss*step_size + is;
+        if (stripe<stop_idx) { // esle past limit}
+         for(uint64_t ik = 0; ik < step_size ; ik++) {
+           const uint64_t k = sk*step_size + ik;
+           if (k<n_samples) { // elsepast the limit
+            const uint64_t idx = (stripe-start_idx) * n_samples_r;
+            const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+
+            Unweighted1<TFloat>(
+                                dm_stripes_buf,dm_stripes_total_buf,
+                                zcheck, stripe_sums,
+                                sums, embedded_proportions,
+                                filled_embs_els_round,idx, n_samples_r,
+                                k, l1);
+           } // if k
+         } // for ik
+        } // if stripe
+       } // for is
+      } // for sk
+    } // for ss
+#endif
 
 #ifdef _OPENACC
    // next iteration will use the alternative space


=====================================
src/unifrac_task.hpp
=====================================
@@ -297,9 +297,8 @@ namespace SUCMP_NM {
     template<class TFloat, class TEmb>
     class UnifracTask : public UnifracTaskBase<TFloat,TEmb> {
       protected:
-        // Use one cache line on CPU
-        // On GPU, shaing a cache line is actually a good thing
-        static const unsigned int step_size = 16*4/sizeof(TFloat);
+        // Use a moderate sized step, a few cache lines
+        static const unsigned int step_size = 64*4/sizeof(TFloat);
 
 #ifdef _OPENACC
         // Use as big vector size as we can, to maximize cache line reuse
@@ -403,32 +402,45 @@ namespace SUCMP_NM {
     template<class TFloat>
     class UnifracUnweightedTask : public UnifracTask<TFloat,uint64_t> {
       public:
+        static const unsigned int step_size = 64*4/sizeof(TFloat);
+
         static const unsigned int RECOMMENDED_MAX_EMBS = UnifracTask<TFloat,uint64_t>::RECOMMENDED_MAX_EMBS_BOOL;
 
         // Note: _max_emb MUST be multiple of 64
         UnifracUnweightedTask(std::vector<double*> &_dm_stripes, std::vector<double*> &_dm_stripes_total, unsigned int _max_embs, const su::task_parameters* _task_p)
         : UnifracTask<TFloat, uint64_t>(_dm_stripes,_dm_stripes_total,_max_embs,_task_p) 
         {
+          const unsigned int n_samples = this->task_p->n_samples;
           const unsigned int bsize = _max_embs*(0x400/32);
+          zcheck = NULL;
+          stripe_sums = NULL;
           sums = NULL;
+          posix_memalign((void **)&zcheck, 4096, sizeof(bool) * n_samples);
+          posix_memalign((void **)&stripe_sums, 4096, sizeof(TFloat) *  n_samples);
           posix_memalign((void **)&sums, 4096, sizeof(TFloat) * bsize);
-#pragma acc enter data create(sums[:bsize])
+#pragma acc enter data create(zcheck[:n_samples],stripe_sums[:n_samples],sums[:bsize])
         }
 
         virtual ~UnifracUnweightedTask()
         {
 #ifdef _OPENACC
-           const unsigned int bsize = this->max_embs*(0x400/32);
-#pragma acc exit data delete(sums[:bsize])
+          const unsigned int n_samples = this->task_p->n_samples;
+          const unsigned int bsize = this->max_embs*(0x400/32);
+#pragma acc exit data delete(sums[:bsize],stripe_sums[:n_samples],zcheck[:n_samples])
 #endif
           free(sums);
+          free(stripe_sums);
+          free(zcheck);
         }
 
         virtual void run(unsigned int filled_embs, const TFloat * __restrict__ length) {_run(filled_embs, length);}
 
         void _run(unsigned int filled_embs, const TFloat * __restrict__ length);
       private:
-        TFloat *sums; // temp buffer
+        // temp buffers
+        TFloat *sums;
+        bool     *zcheck;
+        TFloat   *stripe_sums;
     };
     template<class TFloat>
     class UnifracGeneralizedTask : public UnifracTask<TFloat,TFloat> {



View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/a95340b75f55ef71eb2f6941fc521148d1b3299c...a5a4233e998a25396ccfa7e6bb7a6118bc01b877

-- 
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/a95340b75f55ef71eb2f6941fc521148d1b3299c...a5a4233e998a25396ccfa7e6bb7a6118bc01b877
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221130/be09996d/attachment-0001.htm>