[med-svn] [Git][med-team/unifrac-tools][master] 6 commits: Missed the planned update to new upstream version
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Nov 30 07:54:36 GMT 2022
Andreas Tille pushed to branch master at Debian Med / unifrac-tools
Commits:
a2be7a89 by Andreas Tille at 2022-11-30T08:45:31+01:00
Missed the planned update to new upstream version
- - - - -
2ad1d1b3 by Andreas Tille at 2022-11-30T08:46:02+01:00
routine-update: New upstream version
- - - - -
2eb2aa26 by Andreas Tille at 2022-11-30T08:46:03+01:00
New upstream version 1.1.3
- - - - -
2550cc5f by Andreas Tille at 2022-11-30T08:46:29+01:00
Update upstream source from tag 'upstream/1.1.3'
Update to upstream version '1.1.3'
with Debian dir df68d79f79d28cba84df6ee90e3f48087801825f
- - - - -
5c466d9c by Andreas Tille at 2022-11-30T08:49:03+01:00
Refresh patches
- - - - -
a5a4233e by Andreas Tille at 2022-11-30T08:50:39+01:00
Upload to unstable
- - - - -
16 changed files:
- .github/workflows/main.yml
- README.md
- debian/changelog
- debian/patches/baseline.patch
- debian/patches/remove_non-free_headers.patch
- src/Makefile
- src/api.cpp
- + src/testdata/compare_unifrac_matrix.py
- + src/testdata/compare_unifrac_pcoa.py
- + src/testdata/test500.biom
- + src/testdata/test500.tre
- + src/testdata/test500.unweighted_fp32.f.h5
- + src/testdata/test500.weighted_normalized_fp32.f.h5
- + src/testdata/test500.weighted_unnormalized_fp32.h5
- src/unifrac_task.cpp
- src/unifrac_task.hpp
Changes:
=====================================
.github/workflows/main.yml
=====================================
@@ -34,7 +34,7 @@ jobs:
if [[ "$(uname -s)" == "Linux" ]];
then
- conda create --yes -n unifrac -c conda-forge -c bioconda gxx_linux-64 hdf5 mkl-include lz4 hdf5-static libcblas liblapacke make curl
+ conda create --yes -n unifrac -c conda-forge -c bioconda gxx_linux-64 hdf5 mkl-include lz4 zlib hdf5-static libcblas liblapacke make curl
else
conda create --yes -n unifrac -c conda-forge -c bioconda clangxx_osx-64 hdf5 mkl-include lz4 hdf5-static libcblas liblapacke make curl
fi
@@ -64,7 +64,11 @@ jobs:
if [[ "$(uname -s)" == "Linux" ]];
then
rm -f ~/.R/Makevars
- conda install -c conda-forge r-base
+ conda install --yes -c conda-forge r-base
+ unset CXXFLAGS
+ unset CFLAGS
+ unset DEBUG_CXXFLAGS
+ unset DEBUG_CFLAGS
# the r-base package has a broken dependency
ln -s $CONDA_PREFIX/lib/libreadline.so $CONDA_PREFIX/lib/libreadline.so.6
R -e 'install.packages("Rcpp", repos="http://lib.stat.cmu.edu/R/CRAN/")'
@@ -85,7 +89,41 @@ jobs:
pushd test
./capi_test 1
popd
-
+ pushd src/testdata
+ conda install --yes -c conda-forge h5py
+ # weighted_unnormalized
+ time ssu -m weighted_unnormalized_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
+ ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
+ ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
+ rm -f t1.h5
+ time ssu -f -m weighted_unnormalized_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
+ # matrrix will be different, but PCOA similar
+ ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
+ rm -f t1.h5
+ time ssu -m weighted_unnormalized -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp64 -o t1.h5
+ # minimal precision loss between fp32 and fp64
+ ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
+ ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
+ rm -f t1.h5
+ # weighted_normalized
+ time ssu -f -m weighted_normalized_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
+ ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
+ ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
+ rm -f t1.h5
+ time ssu -f -m weighted_normalized -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp64 -o t1.h5
+ ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
+ ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
+ rm -f t1.h5
+ # unweighted
+ time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
+ ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
+ ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
+ rm -f t1.h5
+ time ssu -f -m unweighted -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp64 -o t1.h5
+ ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
+ ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
+ rm -f t1.h5
+ popd
- name: Sanity checks
shell: bash -l {0}
run: |
=====================================
README.md
=====================================
@@ -12,6 +12,7 @@ A detailed description of the Strided State UniFrac algorithm can be found in [M
ssu
For UniFrac, please see:
+ Sfiligoi et al. mSystems 2022; DOI: 10.1128/msystems.00028-22
McDonald et al. Nature Methods 2018; DOI: 10.1038/s41592-018-0187-8
Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005
Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06
@@ -107,6 +108,14 @@ If more than one GPU is present, one can select the one to use by setting:
Note that there is no GPU support for MacOS.
+## Additional timing information
+
+When evaluating the performance of Unifrac it is sometimes necessary to distinguish
+the time spent interacting with the data from the Unifrac compute proper.
+Additional informational messages can be enabled by setting:
+
+ export UNIFRAC_TIMING_INFO=Y
+
# Examples of use
Below are a few light examples of different ways to use this library.
=====================================
debian/changelog
=====================================
@@ -1,8 +1,9 @@
-unifrac-tools (1.1.1-5) unstable; urgency=medium
+unifrac-tools (1.1.3-1) unstable; urgency=medium
* Delete broken lintian-overrides
+ * New upstream version
- -- Andreas Tille <tille at debian.org> Wed, 30 Nov 2022 08:40:57 +0100
+ -- Andreas Tille <tille at debian.org> Wed, 30 Nov 2022 08:49:15 +0100
unifrac-tools (1.1.1-4) unstable; urgency=medium
=====================================
debian/patches/baseline.patch
=====================================
@@ -5,14 +5,14 @@ Forwarded: not-needed
Last-Update: 2022-10-02
---
This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- unifrac-tools.orig/src/Makefile
-+++ unifrac-tools/src/Makefile
-@@ -73,8 +73,6 @@
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -73,8 +73,6 @@ ifneq (,$(findstring pgi,$(COMPILER)))
else
ifeq ($(PERFORMING_CONDA_BUILD),True)
- CPPFLAGS += -mtune=generic
+ CPPFLAGS += -mtune=generic -mavx
- else
-- CPPFLAGS += -mfma -march=native
+- CPPFLAGS += -mavx -mfma -march=native
endif
endif
=====================================
debian/patches/remove_non-free_headers.patch
=====================================
@@ -28,4 +28,4 @@ Description: mkl_cblas.h and mkl_lapacke.h are in libmkl-dev but this is non-fre
+BLASLIB=-llapacke -lblas
- CPPFLAGS += -Wall -std=c++11 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+ CPPFLAGS += -Wall -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
=====================================
src/Makefile
=====================================
@@ -68,13 +68,13 @@ endif
ifneq (,$(findstring pgi,$(COMPILER)))
ifeq ($(PERFORMING_CONDA_BUILD),True)
- CPPFLAGS += -tp=sandybridge
+ CPPFLAGS += -tp=sandybridge -Mvect=simd:256
endif
else
ifeq ($(PERFORMING_CONDA_BUILD),True)
- CPPFLAGS += -mtune=generic
+ CPPFLAGS += -mtune=generic -mavx
else
- CPPFLAGS += -mfma -march=native
+ CPPFLAGS += -mavx -mfma -march=native
endif
endif
@@ -86,7 +86,7 @@ endif
BLASLIB=-llapacke -lcblas
-CPPFLAGS += -Wall -std=c++11 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+CPPFLAGS += -Wall -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
all: api main install
@@ -148,6 +148,8 @@ rapi_test: main
echo CXX1X=h5c++ > ~/.R/Makevars
echo CXX=h5c++ >> ~/.R/Makevars
echo CC=h5c++ >> ~/.R/Makevars
+ echo CFLAGS= >> ~/.R/Makevars
+ echo CXXFLAGS= >> ~/.R/Makevars
echo LDFLAGS= -llz4 $(BLASLIB) >> ~/.R/Makevars
rm -f *.o
Rscript R_interface/rapi_test.R
=====================================
src/api.cpp
=====================================
@@ -13,15 +13,28 @@
#include <unistd.h>
#include <sys/mman.h>
#include <lz4.h>
+#include <time.h>
#define MMAP_FD_MASK 0x0fff
#define MMAP_FLAG 0x1000
-/* O_NOATIME is defined at fcntl.h when supported */
-#ifndef O_NOATIME
-#define O_NOATIME 0
-#endif
+#define SETUP_TDBG(method) const char *tdbg_method=method; \
+ bool print_tdbg = false;\
+ if (const char* env_p = std::getenv("UNIFRAC_TIMING_INFO")) { \
+ print_tdbg = true; \
+ std::string env_s(env_p); \
+ if ((env_s=="NO") || (env_s=="N") || (env_s=="no") || (env_s=="n") || \
+ (env_s=="NEVER") || (env_s=="never")) print_tdbg = false; \
+ } \
+ time_t tgdb_t0; time(&tgdb_t0); \
+ if(print_tdbg) printf("INFO (unifrac): Starting %s\n",tdbg_method);
+
+#define TDBG_STEP(sname) if(print_tdbg) {\
+ time_t tgdb_t1; time(&tgdb_t1); \
+ printf("INFO (unifrac): dt %4i : Completed %s.%s\n",(int)(tgdb_t1-tgdb_t0),tdbg_method,sname); \
+ tgdb_t0 = tgdb_t1; \
+ }
#define CHECK_FILE(filename, err) if(!is_file_exists(filename)) { \
return err; \
@@ -198,8 +211,12 @@ void initialize_mat_full_no_biom_T(TMat* &result, const char* const * sample_ids
} else {
std::string mmap_template(mmap_dir);
mmap_template+="/su_mmap_XXXXXX";
- // note: mkostemp will update mmap_template in place
+ // note: mkstemp/mkostemp will update mmap_template in place
+#ifdef O_NOATIME
int fd=mkostemp((char *) mmap_template.c_str(), O_NOATIME );
+#else
+ int fd=mkstemp((char *) mmap_template.c_str() );
+#endif
if (fd<0) {
result->matrix = NULL;
// leave error handling to the caller
@@ -382,7 +399,9 @@ void set_tasks(std::vector<su::task_parameters> &tasks,
compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int nthreads, mat_t** result) {
+ SETUP_TDBG("one_off_inmem")
SYNC_TREE_TABLE(tree, table)
+ TDBG_STEP("sync_tree_table")
SET_METHOD(unifrac_method, unknown_method)
const unsigned int stripe_stop = (table.n_samples + 1) / 2;
@@ -400,11 +419,13 @@ compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree,
set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
+ TDBG_STEP("process_stripes")
initialize_mat(*result, table, true); // true -> is_upper_triangle
for(unsigned int tid = 0; tid < threads.size(); tid++) {
su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop);
}
+ TDBG_STEP("stripes_to_condensed_form")
destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0);
return okay;
@@ -415,11 +436,13 @@ compute_status partial(const char* biom_filename, const char* tree_filename,
unsigned int nthreads, unsigned int stripe_start, unsigned int stripe_stop,
partial_mat_t** result) {
+ SETUP_TDBG("partial")
CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
SET_METHOD(unifrac_method, unknown_method)
PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
+ TDBG_STEP("load_files")
// we resize to the largest number of possible stripes even if only computing
// partial, however we do not allocate arrays for non-computed stripes so
// there is a little memory waste here but should be on the order of
@@ -443,7 +466,9 @@ compute_status partial(const char* biom_filename, const char* tree_filename,
set_tasks(tasks, alpha, table.n_samples, stripe_start, stripe_stop, bypass_tips, nthreads);
su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
+ TDBG_STEP("process_stripes")
initialize_partial_mat(*result, table, dm_stripes, stripe_start, stripe_stop, true); // true -> is_upper_triangle
+ TDBG_STEP("partial_mat")
destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, stripe_start, stripe_stop);
return okay;
@@ -451,14 +476,17 @@ compute_status partial(const char* biom_filename, const char* tree_filename,
compute_status faith_pd_one_off(const char* biom_filename, const char* tree_filename,
r_vec** result){
+ SETUP_TDBG("faith_pd_one_off")
CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
+ TDBG_STEP("load_files")
initialize_results_vec(*result, table);
// compute faithpd
su::faith_pd(table, tree_sheared, std::ref((*result)->values));
+ TDBG_STEP("faith_pd")
return okay;
}
@@ -466,10 +494,12 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file
compute_status one_off(const char* biom_filename, const char* tree_filename,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int nthreads, mat_t** result) {
+ SETUP_TDBG("one_off")
CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
PARSE_TREE_TABLE(tree_filename, table_filename)
+ TDBG_STEP("load_files")
// condensed form
return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
}
@@ -481,6 +511,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
bool bypass_tips, unsigned int nthreads,
const char *mmap_dir,
TMat** result) {
+ SETUP_TDBG("one_off_matrix_inmem")
if (mmap_dir!=NULL) {
if (mmap_dir[0]==0) mmap_dir = NULL; // easier to have a simple test going on
}
@@ -488,6 +519,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
SET_METHOD(unifrac_method, unknown_method)
SYNC_TREE_TABLE(tree, table)
+ TDBG_STEP("sync_tree_table")
const unsigned int stripe_stop = (table.n_samples + 1) / 2;
partial_mat_t *partial_mat = NULL;
@@ -501,6 +533,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
+ TDBG_STEP("process_stripes")
initialize_partial_mat(partial_mat, table, dm_stripes, 0, stripe_stop, true); // true -> is_upper_triangle
if ((partial_mat==NULL) || (partial_mat->stripes==NULL) || (partial_mat->sample_ids==NULL) ) {
fprintf(stderr, "Memory allocation error! (initialize_partial_mat)\n");
@@ -527,6 +560,7 @@ compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
(4096/sizeof(TReal)); /* make it larger for mmap, as the limiting factor is swapping */
su::stripes_to_matrix_T<TReal>(ps, partial_mat->n_samples, partial_mat->stripe_total, (*result)->matrix, tile_size);
}
+ TDBG_STEP("stripes_to_matrix")
destroy_partial_mat(&partial_mat);
return okay;
@@ -538,9 +572,11 @@ compute_status one_off_matrix(const char* biom_filename, const char* tree_filena
bool bypass_tips, unsigned int nthreads,
const char *mmap_dir,
mat_full_fp64_t** result) {
+ SETUP_TDBG("one_off_matrix")
CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
PARSE_TREE_TABLE(tree_filename, biom_filename)
+ TDBG_STEP("load_files")
return one_off_matrix_T<double,mat_full_fp64_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
}
@@ -549,9 +585,11 @@ compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_f
bool bypass_tips, unsigned int nthreads,
const char *mmap_dir,
mat_full_fp32_t** result) {
+ SETUP_TDBG("one_off_matrix_fp32")
CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
PARSE_TREE_TABLE(tree_filename, biom_filename)
+ TDBG_STEP("load_files")
return one_off_matrix_T<float,mat_full_fp32_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
}
=====================================
src/testdata/compare_unifrac_matrix.py
=====================================
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+
+# Requires h5py library
+# Usage:
+# compare_unifrac_matrix.py fname1 fname2 precisio
+#
+# Example:
+# compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 a.h5 1.e-5
+#
+import h5py
+import sys
+
+fname1=sys.argv[1]
+fname2=sys.argv[2]
+prec=float(sys.argv[3])
+
+d1= h5py.File(fname1)['matrix'][:,:]
+d2= h5py.File(fname2)['matrix'][:,:]
+
+l1=len(d1)
+l2=len(d2)
+if (l1!=l2):
+ print("The two files do not have the same number of rows")
+ sys.exit(1)
+
+for r in range(l1):
+ m=max(abs(d1[r,:]-d2[r,:]))
+ if (m>prec):
+ print("Diff too large at row %i: %e>%e"%(r,m,prec))
+ sys.exit(2)
+print("Files match within precision")
=====================================
src/testdata/compare_unifrac_pcoa.py
=====================================
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+# Requires h5py library
+# Usage:
+# compare_unifrac_pcoa.py fname1 fname2 elements precision
+#
+# Example:
+# compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 a.h5 3 0.1
+#
+import h5py
+import sys
+
+fname1=sys.argv[1]
+fname2=sys.argv[2]
+els=int(sys.argv[3])
+prec=float(sys.argv[4])
+
+d1= h5py.File(fname1)['pcoa_eigvals'][:els]
+d2= h5py.File(fname2)['pcoa_eigvals'][:els]
+
+l1=len(d1)
+l2=len(d2)
+if (l1!=l2):
+ print("The two files do not have the same number of elements")
+ sys.exit(1)
+
+m=max(abs((d1[:]-d2[:])/d1[:]))
+if (m>prec):
+ print("Diff too large: %e>%e "%(m,prec))
+ print("Raw data:",d1,d2, abs((d1[:]-d2[:])/d1[:]))
+ sys.exit(2)
+
+print("Files match within precision")
=====================================
src/testdata/test500.biom
=====================================
Binary files /dev/null and b/src/testdata/test500.biom differ
=====================================
src/testdata/test500.tre
=====================================
The diff for this file was not included because it is too large.
=====================================
src/testdata/test500.unweighted_fp32.f.h5
=====================================
Binary files /dev/null and b/src/testdata/test500.unweighted_fp32.f.h5 differ
=====================================
src/testdata/test500.weighted_normalized_fp32.f.h5
=====================================
Binary files /dev/null and b/src/testdata/test500.weighted_normalized_fp32.f.h5 differ
=====================================
src/testdata/test500.weighted_unnormalized_fp32.h5
=====================================
Binary files /dev/null and b/src/testdata/test500.weighted_unnormalized_fp32.h5 differ
=====================================
src/unifrac_task.cpp
=====================================
@@ -2,29 +2,35 @@
#include "unifrac_task.hpp"
#include <cstdlib>
+#ifndef _OPENACC
+// popcnt returns number of bits set to 1, if supported by fast HW compute
+// else return 0 when v is 0 and max-bits else
+// Note: The user of popcnt in this file must be able to use this modified semantics
+
+#if __AVX__
+#include <immintrin.h>
+static inline int32_t popcnt_u32(uint32_t v) {return _mm_popcnt_u32(v);}
+static inline int64_t popcnt_u64(uint64_t v) {return _mm_popcnt_u64(v);}
+#else
+static inline int32_t popcnt_u32(uint32_t v) {return (v==0) ? 0 : 32;}
+static inline int64_t popcnt_u64(uint64_t v) {return (v==0) ? 0 : 64;}
+// For future non-x86 ports, see https://barakmich.dev/posts/popcnt-arm64-go-asm/
+// and https://stackoverflow.com/questions/38113284/whats-the-difference-between-builtin-popcountll-and-mm-popcnt-u64
+#endif
+#endif
-
-
+// check for zero values and pre-compute single column sums
template<class TFloat>
-void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
- const uint64_t start_idx = this->task_p->start;
- const uint64_t stop_idx = this->task_p->stop;
- const uint64_t n_samples = this->task_p->n_samples;
- const uint64_t n_samples_r = this->dm_stripes.n_samples_r;
-
- // openacc only works well with local variables
- const TFloat * const __restrict__ embedded_proportions = this->embedded_proportions;
- TFloat * const __restrict__ dm_stripes_buf = this->dm_stripes.buf;
-
- bool * const __restrict__ zcheck = this->zcheck;
- TFloat * const __restrict__ sums = this->sums;
-
- const uint64_t step_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::step_size;
- const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
-
- // check for zero values and pre-compute single column sums
+static inline void WeightedZerosAndSums(
+ bool * const __restrict__ zcheck,
+ TFloat * const __restrict__ sums,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t n_samples,
+ const uint64_t n_samples_r) {
#ifdef _OPENACC
-#pragma acc parallel loop present(embedded_proportions,lengths,zcheck,sums)
+#pragma acc parallel loop gang vector present(embedded_proportions,lengths,zcheck,sums)
#else
#pragma omp parallel for default(shared)
#endif
@@ -44,27 +50,172 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
sums[k] = my_sum;
zcheck[k] = all_zeros;
}
+}
+// Single step in computing Weighted part of Unifrac
+template<class TFloat>
+static inline TFloat WeightedVal1(
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t n_samples_r,
+ const uint64_t k,
+ const uint64_t l1) {
+ TFloat my_stripe = 0.0;
- // now do the real compute
-#ifdef _OPENACC
- const unsigned int acc_vector_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::acc_vector_size;
-#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,lengths,zcheck,sums) async
-#else
- // use dynamic scheduling due to non-homogeneity in the loop
-#pragma omp parallel for default(shared) schedule(dynamic,1)
-#endif
- for(uint64_t sk = 0; sk < sample_steps ; sk++) {
- for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
- for(uint64_t ik = 0; ik < step_size ; ik++) {
- const uint64_t k = sk*step_size + ik;
+ for (uint64_t emb=0; emb<filled_embs; emb++) {
+ const uint64_t offset = n_samples_r * emb;
- if (k>=n_samples) continue; // past the limit
+ TFloat u1 = embedded_proportions[offset + k];
+ TFloat v1 = embedded_proportions[offset + l1];
+ TFloat diff1 = u1 - v1;
+ TFloat length = lengths[emb];
+
+ my_stripe += fabs(diff1) * length;
+ } // for emb
- const bool zcheck_k = zcheck[sk]; // due to loop collapse in ACC, must load in here
+ return my_stripe;
+}
- const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+#ifndef _OPENACC
+template<class TFloat>
+static inline void WeightedVal4(
+ TFloat * const __restrict__ stripes,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t n_samples_r,
+ const uint64_t ks,
+ const uint64_t ls) {
+ TFloat my_stripe0 = 0.0;
+ TFloat my_stripe1 = 0.0;
+ TFloat my_stripe2 = 0.0;
+ TFloat my_stripe3 = 0.0;
+ for (uint64_t emb=0; emb<filled_embs; emb++) {
+ const uint64_t offset = n_samples_r * emb;
+
+ TFloat u0 = embedded_proportions[offset + ks];
+ TFloat u1 = embedded_proportions[offset + ks + 1];
+ TFloat u2 = embedded_proportions[offset + ks + 2];
+ TFloat u3 = embedded_proportions[offset + ks + 3];
+ TFloat v0 = embedded_proportions[offset + ls];
+ TFloat v1 = embedded_proportions[offset + ls + 1];
+ TFloat v2 = embedded_proportions[offset + ls + 2];
+ TFloat v3 = embedded_proportions[offset + ls + 3];
+ TFloat length = lengths[emb];
+
+ TFloat diff0 = u0 - v0;
+ TFloat diff1 = u1 - v1;
+ TFloat diff2 = u2 - v2;
+ TFloat diff3 = u3 - v3;
+ TFloat absdiff0 = fabs(diff0);
+ TFloat absdiff1 = fabs(diff1);
+ TFloat absdiff2 = fabs(diff2);
+ TFloat absdiff3 = fabs(diff3);
+
+ my_stripe0 += absdiff0 * length;
+ my_stripe1 += absdiff1 * length;
+ my_stripe2 += absdiff2 * length;
+ my_stripe3 += absdiff3 * length;
+ } // for emb
+
+ stripes[ks] += my_stripe0;
+ stripes[ks + 1] += my_stripe1;
+ stripes[ks + 2] += my_stripe2;
+ stripes[ks + 3] += my_stripe3;
+}
+
+template<class TFloat>
+static inline void WeightedVal8(
+ TFloat * const __restrict__ stripes,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t n_samples_r,
+ const uint64_t ks,
+ const uint64_t ls) {
+ TFloat my_stripe0 = 0.0;
+ TFloat my_stripe1 = 0.0;
+ TFloat my_stripe2 = 0.0;
+ TFloat my_stripe3 = 0.0;
+ TFloat my_stripe4 = 0.0;
+ TFloat my_stripe5 = 0.0;
+ TFloat my_stripe6 = 0.0;
+ TFloat my_stripe7 = 0.0;
+
+ for (uint64_t emb=0; emb<filled_embs; emb++) {
+ const uint64_t offset = n_samples_r * emb;
+
+ TFloat u0 = embedded_proportions[offset + ks];
+ TFloat u1 = embedded_proportions[offset + ks + 1];
+ TFloat u2 = embedded_proportions[offset + ks + 2];
+ TFloat u3 = embedded_proportions[offset + ks + 3];
+ TFloat u4 = embedded_proportions[offset + ks + 4];
+ TFloat u5 = embedded_proportions[offset + ks + 5];
+ TFloat u6 = embedded_proportions[offset + ks + 6];
+ TFloat u7 = embedded_proportions[offset + ks + 7];
+ TFloat v0 = embedded_proportions[offset + ls];
+ TFloat v1 = embedded_proportions[offset + ls + 1];
+ TFloat v2 = embedded_proportions[offset + ls + 2];
+ TFloat v3 = embedded_proportions[offset + ls + 3];
+ TFloat v4 = embedded_proportions[offset + ls + 4];
+ TFloat v5 = embedded_proportions[offset + ls + 5];
+ TFloat v6 = embedded_proportions[offset + ls + 6];
+ TFloat v7 = embedded_proportions[offset + ls + 7];
+ TFloat length = lengths[emb];
+
+ TFloat diff0 = u0 - v0;
+ TFloat diff1 = u1 - v1;
+ TFloat diff2 = u2 - v2;
+ TFloat diff3 = u3 - v3;
+ TFloat diff4 = u4 - v4;
+ TFloat diff5 = u5 - v5;
+ TFloat diff6 = u6 - v6;
+ TFloat diff7 = u7 - v7;
+ TFloat absdiff0 = fabs(diff0);
+ TFloat absdiff1 = fabs(diff1);
+ TFloat absdiff2 = fabs(diff2);
+ TFloat absdiff3 = fabs(diff3);
+ TFloat absdiff4 = fabs(diff4);
+ TFloat absdiff5 = fabs(diff5);
+ TFloat absdiff6 = fabs(diff6);
+ TFloat absdiff7 = fabs(diff7);
+
+ my_stripe0 += absdiff0 * length;
+ my_stripe1 += absdiff1 * length;
+ my_stripe2 += absdiff2 * length;
+ my_stripe3 += absdiff3 * length;
+ my_stripe4 += absdiff4 * length;
+ my_stripe5 += absdiff5 * length;
+ my_stripe6 += absdiff6 * length;
+ my_stripe7 += absdiff7 * length;
+ } // for emb
+
+ stripes[ks] += my_stripe0;
+ stripes[ks + 1] += my_stripe1;
+ stripes[ks + 2] += my_stripe2;
+ stripes[ks + 3] += my_stripe3;
+ stripes[ks + 4] += my_stripe4;
+ stripes[ks + 5] += my_stripe5;
+ stripes[ks + 6] += my_stripe6;
+ stripes[ks + 7] += my_stripe7;
+}
+#endif
+
+// Single step in computing NormalizedWeighted Unifrac
+template<class TFloat>
+static inline void UnnormalizedWeighted1(
+ TFloat * const __restrict__ dm_stripes_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ sums,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t k,
+ const uint64_t l1) {
const bool allzero_k = zcheck[k];
const bool allzero_l1 = zcheck[l1];
@@ -82,36 +233,282 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled
// keep reads in the same place to maximize GPU warp performance
my_stripe = sums[ridx];
-
} else {
// both sides non zero, use the explicit but slow approach
- my_stripe = 0.0;
-
-#pragma acc loop seq
- for (uint64_t emb=0; emb<filled_embs; emb++) {
- const uint64_t offset = n_samples_r * emb;
+ my_stripe = WeightedVal1(embedded_proportions, lengths,
+ filled_embs, n_samples_r,
+ k, l1);
+ }
- TFloat u1 = embedded_proportions[offset + k];
- TFloat v1 = embedded_proportions[offset + l1];
- TFloat diff1 = u1 - v1;
- TFloat length = lengths[emb];
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+
+ // keep all writes in a single place, to maximize GPU warp performance
+ dm_stripe[k] += my_stripe;
+ }
+}
- my_stripe += fabs(diff1) * length;
- } // for emb
+#ifndef _OPENACC
+// Vectorized step in computing UnnormalizedWeighted Unifrac
+template<class TFloat>
+static inline void UnnormalizedWeighted4(
+ TFloat * const __restrict__ dm_stripes_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ sums,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t ks,
+ const uint64_t ls) {
+ const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0];
+ const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0];
+ const bool allzero_k = z_k==0x01010101;
+ const bool allzero_l = z_l==0x01010101;
+
+ // popcnt is cheap but has large latency, so compute speculatively/early
+ const int32_t cnt_k = popcnt_u32(z_k);
+ const int32_t cnt_l = popcnt_u32(z_l);
+
+ if (allzero_k && allzero_l) {
+ // nothing to do, would have to add 0
+ } else if (allzero_k || allzero_l) {
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+ if (allzero_k) {
+ // one side has all zeros, use distributed property
+ // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+ const TFloat sum_l0 = sums[ls];
+ const TFloat sum_l1 = sums[ls+1];
+ const TFloat sum_l2 = sums[ls+2];
+ const TFloat sum_l3 = sums[ls+3];
+ dm_stripe[ks] += sum_l0;
+ dm_stripe[ks+1] += sum_l1;
+ dm_stripe[ks+2] += sum_l2;
+ dm_stripe[ks+3] += sum_l3;
+ } else {
+ // one side has all zeros, use distributed property
+ // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
+ const TFloat sum_k0 = sums[ks];
+ const TFloat sum_k1 = sums[ks+1];
+ const TFloat sum_k2 = sums[ks+2];
+ const TFloat sum_k3 = sums[ks+3];
+ dm_stripe[ks] += sum_k0;
+ dm_stripe[ks+1] += sum_k1;
+ dm_stripe[ks+2] += sum_k2;
+ dm_stripe[ks+3] += sum_k3;
}
+ } else if ((cnt_k<3) && (cnt_l<3)) {
+ // several of the elemens are nonzero, may as well use the vectorized version
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+
+ WeightedVal4(dm_stripe,
+ embedded_proportions, lengths,
+ filled_embs, n_samples_r,
+ ks, ls);
+ } else {
+ // only a few have both sides partially non zero, use the fine grained compute
+ for (uint64_t i=0; i<4; i++) {
+ UnnormalizedWeighted1<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks+i, ls+i);
+ }
+ } // (allzero_k && allzero_l)
+}
- const uint64_t idx = (stripe-start_idx)*n_samples_r;
+template<class TFloat>
+static inline void UnnormalizedWeighted8(
+ TFloat * const __restrict__ dm_stripes_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ sums,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t ks,
+ const uint64_t ls) {
+ const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0];
+ const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0];
+ const bool allzero_k = z_k==0x0101010101010101;
+ const bool allzero_l = z_l==0x0101010101010101;
+
+ // popcnt is cheap but has large latency, so compute speculatively/early
+ const int64_t cnt_k = popcnt_u64(z_k);
+ const int64_t cnt_l = popcnt_u64(z_l);
+
+ if (allzero_k && allzero_l) {
+ // nothing to do, would have to add 0
+ } else if (allzero_k || allzero_l) {
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
- // keep all writes in a single place, to maximize GPU warp performance
- dm_stripe[k] += my_stripe;
- }
+ if (allzero_k) {
+ // one side has all zeros, use distributed property
+ // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+ const TFloat sum_l0 = sums[ls];
+ const TFloat sum_l1 = sums[ls+1];
+ const TFloat sum_l2 = sums[ls+2];
+ const TFloat sum_l3 = sums[ls+3];
+ const TFloat sum_l4 = sums[ls+4];
+ const TFloat sum_l5 = sums[ls+5];
+ const TFloat sum_l6 = sums[ls+6];
+ const TFloat sum_l7 = sums[ls+7];
+ dm_stripe[ks] += sum_l0;
+ dm_stripe[ks+1] += sum_l1;
+ dm_stripe[ks+2] += sum_l2;
+ dm_stripe[ks+3] += sum_l3;
+ dm_stripe[ks+4] += sum_l4;
+ dm_stripe[ks+5] += sum_l5;
+ dm_stripe[ks+6] += sum_l6;
+ dm_stripe[ks+7] += sum_l7;
+ } else {
+ // one side has all zeros, use distributed property
+ // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
+ const TFloat sum_k0 = sums[ks];
+ const TFloat sum_k1 = sums[ks+1];
+ const TFloat sum_k2 = sums[ks+2];
+ const TFloat sum_k3 = sums[ks+3];
+ const TFloat sum_k4 = sums[ks+4];
+ const TFloat sum_k5 = sums[ks+5];
+ const TFloat sum_k6 = sums[ks+6];
+ const TFloat sum_k7 = sums[ks+7];
+ dm_stripe[ks] += sum_k0;
+ dm_stripe[ks+1] += sum_k1;
+ dm_stripe[ks+2] += sum_k2;
+ dm_stripe[ks+3] += sum_k3;
+ dm_stripe[ks+4] += sum_k4;
+ dm_stripe[ks+5] += sum_k5;
+ dm_stripe[ks+6] += sum_k6;
+ dm_stripe[ks+7] += sum_k7;
+ }
+ } else if ((cnt_k<6) && (cnt_l<6)) {
+ // several of the elemens are nonzero, may as well use the vectorized version
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+
+ WeightedVal8(dm_stripe,
+ embedded_proportions, lengths,
+ filled_embs, n_samples_r,
+ ks, ls);
+ } else {
+ // only a few have both sides partially non zero, use the fine grained compute
+ for (uint64_t i=0; i<8; i++) {
+ UnnormalizedWeighted1<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks+i, ls+i);
+ }
+ } // (allzero_k && allzero_l)
+}
+#endif
+
+template<class TFloat>
+void SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
+ const uint64_t start_idx = this->task_p->start;
+ const uint64_t stop_idx = this->task_p->stop;
+ const uint64_t n_samples = this->task_p->n_samples;
+ const uint64_t n_samples_r = this->dm_stripes.n_samples_r;
+ // openacc only works well with local variables
+ const TFloat * const __restrict__ embedded_proportions = this->embedded_proportions;
+ TFloat * const __restrict__ dm_stripes_buf = this->dm_stripes.buf;
+
+ bool * const __restrict__ zcheck = this->zcheck;
+ TFloat * const __restrict__ sums = this->sums;
+
+ const uint64_t step_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::step_size;
+ const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
+
+ // check for zero values and pre-compute single column sums
+ WeightedZerosAndSums(zcheck, sums,
+ embedded_proportions, lengths,
+ filled_embs, n_samples, n_samples_r);
+
+ // now do the real compute
+#ifdef _OPENACC
+ const unsigned int acc_vector_size = SUCMP_NM::UnifracUnnormalizedWeightedTask<TFloat>::acc_vector_size;
+#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,lengths,zcheck,sums) async
+ for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+ for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
+ // SIMT-based GPU work great one at a time (HW will deal with parallelism)
+ for(uint64_t ik = 0; ik < step_size ; ik++) {
+ const uint64_t k = sk*step_size + ik;
+
+ if (k>=n_samples) continue; // past the limit
+
+ const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+ const uint64_t idx = (stripe-start_idx) * n_samples_r;
+
+ UnnormalizedWeighted1<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ k, l1);
} // for ik
} // for stripe
} // for sk
+#else
+ // tilling helps with better cache reuse without the need of multiple cores
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
+
+ // use dynamic scheduling due to non-homogeneity in the loop
+ // Use a moderate block to prevent trashing but still have some cache reuse
+#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
+ for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
+ for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+ // tile to maximize cache reuse
+ for(uint64_t is = 0; is < step_size ; is++) {
+ const uint64_t stripe = start_idx+ss*step_size + is;
+ if (stripe<stop_idx) { // else past limit
+
+ // SIMD-based CPUs need help with vectorization
+ const uint64_t idx = (stripe-start_idx) * n_samples_r;
+ uint64_t ks = sk*step_size;
+ const uint64_t kmax = std::min(ks+step_size,n_samples);
+ uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
+
+ while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) {
+ UnnormalizedWeighted8<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks, ls);
+ ks+=8;
+ ls = (ls + 8)%n_samples; // wraparound
+ } // for ks+=8
+
+ while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) {
+ UnnormalizedWeighted4<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks, ls);
+ ks+=4;
+ ls = (ls + 4)%n_samples; // wraparound
+ } // for ks+=4
+
+ // deal with any leftovers in serial way
+ for( ; ks < kmax; ks++ ) {
+ UnnormalizedWeighted1<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks, ls);
+ ls = (ls + 1)%n_samples; // wraparound
+ } // for ks
+
+ } // if stripe
+ } // for is
+ } // for sk
+ } // for ss
+#endif
#ifdef _OPENACC
// next iteration will use the alternative space
@@ -187,6 +584,276 @@ void SUCMP_NM::UnifracVawUnnormalizedWeightedTask<TFloat>::_run(unsigned int fil
#endif
}
+// Single step in computing NormalizedWeighted Unifrac
+template<class TFloat>
+static inline void NormalizedWeighted1(
+ TFloat * const __restrict__ dm_stripes_buf,
+ TFloat * const __restrict__ dm_stripes_total_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ sums,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t k,
+ const uint64_t l1) {
+ const bool allzero_k = zcheck[k];
+ const bool allzero_l1 = zcheck[l1];
+
+ if (allzero_k && allzero_l1) {
+ // nothing to do, would have to add 0
+ } else {
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+ //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+
+ const TFloat sum_k = sums[k];
+ const TFloat sum_l = sums[l1];
+
+ // the totals can always use the distributed property
+ dm_stripe_total[k] += sum_k + sum_l;
+
+ TFloat my_stripe;
+
+ if (allzero_k || allzero_l1) {
+ // one side has all zeros
+ // we can use the distributed property, and use the pre-computed values
+
+ my_stripe = (allzero_k) ? sum_l : // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+ sum_k; // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
+ } else {
+ // both sides non zero, use the explicit but slow approach
+ my_stripe = WeightedVal1(embedded_proportions, lengths,
+ filled_embs, n_samples_r,
+ k, l1);
+ }
+
+ // keep all writes in a single place, to maximize GPU warp performance
+ dm_stripe[k] += my_stripe;
+ }
+}
+
+#ifndef _OPENACC
+// Vectorized step in computing NormalizedWeighted Unifrac
+template<class TFloat>
+static inline void NormalizedWeighted4(
+ TFloat * const __restrict__ dm_stripes_buf,
+ TFloat * const __restrict__ dm_stripes_total_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ sums,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t ks,
+ const uint64_t ls) {
+ const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0];
+ const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0];
+ const bool allzero_k = z_k==0x01010101;
+ const bool allzero_l = z_l==0x01010101;
+
+ if (allzero_k && allzero_l) {
+ // nothing to do, would have to add 0
+ } else {
+ const TFloat sum_k0 = sums[ks];
+ const TFloat sum_k1 = sums[ks+1];
+ const TFloat sum_k2 = sums[ks+2];
+ const TFloat sum_k3 = sums[ks+3];
+ const TFloat sum_l0 = sums[ls];
+ const TFloat sum_l1 = sums[ls+1];
+ const TFloat sum_l2 = sums[ls+2];
+ const TFloat sum_l3 = sums[ls+3];
+
+ const TFloat sum_kl0 = sum_k0 + sum_l0;
+ const TFloat sum_kl1 = sum_k1 + sum_l1;
+ const TFloat sum_kl2 = sum_k2 + sum_l2;
+ const TFloat sum_kl3 = sum_k3 + sum_l3;
+
+ int32_t cnt_k = 0;
+ int32_t cnt_l = 0;
+
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+
+ if (allzero_k) {
+ // one side has all zeros, use distributed property
+ // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+ dm_stripe[ks] += sum_l0;
+ dm_stripe[ks+1] += sum_l1;
+ dm_stripe[ks+2] += sum_l2;
+ dm_stripe[ks+3] += sum_l3;
+ } else if (allzero_l) {
+ // one side has all zeros, use distributed property
+ // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
+ dm_stripe[ks] += sum_k0;
+ dm_stripe[ks+1] += sum_k1;
+ dm_stripe[ks+2] += sum_k2;
+ dm_stripe[ks+3] += sum_k3;
+ } else {
+ // popcnt is cheap but has large latency, so compute early
+ cnt_k = popcnt_u32(z_k);
+ cnt_l = popcnt_u32(z_l);
+ }
+
+ // the totals can always use the distributed property
+ {
+ TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+ //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+ dm_stripe_total[ks] += sum_kl0;
+ dm_stripe_total[ks+1] += sum_kl1;
+ dm_stripe_total[ks+2] += sum_kl2;
+ dm_stripe_total[ks+3] += sum_kl3;
+ }
+
+ if (allzero_k||allzero_l) {
+ // already done
+ } else if ((cnt_k<3) && (cnt_l<3)) {
+ // several of the elemens are nonzero, may as well use the vectorized version
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+
+ WeightedVal4(dm_stripe,
+ embedded_proportions, lengths,
+ filled_embs, n_samples_r,
+ ks, ls);
+ } else {
+ // only a few have both sides partially non zero, use the fine grained compute
+ // Use UnnormalizedWeighted since we already computed dm_stripe_total
+ for (uint64_t i=0; i<4; i++) {
+ UnnormalizedWeighted1<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks+i, ls+i);
+ }
+ }
+ } // (allzero_k && allzero_l)
+}
+
+template<class TFloat>
+static inline void NormalizedWeighted8(
+ TFloat * const __restrict__ dm_stripes_buf,
+ TFloat * const __restrict__ dm_stripes_total_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ sums,
+ const TFloat * const __restrict__ embedded_proportions,
+ const TFloat * __restrict__ lengths,
+ const unsigned int filled_embs,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t ks,
+ const uint64_t ls) {
+ const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0];
+ const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0];
+ const bool allzero_k = z_k==0x0101010101010101;
+ const bool allzero_l = z_l==0x0101010101010101;
+
+ if (allzero_k && allzero_l) {
+ // nothing to do, would have to add 0
+ } else {
+ const TFloat sum_k0 = sums[ks];
+ const TFloat sum_k1 = sums[ks+1];
+ const TFloat sum_k2 = sums[ks+2];
+ const TFloat sum_k3 = sums[ks+3];
+ const TFloat sum_k4 = sums[ks+4];
+ const TFloat sum_k5 = sums[ks+5];
+ const TFloat sum_k6 = sums[ks+6];
+ const TFloat sum_k7 = sums[ks+7];
+ const TFloat sum_l0 = sums[ls];
+ const TFloat sum_l1 = sums[ls+1];
+ const TFloat sum_l2 = sums[ls+2];
+ const TFloat sum_l3 = sums[ls+3];
+ const TFloat sum_l4 = sums[ls+4];
+ const TFloat sum_l5 = sums[ls+5];
+ const TFloat sum_l6 = sums[ls+6];
+ const TFloat sum_l7 = sums[ls+7];
+
+ const TFloat sum_kl0 = sum_k0 + sum_l0;
+ const TFloat sum_kl1 = sum_k1 + sum_l1;
+ const TFloat sum_kl2 = sum_k2 + sum_l2;
+ const TFloat sum_kl3 = sum_k3 + sum_l3;
+ const TFloat sum_kl4 = sum_k4 + sum_l4;
+ const TFloat sum_kl5 = sum_k5 + sum_l5;
+ const TFloat sum_kl6 = sum_k6 + sum_l6;
+ const TFloat sum_kl7 = sum_k7 + sum_l7;
+
+ int32_t cnt_k = 0;
+ int32_t cnt_l = 0;
+
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+
+ if (allzero_k) {
+ // one side has all zeros, use distributed property
+ // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
+ dm_stripe[ks] += sum_l0;
+ dm_stripe[ks+1] += sum_l1;
+ dm_stripe[ks+2] += sum_l2;
+ dm_stripe[ks+3] += sum_l3;
+ dm_stripe[ks+4] += sum_l4;
+ dm_stripe[ks+5] += sum_l5;
+ dm_stripe[ks+6] += sum_l6;
+ dm_stripe[ks+7] += sum_l7;
+ } else if (allzero_l) {
+ // one side has all zeros, use distributed property
+ // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
+ dm_stripe[ks] += sum_k0;
+ dm_stripe[ks+1] += sum_k1;
+ dm_stripe[ks+2] += sum_k2;
+ dm_stripe[ks+3] += sum_k3;
+ dm_stripe[ks+4] += sum_k4;
+ dm_stripe[ks+5] += sum_k5;
+ dm_stripe[ks+6] += sum_k6;
+ dm_stripe[ks+7] += sum_k7;
+ } else {
+ // popcnt is cheap but has large latency, so compute early
+ cnt_k = popcnt_u64(z_k);
+ cnt_l = popcnt_u64(z_l);
+ }
+
+ // the totals can always use the distributed property
+ {
+ TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+ //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+ dm_stripe_total[ks] += sum_kl0;
+ dm_stripe_total[ks+1] += sum_kl1;
+ dm_stripe_total[ks+2] += sum_kl2;
+ dm_stripe_total[ks+3] += sum_kl3;
+ dm_stripe_total[ks+4] += sum_kl4;
+ dm_stripe_total[ks+5] += sum_kl5;
+ dm_stripe_total[ks+6] += sum_kl6;
+ dm_stripe_total[ks+7] += sum_kl7;
+ }
+
+ if (allzero_k||allzero_l) {
+ // already done
+ } else if ((cnt_k<6) && (cnt_l<6)) {
+ // several of the elemens are nonzero, may as well use the vectorized version
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+
+ WeightedVal8(dm_stripe,
+ embedded_proportions, lengths,
+ filled_embs, n_samples_r,
+ ks, ls);
+ } else {
+ // only a few have both sides partially non zero, use the fine grained compute
+ // Use UnnormalizedWeighted since we already computed dm_stripe_total
+ for (uint64_t i=0; i<8; i++) {
+ UnnormalizedWeighted1<TFloat>(
+ dm_stripes_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks+i, ls+i);
+ }
+ }
+ } // (allzero_k && allzero_l)
+}
+#endif
+
template<class TFloat>
void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
const uint64_t start_idx = this->task_p->start;
@@ -206,100 +873,88 @@ void SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::_run(unsigned int filled_e
const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
// check for zero values and pre-compute single column sums
-#ifdef _OPENACC
-#pragma acc parallel loop present(embedded_proportions,lengths,zcheck,sums)
-#else
-#pragma omp parallel for default(shared)
-#endif
- for(uint64_t k=0; k<n_samples; k++) {
- bool all_zeros=true;
- TFloat my_sum = 0.0;
-
-#pragma acc loop seq
- for (uint64_t emb=0; emb<filled_embs; emb++) {
- const uint64_t offset = n_samples_r * emb;
-
- TFloat u1 = embedded_proportions[offset + k];
- my_sum += u1*lengths[emb];
- all_zeros = all_zeros && (u1==0.0);
- }
-
- sums[k] = my_sum;
- zcheck[k] = all_zeros;
- }
+ WeightedZerosAndSums(zcheck, sums,
+ embedded_proportions, lengths,
+ filled_embs, n_samples, n_samples_r);
// point of thread
#ifdef _OPENACC
const unsigned int acc_vector_size = SUCMP_NM::UnifracNormalizedWeightedTask<TFloat>::acc_vector_size;
-#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,lengths,zcheck,sums) async
-#else
- // use dynamic scheduling due to non-homogeneity in the loop
-#pragma omp parallel for schedule(dynamic,1) default(shared)
-#endif
+#pragma acc parallel loop gang vector collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,lengths,zcheck,sums) async
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
+ // SIMT-based GPU work great one at a time (HW will deal with parallelism)
for(uint64_t ik = 0; ik < step_size ; ik++) {
const uint64_t k = sk*step_size + ik;
if (k>=n_samples) continue; // past the limit
- const bool zcheck_k = zcheck[sk]; // due to loop collapse in ACC, must load in here
-
const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+ const uint64_t idx = (stripe-start_idx) * n_samples_r;
- const bool allzero_k = zcheck[k];
- const bool allzero_l1 = zcheck[l1];
-
- if (allzero_k && allzero_l1) {
- // nothing to do, would have to add 0
- } else {
- const uint64_t idx = (stripe-start_idx) * n_samples_r;
- TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
- TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
- //TFloat *dm_stripe = dm_stripes[stripe];
- //TFloat *dm_stripe_total = dm_stripes_total[stripe];
-
- // the totals can always use the distributed property
- dm_stripe_total[k] += sums[k] + sums[l1];
-
- TFloat my_stripe;
-
- if (allzero_k || allzero_l1) {
- // one side has all zeros
- // we can use the distributed property, and use the pre-computed values
-
- const uint64_t ridx = (allzero_k) ? l1 : // if (nonzero_l1) ridx=l1 // fabs(k-l1), with k==0
- k; // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0
-
- // keep reads in the same place to maximize GPU warp performance
- my_stripe = sums[ridx];
-
- } else {
- // both sides non zero, use the explicit but slow approach
-
- my_stripe = 0.0;
-
-#pragma acc loop seq
- for (uint64_t emb=0; emb<filled_embs; emb++) {
- const uint64_t offset = n_samples_r * emb;
-
- TFloat u1 = embedded_proportions[offset + k];
- TFloat v1 = embedded_proportions[offset + l1];
- TFloat diff1 = u1 - v1;
- TFloat length = lengths[emb];
-
- my_stripe += fabs(diff1) * length;
- }
-
- }
-
- // keep all writes in a single place, to maximize GPU warp performance
- dm_stripe[k] += my_stripe;
- }
-
+ NormalizedWeighted1<TFloat>(
+ dm_stripes_buf,dm_stripes_total_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ k, l1);
} // for ik
} // for stripe
} // for sk
+#else
+ // tilling helps with better cache reuse without the need of multiple cores
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
+
+ // use dynamic scheduling due to non-homogeneity in the loop
+ // Use a moderate block to prevent trashing but still have some cache reuse
+#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
+ for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
+ for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+ // tile to maximize cache reuse
+ for(uint64_t is = 0; is < step_size ; is++) {
+ const uint64_t stripe = start_idx+ss*step_size + is;
+ if (stripe<stop_idx) { // else past limit
+
+ // SIMD-based CPUs need help with vectorization
+ const uint64_t idx = (stripe-start_idx) * n_samples_r;
+ uint64_t ks = sk*step_size;
+ const uint64_t kmax = std::min(ks+step_size,n_samples);
+ uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound
+
+ while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) {
+ NormalizedWeighted8<TFloat>(
+ dm_stripes_buf,dm_stripes_total_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks, ls);
+ ks+=8;
+ ls = (ls + 8)%n_samples; // wraparound
+ } // for ks+=8
+
+ while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) {
+ NormalizedWeighted4<TFloat>(
+ dm_stripes_buf,dm_stripes_total_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks, ls);
+ ks+=4;
+ ls = (ls + 4)%n_samples; // wraparound
+ } // for ks+=4
+
+ // deal with any leftovers in serial way
+ for( ; ks < kmax; ks++ ) {
+ NormalizedWeighted1<TFloat>(
+ dm_stripes_buf,dm_stripes_total_buf,
+ zcheck, sums, embedded_proportions, lengths,
+ filled_embs,idx, n_samples_r,
+ ks, ls);
+ ls = (ls + 1)%n_samples; // wraparound
+ } // for ks
+
+ } // if stripe
+ } // for is
+ } // for sk
+ } // for ss
+#endif
#ifdef _OPENACC
// next iteration will use the alternative space
@@ -535,6 +1190,320 @@ void SUCMP_NM::UnifracVawGeneralizedTask<TFloat>::_run(unsigned int filled_embs,
#endif
}
+// Single step in computing Unweighted Unifrac
+
+template<class TFloat>
+static inline void UnweightedOneSide(
+ bool * const __restrict__ zcheck,
+ TFloat * const __restrict__ stripe_sums,
+ const TFloat * const __restrict__ sums,
+ const uint64_t * const __restrict__ embedded_proportions,
+ const unsigned int filled_embs_els_round,
+ const uint64_t n_samples_r,
+ const uint64_t kl) {
+ bool all_zeros=true;
+ TFloat my_stripe = 0.0;
+
+ for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
+ const uint64_t offset = n_samples_r * emb_el;
+ const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
+
+ uint64_t o1 = embedded_proportions[offset + kl];
+
+ if (o1==0) { // zeros are prevalent
+ // nothing to do
+ } else {
+ all_zeros = false;
+#ifndef _OPENACC
+ // CPU/SIMD faster if we check for partial compute
+ if (((uint32_t)o1)==0) {
+ // only high part relevant
+ //uint64_t x1 = u1 ^ v1;
+ // With one of the two being 0, xor is just the non-negative one
+
+ // Use the pre-computed sums
+ // Each range of 8 lengths has already been pre-computed and stored in psum
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+ TFloat esum = psum[0x400+((uint8_t)(o1 >> 32))] +
+ psum[0x500+((uint8_t)(o1 >> 40))] +
+ psum[0x600+((uint8_t)(o1 >> 48))] +
+ psum[0x700+((uint8_t)(o1 >> 56))];
+ my_stripe += esum;
+ } else if ((o1>>32)==0) {
+ // only low part relevant
+ //uint64_t x1 = u1 ^ v1;
+ // With one of the two being 0, xor is just the non-negative one
+
+ // Use the pre-computed sums
+ // Each range of 8 lengths has already been pre-computed and stored in psum
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+ TFloat esum = psum[ (uint8_t)(o1) ] +
+ psum[0x100+((uint8_t)(o1 >> 8))] +
+ psum[0x200+((uint8_t)(o1 >> 16))] +
+ psum[0x300+((uint8_t)(o1 >> 24))];
+ my_stripe += esum;
+ } else {
+#else
+ // GPU/SIMT faster if we just go ahead will full at all times
+ {
+#endif
+ //uint64_t x1 = u1 ^ v1;
+ // With one of the two being 0, xor is just the non-negative one
+
+ // Use the pre-computed sums
+ // Each range of 8 lengths has already been pre-computed and stored in psum
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+ TFloat esum = psum[ (uint8_t)(o1) ] +
+ psum[0x100+((uint8_t)(o1 >> 8))] +
+ psum[0x200+((uint8_t)(o1 >> 16))] +
+ psum[0x300+((uint8_t)(o1 >> 24))] +
+ psum[0x400+((uint8_t)(o1 >> 32))] +
+ psum[0x500+((uint8_t)(o1 >> 40))] +
+ psum[0x600+((uint8_t)(o1 >> 48))] +
+ psum[0x700+((uint8_t)(o1 >> 56))];
+ my_stripe += esum;
+ }
+ }
+ }
+
+ zcheck[kl] = all_zeros;
+ stripe_sums[kl] = my_stripe;
+
+}
+
+#ifdef _OPENACC
+
+template<class TFloat>
+static inline void Unweighted1(
+ TFloat * const __restrict__ dm_stripes_buf,
+ TFloat * const __restrict__ dm_stripes_total_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ stripe_sums,
+ const TFloat * const __restrict__ sums,
+ const uint64_t * const __restrict__ embedded_proportions,
+ const unsigned int filled_embs_els_round,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t k,
+ const uint64_t l1) {
+ const bool allzero_k = zcheck[k];
+ const bool allzero_l1 = zcheck[l1];
+
+ if (allzero_k && allzero_l1) {
+ // nothing to do, would have to add 0
+ } else {
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+ //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+
+ bool did_update = false;
+ TFloat my_stripe = 0.0;
+ TFloat my_stripe_total = 0.0;
+
+ if (allzero_k || allzero_l1) {
+ // with one side zero, | and ^ are no-ops
+ const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero onea
+ my_stripe = stripe_sums[kl];
+ did_update = (my_stripe!=0.0);
+ my_stripe_total = my_stripe;
+ } else {
+ // we need both sides
+ for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
+ const uint64_t offset = n_samples_r * emb_el;
+ const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
+
+ uint64_t u1 = embedded_proportions[offset + k];
+ uint64_t v1 = embedded_proportions[offset + l1];
+ uint64_t o1 = u1 | v1;
+
+ if (o1!=0) { // zeros are prevalent
+ did_update=true;
+ uint64_t x1 = u1 ^ v1;
+
+ // Use the pre-computed sums
+ // Each range of 8 lengths has already been pre-computed and stored in psum
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+ my_stripe_total += psum[ (uint8_t)(o1) ] +
+ psum[0x100+((uint8_t)(o1 >> 8))] +
+ psum[0x200+((uint8_t)(o1 >> 16))] +
+ psum[0x300+((uint8_t)(o1 >> 24))] +
+ psum[0x400+((uint8_t)(o1 >> 32))] +
+ psum[0x500+((uint8_t)(o1 >> 40))] +
+ psum[0x600+((uint8_t)(o1 >> 48))] +
+ psum[0x700+((uint8_t)(o1 >> 56))];
+ my_stripe += psum[ (uint8_t)(x1) ] +
+ psum[0x100+((uint8_t)(x1 >> 8))] +
+ psum[0x200+((uint8_t)(x1 >> 16))] +
+ psum[0x300+((uint8_t)(x1 >> 24))] +
+ psum[0x400+((uint8_t)(x1 >> 32))] +
+ psum[0x500+((uint8_t)(x1 >> 40))] +
+ psum[0x600+((uint8_t)(x1 >> 48))] +
+ psum[0x700+((uint8_t)(x1 >> 56))];
+ }
+ }
+ }
+
+ if (did_update) {
+ dm_stripe[k] += my_stripe;
+ dm_stripe_total[k] += my_stripe_total;
+ }
+ } // (allzero_k && allzero_l1)
+
+}
+#else
+
+template<class TFloat>
+static inline void Unweighted1(
+ TFloat * const __restrict__ dm_stripes_buf,
+ TFloat * const __restrict__ dm_stripes_total_buf,
+ const bool * const __restrict__ zcheck,
+ const TFloat * const __restrict__ stripe_sums,
+ const TFloat * const __restrict__ sums,
+ const uint64_t * const __restrict__ embedded_proportions,
+ const unsigned int filled_embs_els_round,
+ const uint64_t idx,
+ const uint64_t n_samples_r,
+ const uint64_t k,
+ const uint64_t l1) {
+ const bool allzero_k = zcheck[k];
+ const bool allzero_l1 = zcheck[l1];
+
+ if (allzero_k && allzero_l1) {
+ // nothing to do, would have to add 0
+ } else {
+ TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
+ TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
+ //TFloat *dm_stripe = dm_stripes[stripe];
+ //TFloat *dm_stripe_total = dm_stripes_total[stripe];
+
+ bool did_update = false;
+ TFloat my_stripe = 0.0;
+ TFloat my_stripe_total = 0.0;
+
+ if (allzero_k || allzero_l1) {
+ // with one side zero, | and ^ are no-ops
+ const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero onea
+ my_stripe = stripe_sums[kl];
+ my_stripe_total = my_stripe;
+ did_update = (my_stripe!=0.0);
+ } else {
+ // we need both sides
+ for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
+ const uint64_t offset = n_samples_r * emb_el;
+ const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
+
+ uint64_t u1 = embedded_proportions[offset + k];
+ uint64_t v1 = embedded_proportions[offset + l1];
+ uint64_t o1 = u1 | v1;
+ uint64_t x1 = u1 ^ v1;
+
+ if (o1==0) { // zeros are prevalent
+ // nothing to do
+ } else if (((uint32_t)o1)==0) {
+ // only high part relevant
+ did_update=true;
+
+ // Use the pre-computed sums
+ // Each range of 8 lengths has already been pre-computed and stored in psum
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+ my_stripe_total += psum[0x400+((uint8_t)(o1 >> 32))] +
+ psum[0x500+((uint8_t)(o1 >> 40))] +
+ psum[0x600+((uint8_t)(o1 >> 48))] +
+ psum[0x700+((uint8_t)(o1 >> 56))];
+ my_stripe += psum[0x400+((uint8_t)(x1 >> 32))] +
+ psum[0x500+((uint8_t)(x1 >> 40))] +
+ psum[0x600+((uint8_t)(x1 >> 48))] +
+ psum[0x700+((uint8_t)(x1 >> 56))];
+ } else if ((o1>>32)==0) {
+ // only low part relevant
+ did_update=true;
+
+ // Use the pre-computed sums
+ // Each range of 8 lengths has already been pre-computed and stored in psum
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+ my_stripe_total += psum[ (uint8_t)(o1) ] +
+ psum[0x100+((uint8_t)(o1 >> 8))] +
+ psum[0x200+((uint8_t)(o1 >> 16))] +
+ psum[0x300+((uint8_t)(o1 >> 24))];
+ my_stripe += psum[ (uint8_t)(x1) ] +
+ psum[0x100+((uint8_t)(x1 >> 8))] +
+ psum[0x200+((uint8_t)(x1 >> 16))] +
+ psum[0x300+((uint8_t)(x1 >> 24))];
+
+ } else {
+ did_update=true;
+
+ // Use the pre-computed sums
+ // Each range of 8 lengths has already been pre-computed and stored in psum
+ // Since embedded_proportions packed format is in 64-bit format for performance reasons
+ // we need to add the 8 sums using the four 8-bits for addressing inside psum
+
+ my_stripe_total += psum[ (uint8_t)(o1) ] +
+ psum[0x100+((uint8_t)(o1 >> 8))] +
+ psum[0x200+((uint8_t)(o1 >> 16))] +
+ psum[0x300+((uint8_t)(o1 >> 24))] +
+ psum[0x400+((uint8_t)(o1 >> 32))] +
+ psum[0x500+((uint8_t)(o1 >> 40))] +
+ psum[0x600+((uint8_t)(o1 >> 48))] +
+ psum[0x700+((uint8_t)(o1 >> 56))];
+ my_stripe += psum[ (uint8_t)(x1) ] +
+ psum[0x100+((uint8_t)(x1 >> 8))] +
+ psum[0x200+((uint8_t)(x1 >> 16))] +
+ psum[0x300+((uint8_t)(x1 >> 24))] +
+ psum[0x400+((uint8_t)(x1 >> 32))] +
+ psum[0x500+((uint8_t)(x1 >> 40))] +
+ psum[0x600+((uint8_t)(x1 >> 48))] +
+ psum[0x700+((uint8_t)(x1 >> 56))];
+ }
+ }
+ } // (allzero_k || allzero_l1)
+
+ if (did_update) {
+ dm_stripe[k] += my_stripe;
+ dm_stripe_total[k] += my_stripe_total;
+ }
+ } // (allzero_k && allzero_l1)
+
+}
+#endif
+
+// check for zero values
+template<class TFloat, class T>
+static inline void UnweightedZerosAndSums(
+ bool * const __restrict__ zcheck,
+ TFloat * const __restrict__ stripe_sums,
+ const TFloat * const __restrict__ el_sums,
+ const T * const __restrict__ embedded_proportions,
+ const unsigned int filled_embs_els_round,
+ const uint64_t n_samples,
+ const uint64_t n_samples_r) {
+#ifdef _OPENACC
+#pragma acc parallel loop gang vector present(embedded_proportions,zcheck,el_sums,stripe_sums)
+#else
+#pragma omp parallel for default(shared)
+#endif
+ for(uint64_t k=0; k<n_samples; k++) {
+ UnweightedOneSide(
+ zcheck, stripe_sums,
+ el_sums, embedded_proportions,
+ filled_embs_els_round, n_samples_r,
+ k);
+ }
+}
+
template<class TFloat>
void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
const uint64_t start_idx = this->task_p->start;
@@ -548,6 +1517,8 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
TFloat * const __restrict__ dm_stripes_total_buf = this->dm_stripes_total.buf;
TFloat * const __restrict__ sums = this->sums;
+ bool * const __restrict__ zcheck = this->zcheck;
+ TFloat * const __restrict__ stripe_sums = this->stripe_sums;
const uint64_t step_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::step_size;
const uint64_t sample_steps = (n_samples+(step_size-1))/step_size; // round up
@@ -557,7 +1528,6 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
const uint64_t filled_embs_els_round = (filled_embs+63)/64;
-
// pre-compute sums of length elements, since they are likely to be accessed many times
// We will use a 8-bit map, to keep it small enough to keep in L1 cache
#ifdef _OPENACC
@@ -613,79 +1583,68 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
}
}
+#pragma acc wait
+ // check for zero values and compute stripe sums
+ UnweightedZerosAndSums(zcheck, stripe_sums,
+ sums, embedded_proportions,
+ filled_embs_els_round, n_samples, n_samples_r);
+
// point of thread
#ifdef _OPENACC
-#pragma acc wait
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
-#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
-#else
- // use dynamic scheduling due to non-homogeneity in the loop
-#pragma omp parallel for schedule(dynamic,1) default(shared)
-#endif
+#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums,zcheck,stripe_sums) async
for(uint64_t sk = 0; sk < sample_steps ; sk++) {
for(uint64_t stripe = start_idx; stripe < stop_idx; stripe++) {
for(uint64_t ik = 0; ik < step_size ; ik++) {
const uint64_t k = sk*step_size + ik;
const uint64_t idx = (stripe-start_idx) * n_samples_r;
- TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
- TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
- //TFloat *dm_stripe = dm_stripes[stripe];
- //TFloat *dm_stripe_total = dm_stripes_total[stripe];
if (k>=n_samples) continue; // past the limit
const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
- bool did_update = false;
- TFloat my_stripe = 0.0;
- TFloat my_stripe_total = 0.0;
-
-#pragma acc loop seq
- for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
- const uint64_t offset = n_samples_r * emb_el;
- const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);
-
- uint64_t u1 = embedded_proportions[offset + k];
- uint64_t v1 = embedded_proportions[offset + l1];
- uint64_t o1 = u1 | v1;
-
- if (o1!=0) { // zeros are prevalent
- did_update=true;
- uint64_t x1 = u1 ^ v1;
-
- // Use the pre-computed sums
- // Each range of 8 lengths has already been pre-computed and stored in psum
- // Since embedded_proportions packed format is in 64-bit format for performance reasons
- // we need to add the 8 sums using the four 8-bits for addressing inside psum
-
- my_stripe += psum[ (x1 & 0xff)] +
- psum[0x100+((x1 >> 8) & 0xff)] +
- psum[0x200+((x1 >> 16) & 0xff)] +
- psum[0x300+((x1 >> 24) & 0xff)] +
- psum[0x400+((x1 >> 32) & 0xff)] +
- psum[0x500+((x1 >> 40) & 0xff)] +
- psum[0x600+((x1 >> 48) & 0xff)] +
- psum[0x700+((x1 >> 56) )];
- my_stripe_total += psum[ (o1 & 0xff)] +
- psum[0x100+((o1 >> 8) & 0xff)] +
- psum[0x200+((o1 >> 16) & 0xff)] +
- psum[0x300+((o1 >> 24) & 0xff)] +
- psum[0x400+((o1 >> 32) & 0xff)] +
- psum[0x500+((o1 >> 40) & 0xff)] +
- psum[0x600+((o1 >> 48) & 0xff)] +
- psum[0x700+((o1 >> 56) )];
- }
- }
-
- if (did_update) {
- dm_stripe[k] += my_stripe;
- dm_stripe_total[k] += my_stripe_total;
- }
-
+ Unweighted1<TFloat>(
+ dm_stripes_buf,dm_stripes_total_buf,
+ zcheck, stripe_sums,
+ sums, embedded_proportions,
+ filled_embs_els_round,idx, n_samples_r,
+ k, l1);
}
}
}
+#else
+ // tilling helps with better cache reuse without the need of multiple cores
+ const uint64_t stripe_steps = ((stop_idx-start_idx)+(step_size-1))/step_size; // round up
+
+ // use dynamic scheduling due to non-homogeneity in the loop
+ // Use a moderate block to prevent trashing but still have some cache reuse
+#pragma omp parallel for collapse(2) schedule(dynamic,step_size) default(shared)
+ for(uint64_t ss = 0; ss < stripe_steps ; ss++) {
+ for(uint64_t sk = 0; sk < sample_steps ; sk++) {
+ // tile to maximize cache reuse
+ for(uint64_t is = 0; is < step_size ; is++) {
+ const uint64_t stripe = start_idx+ss*step_size + is;
+ if (stripe<stop_idx) { // esle past limit}
+ for(uint64_t ik = 0; ik < step_size ; ik++) {
+ const uint64_t k = sk*step_size + ik;
+ if (k<n_samples) { // elsepast the limit
+ const uint64_t idx = (stripe-start_idx) * n_samples_r;
+ const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound
+
+ Unweighted1<TFloat>(
+ dm_stripes_buf,dm_stripes_total_buf,
+ zcheck, stripe_sums,
+ sums, embedded_proportions,
+ filled_embs_els_round,idx, n_samples_r,
+ k, l1);
+ } // if k
+ } // for ik
+ } // if stripe
+ } // for is
+ } // for sk
+ } // for ss
+#endif
#ifdef _OPENACC
// next iteration will use the alternative space
=====================================
src/unifrac_task.hpp
=====================================
@@ -297,9 +297,8 @@ namespace SUCMP_NM {
template<class TFloat, class TEmb>
class UnifracTask : public UnifracTaskBase<TFloat,TEmb> {
protected:
- // Use one cache line on CPU
- // On GPU, shaing a cache line is actually a good thing
- static const unsigned int step_size = 16*4/sizeof(TFloat);
+ // Use a moderate sized step, a few cache lines
+ static const unsigned int step_size = 64*4/sizeof(TFloat);
#ifdef _OPENACC
// Use as big vector size as we can, to maximize cache line reuse
@@ -403,32 +402,45 @@ namespace SUCMP_NM {
template<class TFloat>
class UnifracUnweightedTask : public UnifracTask<TFloat,uint64_t> {
public:
+ static const unsigned int step_size = 64*4/sizeof(TFloat);
+
static const unsigned int RECOMMENDED_MAX_EMBS = UnifracTask<TFloat,uint64_t>::RECOMMENDED_MAX_EMBS_BOOL;
// Note: _max_emb MUST be multiple of 64
UnifracUnweightedTask(std::vector<double*> &_dm_stripes, std::vector<double*> &_dm_stripes_total, unsigned int _max_embs, const su::task_parameters* _task_p)
: UnifracTask<TFloat, uint64_t>(_dm_stripes,_dm_stripes_total,_max_embs,_task_p)
{
+ const unsigned int n_samples = this->task_p->n_samples;
const unsigned int bsize = _max_embs*(0x400/32);
+ zcheck = NULL;
+ stripe_sums = NULL;
sums = NULL;
+ posix_memalign((void **)&zcheck, 4096, sizeof(bool) * n_samples);
+ posix_memalign((void **)&stripe_sums, 4096, sizeof(TFloat) * n_samples);
posix_memalign((void **)&sums, 4096, sizeof(TFloat) * bsize);
-#pragma acc enter data create(sums[:bsize])
+#pragma acc enter data create(zcheck[:n_samples],stripe_sums[:n_samples],sums[:bsize])
}
virtual ~UnifracUnweightedTask()
{
#ifdef _OPENACC
- const unsigned int bsize = this->max_embs*(0x400/32);
-#pragma acc exit data delete(sums[:bsize])
+ const unsigned int n_samples = this->task_p->n_samples;
+ const unsigned int bsize = this->max_embs*(0x400/32);
+#pragma acc exit data delete(sums[:bsize],stripe_sums[:n_samples],zcheck[:n_samples])
#endif
free(sums);
+ free(stripe_sums);
+ free(zcheck);
}
virtual void run(unsigned int filled_embs, const TFloat * __restrict__ length) {_run(filled_embs, length);}
void _run(unsigned int filled_embs, const TFloat * __restrict__ length);
private:
- TFloat *sums; // temp buffer
+ // temp buffers
+ TFloat *sums;
+ bool *zcheck;
+ TFloat *stripe_sums;
};
template<class TFloat>
class UnifracGeneralizedTask : public UnifracTask<TFloat,TFloat> {
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/a95340b75f55ef71eb2f6941fc521148d1b3299c...a5a4233e998a25396ccfa7e6bb7a6118bc01b877
--
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/a95340b75f55ef71eb2f6941fc521148d1b3299c...a5a4233e998a25396ccfa7e6bb7a6118bc01b877
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221130/be09996d/attachment-0001.htm>
More information about the debian-med-commit
mailing list