[med-svn] [Git][med-team/unifrac-tools][master] 6 commits: New upstream version 1.1.1
Andreas Tille (@tille)
gitlab at salsa.debian.org
Tue Jul 26 15:50:34 BST 2022
Andreas Tille pushed to branch master at Debian Med / unifrac-tools
Commits:
a6b633a7 by Andreas Tille at 2022-07-26T16:46:09+02:00
New upstream version 1.1.1
- - - - -
9abf679f by Andreas Tille at 2022-07-26T16:46:09+02:00
routine-update: New upstream version
- - - - -
ee3220a0 by Andreas Tille at 2022-07-26T16:46:09+02:00
Update upstream source from tag 'upstream/1.1.1'
Update to upstream version '1.1.1'
with Debian dir 10b8c2e2375c44b25c05b8f862772475b3dd66e5
- - - - -
8b830afc by Andreas Tille at 2022-07-26T16:46:09+02:00
routine-update: Standards-Version: 4.6.1
- - - - -
cde920ff by Andreas Tille at 2022-07-26T16:46:12+02:00
Trim trailing whitespace.
Changes-By: lintian-brush
Fixes: lintian: trailing-whitespace
See-also: https://lintian.debian.org/tags/trailing-whitespace.html
- - - - -
538b778a by Andreas Tille at 2022-07-26T16:49:53+02:00
Cleanup changelog
- - - - -
13 changed files:
- .github/workflows/main.yml
- README.md
- debian/changelog
- debian/control
- src/Makefile
- src/api.cpp
- src/api.hpp
- src/biom.cpp
- src/biom.hpp
- + src/status_enum.hpp
- src/test_su.cpp
- src/tree.cpp
- src/tree.hpp
Changes:
=====================================
.github/workflows/main.yml
=====================================
@@ -55,6 +55,7 @@ jobs:
./scripts/install_hpc_sdk.sh
source setup_nv_h5.sh
fi
+ export PERFORMING_CONDA_BUILD=True
make api && \
make main && \
make install && \
=====================================
README.md
=====================================
@@ -36,6 +36,10 @@ Compilation has been performed on both LLVM 10.0.0 (OS X >= 10.12) or GCC 9 (Cen
Installation time should be a few minutes at most.
+## Install (example)
+
+An example of installing UniFrac, and using it with CPUs as well as GPUs, can be be found on [Google Colabs](https://colab.research.google.com/drive/1yL0MdF1zNAkPg1_yESI1iABUH4ZHNGwj?usp=sharing).
+
## Install (bioconda)
This binaries can be installed via a combination of `conda-forge` and `bioconda`:
@@ -75,6 +79,34 @@ scripts/install_hpc_sdk.sh
source setup_nv_h5.sh
```
+# Environment considerations
+
+## Multi-core support
+
+Unifrac uses OpenMP to make use of multiple CPU cores.
+By default, Unifrac will use all the cores that are available on the system.
+To restrict the number of cores used, set:
+
+ export OMP_NUM_THREADS=nthreads
+
+## GPU support
+
+On Linux platforms, Unifrac will run on a GPU, if one is found.
+To disable GPU offload, and thus force CPU-only execution, one can set:
+
+ export UNIFRAC_USE_GPU=N
+
+To check which code path is used (Unifrac will print it to standard output at runtime), set:
+
+ export UNIFRAC_GPU_INFO=Y
+
+Finally, Unifrac will only use one GPU at a time.
+If more than one GPU is present, one can select the one to use by setting:
+
+ export ACC_DEVICE_NUM=gpunum
+
+Note that there is no GPU support for MacOS.
+
# Examples of use
Below are a few light examples of different ways to use this library.
=====================================
debian/changelog
=====================================
@@ -1,5 +1,5 @@
-unifrac-tools (1.0.0-1) UNRELEASED; urgency=medium
+unifrac-tools (1.1.1-1) UNRELEASED; urgency=medium
* Initial release (Closes: #???)
- -- Andreas Tille <tille at debian.org> Fri, 18 Feb 2022 12:25:53 +0100
+ -- Andreas Tille <tille at debian.org> Tue, 26 Jul 2022 16:46:09 +0200
=====================================
debian/control
=====================================
@@ -12,7 +12,7 @@ Build-Depends: debhelper-compat (= 13),
libblas-dev,
liblapacke-dev,
chrpath
-Standards-Version: 4.6.0
+Standards-Version: 4.6.1
Vcs-Browser: https://salsa.debian.org/med-team/unifrac-tools
Vcs-Git: https://salsa.debian.org/med-team/unifrac-tools.git
Homepage: https://github.com/biocore/unifrac-tools
@@ -73,4 +73,3 @@ Description: high-performance phylogenetic diversity calculations (dev)
reference implementation.
.
This package contains the static library and header files.
-
=====================================
src/Makefile
=====================================
@@ -68,7 +68,7 @@ endif
ifneq (,$(findstring pgi,$(COMPILER)))
ifeq ($(PERFORMING_CONDA_BUILD),True)
- CPPFLAGS += -tp=px
+ CPPFLAGS += -tp=sandybridge
endif
else
ifeq ($(PERFORMING_CONDA_BUILD),True)
@@ -135,6 +135,7 @@ install: libssu.so ssu faithpd
mkdir -p ${PREFIX}/include/unifrac
cp task_parameters.hpp ${PREFIX}/include/unifrac/
cp api.hpp ${PREFIX}/include/unifrac/
+ cp status_enum.hpp ${PREFIX}/include/unifrac/
rapi_test: main
mkdir -p ~/.R
=====================================
src/api.cpp
=====================================
@@ -48,22 +48,25 @@
return err; \
}
-#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \
- std::string content = std::string(std::istreambuf_iterator<char>(ifs), \
- std::istreambuf_iterator<char>()); \
- su::BPTree tree = su::BPTree(content); \
- su::biom table = su::biom(biom_filename); \
- if(table.n_samples <= 0 | table.n_obs <= 0) { \
- return table_empty; \
- } \
- std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \
- if(bad_id != "") { \
- return table_and_tree_do_not_overlap; \
- } \
- std::unordered_set<std::string> to_keep(table.obs_ids.begin(), \
- table.obs_ids.end()); \
- su::BPTree tree_sheared = tree.shear(to_keep).collapse();
-
+#define SYNC_TREE_TABLE(tree, table) std::unordered_set<std::string> to_keep(table.obs_ids.begin(), \
+ table.obs_ids.end()); \
+ su::BPTree tree_sheared = tree.shear(to_keep).collapse();
+
+#define PARSE_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \
+ std::string content = std::string(std::istreambuf_iterator<char>(ifs), \
+ std::istreambuf_iterator<char>()); \
+ su::BPTree tree = su::BPTree(content); \
+ su::biom table = su::biom(biom_filename); \
+ if(table.n_samples <= 0 | table.n_obs <= 0) { \
+ return table_empty; \
+ } \
+ std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \
+ if(bad_id != "") { \
+ return table_and_tree_do_not_overlap; \
+ }
+
+#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) PARSE_TREE_TABLE(tree_filename, table_filename) \
+ SYNC_TREE_TABLE(tree, table)
using namespace su;
using namespace std;
@@ -115,7 +118,7 @@ void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) {
}
}
-void initialize_results_vec(r_vec* &result, biom& table){
+void initialize_results_vec(r_vec* &result, biom &table){
// Stores results for Faith PD
result = (r_vec*)malloc(sizeof(results_vec));
result->n_samples = table.n_samples;
@@ -146,6 +149,34 @@ void initialize_mat_no_biom(mat_t* &result, char** sample_ids, unsigned int n_sa
}
}
+inline compute_status is_fp64_method(const std::string &method_string, bool &fp64) {
+ if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) {
+ fp64 = false;
+ } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) {
+ fp64 = true;
+ } else {
+ return unknown_method;
+ }
+
+ return okay;
+}
+
+
+inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) {
+ if (format_string == "hdf5_fp32") {
+ fp64 = false;
+ } else if (format_string == "hdf5_fp64") {
+ fp64 = true;
+ } else if (format_string == "hdf5") {
+ return is_fp64_method(method_string, fp64);
+ } else {
+ return unknown_method;
+ }
+
+ return okay;
+}
+
+
template<class TReal, class TMat>
void initialize_mat_full_no_biom_T(TMat* &result, const char* const * sample_ids, unsigned int n_samples,
const char *mmap_dir /* if NULL or "", use malloc */) {
@@ -348,6 +379,37 @@ void set_tasks(std::vector<su::task_parameters> &tasks,
}
}
+compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree,
+ const char* unifrac_method, bool variance_adjust, double alpha,
+ bool bypass_tips, unsigned int nthreads, mat_t** result) {
+ SYNC_TREE_TABLE(tree, table)
+ SET_METHOD(unifrac_method, unknown_method)
+
+ const unsigned int stripe_stop = (table.n_samples + 1) / 2;
+ std::vector<double*> dm_stripes(stripe_stop);
+ std::vector<double*> dm_stripes_total(stripe_stop);
+
+ if(nthreads > dm_stripes.size()) {
+ fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size());
+ nthreads = dm_stripes.size();
+ }
+
+ std::vector<su::task_parameters> tasks(nthreads);
+ std::vector<std::thread> threads(nthreads);
+
+ set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
+ su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
+
+ initialize_mat(*result, table, true); // true -> is_upper_triangle
+ for(unsigned int tid = 0; tid < threads.size(); tid++) {
+ su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop);
+ }
+
+ destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0);
+
+ return okay;
+}
+
compute_status partial(const char* biom_filename, const char* tree_filename,
const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips,
unsigned int nthreads, unsigned int stripe_start, unsigned int stripe_stop,
@@ -404,40 +466,17 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file
compute_status one_off(const char* biom_filename, const char* tree_filename,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int nthreads, mat_t** result) {
-
CHECK_FILE(biom_filename, table_missing)
CHECK_FILE(tree_filename, tree_missing)
- SET_METHOD(unifrac_method, unknown_method)
- PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
+ PARSE_TREE_TABLE(tree_filename, table_filename)
- const unsigned int stripe_stop = (table.n_samples + 1) / 2;
- std::vector<double*> dm_stripes(stripe_stop);
- std::vector<double*> dm_stripes_total(stripe_stop);
-
- if(nthreads > dm_stripes.size()) {
- fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size());
- nthreads = dm_stripes.size();
- }
-
- std::vector<su::task_parameters> tasks(nthreads);
- std::vector<std::thread> threads(nthreads);
-
- set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
- su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
-
- initialize_mat(*result, table, true); // true -> is_upper_triangle
- for(unsigned int tid = 0; tid < threads.size(); tid++) {
- su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop);
- }
-
- destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0);
-
- return okay;
+ // condensed form
+ return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
}
// TMat mat_full_fp32_t
template<class TReal, class TMat>
-compute_status one_off_matrix_T(const char* biom_filename, const char* tree_filename,
+compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int nthreads,
const char *mmap_dir,
@@ -446,10 +485,8 @@ compute_status one_off_matrix_T(const char* biom_filename, const char* tree_file
if (mmap_dir[0]==0) mmap_dir = NULL; // easier to have a simple test going on
}
- CHECK_FILE(biom_filename, table_missing)
- CHECK_FILE(tree_filename, tree_missing)
SET_METHOD(unifrac_method, unknown_method)
- PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
+ SYNC_TREE_TABLE(tree, table)
const unsigned int stripe_stop = (table.n_samples + 1) / 2;
partial_mat_t *partial_mat = NULL;
@@ -472,7 +509,10 @@ compute_status one_off_matrix_T(const char* biom_filename, const char* tree_file
destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, stripe_stop);
}
- initialize_mat_full_no_biom_T<TReal,TMat>(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir);
+ // allow the caller to allocate the memory
+ if((*result) == NULL) {
+ initialize_mat_full_no_biom_T<TReal,TMat>(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir);
+ }
if (((*result)==NULL) || ((*result)->matrix==NULL) || ((*result)->sample_ids==NULL) ) {
fprintf(stderr, "Memory allocation error! (initialize_mat)\n");
@@ -498,7 +538,10 @@ compute_status one_off_matrix(const char* biom_filename, const char* tree_filena
bool bypass_tips, unsigned int nthreads,
const char *mmap_dir,
mat_full_fp64_t** result) {
- return one_off_matrix_T<double,mat_full_fp64_t>(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
+ CHECK_FILE(biom_filename, table_missing)
+ CHECK_FILE(tree_filename, tree_missing)
+ PARSE_TREE_TABLE(tree_filename, biom_filename)
+ return one_off_matrix_T<double,mat_full_fp64_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
}
compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_filename,
@@ -506,29 +549,89 @@ compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_f
bool bypass_tips, unsigned int nthreads,
const char *mmap_dir,
mat_full_fp32_t** result) {
- return one_off_matrix_T<float,mat_full_fp32_t>(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
+ CHECK_FILE(biom_filename, table_missing)
+ CHECK_FILE(tree_filename, tree_missing)
+ PARSE_TREE_TABLE(tree_filename, biom_filename)
+ return one_off_matrix_T<float,mat_full_fp32_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
}
-inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) {
- if (format_string == "hdf5_fp32") {
- fp64 = false;
- } else if (format_string == "hdf5_fp64") {
- fp64 = true;
- } else if (format_string == "hdf5") {
- if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) {
- fp64 = false;
- } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) {
- fp64 = true;
+compute_status one_off_inmem_matrix(su::biom &table, su::BPTree &tree,
+ const char* unifrac_method, bool variance_adjust, double alpha,
+ bool bypass_tips, unsigned int nthreads,
+ mat_full_fp64_t** result) {
+ // NULL -> mmap_dir, we're doing full in memory here
+ return one_off_matrix_T<double,mat_full_fp64_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result);
+}
+
+compute_status one_off_inmem_matrix_fp32(su::biom &table, su::BPTree &tree,
+ const char* unifrac_method, bool variance_adjust, double alpha,
+ bool bypass_tips, unsigned int nthreads,
+ mat_full_fp32_t** result) {
+ // NULL -> mmap_dir, we're doing full in memory here
+ return one_off_matrix_T<float,mat_full_fp32_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result);
+}
+
+compute_status one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data,
+ const char* unifrac_method, bool variance_adjust, double alpha,
+ bool bypass_tips, unsigned int nthreads, mat_full_fp64_t** result) {
+ bool fp64;
+ compute_status rc = is_fp64_method(unifrac_method, fp64);
+
+ if (rc == okay) {
+ if (!fp64) {
+ return invalid_method;
+ }
} else {
- return unknown_method;
+ return rc;
}
- } else {
- return unknown_method;
- }
- return okay;
+ su::biom table(table_data->obs_ids,
+ table_data->sample_ids,
+ table_data->indices,
+ table_data->indptr,
+ table_data->data,
+ table_data->n_obs,
+ table_data->n_samples,
+ table_data->nnz);
+
+ su::BPTree tree(tree_data->structure,
+ tree_data->lengths,
+ tree_data->names,
+ tree_data->n_parens);
+
+ return one_off_inmem_matrix(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
}
+compute_status one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data,
+ const char* unifrac_method, bool variance_adjust, double alpha,
+ bool bypass_tips, unsigned int nthreads, mat_full_fp32_t** result) {
+ bool fp64;
+ compute_status rc = is_fp64_method(unifrac_method, fp64);
+
+ if (rc == okay) {
+ if (fp64) {
+ return invalid_method;
+ }
+ } else {
+ return rc;
+ }
+
+ su::biom table(table_data->obs_ids,
+ table_data->sample_ids,
+ table_data->indices,
+ table_data->indptr,
+ table_data->data,
+ table_data->n_obs,
+ table_data->n_samples,
+ table_data->nnz);
+
+ su::BPTree tree(tree_data->structure,
+ tree_data->lengths,
+ tree_data->names,
+ tree_data->n_parens);
+
+ return one_off_inmem_matrix_fp32(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
+}
compute_status unifrac_to_file(const char* biom_filename, const char* tree_filename, const char* out_filename,
const char* unifrac_method, bool variance_adjust, double alpha,
@@ -540,7 +643,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen
if (rc==okay) {
if (fp64) {
- mat_full_fp64_t* result;
+ mat_full_fp64_t* result = NULL;
rc = one_off_matrix(biom_filename, tree_filename,
unifrac_method, variance_adjust, alpha,
bypass_tips, threads, mmap_dir,
@@ -554,7 +657,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen
if (iostatus!=write_okay) rc=output_error;
}
} else {
- mat_full_fp32_t* result;
+ mat_full_fp32_t* result = NULL;
rc = one_off_matrix_fp32(biom_filename, tree_filename,
unifrac_method, variance_adjust, alpha,
bypass_tips, threads, mmap_dir,
=====================================
src/api.hpp
=====================================
@@ -1,4 +1,5 @@
#include "task_parameters.hpp"
+#include "status_enum.hpp"
#ifdef __cplusplus
#include <vector>
@@ -14,10 +15,6 @@
#define PARTIAL_MAGIC_V2 0x088ABA02
-typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus;
-typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus;
-typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus;
-
/* a result matrix
*
* n_samples <uint> the number of samples.
@@ -122,14 +119,49 @@ typedef struct partial_dyn_mat {
char* filename;
} partial_dyn_mat_t;
+/* support structure to carry in biom table information
+ *
+ * obs_ids <char**> the observation IDs
+ * sample_ids <char**> the sample IDs
+ * indices <int32_t*> the indices of the data values
+ * indptr <int32_t*> the row offset of the data values
+ * data <double*> the actual matrix values
+ * n_obs <int> the number of observations, corresponding to length of obs_ids
+ * n_samples <int> the number of samples, corresponding to the length of sample_ids
+ * nnz <int> the number of nonzero values, corresponding to the length of data and indices
+ */
+typedef struct support_biom {
+ char** obs_ids;
+ char** sample_ids;
+ uint32_t* indices;
+ uint32_t* indptr;
+ double* data;
+ int n_obs;
+ int n_samples;
+ int nnz;
+} support_biom_t;
+
+/* support structure to carry in bptree information
+ *
+ * structure <bool*> the topology of the tree
+ * lengths <double*> the branch lengths of the tree
+ * names <char**> the names of the tips and internal nodes of hte tree
+ * n_parens <int> the length of the structure array
+ */
+typedef struct support_bptree {
+ bool* structure;
+ double* lengths;
+ char** names;
+ int n_parens;
+} support_bptree_t;
-void destroy_mat(mat_t** result);
-void destroy_mat_full_fp64(mat_full_fp64_t** result);
-void destroy_mat_full_fp32(mat_full_fp32_t** result);
-void destroy_partial_mat(partial_mat_t** result);
-void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
-void destroy_results_vec(r_vec** result);
+EXTERN void destroy_mat(mat_t** result);
+EXTERN void destroy_mat_full_fp64(mat_full_fp64_t** result);
+EXTERN void destroy_mat_full_fp32(mat_full_fp32_t** result);
+EXTERN void destroy_partial_mat(partial_mat_t** result);
+EXTERN void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
+EXTERN void destroy_results_vec(r_vec** result);
/* Compute UniFrac - condensed form
*
@@ -154,6 +186,49 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam
const char* unifrac_method, bool variance_adjust, double alpha,
bool bypass_tips, unsigned int threads, mat_t** result);
+
+/* Compute UniFrac - against in-memory objects returning full form matrix
+ *
+ * table <biom> a constructed BIOM object
+ * tree <BPTree> a constructed BPTree object
+ * unifrac_method <const char*> the requested unifrac method.
+ * variance_adjust <bool> whether to apply variance adjustment.
+ * alpha <double> GUniFrac alpha, only relevant if method == generalized.
+ * bypass_tips <bool> disregard tips, reduces compute by about 50%
+ * threads <uint> the number of threads to use.
+ * result <mat_full_fp64_t**> the resulting distance matrix in full form, this is initialized within the method so using **
+ *
+ * one_off_inmem returns the following error codes:
+ *
+ * okay : no problems encountered
+ * unknown_method : the requested method is unknown.
+ * table_empty : the table does not have any entries
+ */
+EXTERN ComputeStatus one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data,
+ const char* unifrac_method, bool variance_adjust, double alpha,
+ bool bypass_tips, unsigned int threads, mat_full_fp64_t** result);
+
+/* Compute UniFrac - against in-memory objects returning full form matrix, fp32
+ *
+ * table <biom> a constructed BIOM object
+ * tree <BPTree> a constructed BPTree object
+ * unifrac_method <const char*> the requested unifrac method.
+ * variance_adjust <bool> whether to apply variance adjustment.
+ * alpha <double> GUniFrac alpha, only relevant if method == generalized.
+ * bypass_tips <bool> disregard tips, reduces compute by about 50%
+ * threads <uint> the number of threads to use.
+ * result <mat_full_fp32_t**> the resulting distance matrix in full form, this is initialized within the method so using **
+ *
+ * one_off_inmem returns the following error codes:
+ *
+ * okay : no problems encountered
+ * unknown_method : the requested method is unknown.
+ * table_empty : the table does not have any entries
+ */
+EXTERN ComputeStatus one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data,
+ const char* unifrac_method, bool variance_adjust, double alpha,
+ bool bypass_tips, unsigned int threads, mat_full_fp32_t** result);
+
/* Compute UniFrac - matrix form
*
* biom_filename <const char*> the filename to the biom table.
=====================================
src/biom.cpp
=====================================
@@ -26,7 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices");
const std::string SAMPLE_DATA = std::string("/sample/matrix/data");
const std::string SAMPLE_IDS = std::string("/sample/ids");
-biom::biom(std::string filename) {
+biom::biom(std::string filename) : has_hdf5_backing(true) {
file = H5File(filename.c_str(), H5F_ACC_RDONLY);
/* establish the datasets */
@@ -55,9 +55,51 @@ biom::biom(std::string filename) {
obs_id_index = std::unordered_map<std::string, uint32_t>();
sample_id_index = std::unordered_map<std::string, uint32_t>();
- create_id_index(obs_ids, obs_id_index);
- create_id_index(sample_ids, sample_id_index);
+ #pragma omp parallel for schedule(static)
+ for(int i = 0; i < 3; i++) {
+ if(i == 0)
+ create_id_index(obs_ids, obs_id_index);
+ else if(i == 1)
+ create_id_index(sample_ids, sample_id_index);
+ else if(i == 2)
+ malloc_resident(n_obs);
+ }
+
+ uint32_t *current_indices = NULL;
+ double *current_data = NULL;
+ for(unsigned int i = 0; i < obs_ids.size(); i++) {
+ std::string id_ = obs_ids[i];
+ unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
+ obs_counts_resident[i] = n;
+ obs_indices_resident[i] = current_indices;
+ obs_data_resident[i] = current_data;
+ }
+ sample_counts = get_sample_counts();
+}
+
+biom::~biom() {
+ if(has_hdf5_backing) {
+ if(obs_indices_resident != NULL && obs_data_resident != NULL) {
+ for(unsigned int i = 0; i < n_obs; i++) {
+ if(obs_indices_resident[i] != NULL)
+ free(obs_indices_resident[i]);
+ if(obs_data_resident[i] != NULL)
+ free(obs_data_resident[i]);
+ }
+ }
+
+ if(obs_indices_resident != NULL)
+ free(obs_indices_resident);
+ if(obs_data_resident != NULL)
+ free(obs_data_resident);
+ if(obs_counts_resident != NULL)
+ free(obs_counts_resident);
+ }
+ // else, it is the responsibility of the entity constructing this object
+ // to clean itself up
+}
+void biom::malloc_resident(uint32_t n_obs) {
/* load obs sparse data */
obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs);
if(obs_indices_resident == NULL) {
@@ -77,30 +119,82 @@ biom::biom(std::string filename) {
sizeof(unsigned int) * n_obs, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
+}
- uint32_t *current_indices = NULL;
- double *current_data = NULL;
- for(unsigned int i = 0; i < obs_ids.size(); i++) {
- std::string id_ = obs_ids[i];
- unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
- obs_counts_resident[i] = n;
- obs_indices_resident[i] = current_indices;
- obs_data_resident[i] = current_data;
- }
- sample_counts = get_sample_counts();
+biom::biom() : has_hdf5_backing(false) {
+ n_obs = 0;
+ malloc_resident(0);
}
-biom::~biom() {
- for(unsigned int i = 0; i < n_obs; i++) {
- free(obs_indices_resident[i]);
- free(obs_data_resident[i]);
+// not using const on indices/indptr/data as the pointers are being borrowed
+biom::biom(char** obs_ids_in,
+ char** samp_ids_in,
+ uint32_t* indices,
+ uint32_t* indptr,
+ double* data,
+ const int n_obs,
+ const int n_samples,
+ const int nnz) : has_hdf5_backing(false) {
+
+ this->nnz = nnz;
+ this->n_samples = n_samples;
+ this->n_obs = n_obs;
+
+ sample_ids = std::vector<std::string>();
+ sample_ids.resize(n_samples);
+ obs_ids = std::vector<std::string>();
+ obs_ids.resize(n_obs);
+
+ #pragma omp parallel for schedule(static)
+ for(int x = 0; x < 2; x++) {
+ if(x == 0) {
+ for(int i = 0; i < n_obs; i++) {
+ obs_ids[i] = std::string(obs_ids_in[i]);
+ }
+ } else {
+ for(int i = 0; i < n_samples; i++) {
+ sample_ids[i] = std::string(samp_ids_in[i]);
+ }
+ }
}
- free(obs_indices_resident);
- free(obs_data_resident);
- free(obs_counts_resident);
+
+ /* define a mapping between an ID and its corresponding offset */
+ obs_id_index = std::unordered_map<std::string, uint32_t>();
+ sample_id_index = std::unordered_map<std::string, uint32_t>();
+
+ #pragma omp parallel for schedule(static)
+ for(int i = 0; i < 3; i++) {
+ if(i == 0)
+ create_id_index(obs_ids, obs_id_index);
+ else if(i == 1)
+ create_id_index(sample_ids, sample_id_index);
+ else if(i == 2)
+ malloc_resident(n_obs);
+ }
+
+ #pragma omp parallel for schedule(static)
+ for(unsigned int i = 0; i < n_obs; i++) {
+ int32_t start = indptr[i];
+ int32_t end = indptr[i + 1];
+ unsigned int count = end - start;
+
+ uint32_t* index_ptr = (indices + start);
+ double* data_ptr = (data + start);
+
+ obs_indices_resident[i] = index_ptr;
+ obs_data_resident[i] = data_ptr;
+ obs_counts_resident[i] = count;
+ }
+ sample_counts = get_sample_counts();
}
void biom::set_nnz() {
+ if(!has_hdf5_backing) {
+ fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
+ __FILE__, __LINE__);
+ exit(EXIT_FAILURE);
+ }
+
// should these be cached?
DataType dtype = obs_data.getDataType();
DataSpace dataspace = obs_data.getSpace();
@@ -111,6 +205,12 @@ void biom::set_nnz() {
}
void biom::load_ids(const char *path, std::vector<std::string> &ids) {
+ if(!has_hdf5_backing) {
+ fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
+ __FILE__, __LINE__);
+ exit(EXIT_FAILURE);
+ }
+
DataSet ds_ids = file.openDataSet(path);
DataType dtype = ds_ids.getDataType();
DataSpace dataspace = ds_ids.getSpace();
@@ -138,6 +238,12 @@ void biom::load_ids(const char *path, std::vector<std::string> &ids) {
}
void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
+ if(!has_hdf5_backing) {
+ fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
+ __FILE__, __LINE__);
+ exit(EXIT_FAILURE);
+ }
+
DataSet ds = file.openDataSet(path);
DataType dtype = ds.getDataType();
DataSpace dataspace = ds.getSpace();
@@ -159,7 +265,7 @@ void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
free(dataout);
}
-void biom::create_id_index(std::vector<std::string> &ids,
+void biom::create_id_index(const std::vector<std::string> &ids,
std::unordered_map<std::string, uint32_t> &map) {
uint32_t count = 0;
map.reserve(ids.size());
@@ -169,6 +275,12 @@ void biom::create_id_index(std::vector<std::string> &ids,
}
unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
+ if(!has_hdf5_backing) {
+ fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
+ __FILE__, __LINE__);
+ exit(EXIT_FAILURE);
+ }
+
uint32_t idx = obs_id_index.at(id);
uint32_t start = obs_indptr[idx];
uint32_t end = obs_indptr[idx + 1];
@@ -270,6 +382,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne
}
unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
+ if(!has_hdf5_backing) {
+ fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
+ __FILE__, __LINE__);
+ exit(EXIT_FAILURE);
+ }
+
uint32_t idx = sample_id_index.at(id);
uint32_t start = sample_indptr[idx];
uint32_t end = sample_indptr[idx + 1];
@@ -310,6 +428,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur
double* biom::get_sample_counts() {
double *sample_counts = (double*)calloc(sizeof(double), n_samples);
+
for(unsigned int i = 0; i < n_obs; i++) {
unsigned int count = obs_counts_resident[i];
uint32_t *indices = obs_indices_resident[i];
=====================================
src/biom.hpp
=====================================
@@ -21,12 +21,35 @@
namespace su {
class biom : public biom_interface {
public:
+ /* nullary constructor */
+ biom();
+
/* default constructor
*
* @param filename The path to the BIOM table to read
*/
biom(std::string filename);
+ /* constructor from compress sparse data
+ *
+ * @param obs_ids vector of observation identifiers
+ * @param samp_ids vector of sample identifiers
+ * @param index vector of index positions
+ * @param indptr vector of indptr positions
+ * @param data vector of observation counts
+ * @param n_obs number of observations
+ * @param n_samples number of samples
+ * @param nnz number of data points
+ */
+ biom(char** obs_ids,
+ char** samp_ids,
+ uint32_t* index,
+ uint32_t* indptr,
+ double* data,
+ const int n_obs,
+ const int n_samples,
+ const int nnz);
+
/* default destructor
*
* Temporary arrays are freed
@@ -55,8 +78,9 @@ namespace su {
*/
void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, double* out) const;
void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, float* out) const;
-
private:
+ bool has_hdf5_backing = false;
+
/* retain DataSet handles within the HDF5 file */
H5::DataSet obs_indices;
H5::DataSet sample_indices;
@@ -66,11 +90,14 @@ namespace su {
uint32_t **obs_indices_resident;
double **obs_data_resident;
unsigned int *obs_counts_resident;
+
+ void malloc_resident(uint32_t n_obs);
unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out);
unsigned int get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out);
double* get_sample_counts();
+
/* At construction, lookups mapping IDs -> index position within an
* axis are defined
*/
@@ -100,7 +127,7 @@ namespace su {
* @param ids A vector of IDs to index
* @param map A hash table to populate
*/
- void create_id_index(std::vector<std::string> &ids,
+ void create_id_index(const std::vector<std::string> &ids,
std::unordered_map<std::string, uint32_t> &map);
=====================================
src/status_enum.hpp
=====================================
@@ -0,0 +1,8 @@
+#ifndef _UNIFRAC_STATUS_H
+#define _UNIFRAC_STATUS_H
+
+typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error, invalid_method} ComputeStatus;
+typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus;
+typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus;
+
+#endif
=====================================
src/test_su.cpp
=====================================
@@ -153,9 +153,8 @@ void test_bptree_constructor_from_existing() {
//11101000
su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);");
su::BPTree tree = su::BPTree(existing.get_structure(), existing.lengths, existing.names);
-
+
unsigned int exp_nparens = 8;
-
std::vector<bool> exp_structure;
exp_structure.push_back(true);
exp_structure.push_back(true);
@@ -467,10 +466,7 @@ void test_biom_constructor() {
SUITE_END();
}
-void test_biom_get_obs_data() {
- SUITE_START("biom get obs data");
-
- su::biom table = su::biom("test.biom");
+void _exercise_get_obs_data(su::biom &table) {
double exp0[] = {0.0, 0.0, 1.0, 0.0, 0.0, 0.0};
std::vector<double> exp0_vec = _double_array_to_vector(exp0, 6);
double exp1[] = {5.0, 1.0, 0.0, 2.0, 3.0, 1.0};
@@ -506,9 +502,43 @@ void test_biom_get_obs_data() {
ASSERT(vec_almost_equal(obs_vec, exp4_vec));
free(out);
+}
+
+void test_biom_constructor_from_sparse() {
+ SUITE_START("biom from sparse constructor");
+ uint32_t index[] = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2};
+ uint32_t indptr[] = {0, 1, 6, 9, 13, 15};
+ double data[] = {1., 5., 1., 2., 3., 1., 1., 4., 2., 2., 1., 1., 1., 1., 1.};
+ char* obs_ids[] = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"};
+ char* samp_ids[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"};
+
+ su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data, 5, 6, 15);
+ _exercise_get_obs_data(table);
+
+ SUITE_END();
+}
+
+void test_biom_nullary() {
+ SUITE_START("biom nullary");
+ su::biom table = su::biom();
+ SUITE_END();
+}
+
+void test_bptree_nullary() {
+ SUITE_START("bptree nullary");
+ su::BPTree tree = su::BPTree();
+ SUITE_END();
+}
+
+void test_biom_get_obs_data() {
+ SUITE_START("biom get obs data");
+
+ su::biom table = su::biom("test.biom");
+ _exercise_get_obs_data(table);
SUITE_END();
}
+
void test_bptree_leftchild() {
SUITE_START("test bptree left child");
su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;");
@@ -1802,6 +1832,66 @@ void test_set_tasks() {
SUITE_END();
}
+void test_bptree_cstyle_constructor() {
+ SUITE_START("test bptree constructor from c-style data");
+ //01234567
+ //11101000
+ bool structure[] = {true, true, true, false, true, false, false, false};
+ double lengths[] = {0, 0, 1, 0, 2, 0, 0, 0};
+ char* names[] = {"", "c", "123:foo; bar", "", "b", "", "", ""};
+ su::BPTree tree = su::BPTree(structure, lengths, names, 8);
+
+ unsigned int exp_nparens = 8;
+
+ std::vector<bool> exp_structure;
+ exp_structure.push_back(true);
+ exp_structure.push_back(true);
+ exp_structure.push_back(true);
+ exp_structure.push_back(false);
+ exp_structure.push_back(true);
+ exp_structure.push_back(false);
+ exp_structure.push_back(false);
+ exp_structure.push_back(false);
+
+ std::vector<uint32_t> exp_openclose;
+ exp_openclose.push_back(7);
+ exp_openclose.push_back(6);
+ exp_openclose.push_back(3);
+ exp_openclose.push_back(2);
+ exp_openclose.push_back(5);
+ exp_openclose.push_back(4);
+ exp_openclose.push_back(1);
+ exp_openclose.push_back(0);
+
+ std::vector<std::string> exp_names;
+ exp_names.push_back(std::string());
+ exp_names.push_back(std::string("c"));
+ exp_names.push_back(std::string("123:foo; bar"));
+ exp_names.push_back(std::string());
+ exp_names.push_back(std::string("b"));
+ exp_names.push_back(std::string());
+ exp_names.push_back(std::string());
+ exp_names.push_back(std::string());
+
+ std::vector<double> exp_lengths;
+ exp_lengths.push_back(0.0);
+ exp_lengths.push_back(0.0);
+ exp_lengths.push_back(1.0);
+ exp_lengths.push_back(0.0);
+ exp_lengths.push_back(2.0);
+ exp_lengths.push_back(0.0);
+ exp_lengths.push_back(0.0);
+ exp_lengths.push_back(0.0);
+
+ ASSERT(tree.nparens == exp_nparens);
+ ASSERT(tree.get_structure() == exp_structure);
+ ASSERT(tree.get_openclose() == exp_openclose);
+ ASSERT(tree.lengths == exp_lengths);
+ ASSERT(tree.names == exp_names);
+
+ SUITE_END();
+}
+
void test_bptree_constructor_newline_bug() {
SUITE_START("test bptree constructor newline bug");
su::BPTree tree = su::BPTree("((362be41f31fd26be95ae43a8769b91c0:0.116350803,(a16679d5a10caa9753f171977552d920:0.105836235,((a7acc2abb505c3ee177a12e514d3b994:0.008268754,(4e22aa3508b98813f52e1a12ffdb74ad:0.03144211,8139c4ac825dae48454fb4800fb87896:0.043622957)0.923:0.046588301)0.997:0.120902074,((2d3df7387323e2edcbbfcb6e56a02710:0.031543994,3f6752aabcc291b67a063fb6492fd107:0.091571442)0.759:0.016335166,((d599ebe277afb0dfd4ad3c2176afc50e:5e-09,84d0affc7243c7d6261f3a7d680b873f:0.010245188)0.883:0.048993011,51121722488d0c3da1388d1b117cd239:0.119447926)0.763:0.035660204)0.921:0.058191474)0.776:0.02854575)0.657:0.052060833)0.658:0.032547569,(99647b51f775c8ddde8ed36a7d60dbcd:0.173334268,(f18a9c8112372e2916a66a9778f3741b:0.194813398,(5833416522de0cca717a1abf720079ac:5e-09,(2bf1067d2cd4f09671e3ebe5500205ca:0.031692682,(b32621bcd86cb99e846d8f6fee7c9ab8:0.031330707,1016319c25196d73bdb3096d86a9df2f:5e-09)0.058:0.01028612)0.849:0.010284866)0.791:0.041353384)0.922:0.109470534):0.022169824000000005)root;\n\n");
@@ -1818,6 +1908,8 @@ int main(int argc, char** argv) {
test_bptree_constructor_edgecases();
test_bptree_constructor_quoted_comma();
test_bptree_constructor_quoted_parens();
+ test_bptree_cstyle_constructor();
+ test_bptree_nullary();
test_bptree_postorder();
test_bptree_preorder();
test_bptree_parent();
@@ -1832,6 +1924,8 @@ int main(int argc, char** argv) {
test_bptree_collapse_edge();
test_biom_constructor();
+ test_biom_constructor_from_sparse();
+ test_biom_nullary();
test_biom_get_obs_data();
test_propstack_constructor();
=====================================
src/tree.cpp
=====================================
@@ -4,6 +4,8 @@
using namespace su;
+BPTree::BPTree() { }
+
BPTree::BPTree(std::string newick) {
openclose = std::vector<uint32_t>();
lengths = std::vector<double>();
@@ -50,6 +52,36 @@ BPTree::BPTree(std::vector<bool> input_structure, std::vector<double> input_leng
index_and_cache();
}
+BPTree::BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens) {
+ structure = std::vector<bool>();
+ lengths = std::vector<double>();
+ names = std::vector<std::string>();
+
+ structure.resize(n_parens);
+ lengths.resize(n_parens);
+ names.resize(n_parens);
+
+ nparens = n_parens;
+
+ //#pragma omp parallel for schedule(static)
+ for(int i = 0; i < n_parens; i++) {
+ structure[i] = input_structure[i];
+ lengths[i] = input_lengths[i];
+ names[i] = std::string(input_names[i]);
+ }
+
+ openclose = std::vector<uint32_t>();
+ select_0_index = std::vector<uint32_t>();
+ select_1_index = std::vector<uint32_t>();
+ openclose.resize(nparens);
+ select_0_index.resize(nparens / 2);
+ select_1_index.resize(nparens / 2);
+ excess.resize(nparens);
+
+ structure_to_openclose();
+ index_and_cache();
+}
+
BPTree BPTree::mask(std::vector<bool> topology_mask, std::vector<double> in_lengths) {
std::vector<bool> new_structure = std::vector<bool>();
@@ -197,7 +229,7 @@ void BPTree::index_and_cache() {
}
}
-uint32_t BPTree::postorderselect(uint32_t k) const {
+uint32_t BPTree::postorderselect(uint32_t k) const {
return open(select_0_index[k]);
}
=====================================
src/tree.hpp
=====================================
@@ -19,6 +19,9 @@
namespace su {
class BPTree {
public:
+ /* nullary constructor */
+ BPTree();
+
/* tracked attributes */
std::vector<double> lengths;
std::vector<std::string> names;
@@ -41,6 +44,15 @@ namespace su {
BPTree(std::vector<bool> input_structure, std::vector<double> input_lengths, std::vector<std::string> input_names);
~BPTree();
+ /* constructor from a defined topology using c-types
+ *
+ * @param input_structure A boolean array defining the topology
+ * @param input_lengths A double array of the branch lengths
+ * @param input_names A char* array of the names
+ * @param n_parens The length of the topology
+ */
+ BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens);
+
/* postorder tree traversal
*
* Get the index position of the ith node in a postorder tree
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/cdd1e39be4b42ff6c57e5c3a01509ad48837c198...538b778ae146ea76a3e6f38306dad3ef61164125
--
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/cdd1e39be4b42ff6c57e5c3a01509ad48837c198...538b778ae146ea76a3e6f38306dad3ef61164125
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220726/1ecb8562/attachment-0001.htm>
More information about the debian-med-commit
mailing list