[med-svn] [Git][med-team/unifrac-tools][upstream] New upstream version 1.1.1

Andreas Tille (@tille) gitlab at salsa.debian.org
Tue Jul 26 15:50:47 BST 2022



Andreas Tille pushed to branch upstream at Debian Med / unifrac-tools


Commits:
a6b633a7 by Andreas Tille at 2022-07-26T16:46:09+02:00
New upstream version 1.1.1
- - - - -


11 changed files:

- .github/workflows/main.yml
- README.md
- src/Makefile
- src/api.cpp
- src/api.hpp
- src/biom.cpp
- src/biom.hpp
- + src/status_enum.hpp
- src/test_su.cpp
- src/tree.cpp
- src/tree.hpp


Changes:

=====================================
.github/workflows/main.yml
=====================================
@@ -55,6 +55,7 @@ jobs:
           ./scripts/install_hpc_sdk.sh
           source setup_nv_h5.sh
         fi
+        export PERFORMING_CONDA_BUILD=True
         make api && \
         make main && \
         make install && \


=====================================
README.md
=====================================
@@ -36,6 +36,10 @@ Compilation has been performed on both LLVM 10.0.0 (OS X >= 10.12) or GCC 9 (Cen
 
 Installation time should be a few minutes at most.
 
+## Install (example)
+
+An example of installing UniFrac, and using it with CPUs as well as GPUs, can be be found on [Google Colabs](https://colab.research.google.com/drive/1yL0MdF1zNAkPg1_yESI1iABUH4ZHNGwj?usp=sharing).
+
 ## Install (bioconda)
 
 This binaries can be installed via a combination of `conda-forge` and `bioconda`:
@@ -75,6 +79,34 @@ scripts/install_hpc_sdk.sh
 source setup_nv_h5.sh
 ```
 
+# Environment considerations
+
+## Multi-core support
+
+Unifrac uses OpenMP to make use of multiple CPU cores.
+By default, Unifrac will use all the cores that are available on the system.
+To restrict the number of cores used, set:
+
+    export OMP_NUM_THREADS=nthreads
+
+## GPU support
+
+On Linux platforms, Unifrac will run on a GPU, if one is found. 
+To disable GPU offload, and thus force CPU-only execution, one can set:
+
+    export UNIFRAC_USE_GPU=N
+
+To check which code path is used (Unifrac will print it to standard output at runtime), set:
+
+    export UNIFRAC_GPU_INFO=Y
+
+Finally, Unifrac will only use one GPU at a time. 
+If more than one GPU is present, one can select the one to use by setting:
+
+    export ACC_DEVICE_NUM=gpunum
+
+Note that there is no GPU support for MacOS.
+
 # Examples of use
 
 Below are a few light examples of different ways to use this library.


=====================================
src/Makefile
=====================================
@@ -68,7 +68,7 @@ endif
 
 ifneq (,$(findstring pgi,$(COMPILER)))
 	ifeq ($(PERFORMING_CONDA_BUILD),True)
-		CPPFLAGS += -tp=px
+		CPPFLAGS += -tp=sandybridge
 	endif
 else
 	ifeq ($(PERFORMING_CONDA_BUILD),True)
@@ -135,6 +135,7 @@ install: libssu.so ssu faithpd
 	mkdir -p ${PREFIX}/include/unifrac
 	cp task_parameters.hpp ${PREFIX}/include/unifrac/
 	cp api.hpp ${PREFIX}/include/unifrac/
+	cp status_enum.hpp ${PREFIX}/include/unifrac/
 
 rapi_test: main
 	mkdir -p ~/.R


=====================================
src/api.cpp
=====================================
@@ -48,22 +48,25 @@
                                               return err;                                                      \
                                           }
 
-#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename);                                        \
-                                                             std::string content = std::string(std::istreambuf_iterator<char>(ifs),   \
-                                                                                               std::istreambuf_iterator<char>());     \
-                                                             su::BPTree tree = su::BPTree(content);                                   \
-                                                             su::biom table = su::biom(biom_filename);                                \
-                                                             if(table.n_samples <= 0 | table.n_obs <= 0) {                            \
-                                                                 return table_empty;                                                  \
-                                                             }                                                                        \
-                                                             std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \
-                                                             if(bad_id != "") {                                                       \
-                                                                 return table_and_tree_do_not_overlap;                                \
-                                                             }                                                                        \
-                                                             std::unordered_set<std::string> to_keep(table.obs_ids.begin(),           \
-                                                                                                     table.obs_ids.end());            \
-                                                             su::BPTree tree_sheared = tree.shear(to_keep).collapse();
-
+#define SYNC_TREE_TABLE(tree, table) std::unordered_set<std::string> to_keep(table.obs_ids.begin(),           \
+                                                                             table.obs_ids.end());            \
+                                     su::BPTree tree_sheared = tree.shear(to_keep).collapse();
+
+#define PARSE_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename);                                        \
+                                                        std::string content = std::string(std::istreambuf_iterator<char>(ifs),   \
+                                                                                          std::istreambuf_iterator<char>());     \
+                                                        su::BPTree tree = su::BPTree(content);                                   \
+                                                        su::biom table = su::biom(biom_filename);                                \
+                                                        if(table.n_samples <= 0 | table.n_obs <= 0) {                            \
+                                                            return table_empty;                                                  \
+                                                        }                                                                        \
+                                                        std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \
+                                                        if(bad_id != "") {                                                       \
+                                                            return table_and_tree_do_not_overlap;                                \
+                                                        }   
+
+#define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) PARSE_TREE_TABLE(tree_filename, table_filename) \
+                                                             SYNC_TREE_TABLE(tree, table)
 
 using namespace su;
 using namespace std;
@@ -115,7 +118,7 @@ void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) {
     }
 }
 
-void initialize_results_vec(r_vec* &result, biom& table){
+void initialize_results_vec(r_vec* &result, biom &table){
     // Stores results for Faith PD
     result = (r_vec*)malloc(sizeof(results_vec));
     result->n_samples = table.n_samples;
@@ -146,6 +149,34 @@ void initialize_mat_no_biom(mat_t* &result, char** sample_ids, unsigned int n_sa
     }
 }
 
+inline compute_status is_fp64_method(const std::string &method_string, bool &fp64) {
+    if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) {
+        fp64 = false;
+    } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) {
+       fp64 = true;
+    } else {
+        return unknown_method;
+    }
+
+    return okay;
+}
+
+
+inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) {
+  if (format_string == "hdf5_fp32") {
+    fp64 = false;
+  } else if (format_string == "hdf5_fp64") {
+    fp64 = true;
+  } else if (format_string == "hdf5") {
+    return is_fp64_method(method_string, fp64);
+  } else {
+    return unknown_method; 
+  }
+
+  return okay;
+}
+
+
 template<class TReal, class TMat>
 void initialize_mat_full_no_biom_T(TMat* &result, const char* const * sample_ids, unsigned int n_samples, 
                                    const char *mmap_dir /* if NULL or "", use malloc */) {
@@ -348,6 +379,37 @@ void set_tasks(std::vector<su::task_parameters> &tasks,
     }
 }
 
+compute_status one_off_inmem_cpp(su::biom &table, su::BPTree &tree,
+                             const char* unifrac_method, bool variance_adjust, double alpha,
+                             bool bypass_tips, unsigned int nthreads, mat_t** result) {
+    SYNC_TREE_TABLE(tree, table)
+    SET_METHOD(unifrac_method, unknown_method)
+
+    const unsigned int stripe_stop = (table.n_samples + 1) / 2;
+    std::vector<double*> dm_stripes(stripe_stop);
+    std::vector<double*> dm_stripes_total(stripe_stop);
+
+    if(nthreads > dm_stripes.size()) {
+        fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size());
+        nthreads = dm_stripes.size();
+    }
+
+    std::vector<su::task_parameters> tasks(nthreads);
+    std::vector<std::thread> threads(nthreads);
+
+    set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
+    su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
+
+    initialize_mat(*result, table, true);  // true -> is_upper_triangle
+    for(unsigned int tid = 0; tid < threads.size(); tid++) {
+        su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop);
+    }
+
+    destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0);
+
+    return okay;
+}
+
 compute_status partial(const char* biom_filename, const char* tree_filename,
                        const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips,
                        unsigned int nthreads, unsigned int stripe_start, unsigned int stripe_stop,
@@ -404,40 +466,17 @@ compute_status faith_pd_one_off(const char* biom_filename, const char* tree_file
 compute_status one_off(const char* biom_filename, const char* tree_filename,
                        const char* unifrac_method, bool variance_adjust, double alpha,
                        bool bypass_tips, unsigned int nthreads, mat_t** result) {
-
     CHECK_FILE(biom_filename, table_missing)
     CHECK_FILE(tree_filename, tree_missing)
-    SET_METHOD(unifrac_method, unknown_method)
-    PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
+    PARSE_TREE_TABLE(tree_filename, table_filename)
 
-    const unsigned int stripe_stop = (table.n_samples + 1) / 2;
-    std::vector<double*> dm_stripes(stripe_stop);
-    std::vector<double*> dm_stripes_total(stripe_stop);
-
-    if(nthreads > dm_stripes.size()) {
-        fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n", dm_stripes.size());
-        nthreads = dm_stripes.size();
-    }
-
-    std::vector<su::task_parameters> tasks(nthreads);
-    std::vector<std::thread> threads(nthreads);
-
-    set_tasks(tasks, alpha, table.n_samples, 0, stripe_stop, bypass_tips, nthreads);
-    su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks);
-
-    initialize_mat(*result, table, true);  // true -> is_upper_triangle
-    for(unsigned int tid = 0; tid < threads.size(); tid++) {
-        su::stripes_to_condensed_form(dm_stripes,table.n_samples,(*result)->condensed_form,tasks[tid].start,tasks[tid].stop);
-    }
-
-    destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0);
-
-    return okay;
+    // condensed form
+    return one_off_inmem_cpp(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
 }
 
 // TMat mat_full_fp32_t
 template<class TReal, class TMat>
-compute_status one_off_matrix_T(const char* biom_filename, const char* tree_filename,
+compute_status one_off_matrix_T(su::biom &table, su::BPTree &tree,
                                 const char* unifrac_method, bool variance_adjust, double alpha,
                                 bool bypass_tips, unsigned int nthreads,
                                 const char *mmap_dir,  
@@ -446,10 +485,8 @@ compute_status one_off_matrix_T(const char* biom_filename, const char* tree_file
      if (mmap_dir[0]==0) mmap_dir = NULL; // easier to have a simple test going on
     }
 
-    CHECK_FILE(biom_filename, table_missing)
-    CHECK_FILE(tree_filename, tree_missing)
     SET_METHOD(unifrac_method, unknown_method)
-    PARSE_SYNC_TREE_TABLE(tree_filename, table_filename)
+    SYNC_TREE_TABLE(tree, table)
 
     const unsigned int stripe_stop = (table.n_samples + 1) / 2;
     partial_mat_t *partial_mat = NULL;
@@ -472,7 +509,10 @@ compute_status one_off_matrix_T(const char* biom_filename, const char* tree_file
       destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, stripe_stop);
     }
 
-    initialize_mat_full_no_biom_T<TReal,TMat>(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir);
+    // allow the caller to allocate the memory
+    if((*result) == NULL) {
+        initialize_mat_full_no_biom_T<TReal,TMat>(*result, partial_mat->sample_ids, partial_mat->n_samples,mmap_dir);
+    }
 
     if (((*result)==NULL) || ((*result)->matrix==NULL) || ((*result)->sample_ids==NULL) ) {
         fprintf(stderr, "Memory allocation error! (initialize_mat)\n");
@@ -498,7 +538,10 @@ compute_status one_off_matrix(const char* biom_filename, const char* tree_filena
                               bool bypass_tips, unsigned int nthreads,
                               const char *mmap_dir,
                               mat_full_fp64_t** result) {
- return one_off_matrix_T<double,mat_full_fp64_t>(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
+    CHECK_FILE(biom_filename, table_missing)
+    CHECK_FILE(tree_filename, tree_missing)
+    PARSE_TREE_TABLE(tree_filename, biom_filename)
+    return one_off_matrix_T<double,mat_full_fp64_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
 }
 
 compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_filename,
@@ -506,29 +549,89 @@ compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_f
                                    bool bypass_tips, unsigned int nthreads,
                                    const char *mmap_dir,
                                    mat_full_fp32_t** result) {
- return one_off_matrix_T<float,mat_full_fp32_t>(biom_filename,tree_filename,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
+    CHECK_FILE(biom_filename, table_missing)
+    CHECK_FILE(tree_filename, tree_missing)
+    PARSE_TREE_TABLE(tree_filename, biom_filename)
+    return one_off_matrix_T<float,mat_full_fp32_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
 }
 
-inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) {
-  if (format_string == "hdf5_fp32") {
-    fp64 = false;
-  } else if (format_string == "hdf5_fp64") {
-    fp64 = true;
-  } else if (format_string == "hdf5") {
-    if ((method_string=="unweighted_fp32") || (method_string=="weighted_normalized_fp32") || (method_string=="weighted_unnormalized_fp32") || (method_string=="generalized_fp32")) {
-       fp64 = false;
-    } else if ((method_string=="unweighted") || (method_string=="weighted_normalized") || (method_string=="weighted_unnormalized") || (method_string=="generalized")) {
-       fp64 = true;
+compute_status one_off_inmem_matrix(su::biom &table, su::BPTree &tree,
+                                    const char* unifrac_method, bool variance_adjust, double alpha,
+                                    bool bypass_tips, unsigned int nthreads,
+                                    mat_full_fp64_t** result) {
+    // NULL -> mmap_dir, we're doing full in memory here
+    return one_off_matrix_T<double,mat_full_fp64_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result);
+}
+
+compute_status one_off_inmem_matrix_fp32(su::biom &table, su::BPTree &tree,
+                                         const char* unifrac_method, bool variance_adjust, double alpha,
+                                         bool bypass_tips, unsigned int nthreads,
+                                         mat_full_fp32_t** result) {
+    // NULL -> mmap_dir, we're doing full in memory here
+    return one_off_matrix_T<float,mat_full_fp32_t>(table,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,NULL,result);
+}
+
+compute_status one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data,
+                             const char* unifrac_method, bool variance_adjust, double alpha,
+                             bool bypass_tips, unsigned int nthreads, mat_full_fp64_t** result) {
+    bool fp64;
+    compute_status rc = is_fp64_method(unifrac_method, fp64);
+
+    if (rc == okay) {
+        if (!fp64) {
+            return invalid_method;
+        }
     } else {
-      return unknown_method;
+        return rc;
     }
-  } else {
-    return unknown_method; 
-  }
 
-  return okay;
+    su::biom table(table_data->obs_ids, 
+                   table_data->sample_ids, 
+                   table_data->indices,
+                   table_data->indptr,
+                   table_data->data,
+                   table_data->n_obs,
+                   table_data->n_samples,
+                   table_data->nnz);
+
+    su::BPTree tree(tree_data->structure,
+                    tree_data->lengths,
+                    tree_data->names,
+                    tree_data->n_parens);
+
+    return one_off_inmem_matrix(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
 }
 
+compute_status one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data,
+                                  const char* unifrac_method, bool variance_adjust, double alpha,
+                                  bool bypass_tips, unsigned int nthreads, mat_full_fp32_t** result) {
+    bool fp64;
+    compute_status rc = is_fp64_method(unifrac_method, fp64);
+
+    if (rc == okay) {
+        if (fp64) {
+            return invalid_method;
+        }
+    } else {
+        return rc;
+    }
+
+    su::biom table(table_data->obs_ids, 
+                   table_data->sample_ids, 
+                   table_data->indices,
+                   table_data->indptr,
+                   table_data->data,
+                   table_data->n_obs,
+                   table_data->n_samples,
+                   table_data->nnz);
+
+    su::BPTree tree(tree_data->structure,
+                    tree_data->lengths,
+                    tree_data->names,
+                    tree_data->n_parens);
+
+    return one_off_inmem_matrix_fp32(table, tree, unifrac_method, variance_adjust, alpha, bypass_tips, nthreads, result);
+}
 
 compute_status unifrac_to_file(const char* biom_filename, const char* tree_filename, const char* out_filename,
                                const char* unifrac_method, bool variance_adjust, double alpha,
@@ -540,7 +643,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen
 
     if (rc==okay) {
       if (fp64) {
-        mat_full_fp64_t* result;
+        mat_full_fp64_t* result = NULL;
         rc = one_off_matrix(biom_filename, tree_filename,
                             unifrac_method, variance_adjust, alpha,
                             bypass_tips, threads, mmap_dir,
@@ -554,7 +657,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen
           if (iostatus!=write_okay) rc=output_error;
         }
       } else {
-        mat_full_fp32_t* result;
+        mat_full_fp32_t* result = NULL;
         rc = one_off_matrix_fp32(biom_filename, tree_filename,
                                  unifrac_method, variance_adjust, alpha,
                                  bypass_tips, threads, mmap_dir,


=====================================
src/api.hpp
=====================================
@@ -1,4 +1,5 @@
 #include "task_parameters.hpp"
+#include "status_enum.hpp"
 
 #ifdef __cplusplus
 #include <vector>
@@ -14,10 +15,6 @@
 #define PARTIAL_MAGIC_V2 0x088ABA02
 
 
-typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus;
-typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus;
-typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus;
-
 /* a result matrix
  *
  * n_samples <uint> the number of samples.
@@ -122,14 +119,49 @@ typedef struct partial_dyn_mat {
     char* filename;
 } partial_dyn_mat_t;
 
+/* support structure to carry in biom table information 
+ *
+ * obs_ids <char**> the observation IDs
+ * sample_ids <char**> the sample IDs
+ * indices <int32_t*> the indices of the data values
+ * indptr <int32_t*> the row offset of the data values
+ * data <double*> the actual matrix values
+ * n_obs <int> the number of observations, corresponding to length of obs_ids
+ * n_samples <int> the number of samples, corresponding to the length of sample_ids
+ * nnz <int> the number of nonzero values, corresponding to the length of data and indices
+ */
+typedef struct support_biom {
+    char** obs_ids;
+    char** sample_ids;
+    uint32_t* indices;
+    uint32_t* indptr;
+    double* data;
+    int n_obs;
+    int n_samples;
+    int nnz;
+} support_biom_t;
+
+/* support structure to carry in bptree information
+ *
+ * structure <bool*> the topology of the tree
+ * lengths <double*> the branch lengths of the tree
+ * names <char**> the names of the tips and internal nodes of hte tree
+ * n_parens <int> the length of the structure array
+ */
+typedef struct support_bptree {
+    bool* structure;
+    double* lengths;
+    char** names;
+    int n_parens;
+} support_bptree_t;
 
 
-void destroy_mat(mat_t** result);
-void destroy_mat_full_fp64(mat_full_fp64_t** result);
-void destroy_mat_full_fp32(mat_full_fp32_t** result);
-void destroy_partial_mat(partial_mat_t** result);
-void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
-void destroy_results_vec(r_vec** result);
+EXTERN void destroy_mat(mat_t** result);
+EXTERN void destroy_mat_full_fp64(mat_full_fp64_t** result);
+EXTERN void destroy_mat_full_fp32(mat_full_fp32_t** result);
+EXTERN void destroy_partial_mat(partial_mat_t** result);
+EXTERN void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
+EXTERN void destroy_results_vec(r_vec** result);
 
 /* Compute UniFrac - condensed form
  *
@@ -154,6 +186,49 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam
                              const char* unifrac_method, bool variance_adjust, double alpha,
                              bool bypass_tips, unsigned int threads, mat_t** result);
 
+
+/* Compute UniFrac - against in-memory objects returning full form matrix
+ *
+ * table <biom> a constructed BIOM object
+ * tree <BPTree> a constructed BPTree object
+ * unifrac_method <const char*> the requested unifrac method.
+ * variance_adjust <bool> whether to apply variance adjustment.
+ * alpha <double> GUniFrac alpha, only relevant if method == generalized.
+ * bypass_tips <bool> disregard tips, reduces compute by about 50%
+ * threads <uint> the number of threads to use.
+ * result <mat_full_fp64_t**> the resulting distance matrix in full form, this is initialized within the method so using **
+ *
+ * one_off_inmem returns the following error codes:
+ *
+ * okay           : no problems encountered
+ * unknown_method : the requested method is unknown.
+ * table_empty    : the table does not have any entries
+ */
+EXTERN ComputeStatus one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data,
+                                   const char* unifrac_method, bool variance_adjust, double alpha,
+                                   bool bypass_tips, unsigned int threads, mat_full_fp64_t** result);
+
+/* Compute UniFrac - against in-memory objects returning full form matrix, fp32
+ *
+ * table <biom> a constructed BIOM object
+ * tree <BPTree> a constructed BPTree object
+ * unifrac_method <const char*> the requested unifrac method.
+ * variance_adjust <bool> whether to apply variance adjustment.
+ * alpha <double> GUniFrac alpha, only relevant if method == generalized.
+ * bypass_tips <bool> disregard tips, reduces compute by about 50%
+ * threads <uint> the number of threads to use.
+ * result <mat_full_fp32_t**> the resulting distance matrix in full form, this is initialized within the method so using **
+ *
+ * one_off_inmem returns the following error codes:
+ *
+ * okay           : no problems encountered
+ * unknown_method : the requested method is unknown.
+ * table_empty    : the table does not have any entries
+ */
+EXTERN ComputeStatus one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data,
+                                        const char* unifrac_method, bool variance_adjust, double alpha,
+                                        bool bypass_tips, unsigned int threads, mat_full_fp32_t** result);
+
 /* Compute UniFrac - matrix form
  *
  * biom_filename <const char*> the filename to the biom table.


=====================================
src/biom.cpp
=====================================
@@ -26,7 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices");
 const std::string SAMPLE_DATA = std::string("/sample/matrix/data");
 const std::string SAMPLE_IDS = std::string("/sample/ids");
 
-biom::biom(std::string filename) {
+biom::biom(std::string filename) : has_hdf5_backing(true) {
     file = H5File(filename.c_str(), H5F_ACC_RDONLY);
 
     /* establish the datasets */
@@ -55,9 +55,51 @@ biom::biom(std::string filename) {
     obs_id_index = std::unordered_map<std::string, uint32_t>();
     sample_id_index = std::unordered_map<std::string, uint32_t>();
 
-    create_id_index(obs_ids, obs_id_index);
-    create_id_index(sample_ids, sample_id_index);
+    #pragma omp parallel for schedule(static)
+    for(int i = 0; i < 3; i++) {
+        if(i == 0)
+            create_id_index(obs_ids, obs_id_index);
+        else if(i == 1)
+            create_id_index(sample_ids, sample_id_index);
+        else if(i == 2)
+            malloc_resident(n_obs);
+    }
+
+    uint32_t *current_indices = NULL;
+    double *current_data = NULL;
+    for(unsigned int i = 0; i < obs_ids.size(); i++)  {
+        std::string id_ = obs_ids[i];
+        unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
+        obs_counts_resident[i] = n;
+        obs_indices_resident[i] = current_indices;
+        obs_data_resident[i] = current_data;
+    }
+    sample_counts = get_sample_counts();
+}
+
+biom::~biom() {
+    if(has_hdf5_backing) {
+        if(obs_indices_resident != NULL && obs_data_resident != NULL) {
+            for(unsigned int i = 0; i < n_obs; i++) {
+                if(obs_indices_resident[i] != NULL)
+                    free(obs_indices_resident[i]);
+                if(obs_data_resident[i] != NULL)
+                    free(obs_data_resident[i]);
+            }
+        }
+        
+        if(obs_indices_resident != NULL)
+            free(obs_indices_resident);
+        if(obs_data_resident != NULL)
+            free(obs_data_resident);
+        if(obs_counts_resident != NULL)
+            free(obs_counts_resident);
+    } 
+    // else, it is the responsibility of the entity constructing this object
+    // to clean itself up
+}
 
+void biom::malloc_resident(uint32_t n_obs) { 
     /* load obs sparse data */
     obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs);
     if(obs_indices_resident == NULL) {
@@ -77,30 +119,82 @@ biom::biom(std::string filename) {
                 sizeof(unsigned int) * n_obs, __FILE__, __LINE__);
         exit(EXIT_FAILURE);
     }
+}
 
-    uint32_t *current_indices = NULL;
-    double *current_data = NULL;
-    for(unsigned int i = 0; i < obs_ids.size(); i++)  {
-        std::string id_ = obs_ids[i];
-        unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
-        obs_counts_resident[i] = n;
-        obs_indices_resident[i] = current_indices;
-        obs_data_resident[i] = current_data;
-    }
-    sample_counts = get_sample_counts();
+biom::biom() : has_hdf5_backing(false) { 
+    n_obs = 0;
+    malloc_resident(0);
 }
 
-biom::~biom() {
-    for(unsigned int i = 0; i < n_obs; i++) {
-        free(obs_indices_resident[i]);
-        free(obs_data_resident[i]);
+// not using const on indices/indptr/data as the pointers are being borrowed
+biom::biom(char** obs_ids_in,
+           char** samp_ids_in,
+           uint32_t* indices,
+           uint32_t* indptr,
+           double* data,
+           const int n_obs,
+           const int n_samples,
+           const int nnz) : has_hdf5_backing(false) {
+
+    this->nnz = nnz;
+    this->n_samples = n_samples;
+    this->n_obs = n_obs;
+
+    sample_ids = std::vector<std::string>();
+    sample_ids.resize(n_samples);
+    obs_ids = std::vector<std::string>();
+    obs_ids.resize(n_obs);
+   
+    #pragma omp parallel for schedule(static)
+    for(int x = 0; x < 2; x++) {
+        if(x == 0) {
+            for(int i = 0; i < n_obs; i++) {
+                obs_ids[i] = std::string(obs_ids_in[i]);
+            }
+        } else {
+            for(int i = 0; i < n_samples; i++) {
+                sample_ids[i] = std::string(samp_ids_in[i]);
+            }
+        }
     }
-    free(obs_indices_resident);
-    free(obs_data_resident);
-    free(obs_counts_resident);
+
+    /* define a mapping between an ID and its corresponding offset */
+    obs_id_index = std::unordered_map<std::string, uint32_t>();
+    sample_id_index = std::unordered_map<std::string, uint32_t>();
+
+    #pragma omp parallel for schedule(static)
+    for(int i = 0; i < 3; i++) {
+        if(i == 0)
+            create_id_index(obs_ids, obs_id_index);
+        else if(i == 1)
+            create_id_index(sample_ids, sample_id_index);
+        else if(i == 2)
+            malloc_resident(n_obs);
+    }
+
+    #pragma omp parallel for schedule(static)
+    for(unsigned int i = 0; i < n_obs; i++)  {
+        int32_t start = indptr[i];
+        int32_t end = indptr[i + 1];
+        unsigned int count = end - start;
+
+        uint32_t* index_ptr = (indices + start);
+        double* data_ptr = (data + start);
+        
+        obs_indices_resident[i] = index_ptr;
+        obs_data_resident[i] = data_ptr;
+        obs_counts_resident[i] = count;
+    }
+    sample_counts = get_sample_counts();
 }
 
 void biom::set_nnz() {
+    if(!has_hdf5_backing) {
+        fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", 
+                __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+
     // should these be cached?
     DataType dtype = obs_data.getDataType();
     DataSpace dataspace = obs_data.getSpace();
@@ -111,6 +205,12 @@ void biom::set_nnz() {
 }
 
 void biom::load_ids(const char *path, std::vector<std::string> &ids) {
+    if(!has_hdf5_backing) {
+        fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", 
+                __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+
     DataSet ds_ids = file.openDataSet(path);
     DataType dtype = ds_ids.getDataType();
     DataSpace dataspace = ds_ids.getSpace();
@@ -138,6 +238,12 @@ void biom::load_ids(const char *path, std::vector<std::string> &ids) {
 }
 
 void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
+    if(!has_hdf5_backing) {
+        fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", 
+                __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+
     DataSet ds = file.openDataSet(path);
     DataType dtype = ds.getDataType();
     DataSpace dataspace = ds.getSpace();
@@ -159,7 +265,7 @@ void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
     free(dataout);
 }
 
-void biom::create_id_index(std::vector<std::string> &ids, 
+void biom::create_id_index(const std::vector<std::string> &ids, 
                            std::unordered_map<std::string, uint32_t> &map) {
     uint32_t count = 0;
     map.reserve(ids.size());
@@ -169,6 +275,12 @@ void biom::create_id_index(std::vector<std::string> &ids,
 }
 
 unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
+    if(!has_hdf5_backing) {
+        fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", 
+                __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+
     uint32_t idx = obs_id_index.at(id);
     uint32_t start = obs_indptr[idx];
     uint32_t end = obs_indptr[idx + 1];
@@ -270,6 +382,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne
 }
 
 unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
+    if(!has_hdf5_backing) {
+        fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n", 
+                __FILE__, __LINE__);
+        exit(EXIT_FAILURE);
+    }
+
     uint32_t idx = sample_id_index.at(id);
     uint32_t start = sample_indptr[idx];
     uint32_t end = sample_indptr[idx + 1];
@@ -310,6 +428,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur
 
 double* biom::get_sample_counts() {
     double *sample_counts = (double*)calloc(sizeof(double), n_samples);
+
     for(unsigned int i = 0; i < n_obs; i++) {
         unsigned int count = obs_counts_resident[i];
         uint32_t *indices = obs_indices_resident[i];


=====================================
src/biom.hpp
=====================================
@@ -21,12 +21,35 @@
 namespace su {
     class biom : public biom_interface {
         public:
+            /* nullary constructor */
+            biom();
+
             /* default constructor
              *
              * @param filename The path to the BIOM table to read
              */
             biom(std::string filename);
 
+            /* constructor from compress sparse data
+             *
+             * @param obs_ids vector of observation identifiers
+             * @param samp_ids vector of sample identifiers
+             * @param index vector of index positions
+             * @param indptr vector of indptr positions
+             * @param data vector of observation counts
+             * @param n_obs number of observations
+             * @param n_samples number of samples
+             * @param nnz number of data points
+             */
+            biom(char** obs_ids,
+                 char** samp_ids,
+                 uint32_t* index,
+                 uint32_t* indptr,
+                 double* data,
+                 const int n_obs,
+                 const int n_samples,
+                 const int nnz);
+
             /* default destructor
              *
              * Temporary arrays are freed
@@ -55,8 +78,9 @@ namespace su {
              */
             void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, double* out) const;
             void get_obs_data_range(const std::string &id, unsigned int start, unsigned int end, bool normalize, float* out) const;
-
         private:
+            bool has_hdf5_backing = false;
+            
             /* retain DataSet handles within the HDF5 file */
             H5::DataSet obs_indices;
             H5::DataSet sample_indices;
@@ -66,11 +90,14 @@ namespace su {
             uint32_t **obs_indices_resident;
             double **obs_data_resident;
             unsigned int *obs_counts_resident;
+            
+            void malloc_resident(uint32_t n_obs);
 
             unsigned int get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out);
             unsigned int get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out);
             double* get_sample_counts();
 
+
             /* At construction, lookups mapping IDs -> index position within an
              * axis are defined
              */
@@ -100,7 +127,7 @@ namespace su {
              * @param ids A vector of IDs to index
              * @param map A hash table to populate
              */
-            void create_id_index(std::vector<std::string> &ids, 
+            void create_id_index(const std::vector<std::string> &ids, 
                                  std::unordered_map<std::string, uint32_t> &map);
 
 


=====================================
src/status_enum.hpp
=====================================
@@ -0,0 +1,8 @@
+#ifndef _UNIFRAC_STATUS_H
+#define _UNIFRAC_STATUS_H
+
+typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error, invalid_method} ComputeStatus;
+typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus;
+typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus;
+
+#endif


=====================================
src/test_su.cpp
=====================================
@@ -153,9 +153,8 @@ void test_bptree_constructor_from_existing() {
                                 //11101000
     su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);");
     su::BPTree tree = su::BPTree(existing.get_structure(), existing.lengths, existing.names);
-
+ 
     unsigned int exp_nparens = 8;
-
     std::vector<bool> exp_structure;
     exp_structure.push_back(true);
     exp_structure.push_back(true);
@@ -467,10 +466,7 @@ void test_biom_constructor() {
     SUITE_END();
 }
 
-void test_biom_get_obs_data() {
-    SUITE_START("biom get obs data");
-
-    su::biom table = su::biom("test.biom");
+void _exercise_get_obs_data(su::biom &table) {
     double exp0[] = {0.0, 0.0, 1.0, 0.0, 0.0, 0.0};
     std::vector<double> exp0_vec = _double_array_to_vector(exp0, 6);
     double exp1[] = {5.0, 1.0, 0.0, 2.0, 3.0, 1.0};
@@ -506,9 +502,43 @@ void test_biom_get_obs_data() {
     ASSERT(vec_almost_equal(obs_vec, exp4_vec));
 
     free(out);
+}
+
+void test_biom_constructor_from_sparse() {
+    SUITE_START("biom from sparse constructor");
+    uint32_t index[] = {2, 0, 1, 3, 4, 5, 2, 3, 5, 0, 1, 2, 5, 1, 2};
+    uint32_t indptr[] = {0,  1,  6,  9, 13, 15};
+    double data[] = {1., 5., 1., 2., 3., 1., 1., 4., 2., 2., 1., 1., 1., 1., 1.};
+    char* obs_ids[] = {"GG_OTU_1", "GG_OTU_2", "GG_OTU_3", "GG_OTU_4", "GG_OTU_5"};
+    char* samp_ids[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"};
+
+    su::biom table = su::biom(obs_ids, samp_ids, index, indptr, data, 5, 6, 15);
+    _exercise_get_obs_data(table);
+   
+    SUITE_END();
+}
+
+void test_biom_nullary() {
+    SUITE_START("biom nullary");
+    su::biom table = su::biom();
+    SUITE_END();
+}
+
+void test_bptree_nullary() {
+    SUITE_START("bptree nullary");
+    su::BPTree tree = su::BPTree();
+    SUITE_END();
+}
+
+void test_biom_get_obs_data() {
+    SUITE_START("biom get obs data");
+
+    su::biom table = su::biom("test.biom");
+    _exercise_get_obs_data(table);
     SUITE_END();
 }
 
+
 void test_bptree_leftchild() {
     SUITE_START("test bptree left child");
     su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;");
@@ -1802,6 +1832,66 @@ void test_set_tasks() {
     SUITE_END();
 }
 
+void test_bptree_cstyle_constructor() {
+    SUITE_START("test bptree constructor from c-style data");
+                                //01234567
+                                //11101000
+    bool structure[] = {true, true, true, false, true, false, false, false};
+    double lengths[] = {0, 0, 1, 0, 2, 0, 0, 0};
+    char* names[] = {"", "c", "123:foo; bar", "", "b", "", "", ""};
+    su::BPTree tree = su::BPTree(structure, lengths, names, 8);
+
+    unsigned int exp_nparens = 8;
+
+    std::vector<bool> exp_structure;
+    exp_structure.push_back(true);
+    exp_structure.push_back(true);
+    exp_structure.push_back(true);
+    exp_structure.push_back(false);
+    exp_structure.push_back(true);
+    exp_structure.push_back(false);
+    exp_structure.push_back(false);
+    exp_structure.push_back(false);
+
+    std::vector<uint32_t> exp_openclose;
+    exp_openclose.push_back(7);
+    exp_openclose.push_back(6);
+    exp_openclose.push_back(3);
+    exp_openclose.push_back(2);
+    exp_openclose.push_back(5);
+    exp_openclose.push_back(4);
+    exp_openclose.push_back(1);
+    exp_openclose.push_back(0);
+
+    std::vector<std::string> exp_names;
+    exp_names.push_back(std::string());
+    exp_names.push_back(std::string("c"));
+    exp_names.push_back(std::string("123:foo; bar"));
+    exp_names.push_back(std::string());
+    exp_names.push_back(std::string("b"));
+    exp_names.push_back(std::string());
+    exp_names.push_back(std::string());
+    exp_names.push_back(std::string());
+
+    std::vector<double> exp_lengths;
+    exp_lengths.push_back(0.0);
+    exp_lengths.push_back(0.0);
+    exp_lengths.push_back(1.0);
+    exp_lengths.push_back(0.0);
+    exp_lengths.push_back(2.0);
+    exp_lengths.push_back(0.0);
+    exp_lengths.push_back(0.0);
+    exp_lengths.push_back(0.0);
+
+    ASSERT(tree.nparens == exp_nparens);
+    ASSERT(tree.get_structure() == exp_structure);
+    ASSERT(tree.get_openclose() == exp_openclose);
+    ASSERT(tree.lengths == exp_lengths);
+    ASSERT(tree.names == exp_names);
+
+    SUITE_END();
+}
+
 void test_bptree_constructor_newline_bug() {
     SUITE_START("test bptree constructor newline bug");
     su::BPTree tree = su::BPTree("((362be41f31fd26be95ae43a8769b91c0:0.116350803,(a16679d5a10caa9753f171977552d920:0.105836235,((a7acc2abb505c3ee177a12e514d3b994:0.008268754,(4e22aa3508b98813f52e1a12ffdb74ad:0.03144211,8139c4ac825dae48454fb4800fb87896:0.043622957)0.923:0.046588301)0.997:0.120902074,((2d3df7387323e2edcbbfcb6e56a02710:0.031543994,3f6752aabcc291b67a063fb6492fd107:0.091571442)0.759:0.016335166,((d599ebe277afb0dfd4ad3c2176afc50e:5e-09,84d0affc7243c7d6261f3a7d680b873f:0.010245188)0.883:0.048993011,51121722488d0c3da1388d1b117cd239:0.119447926)0.763:0.035660204)0.921:0.058191474)0.776:0.02854575)0.657:0.052060833)0.658:0.032547569,(99647b51f775c8ddde8ed36a7d60dbcd:0.173334268,(f18a9c8112372e2916a66a9778f3741b:0.194813398,(5833416522de0cca717a1abf720079ac:5e-09,(2bf1067d2cd4f09671e3ebe5500205ca:0.031692682,(b32621bcd86cb99e846d8f6fee7c9ab8:0.031330707,1016319c25196d73bdb3096d86a9df2f:5e-09)0.058:0.01028612)0.849:0.010284866)0.791:0.041353384)0.922:0.109470534):0.022169824000000005)root;\n\n");
@@ -1818,6 +1908,8 @@ int main(int argc, char** argv) {
     test_bptree_constructor_edgecases();
     test_bptree_constructor_quoted_comma();
     test_bptree_constructor_quoted_parens();
+    test_bptree_cstyle_constructor();
+    test_bptree_nullary();
     test_bptree_postorder();
     test_bptree_preorder();
     test_bptree_parent();
@@ -1832,6 +1924,8 @@ int main(int argc, char** argv) {
     test_bptree_collapse_edge();
 
     test_biom_constructor();
+    test_biom_constructor_from_sparse();
+    test_biom_nullary();
     test_biom_get_obs_data();
 
     test_propstack_constructor();


=====================================
src/tree.cpp
=====================================
@@ -4,6 +4,8 @@
 
 using namespace su;
 
+BPTree::BPTree() { }
+
 BPTree::BPTree(std::string newick) {
     openclose = std::vector<uint32_t>();
     lengths = std::vector<double>();
@@ -50,6 +52,36 @@ BPTree::BPTree(std::vector<bool> input_structure, std::vector<double> input_leng
     index_and_cache();
 }
 
+BPTree::BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens) {
+    structure = std::vector<bool>();
+    lengths = std::vector<double>();
+    names = std::vector<std::string>();
+
+    structure.resize(n_parens);
+    lengths.resize(n_parens);
+    names.resize(n_parens);
+
+    nparens = n_parens;
+
+    //#pragma omp parallel for schedule(static)
+    for(int i = 0; i < n_parens; i++) {
+        structure[i] = input_structure[i];
+        lengths[i] = input_lengths[i];
+        names[i] = std::string(input_names[i]);
+    }
+
+    openclose = std::vector<uint32_t>();
+    select_0_index = std::vector<uint32_t>();
+    select_1_index = std::vector<uint32_t>();
+    openclose.resize(nparens);
+    select_0_index.resize(nparens / 2);
+    select_1_index.resize(nparens / 2);
+    excess.resize(nparens);
+
+    structure_to_openclose();
+    index_and_cache();
+}
+
 BPTree BPTree::mask(std::vector<bool> topology_mask, std::vector<double> in_lengths) {
     
     std::vector<bool> new_structure = std::vector<bool>();
@@ -197,7 +229,7 @@ void BPTree::index_and_cache() {
     }
 }
 
-uint32_t BPTree::postorderselect(uint32_t k) const { 
+uint32_t BPTree::postorderselect(uint32_t k) const {
     return open(select_0_index[k]);
 }
 


=====================================
src/tree.hpp
=====================================
@@ -19,6 +19,9 @@
 namespace su {
     class BPTree {
         public:
+            /* nullary constructor */
+            BPTree();
+
             /* tracked attributes */
             std::vector<double> lengths;
             std::vector<std::string> names;
@@ -41,6 +44,15 @@ namespace su {
             BPTree(std::vector<bool> input_structure, std::vector<double> input_lengths, std::vector<std::string> input_names);
             ~BPTree();
 
+            /* constructor from a defined topology using c-types
+             *
+             * @param input_structure A boolean array defining the topology
+             * @param input_lengths A double array of the branch lengths
+             * @param input_names A char* array of the names
+             * @param n_parens The length of the topology
+             */
+            BPTree(const bool* input_structure, const double* input_lengths, char** input_names, const int n_parens);
+
             /* postorder tree traversal
              *
              * Get the index position of the ith node in a postorder tree



View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/commit/a6b633a7a719dbcff127a62272a82236881e5405

-- 
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/commit/a6b633a7a719dbcff127a62272a82236881e5405
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220726/8379652b/attachment-0001.htm>


More information about the debian-med-commit mailing list