[med-svn] [Git][med-team/unifrac-tools][master] 5 commits: New upstream version 1.4
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Mon Oct 23 20:51:51 BST 2023
Nilesh Patra pushed to branch master at Debian Med / unifrac-tools
Commits:
f6ed45fb by Nilesh Patra at 2023-10-24T00:43:20+05:30
New upstream version 1.4
- - - - -
a4ec6691 by Nilesh Patra at 2023-10-24T00:43:47+05:30
Update upstream source from tag 'upstream/1.4'
Update to upstream version '1.4'
with Debian dir 9b3bafd741356fc5bea59cd6f3fad291836fdaa6
- - - - -
55faead1 by Nilesh Patra at 2023-10-24T00:51:28+05:30
Drop gcc-13.patch
- - - - -
f8581864 by Nilesh Patra at 2023-10-24T00:51:43+05:30
Refresh/re-diff patches with new version
- - - - -
30371a96 by Nilesh Patra at 2023-10-24T00:52:04+05:30
Upload to unstable
- - - - -
22 changed files:
- − .github/workflows/blank.yml
- − .github/workflows/main.yml
- README.md
- combined/Makefile
- debian/changelog
- debian/patches/enable-debug-flag.patch
- − debian/patches/gcc-13.patch
- debian/patches/hardening
- debian/patches/series
- src/Makefile
- src/api.cpp
- src/biom_inmem.hpp
- src/biom_subsampled.cpp
- src/biom_subsampled.hpp
- src/skbio_alt.cpp
- src/skbio_alt.hpp
- src/su.cpp
- src/test_ska.cpp
- src/tree.cpp
- src/tree.hpp
- src/tsv.hpp
- test/Makefile
Changes:
=====================================
.github/workflows/blank.yml deleted
=====================================
@@ -1,36 +0,0 @@
-# This is a basic workflow to help you get started with Actions
-
-name: CI
-
-# Controls when the workflow will run
-on:
- # Triggers the workflow on push or pull request events but only for the main branch
- push:
- branches: [ main ]
- pull_request:
- branches: [ main ]
-
- # Allows you to run this workflow manually from the Actions tab
- workflow_dispatch:
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
- # This workflow contains a single job called "build"
- build:
- # The type of runner that the job will run on
- runs-on: ubuntu-latest
-
- # Steps represent a sequence of tasks that will be executed as part of the job
- steps:
- # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- - uses: actions/checkout at v2
-
- # Runs a single command using the runners shell
- - name: Run a one-line script
- run: echo Hello, world!
-
- # Runs a set of commands using the runners shell
- - name: Run a multi-line script
- run: |
- echo Add other actions to build,
- echo test, and deploy your project.
=====================================
.github/workflows/main.yml deleted
=====================================
@@ -1,216 +0,0 @@
-name: unifrac-binaries CI
-
-on:
- push:
- branches: [ main ]
- pull_request:
- branches: [ main ]
-
-
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
- build-and-test:
- strategy:
- matrix:
- os: [ubuntu-latest, macos-latest, linux-gpu-cuda]
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout at v3
- - uses: conda-incubator/setup-miniconda at v2
- with:
- miniconda-version: "latest"
- auto-update-conda: true
- - name: Install
- shell: bash -l {0}
- run: |
- df -h .
- if [[ "$(uname -s)" == "Linux" ]];
- then
- conda create -q --yes --strict-channel-priority -n unifrac -c conda-forge -c bioconda gxx_linux-64 gfortran_linux-64 hdf5 mkl-include lz4 zlib hdf5-static libcblas liblapacke make curl
- else
- conda create -q --yes --strict-channel-priority -n unifrac -c conda-forge -c bioconda clangxx_osx-64 hdf5 mkl-include lz4 hdf5-static libcblas liblapacke make curl
- fi
- conda clean --yes -t
- df -h .
- conda activate unifrac
- echo "$(uname -s)"
- if [[ "$(uname -s)" == "Linux" ]];
- then
- which x86_64-conda-linux-gnu-gcc
- x86_64-conda-linux-gnu-gcc -v
- x86_64-conda-linux-gnu-g++ -v
- else
- which clang
- clang -v
- fi
- which h5c++
- if [[ "$(uname -s)" == "Linux" ]];
- then
- # install PGI but do not source it
- # the makefile will do it automatically
- ./scripts/install_hpc_sdk.sh </dev/null
- fi
- df -h .
- export PERFORMING_CONDA_BUILD=True
- echo "======= begin env ====="
- env
- echo "======= end env ====="
- # all == build (shlib,bins,tests) and install
- make all
- df -h .
- pushd src
- if [[ "$(uname -s)" == "Linux" ]];
- then
- rm -f ~/.R/Makevars
- conda install -q --yes --strict-channel-priority -c conda-forge r-base
- unset CXXFLAGS
- unset CFLAGS
- unset DEBUG_CXXFLAGS
- unset DEBUG_CFLAGS
- # the r-base package has a broken dependency
- ln -s $CONDA_PREFIX/lib/libreadline.so $CONDA_PREFIX/lib/libreadline.so.6
- R -e 'install.packages("Rcpp", repos="http://lib.stat.cmu.edu/R/CRAN/")'
- make rapi_test
- fi
- popd
-
- - name: Tests
- shell: bash -l {0}
- run: |
- conda activate unifrac
- # keep it low for runs in containers
- # and a weird number to potentially catch potential bugs
- export OMP_NUM_THREADS=3
- # diagnostic messages for debugging, if needed
- export UNIFRAC_GPU_INFO=Y
- pushd src
- ./test_su
- ./test_api
- ./test_ska
- popd
- pushd test
- ./capi_test 1
- # explicitly check that we do not have hdf5 dependencies
- if [[ "$(uname -s)" == "Linux" ]];
- then
- ldd ./capi_test |awk 'BEGIN{a=0}/hdf/{a=a+1;print $0}END{if (a==0) {print "No dynamic hdf5 found"} else {exit 2}}'
- else
- otool -L ./capi_test|awk 'BEGIN{a=0}/hdf/{a=a+1;print $0}END{if (a==0) {print "No dynamic hdf5 found"} else {exit 2}}'
- fi
- popd
- pushd src/testdata
- conda install --yes -c conda-forge h5py
- # weighted_unnormalized
- time ssu -m weighted_unnormalized_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
- ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
- rm -f t1.h5
- # retry with default precision handling
- time ssu -m weighted_unnormalized -i test500.biom -t test500.tre --pcoa 4 -r hdf5 -o t1.h5
- ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
- rm -f t1.h5
- time ssu -f -m weighted_unnormalized_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
- # matrrix will be different, but PCOA similar
- ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
- rm -f t1.h5
- time ssu -m weighted_unnormalized_fp64 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp64 -o t1.h5
- # minimal precision loss between fp32 and fp64
- ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
- rm -f t1.h5
- # weighted_normalized
- time ssu -f -m weighted_normalized -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
- ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
- rm -f t1.h5
- time ssu -f -m weighted_normalized_fp64 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp64 -o t1.h5
- ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
- rm -f t1.h5
- # unweighted
- time ssu -f -m unweighted -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 -o t1.h5
- ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
- rm -f t1.h5
- time ssu -f -m unweighted_fp64 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp64 -o t1.h5
- ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
- ls -l t1.h5
- rm -f t1.h5
- # hdf5 without distance matrix, just PCoA
- echo "hdf5_nodist"
- time ssu -f -m unweighted_fp64 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_nodist -o t1.h5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
- ls -l t1.h5
- rm -f t1.h5
- time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_nodist -o t1.h5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
- ls -l t1.h5
- rm -f t1.h5
- # permanova
- echo "permanova"
- time ssu -m unweighted -i test500.biom -t test500.tre --pcoa 4 -r hdf5 --permanova 999 -g test500.tsv -c empo_2 -o t1.h5
- # compare to values given by skbio.stats.distance.permanova
- python compare_unifrac_stats.py t1.h5 5 999 1.001112 0.456 0.001 0.1
- ls -l t1.h5
- rm -f t1.h5
- time ssu -m unweighted -i test500.biom -t test500.tre --pcoa 4 -r hdf5 --permanova 99 -g test500.tsv -c empo_2 -o t1.h5
- python compare_unifrac_stats.py t1.h5 5 99 1.001112 0.456 0.001 0.2
- ls -l t1.h5
- rm -f t1.h5
- time ssu -m weighted_unnormalized_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_nodist -g test500.tsv -c empo_3 -o t1.h5
- # compare to values given by skbio.stats.distance.permanova
- python compare_unifrac_stats.py t1.h5 17 999 0.890697 0.865 0.001 0.1
- ls -l t1.h5
- rm -f t1.h5
- # partials
- echo "partials"
- ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode partial-report --n-partials 2
- time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode partial --start 0 --stop 125 -o t1.partial.1
- time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode partial --start 125 --stop 250 -o t1.partial.2
- ls -l t1.partial*
- ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode check-partial --partial-pattern 't1.partial.*'
- time ssu -f -m unweighted_fp64 -i test500.biom -t test500.tre --pcoa 4 --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_fp64 -o t1.h5
- ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
- ls -l t1.h5
- rm -f t1.h5
- time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --pcoa 4 --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_fp32 -o t1.h5
- ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
- ls -l t1.h5
- rm -f t1.h5
- time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --pcoa 4 --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_nodist -o t1.h5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
- ls -l t1.h5
- rm -f t1.h5
- rm -f t1.partial.*
- # subsample
- echo "subsample"
- time ssu -f -m unweighted -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp32 --subsample-depth 100 -o t1.h5
- ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.3
- rm -f t1.h5
- time ssu -m unweighted -i test500.biom -t test500.tre --pcoa 4 -r hdf5_nodist --permanova 99 -g test500.tsv -c empo_2 --subsample-depth 100 -o t1.h5
- python compare_unifrac_stats.py t1.h5 5 99 1.001112 0.456 0.05 0.6
- ls -l t1.h5
- rm -f t1.h5
- # multi
- echo "multi"
- time ssu -f -m unweighted -i test500.biom -t test500.tre --pcoa 4 --mode multi --subsample-depth 100 --n-subsamples 10 -o t1.h5
- ./compare_unifrac_pcoa_multi.py test500.unweighted_fp32.f.h5 t1.h5 10 3 0.3
- ls -l t1.h5
- rm -f t1.h5
- time ssu -m unweighted -i test500.biom -t test500.tre --pcoa 4 --mode multi --n-subsamples 10 --permanova 99 -g test500.tsv -c empo_2 --subsample-depth 100 -o t1.h5
- python compare_unifrac_stats_multi.py t1.h5 10 5 99 1.001112 0.456 0.08 0.6
- ls -l t1.h5
- rm -f t1.h5
- popd
- - name: Sanity checks
- shell: bash -l {0}
- run: |
- conda activate unifrac
- pushd ci
- ./crawford_test.sh
- popd
=====================================
README.md
=====================================
@@ -4,7 +4,7 @@
UniFrac is the *de facto* repository for high-performance phylogenetic diversity calculations. The methods in this repository are based on an implementation of the [Strided State UniFrac](https://www.nature.com/articles/s41592-018-0187-8) algorithm which is faster, and uses less memory than [Fast UniFrac](http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html). Strided State UniFrac supports [Unweighted UniFrac](http://aem.asm.org/content/71/12/8228.abstract), [Weighted UniFrac](http://aem.asm.org/content/73/5/1576), [Generalized UniFrac](https://academic.oup.com/bioinformatics/article/28/16/2106/324465/Associating-microbiome-composition-with), [Variance Adjusted UniFrac](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-118) and [meta UniFrac](http://www.pnas.org/content/105/39/15076.short), in both double and single precision (fp32).
This repository also includes Stacked Faith (manuscript in preparation), a method for calculating Faith's PD that is faster and uses less memory than the Fast UniFrac-based [reference implementation](http://scikit-bio.org/).
-This repository produces standalone exutables and a C API exposed via a shared library which can be linked against by any programming language.
+This repository produces standalone executables and a C API exposed via a shared library which can be linked against by any programming language.
# Citation
@@ -43,7 +43,7 @@ An example of installing UniFrac, and using it with CPUs as well as GPUs, can be
## Install (bioconda)
-This binaries can be installed via a combination of `conda-forge` and `bioconda`:
+The binaries can be installed via a combination of `conda-forge` and `bioconda`:
```
conda create --name unifrac -c conda-forge -c bioconda unifrac-binaries
@@ -65,7 +65,7 @@ make install
```
-**Note**: if you are using `conda` we recommend installing HDF5 and relate compiler using the
+**Note**: if you are using `conda` we recommend installing HDF5 and related compiler using the
`conda-forge` channel, for example:
```
@@ -74,7 +74,7 @@ conda activate unifrac
```
For GPU-enabled code, you will need the [NVIDIA HPC SDK](https://developer.nvidia.com/hpc-sdk) compiler.
-A helper script will download it, install it and setup the necessary environment:
+This helper script will download it, install it and setup the necessary environment:
```
scripts/install_hpc_sdk.sh
source setup_nv_h5.sh
@@ -173,6 +173,7 @@ The methods can be used directly through the command line after install:
hdf5_fp32 : HFD5 format, using fp32 precision.
hdf5_fp64 : HFD5 format, using fp64 precision.
--subsample-depth Depth of subsampling of the input BIOM before computing unifrac (required for mode==multi, optional for one-off)
+ --subsample-replacement [OPTIONAL] Subsample with or without replacement (default is with)
--n-subsamples [OPTIONAL] if mode==multi, number of subsampled UniFracs to compute (default: 100)
--permanova [OPTIONAL] Number of PERMANOVA permutations to compute (default: 999 with -g, do not compute if 0)
--pcoa [OPTIONAL] Number of PCoA dimensions to compute (default: 10, do not compute if 0)
=====================================
combined/Makefile
=====================================
@@ -13,10 +13,10 @@ ifeq ($(PREFIX),)
endif
libssu.o: libssu.c
- $(CC) -c libssu.c -fPIC
+ $(CC) $(CPPFLAGS) $(CFLAGS) -c libssu.c -fPIC
libssu.so: libssu.o
- $(CC) -shared -o libssu.so libssu.o -fPIC -ldl
+ $(CC) -shared -o libssu.so libssu.o -fPIC -ldl $(LDFLAGS)
install: libssu.so
rm -f ${PREFIX}/lib//libssu.so; cp libssu.so ${PREFIX}/lib/
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+unifrac-tools (1.4-1) unstable; urgency=medium
+
+ * Team Upload.
+ * New upstream version 1.4
+ * Drop gcc-13.patch
+ * Refresh/re-diff patches with new version
+
+ -- Nilesh Patra <nilesh at debian.org> Tue, 24 Oct 2023 00:51:49 +0530
+
unifrac-tools (1.3.2-5) unstable; urgency=medium
* Team upload.
=====================================
debian/patches/enable-debug-flag.patch
=====================================
@@ -2,14 +2,14 @@ Description: Add debug flag to cppflags
Author: Nilesh Patra <nilesh at debian.org>
Last-Update: 2023-01-05
Forwarded: not-needed
---- unifrac-tools.orig/src/Makefile
-+++ unifrac-tools/src/Makefile
+--- a/src/Makefile
++++ b/src/Makefile
@@ -85,7 +85,7 @@
endif
--CPPFLAGS += -Wall -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
-+CPPFLAGS += -g -Wall -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+-CPPFLAGS += -Wall -std=c++17 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
++CPPFLAGS += -g -Wall -std=c++17 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
all: api main install
=====================================
debian/patches/gcc-13.patch deleted
=====================================
@@ -1,37 +0,0 @@
-Description: fix build failure with gcc 13
-Author: Étienne Mollier <emollier at debian.org>
-Bug-Debian: https://bugs.debian.org/1037879
-Forwarded: https://github.com/biocore/unifrac-binaries/pull/45
-Last-Update: 2023-07-23
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- unifrac-tools.orig/src/tree.hpp
-+++ unifrac-tools/src/tree.hpp
-@@ -10,6 +10,7 @@
- #ifndef __UNIFRAC_TREE_H
- #define __UNIFRAC_TREE_H 1
-
-+#include <cstdint>
- #include <string>
- #include <sstream>
- #include <iostream>
---- unifrac-tools.orig/src/biom_inmem.hpp
-+++ unifrac-tools/src/biom_inmem.hpp
-@@ -11,6 +11,7 @@
- #ifndef _UNIFRAC_BIOM_INMEM_H
- #define _UNIFRAC_BIOM_INMEM_H
-
-+#include <cstdint>
- #include <vector>
- #include <unordered_map>
-
---- unifrac-tools.orig/src/tsv.hpp
-+++ unifrac-tools/src/tsv.hpp
-@@ -11,6 +11,7 @@
- #ifndef _UNIFRAC_TSV_H
- #define _UNIFRAC_TSV_H
-
-+#include <cstdint>
- #include <vector>
- #include <map>
- #include <string>
=====================================
debian/patches/hardening
=====================================
@@ -2,23 +2,8 @@ From: Michael R. Crusoe <crusoe at debian.org>
Subject: Use CPPFLAGS, CFLAGS, and LDFLAGS
Forwarded: https://github.com/biocore/unifrac-binaries/pull/46
---- unifrac-tools.orig/combined/Makefile
-+++ unifrac-tools/combined/Makefile
-@@ -13,10 +13,10 @@
- endif
-
- libssu.o: libssu.c
-- $(CC) -c libssu.c -fPIC
-+ $(CC) $(CPPFLAGS) $(CFLAGS) -c libssu.c -fPIC
-
- libssu.so: libssu.o
-- $(CC) -shared -o libssu.so libssu.o -fPIC -ldl
-+ $(CC) -shared -o libssu.so libssu.o -fPIC -ldl $(LDFLAGS)
-
- install: libssu.so
- rm -f ${PREFIX}/lib//libssu.so; cp libssu.so ${PREFIX}/lib/
---- unifrac-tools.orig/test/Makefile
-+++ unifrac-tools/test/Makefile
+--- a/test/Makefile
++++ b/test/Makefile
@@ -1,8 +1,5 @@
.PHONY: test_binaries test
@@ -28,12 +13,3 @@ Forwarded: https://github.com/biocore/unifrac-binaries/pull/46
ifeq ($(PREFIX),)
PREFIX := $(CONDA_PREFIX)
endif
-@@ -13,7 +10,7 @@
- test_binaries: capi_test
-
- capi_test: capi_test.c
-- $(CC) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib -o capi_test
-+ $(CC) $(CFLAGS) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib $(LDFLAGS) -o capi_test
-
- clean:
- -rm -f *.o capi_test
=====================================
debian/patches/series
=====================================
@@ -4,5 +4,4 @@ soname.patch
baseline.patch
enable_linking_to_shared_hdf5_lib.patch
enable-debug-flag.patch
-gcc-13.patch
hardening
=====================================
src/Makefile
=====================================
@@ -118,7 +118,7 @@ ifeq (,$(findstring pgi,$(COMPILER)))
endif
-CPPFLAGS += -Wall -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+CPPFLAGS += -Wall -std=c++17 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
all: api main install
=====================================
src/api.cpp
=====================================
@@ -609,12 +609,7 @@ compute_status one_off_matrix_v2(const char* biom_filename, const char* tree_fil
PARSE_TREE_TABLE(tree_filename, biom_filename)
TDBG_STEP("load_files")
if (subsample_depth>0) {
- // We do not implement subsampling without replacement yet
- if (!subsample_with_replacement) {
- fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
- return table_empty;
- }
- su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+ su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
if ((table_subsampled.n_samples==0) || (table_subsampled.n_obs==0)) {
return table_empty;
}
@@ -636,12 +631,7 @@ compute_status one_off_matrix_fp32_v2(const char* biom_filename, const char* tre
PARSE_TREE_TABLE(tree_filename, biom_filename)
TDBG_STEP("load_files")
if (subsample_depth>0) {
- // We do not implement subsampling without replacement yet
- if (!subsample_with_replacement) {
- fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
- return table_empty;
- }
- su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+ su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
TDBG_STEP("subsample")
return one_off_matrix_T<float,mat_full_fp32_t>(table_subsampled,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
} else {
@@ -698,12 +688,7 @@ compute_status one_off_matrix_inmem_v2(const support_biom_t *table_data, const s
tree_data->n_parens);
if (subsample_depth>0) {
- // We do not implement subsampling without replacement yet
- if (!subsample_with_replacement) {
- fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
- return table_empty;
- }
- su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+ su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
TDBG_STEP("subsample")
return one_off_matrix_T<double,mat_full_fp64_t>(table_subsampled,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
} else {
@@ -751,12 +736,7 @@ compute_status one_off_matrix_inmem_fp32_v2(const support_biom_t *table_data, co
tree_data->n_parens);
if (subsample_depth>0) {
- // We do not implement subsampling without replacement yet
- if (!subsample_with_replacement) {
- fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
- return table_empty;
- }
- su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+ su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
TDBG_STEP("subsample")
return one_off_matrix_T<float,mat_full_fp32_t>(table_subsampled,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
} else {
@@ -1272,11 +1252,6 @@ compute_status unifrac_multi_to_file_T(hid_t real_id, const bool save_dist,
compute_status rc = okay;
SETUP_TDBG("unifrac_multi_to_file")
- if (!subsample_with_replacement) {
- fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
- return table_empty;
- }
-
if (subsample_depth<1) {
fprintf(stderr, "ERROR: subsampling depth cannot be 0.\n");
return table_empty;
@@ -1317,7 +1292,7 @@ compute_status unifrac_multi_to_file_T(hid_t real_id, const bool save_dist,
su::WriteHDF5Multi<TReal,TMat> h5obj(real_id, out_filename,pcoa_dims);
for (unsigned int i=0; i<n_subsamples; i++) {
- su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+ su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
if ((table_subsampled.n_samples==0) || (table_subsampled.n_obs==0)) {
rc = table_empty;
break;
=====================================
src/biom_inmem.hpp
=====================================
@@ -11,6 +11,7 @@
#ifndef _UNIFRAC_BIOM_INMEM_H
#define _UNIFRAC_BIOM_INMEM_H
+#include <cstdint>
#include <vector>
#include <unordered_map>
=====================================
src/biom_subsampled.cpp
=====================================
@@ -11,6 +11,9 @@
#include <iostream>
#include <stdio.h>
#include <random>
+#include <vector>
+#include <algorithm>
+#include <omp.h>
#include "biom_subsampled.hpp"
@@ -83,29 +86,190 @@ linked_sparse_transposed::~linked_sparse_transposed() {
}
}
-void linked_sparse_transposed::transposed_subsample_with_replacement(const uint32_t n, const uint32_t random_seed) {
- std::mt19937 generator(random_seed);
+namespace su {
+ // Equivalent to iterator over np.repeat
+ // https://github.com/biocore/biom-format/blob/b0e71a00ecb349a6f5f1ca64a23d71f380ddc19c/biom/_subsample.pyx#LL64C24-L64C55
+ class WeightedSampleIterator
+ {
+ public:
+ // While we do not implememnt the whole random_access_iterator interface
+ // we want the implementations to use operator- and that requires random
+ using iterator_category = std::random_access_iterator_tag;
+ using difference_type = int64_t;
+ using value_type = uint32_t;
+ using pointer = const uint32_t*;
+ using reference = const uint32_t&;
- // use common buffer to minimize allocation costs
- double *data_in = new double[max_count]; // original values
- uint32_t *data_out = new uint32_t[max_count]; // computed values
+ WeightedSampleIterator(uint64_t *_data_in, uint32_t _idx, uint64_t _cnt)
+ : data_in(_data_in)
+ , idx(_idx)
+ , cnt(_cnt)
+ {}
- for (uint32_t i=0; i<n_obs; i++) {
- unsigned int length = obs_counts_resident[i];
- double* *data_arr = obs_data_resident[i];
+ reference operator*() const { return idx; }
+ pointer operator->() const { return &idx; }
+
+ WeightedSampleIterator& operator++()
+ {
+ cnt++;
+ if (cnt>=data_in[idx]) {
+ cnt = 0;
+ idx++;
+ }
+ return *this;
+ }
+
+ WeightedSampleIterator operator++(int) { WeightedSampleIterator tmp = *this; ++(*this); return tmp; }
+
+ friend bool operator== (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
+ {
+ return (a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt);
+ };
+
+ friend bool operator!= (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
+ {
+ return !((a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt));
+ };
+
+ friend int64_t operator-(const WeightedSampleIterator& b, const WeightedSampleIterator& a)
+ {
+ int64_t diff = 0;
+ //assert(a.data_in == b.data_in);
+ //assert(a.idx <= b.idx);
+ //assert((a.idx > b.idx) || (a.cnt<=b.cnt));
+
+ for (uint32_t i = a.idx; i<b.idx; i++) {
+ diff += a.data_in[i];
+ }
+
+ return diff + b.cnt - a.cnt;
+ };
+
+ private:
+
+ uint64_t *data_in;
+ uint32_t idx; // index of data_in
+ uint64_t cnt; // how deep in data_in[idx] are we (must be < data_in[idx])
+ };
+
+ class WeightedSample
+ {
+ public:
+ WeightedSample(uint32_t _max_count, uint32_t _n, uint32_t random_seed)
+ : max_count(_max_count)
+ , n(_n)
+ , generator(random_seed)
+ , current_count(0)
+ , data(max_count)
+ , sample_out(n)
+ , data_out(max_count)
+ {}
+
+ void do_sample(unsigned int length, double* *data_arr) {
+ for (unsigned int j=0; j<length; j++) data_out[j] = 0;
+
+ // note: We are assuming length>=n
+ // Enforced by the caller (via filtering)
+ assign(length,data_arr);
+ std::sample(begin(), end(),
+ sample_out.begin(), n,
+ generator);
+
+ for (uint32_t j=0; j<n; j++) data_out[sample_out[j]]++;
+
+ for (unsigned int j=0; j<length; j++) *(data_arr[j]) = data_out[j];
+ }
+
+private:
+ void assign(uint32_t length, double * *data_arr) {
+ current_count = length;
+ for (uint32_t j=0; j<length; j++) data[j] = *(data_arr[j]);
+ }
+
+ WeightedSampleIterator begin() { return WeightedSampleIterator(data.data(),0,0); }
+ WeightedSampleIterator end() { return WeightedSampleIterator(data.data(),current_count,0); } // current_count is out of bounds
+
+ uint32_t max_count;
+ uint32_t n;
+ std::mt19937 generator;
- for (unsigned int j=0; j<length; j++) data_in[j] = *(data_arr[j]);
-
+ uint32_t current_count;
+
+ // use persistent buffer to minimize allocation costs
+ std::vector<uint64_t> data; // original values
+ std::vector<uint32_t> sample_out; // random output buffer
+ std::vector<uint32_t> data_out; // computed values
+ };
+
+ class WeightedSampleWithReplacement
+ {
+ public:
+ WeightedSampleWithReplacement(uint32_t _max_count, uint32_t _n, uint32_t random_seed)
+ : max_count(_max_count)
+ , n(_n)
+ , generator(random_seed)
+ , current_count(0)
+ , data(max_count)
+ , data_out(max_count)
+ {}
+
+ void do_sample(unsigned int length, double* *data_arr) {
+ assign(length,data_arr);
+
+ // note: We are assuming length>=n
+ // Enforced by the caller (via filtering)
+ double *data_in = data.data();
std::discrete_distribution<uint32_t> multinomial(data_in, data_in+length);
for (unsigned int j=0; j<length; j++) data_out[j] = 0;
for (uint32_t j=0; j<n; j++) data_out[multinomial(generator)]++;
for (unsigned int j=0; j<length; j++) *(data_arr[j]) = data_out[j];
}
- delete[] data_out;
- delete[] data_in;
+
+private:
+ void assign(uint32_t length, double * *data_arr) {
+ current_count = length;
+ for (uint32_t j=0; j<length; j++) data[j] = *(data_arr[j]);
+ }
+
+ uint32_t max_count;
+ uint32_t n;
+ std::mt19937 generator;
+
+ uint32_t current_count;
+
+ // use persistent buffer to minimize allocation costs
+ std::vector<double> data; // original values
+ std::vector<uint32_t> data_out; // computed values
+ };
+} // end namespace su
+
+
+template<class TWork>
+inline void linked_sparse_transposed::transposed_subsample(const uint32_t n, const uint32_t random_seed) {
+ const uint32_t max_threads = omp_get_max_threads();
+ std::mt19937 master_generator(random_seed);
+
+ // use common buffer to minimize allocation cost, but need one per thread
+ std::vector<TWork> sample_data_arr;
+ for (uint32_t i=0; i<max_threads; i++) sample_data_arr.emplace(sample_data_arr.end(), max_count, n, master_generator());
+
+ #pragma omp parallel for
+ for (uint32_t i=0; i<n_obs; i++) {
+ int my_thread_num = omp_get_thread_num();
+ sample_data_arr[my_thread_num].do_sample(obs_counts_resident[i], obs_data_resident[i]);
+ }
+}
+
+void linked_sparse_transposed::transposed_subsample_without_replacement(const uint32_t n, const uint32_t random_seed) {
+ transposed_subsample<WeightedSample>(n, random_seed);
}
+void linked_sparse_transposed::transposed_subsample_with_replacement(const uint32_t n, const uint32_t random_seed) {
+ transposed_subsample<WeightedSampleWithReplacement>(n, random_seed);
+}
+
+
// ===================== sparse_data_subsampled ==========================
void sparse_data_subsampled::subsample_with_replacement(const uint32_t n, const uint32_t random_seed) {
@@ -113,15 +277,24 @@ void sparse_data_subsampled::subsample_with_replacement(const uint32_t n, const
transposed.transposed_subsample_with_replacement(n,random_seed);
}
+void sparse_data_subsampled::subsample_without_replacement(const uint32_t n, const uint32_t random_seed) {
+ linked_sparse_transposed transposed(*this);
+ transposed.transposed_subsample_without_replacement(n,random_seed);
+}
+
// ===================== biom_subsampled ==========================
-biom_subsampled::biom_subsampled(const biom_inmem &parent, const uint32_t n, const uint32_t random_seed)
+biom_subsampled::biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n, const uint32_t random_seed)
: biom_inmem(true)
{
sparse_data_subsampled tmp_obj(parent.get_resident_obj(), parent.get_sample_counts(), n);
if ((tmp_obj.n_obs==0) || (tmp_obj.n_samples==0)) return; //already everything filtered out
- tmp_obj.subsample_with_replacement(n,random_seed);
+ if (w_replacement) {
+ tmp_obj.subsample_with_replacement(n,random_seed);
+ } else {
+ tmp_obj.subsample_without_replacement(n,random_seed);
+ }
// Note: We could filter out the zero rows
// But that's just an optimization and will not be worth it most of the time
steal_nonzero(parent,tmp_obj);
=====================================
src/biom_subsampled.hpp
=====================================
@@ -29,6 +29,12 @@ namespace su {
/* perform subsampling with replacement, no filtering */
void transposed_subsample_with_replacement(const uint32_t n, const uint32_t random_seed);
+
+ /* perform subsampling without replacement, no filtering */
+ void transposed_subsample_without_replacement(const uint32_t n, const uint32_t random_seed);
+ private:
+ // actual implementaiton, templated
+ template<class TWork> void transposed_subsample(const uint32_t n, const uint32_t random_seed);
public: // keep it open for ease of access
uint32_t n_obs; // row dimension
uint32_t n_samples; // column dimension
@@ -64,6 +70,9 @@ namespace su {
/* perform subsampling with replacement, no filtering */
void subsample_with_replacement(const uint32_t n, const uint32_t random_seed);
+
+ /* perform subsampling without replacement, no filtering */
+ void subsample_without_replacement(const uint32_t n, const uint32_t random_seed);
};
class biom_subsampled : public biom_inmem {
@@ -71,9 +80,10 @@ namespace su {
/* default constructor
*
* @param parent biom object to subsample
+ * @param w_replacement Whether to permute or use multinomial sampling
* @param n Number of items to subsample
*/
- biom_subsampled(const biom_inmem &parent, const uint32_t n, const uint32_t random_seed);
+ biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n, const uint32_t random_seed);
protected:
void steal_nonzero(const biom_inmem &parent, sparse_data& subsampled_obj);
=====================================
src/skbio_alt.cpp
=====================================
@@ -895,7 +895,7 @@ void su::permanova(const float * mat, unsigned int n_dims,
// ======================= skbio_biom_subsampled ================================
-su::skbio_biom_subsampled::skbio_biom_subsampled(const biom_inmem &parent, const uint32_t n)
- : su::biom_subsampled(parent, n, uint32_t(myRandomGenerator()))
+su::skbio_biom_subsampled::skbio_biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n)
+ : su::biom_subsampled(parent, w_replacement, n, uint32_t(myRandomGenerator()))
{}
=====================================
src/skbio_alt.hpp
=====================================
@@ -55,9 +55,10 @@ public:
/* default constructor
*
* @param parent biom object to subsample
+ * @param w_replacement Whether to permute or use multinomial sampling
* @param n Number of items to subsample
*/
- skbio_biom_subsampled(const biom_inmem &parent, const uint32_t n);
+ skbio_biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n);
};
=====================================
src/su.cpp
=====================================
@@ -49,6 +49,7 @@ void usage() {
std::cout << " \t\t hdf5_fp64 : HFD5 format, using fp64 precision." << std::endl;
std::cout << " \t\t hdf5_nodist : HFD5 format, no distance matrix. (default if mode==multi)" << std::endl;
std::cout << " --subsample-depth\tDepth of subsampling of the input BIOM before computing unifrac (required for mode==multi, optional for one-off)" << std::endl;
+ std::cout << " --subsample-replacement\t[OPTIONAL] Subsample with or without replacement (default is with)" << std::endl;
std::cout << " --n-subsamples\t[OPTIONAL] if mode==multi, number of subsampled UniFracs to compute (default: 100)" << std::endl;
std::cout << " --permanova\t[OPTIONAL] Number of PERMANOVA permutations to compute (default: 999 with -g, do not compute if 0)" << std::endl;
std::cout << " --pcoa\t[OPTIONAL] Number of PCoA dimensions to compute (default: 10, do not compute if 0)" << std::endl;
@@ -393,7 +394,7 @@ int mode_partial(std::string table_filename, std::string tree_filename,
int mode_one_off(const std::string &table_filename, const std::string &tree_filename,
const std::string &output_filename, const std::string &format_str, Format format_val,
- const std::string &method_string, unsigned int subsample_depth, unsigned int pcoa_dims,
+ const std::string &method_string, unsigned int subsample_depth, bool subsample_with_replacement, unsigned int pcoa_dims,
unsigned int permanova_perms, const std::string &grouping_filename, const std::string &grouping_columns,
bool vaw, double g_unifrac_alpha, bool bypass_tips,
unsigned int nsubsteps, const std::string &mmap_dir) {
@@ -457,7 +458,7 @@ int mode_one_off(const std::string &table_filename, const std::string &tree_file
status = unifrac_to_file_v2(table_filename.c_str(), tree_filename.c_str(), output_filename.c_str(),
method_string.c_str(), vaw, g_unifrac_alpha, bypass_tips, nsubsteps, format_str.c_str(),
- subsample_depth, true,
+ subsample_depth, subsample_with_replacement,
pcoa_dims, permanova_perms, grouping_c, columns_c, mmap_dir_c);
if (status != okay) {
@@ -471,7 +472,7 @@ int mode_one_off(const std::string &table_filename, const std::string &tree_file
int mode_multi(const std::string &table_filename, const std::string &tree_filename,
const std::string &output_filename, const std::string &format_str, Format format_val,
const std::string &method_string,
- unsigned int n_subsamples, unsigned int subsample_depth,
+ unsigned int n_subsamples, unsigned int subsample_depth, bool subsample_with_replacement,
unsigned int pcoa_dims,
unsigned int permanova_perms, const std::string &grouping_filename, const std::string &grouping_columns,
bool vaw, double g_unifrac_alpha, bool bypass_tips,
@@ -529,7 +530,7 @@ int mode_multi(const std::string &table_filename, const std::string &tree_filena
status = unifrac_multi_to_file_v2(table_filename.c_str(), tree_filename.c_str(), output_filename.c_str(),
method_string.c_str(), vaw, g_unifrac_alpha, bypass_tips, nsubsteps, format_str.c_str(),
- n_subsamples, subsample_depth, true,
+ n_subsamples, subsample_depth, subsample_with_replacement,
pcoa_dims, permanova_perms, grouping_c, columns_c, mmap_dir_c);
if (status != okay) {
@@ -616,6 +617,7 @@ int main(int argc, char **argv){
std::string permanova_arg = input.getCmdOption("--permanova");
std::string seed_arg = input.getCmdOption("--seed");
std::string subsample_depth_arg = input.getCmdOption("--subsample-depth");
+ std::string subsample_replacement_arg = input.getCmdOption("--subsample-replacement");
std::string n_subsamples_arg = input.getCmdOption("--n-subsamples");
std::string diskbuf_arg = input.getCmdOption("--diskbuf");
@@ -702,13 +704,23 @@ int main(int argc, char **argv){
if(mode_arg == "multi" || mode_arg == "multiple") {
err("--subsample-depth required in multi mode.");
return EXIT_FAILURE;
- } else {
- subsample_depth = 0;
}
} else {
subsample_depth = atoi(subsample_depth_arg.c_str());
}
+ bool subsample_without_replacement = false;
+ if(!subsample_replacement_arg.empty()) {
+ if (subsample_replacement_arg == "with") {
+ subsample_without_replacement = false;
+ } else if (subsample_replacement_arg == "without") {
+ subsample_without_replacement = true;
+ } else {
+ err("Invalid --subsample-replacement argument, must be 'with' or 'without'.");
+ return EXIT_FAILURE;
+ }
+ }
+
unsigned int n_subsamples = 0;
if(n_subsamples_arg.empty()) {
n_subsamples = 100;
@@ -723,7 +735,7 @@ int main(int argc, char **argv){
if(mode_arg.empty() || mode_arg == "one-off")
return mode_one_off(table_filename, tree_filename, output_filename, format2str(format_val), format_val, method_string,
- subsample_depth,
+ subsample_depth, !subsample_without_replacement,
pcoa_dims, permanova_perms, grouping_filename, grouping_columns,
vaw, g_unifrac_alpha, bypass_tips, nsubsteps, diskbuf_arg);
else if(mode_arg == "partial") {
@@ -742,7 +754,7 @@ int main(int argc, char **argv){
return mode_partial_report(table_filename, uint32_t(n_partials), bare);
else if(mode_arg == "multi" || mode_arg == "multiple")
return mode_multi(table_filename, tree_filename, output_filename, format2str(format_val), format_val, method_string,
- n_subsamples,subsample_depth,
+ n_subsamples,subsample_depth, !subsample_without_replacement,
pcoa_dims, permanova_perms, grouping_filename, grouping_columns,
vaw, g_unifrac_alpha, bypass_tips, nsubsteps, diskbuf_arg);
else
=====================================
src/test_ska.cpp
=====================================
@@ -649,7 +649,7 @@ void test_subsample_replacement() {
std::string oids_org[] = {"GG_OTU_1", "GG_OTU_2","GG_OTU_3", "GG_OTU_4", "GG_OTU_5"};
std::vector<std::string> exp_org_oids = _string_array_to_vector(oids_org, exp_org_n_obs);
- su::skbio_biom_subsampled table(org_table,5);
+ su::skbio_biom_subsampled table(org_table,true,5);
uint32_t exp_n_samples = 2;
uint32_t exp_n_obs = 3;
@@ -669,7 +669,7 @@ void test_subsample_replacement() {
ASSERTINTEQ(table.n_samples , exp_n_samples);
ASSERTINTLE(table.n_obs , exp_n_obs);
ASSERT(table.get_sample_ids() == exp_sids);
- // when with replacement
+
// all columns should add to n==5
{
double exp_data[2] = {5.0, 5.0};
@@ -685,26 +685,147 @@ void test_subsample_replacement() {
ASSERT(sum_vec == exp_vec);
}
+ su::skbio_biom_subsampled table_empty(org_table,true,8);
+ uint32_t exp_empty_n_samples = 0;
+ uint32_t exp_empty_n_obs = 0;
+
+ ASSERT(table_empty.n_samples == exp_empty_n_samples);
+ ASSERT(table_empty.n_obs == exp_empty_n_obs);
+
+ SUITE_END();
+}
+
+void test_subsample_replacement_limit() {
+ SUITE_START("test subsample with replacement limit");
+
+ // test case when n == sum(vector) - 1
+
+ const char *t_obs_ids[] = {"OTU0","OTU1","OTU2","OTU3","OTU4",
+ "OTU5","OTU6","OTU7","OTU8","OTU9"};
+ const char *t_samp_ids[] = {"S1"};
+ uint32_t t_index[] = {0,0,0,0,0,0,0,0,0,0};
+ uint32_t t_idxptr[] = {0,1,2,3,4,5,6,7,8,9,10};
+ double t_data[] = {2., 1., 2., 1., 7., 6., 3., 3., 5., 5.};
+ su::biom_inmem org_table(t_obs_ids,t_samp_ids,t_index,t_idxptr,t_data,10,1);
+
+ const uint32_t exp_org_n_samples = 1;
+ const uint32_t exp_org_n_obs = 10;
+
+ // verify we did the right thing
+ ASSERTINTEQ(org_table.n_samples , exp_org_n_samples);
+ ASSERTINTEQ(org_table.n_obs , exp_org_n_obs);
+
+ // column should add to n==35
+ {
+ double exp_data[1] = {35.0};
+ std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+ double data_sum[1] = {0.0};
+ for (auto obs_id : org_table.get_obs_ids()) {
+ double line[1];
+ org_table.get_obs_data(obs_id, line);
+ for (int j=0; j<1; j++) data_sum[j] += line[j];
+ }
+ std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+ ASSERT(sum_vec == exp_vec);
+ }
+
+ su::skbio_biom_subsampled table(org_table,true,34);
+ uint32_t exp_n_samples = 1;
+ uint32_t exp_n_obs = 10;
+
+ // now check basic rarefaction properties
+ ASSERTINTEQ(table.n_samples , exp_n_samples);
+ ASSERTINTLE(table.n_obs , exp_n_obs);
+
+ // column should add to n==34
+ {
+ double exp_data[1] = {34.0};
+ std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+ double data_sum[1] = {0.0};
+ for (auto obs_id : table.get_obs_ids()) {
+ double line[1];
+ table.get_obs_data(obs_id, line);
+ for (int j=0; j<1; j++) data_sum[j] += line[j];
+ }
+ std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+ ASSERT(sum_vec == exp_vec);
+ }
+
+ // As in scikit-bio and biom-format
// when with replacement
- // given any column, the number of permutations should be more than n
+ // the number of permutations should be more than n
{
std::unordered_set<uint64_t> perm_set;
// will pick column 1
for (int i=0; i<1000; i++) {
- su::skbio_biom_subsampled table2(org_table,5);
+ su::skbio_biom_subsampled table2(org_table,true,34);
uint64_t val = 0;
for (auto obs_id : table2.get_obs_ids()) {
- double line[2];
+ double line[1];
table2.get_obs_data(obs_id, line);
- val = val*10 + uint64_t(line[1]);
+ val = val*8 + uint64_t(line[0]);
}
perm_set.insert(val);
}
- ASSERT(uint32_t(perm_set.size()) > uint32_t(5));
+ ASSERT(uint32_t(perm_set.size()) > uint32_t(10));
+ }
+
+ SUITE_END();
+}
+
+void test_subsample_woreplacement() {
+ SUITE_START("test subsample without replacement");
+
+ su::biom org_table("test.biom");
+ uint32_t exp_org_n_samples = 6;
+ uint32_t exp_org_n_obs = 5;
+
+ std::string sids_org[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"};
+ std::vector<std::string> exp_org_sids = _string_array_to_vector(sids_org, exp_org_n_samples);
+
+ std::string oids_org[] = {"GG_OTU_1", "GG_OTU_2","GG_OTU_3", "GG_OTU_4", "GG_OTU_5"};
+ std::vector<std::string> exp_org_oids = _string_array_to_vector(oids_org, exp_org_n_obs);
+
+ su::skbio_biom_subsampled table(org_table,false,5);
+ uint32_t exp_n_samples = 2;
+ uint32_t exp_n_obs = 3;
+
+ std::string sids[] = {"Sample1", "Sample4"};
+ std::vector<std::string> exp_sids = _string_array_to_vector(sids, exp_n_samples);
+
+ std::string oids[] = {"GG_OTU_2","GG_OTU_3", "GG_OTU_4"};
+ std::vector<std::string> exp_oids = _string_array_to_vector(oids, exp_n_obs);
+
+ // verify it did not destroy the original table
+ ASSERTINTEQ(org_table.n_samples , exp_org_n_samples);
+ ASSERTINTEQ(org_table.n_obs , exp_org_n_obs);
+ ASSERT(org_table.get_sample_ids() == exp_org_sids);
+ ASSERT(org_table.get_obs_ids() == exp_org_oids);
+
+ // now check basic rarefaction properties
+ ASSERTINTEQ(table.n_samples , exp_n_samples);
+ ASSERTINTLE(table.n_obs , exp_n_obs);
+ ASSERT(table.get_sample_ids() == exp_sids);
+
+ // all columns should add to n==5
+ {
+ double exp_data[2] = {5.0, 5.0};
+ std::vector<double> exp_vec = _double_array_to_vector(exp_data, 2);
+
+ double data_sum[2] = {0.0, 0.0};
+ for (auto obs_id : table.get_obs_ids()) {
+ double line[2];
+ table.get_obs_data(obs_id, line);
+ for (int j=0; j<2; j++) data_sum[j] += line[j];
+ }
+ std::vector<double> sum_vec = _double_array_to_vector(data_sum, 2);
+ ASSERT(sum_vec == exp_vec);
}
- su::skbio_biom_subsampled table_empty(org_table,8);
+ su::skbio_biom_subsampled table_empty(org_table,false,8);
uint32_t exp_empty_n_samples = 0;
uint32_t exp_empty_n_obs = 0;
@@ -714,6 +835,87 @@ void test_subsample_replacement() {
SUITE_END();
}
+void test_subsample_woreplacement_limit() {
+ SUITE_START("test subsample without replacement limit");
+
+ // test case when n == sum(vector) - 1
+
+ const char *t_obs_ids[] = {"OTU0","OTU1","OTU2","OTU3","OTU4",
+ "OTU5","OTU6","OTU7","OTU8","OTU9"};
+ const char *t_samp_ids[] = {"S1"};
+ uint32_t t_index[] = {0,0,0,0,0,0,0,0,0,0};
+ uint32_t t_idxptr[] = {0,1,2,3,4,5,6,7,8,9,10};
+ double t_data[] = {2., 1., 2., 1., 7., 6., 3., 3., 5., 5.};
+ su::biom_inmem org_table(t_obs_ids,t_samp_ids,t_index,t_idxptr,t_data,10,1);
+
+ const uint32_t exp_org_n_samples = 1;
+ const uint32_t exp_org_n_obs = 10;
+
+ // verify we did the right thing
+ ASSERTINTEQ(org_table.n_samples , exp_org_n_samples);
+ ASSERTINTEQ(org_table.n_obs , exp_org_n_obs);
+
+ // column should add to n==35
+ {
+ double exp_data[1] = {35.0};
+ std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+ double data_sum[1] = {0.0};
+ for (auto obs_id : org_table.get_obs_ids()) {
+ double line[1];
+ org_table.get_obs_data(obs_id, line);
+ for (int j=0; j<1; j++) data_sum[j] += line[j];
+ }
+ std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+ ASSERT(sum_vec == exp_vec);
+ }
+
+ su::skbio_biom_subsampled table(org_table,false,34);
+ uint32_t exp_n_samples = 1;
+ uint32_t exp_n_obs = 10;
+
+ // now check basic rarefaction properties
+ ASSERTINTEQ(table.n_samples , exp_n_samples);
+ ASSERTINTLE(table.n_obs , exp_n_obs);
+
+ // column should add to n==34
+ {
+ double exp_data[1] = {34.0};
+ std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+ double data_sum[1] = {0.0};
+ for (auto obs_id : table.get_obs_ids()) {
+ double line[1];
+ table.get_obs_data(obs_id, line);
+ for (int j=0; j<1; j++) data_sum[j] += line[j];
+ }
+ std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+ ASSERT(sum_vec == exp_vec);
+ }
+
+ // As in scikit-bio and biom-format
+ // when without replacement
+ // the number of permutations should be exactly n
+ {
+ std::unordered_set<uint64_t> perm_set;
+ // will pick column 1
+ for (int i=0; i<1000; i++) {
+ su::skbio_biom_subsampled table2(org_table,false,34);
+
+ uint64_t val = 0;
+ for (auto obs_id : table2.get_obs_ids()) {
+ double line[1];
+ table2.get_obs_data(obs_id, line);
+ val = val*8 + uint64_t(line[0]);
+ }
+ perm_set.insert(val);
+ }
+ ASSERT(uint32_t(perm_set.size()) == uint32_t(10));
+ }
+
+ SUITE_END();
+}
+
int main(int argc, char** argv) {
test_center_mat();
test_pcoa();
@@ -723,6 +925,9 @@ int main(int argc, char** argv) {
test_permanova_noties();
test_permanova_unequal();
test_subsample_replacement();
+ test_subsample_replacement_limit();
+ test_subsample_woreplacement();
+ test_subsample_woreplacement_limit();
printf("\n");
printf(" %i / %i suites failed\n", suites_failed, suites_run);
=====================================
src/tree.cpp
=====================================
@@ -369,16 +369,15 @@ void BPTree::structure_to_openclose() {
}
// trim from end
// from http://stackoverflow.com/a/217605
-static inline std::string &rtrim(std::string &s) {
- s.erase(std::find_if(s.rbegin(), s.rend(),
- std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
- return s;
+static inline void rtrim(std::string &s) {
+ s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
+ return !std::isspace(ch);
+ }).base(), s.end());
}
-
//// WEIRDNESS. THIS SOLVES IT WITH THE RTRIM. ISOLATE, MOVE TO CONSTRUCTOR.
void BPTree::newick_to_metadata(std::string newick) {
- newick = rtrim(newick);
+ rtrim(newick);
std::string::iterator start = newick.begin();
std::string::iterator end = newick.end();
=====================================
src/tree.hpp
=====================================
@@ -10,6 +10,7 @@
#ifndef __UNIFRAC_TREE_H
#define __UNIFRAC_TREE_H 1
+#include <cstdint>
#include <string>
#include <sstream>
#include <iostream>
=====================================
src/tsv.hpp
=====================================
@@ -11,6 +11,7 @@
#ifndef _UNIFRAC_TSV_H
#define _UNIFRAC_TSV_H
+#include <cstdint>
#include <vector>
#include <map>
#include <string>
=====================================
test/Makefile
=====================================
@@ -13,7 +13,7 @@ test: capi_test
test_binaries: capi_test
capi_test: capi_test.c
- $(CC) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib -o capi_test
+ $(CC) $(CFLAGS) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib $(LDFLAGS) -o capi_test
clean:
-rm -f *.o capi_test
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/6c8284acc22812b9d886b88681eb3e8901713ff9...30371a96e254628e402e15a84c246fa44ae0187e
--
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/6c8284acc22812b9d886b88681eb3e8901713ff9...30371a96e254628e402e15a84c246fa44ae0187e
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231023/92a32277/attachment-0001.htm>
More information about the debian-med-commit
mailing list