[med-svn] [Git][med-team/unifrac-tools][master] 5 commits: New upstream version 1.4

Mon Oct 23 20:51:51 BST 2023


Nilesh Patra pushed to branch master at Debian Med / unifrac-tools


Commits:
f6ed45fb by Nilesh Patra at 2023-10-24T00:43:20+05:30
New upstream version 1.4
- - - - -
a4ec6691 by Nilesh Patra at 2023-10-24T00:43:47+05:30
Update upstream source from tag 'upstream/1.4'

Update to upstream version '1.4'
with Debian dir 9b3bafd741356fc5bea59cd6f3fad291836fdaa6
- - - - -
55faead1 by Nilesh Patra at 2023-10-24T00:51:28+05:30
Drop gcc-13.patch

- - - - -
f8581864 by Nilesh Patra at 2023-10-24T00:51:43+05:30
Refresh/re-diff patches with new version

- - - - -
30371a96 by Nilesh Patra at 2023-10-24T00:52:04+05:30
Upload to unstable

- - - - -


22 changed files:

- − .github/workflows/blank.yml
- − .github/workflows/main.yml
- README.md
- combined/Makefile
- debian/changelog
- debian/patches/enable-debug-flag.patch
- − debian/patches/gcc-13.patch
- debian/patches/hardening
- debian/patches/series
- src/Makefile
- src/api.cpp
- src/biom_inmem.hpp
- src/biom_subsampled.cpp
- src/biom_subsampled.hpp
- src/skbio_alt.cpp
- src/skbio_alt.hpp
- src/su.cpp
- src/test_ska.cpp
- src/tree.cpp
- src/tree.hpp
- src/tsv.hpp
- test/Makefile


Changes:

=====================================
.github/workflows/blank.yml deleted
=====================================
@@ -1,36 +0,0 @@
-# This is a basic workflow to help you get started with Actions
-
-name: CI
-
-# Controls when the workflow will run
-on:
-  # Triggers the workflow on push or pull request events but only for the main branch
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
-  # This workflow contains a single job called "build"
-  build:
-    # The type of runner that the job will run on
-    runs-on: ubuntu-latest
-
-    # Steps represent a sequence of tasks that will be executed as part of the job
-    steps:
-      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout at v2
-
-      # Runs a single command using the runners shell
-      - name: Run a one-line script
-        run: echo Hello, world!
-
-      # Runs a set of commands using the runners shell
-      - name: Run a multi-line script
-        run: |
-          echo Add other actions to build,
-          echo test, and deploy your project.


=====================================
.github/workflows/main.yml deleted
=====================================
@@ -1,216 +0,0 @@
-name: unifrac-binaries CI
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
-  build-and-test:
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, linux-gpu-cuda]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - uses: actions/checkout at v3
-    - uses: conda-incubator/setup-miniconda at v2
-      with:
-        miniconda-version: "latest"
-        auto-update-conda: true
-    - name: Install 
-      shell: bash -l {0}
-      run: |
-        df -h .
-        if [[ "$(uname -s)" == "Linux" ]];
-        then
-          conda create -q --yes --strict-channel-priority -n unifrac -c conda-forge -c bioconda gxx_linux-64 gfortran_linux-64 hdf5 mkl-include lz4 zlib hdf5-static libcblas liblapacke make curl
-        else
-          conda create -q --yes --strict-channel-priority -n unifrac -c conda-forge -c bioconda clangxx_osx-64 hdf5 mkl-include lz4 hdf5-static libcblas liblapacke make curl
-        fi 
-        conda clean --yes -t
-        df -h .
-        conda activate unifrac
-        echo "$(uname -s)"
-        if [[ "$(uname -s)" == "Linux" ]];
-        then
-          which x86_64-conda-linux-gnu-gcc
-          x86_64-conda-linux-gnu-gcc -v
-          x86_64-conda-linux-gnu-g++ -v
-        else
-          which clang
-          clang -v
-        fi
-        which h5c++
-        if [[ "$(uname -s)" == "Linux" ]];
-        then
-          # install PGI but do not source it
-          # the makefile will do it automatically
-          ./scripts/install_hpc_sdk.sh </dev/null
-        fi
-        df -h .
-        export PERFORMING_CONDA_BUILD=True
-        echo "======= begin env ====="
-        env
-        echo "=======  end env  ====="
-        # all == build (shlib,bins,tests) and install
-        make all
-        df -h .
-        pushd src
-        if [[ "$(uname -s)" == "Linux" ]]; 
-        then
-          rm -f ~/.R/Makevars
-          conda install -q --yes  --strict-channel-priority -c conda-forge r-base
-          unset CXXFLAGS
-          unset CFLAGS
-          unset DEBUG_CXXFLAGS
-          unset DEBUG_CFLAGS
-          # the r-base package has a broken dependency
-          ln -s $CONDA_PREFIX/lib/libreadline.so $CONDA_PREFIX/lib/libreadline.so.6
-          R -e 'install.packages("Rcpp", repos="http://lib.stat.cmu.edu/R/CRAN/")'
-          make rapi_test
-        fi
-        popd
-            
-    - name: Tests
-      shell: bash -l {0}
-      run: |
-        conda activate unifrac
-        # keep it low for runs in containers
-        # and a weird number to potentially catch potential bugs
-        export OMP_NUM_THREADS=3
-        # diagnostic messages for debugging, if needed
-        export UNIFRAC_GPU_INFO=Y
-        pushd src
-        ./test_su
-        ./test_api
-        ./test_ska
-        popd
-        pushd test
-        ./capi_test 1
-        # explicitly check that we do not have hdf5 dependencies
-        if [[ "$(uname -s)" == "Linux" ]];
-        then
-          ldd ./capi_test |awk 'BEGIN{a=0}/hdf/{a=a+1;print $0}END{if (a==0) {print "No dynamic hdf5 found"} else {exit 2}}'
-        else
-          otool -L ./capi_test|awk 'BEGIN{a=0}/hdf/{a=a+1;print $0}END{if (a==0) {print "No dynamic hdf5 found"} else {exit 2}}'
-        fi
-        popd
-        pushd src/testdata
-        conda install --yes -c conda-forge h5py
-        # weighted_unnormalized
-        time ssu -m weighted_unnormalized_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
-        ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
-        rm -f t1.h5
-        # retry with default precision handling
-        time ssu -m weighted_unnormalized -i test500.biom  -t test500.tre --pcoa 4  -r hdf5 -o t1.h5
-        ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
-        rm -f t1.h5
-        time ssu -f -m weighted_unnormalized_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
-        # matrrix will be different, but PCOA similar
-        ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
-        rm -f t1.h5
-        time ssu -m weighted_unnormalized_fp64 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp64 -o t1.h5
-        # minimal precision loss between fp32 and fp64
-        ./compare_unifrac_matrix.py test500.weighted_unnormalized_fp32.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.weighted_unnormalized_fp32.h5 t1.h5 3 0.1
-        rm -f t1.h5
-        # weighted_normalized
-        time ssu -f -m weighted_normalized -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
-        ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
-        rm -f t1.h5
-        time ssu -f -m weighted_normalized_fp64 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp64 -o t1.h5
-        ./compare_unifrac_matrix.py test500.weighted_normalized_fp32.f.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.weighted_normalized_fp32.f.h5 t1.h5 3 0.1
-        rm -f t1.h5
-        # unweighted
-        time ssu -f -m unweighted -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 -o t1.h5
-        ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
-        rm -f t1.h5
-        time ssu -f -m unweighted_fp64 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp64 -o t1.h5
-        ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        # hdf5 without distance matrix, just PCoA
-        echo "hdf5_nodist"
-        time ssu -f -m unweighted_fp64 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_nodist -o t1.h5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        time ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_nodist -o t1.h5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        # permanova
-        echo "permanova"
-        time ssu -m unweighted -i test500.biom  -t test500.tre --pcoa 4  -r hdf5 --permanova 999 -g test500.tsv -c empo_2 -o t1.h5
-        # compare to values given by skbio.stats.distance.permanova
-        python compare_unifrac_stats.py t1.h5 5 999 1.001112 0.456  0.001 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        time ssu -m unweighted -i test500.biom  -t test500.tre --pcoa 4  -r hdf5 --permanova 99 -g test500.tsv -c empo_2 -o t1.h5
-        python compare_unifrac_stats.py t1.h5 5 99 1.001112 0.456  0.001 0.2
-        ls -l t1.h5
-        rm -f t1.h5
-        time ssu -m weighted_unnormalized_fp32 -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_nodist -g test500.tsv -c empo_3 -o t1.h5
-        # compare to values given by skbio.stats.distance.permanova
-        python compare_unifrac_stats.py t1.h5 17 999 0.890697 0.865 0.001 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        # partials
-        echo "partials"
-        ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --mode partial-report --n-partials 2
-        time ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --mode partial --start 0 --stop 125 -o t1.partial.1
-        time ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --mode partial --start 125 --stop 250 -o t1.partial.2
-        ls -l t1.partial*
-        ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --mode check-partial --partial-pattern 't1.partial.*'
-        time ssu -f -m unweighted_fp64 -i test500.biom  -t test500.tre --pcoa 4  --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_fp64 -o t1.h5
-        ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        time ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --pcoa 4  --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_fp32 -o t1.h5
-        ./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        time ssu -f -m unweighted_fp32 -i test500.biom  -t test500.tre --pcoa 4  --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_nodist -o t1.h5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
-        ls -l t1.h5
-        rm -f t1.h5
-        rm -f t1.partial.*
-        # subsample
-        echo "subsample"
-        time ssu -f -m unweighted -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_fp32 --subsample-depth 100 -o t1.h5
-        ./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.3
-        rm -f t1.h5
-        time ssu -m unweighted -i test500.biom  -t test500.tre --pcoa 4  -r hdf5_nodist --permanova 99 -g test500.tsv -c empo_2 --subsample-depth 100 -o t1.h5
-        python compare_unifrac_stats.py t1.h5 5 99 1.001112 0.456  0.05 0.6
-        ls -l t1.h5
-        rm -f t1.h5
-        # multi
-        echo "multi"
-        time ssu -f -m unweighted -i test500.biom  -t test500.tre --pcoa 4 --mode multi --subsample-depth 100 --n-subsamples 10 -o t1.h5
-        ./compare_unifrac_pcoa_multi.py test500.unweighted_fp32.f.h5 t1.h5 10 3 0.3
-        ls -l t1.h5
-        rm -f t1.h5
-        time ssu -m unweighted -i test500.biom  -t test500.tre --pcoa 4  --mode multi --n-subsamples 10 --permanova 99 -g test500.tsv -c empo_2 --subsample-depth 100 -o t1.h5
-        python compare_unifrac_stats_multi.py t1.h5 10 5 99 1.001112 0.456  0.08 0.6
-        ls -l t1.h5
-        rm -f t1.h5
-        popd
-    - name: Sanity checks
-      shell: bash -l {0}
-      run: |
-        conda activate unifrac
-        pushd ci
-        ./crawford_test.sh
-        popd


=====================================
README.md
=====================================
@@ -4,7 +4,7 @@
 UniFrac is the *de facto* repository for high-performance phylogenetic diversity calculations. The methods in this repository are based on an implementation of the [Strided State UniFrac](https://www.nature.com/articles/s41592-018-0187-8) algorithm which is faster, and uses less memory than [Fast UniFrac](http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html). Strided State UniFrac supports [Unweighted UniFrac](http://aem.asm.org/content/71/12/8228.abstract), [Weighted UniFrac](http://aem.asm.org/content/73/5/1576), [Generalized UniFrac](https://academic.oup.com/bioinformatics/article/28/16/2106/324465/Associating-microbiome-composition-with), [Variance Adjusted UniFrac](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-118) and [meta UniFrac](http://www.pnas.org/content/105/39/15076.short), in both double and single precision (fp32).
 This repository also includes Stacked Faith (manuscript in preparation), a method for calculating Faith's PD that is faster and uses less memory than the Fast UniFrac-based [reference implementation](http://scikit-bio.org/).
 
-This repository produces standalone exutables and a C API exposed via a shared library which can be linked against by any programming language. 
+This repository produces standalone executables and a C API exposed via a shared library which can be linked against by any programming language. 
 
 # Citation
 
@@ -43,7 +43,7 @@ An example of installing UniFrac, and using it with CPUs as well as GPUs, can be
 
 ## Install (bioconda)
 
-This binaries can be installed via a combination of `conda-forge` and `bioconda`:
+The binaries can be installed via a combination of `conda-forge` and `bioconda`:
 
 ```
 conda create --name unifrac -c conda-forge -c bioconda unifrac-binaries
@@ -65,7 +65,7 @@ make install
 ```
 
 
-**Note**: if you are using `conda` we recommend installing HDF5 and relate compiler using the
+**Note**: if you are using `conda` we recommend installing HDF5 and related compiler using the
 `conda-forge` channel, for example:
 
 ```
@@ -74,7 +74,7 @@ conda activate unifrac
 ```
 
 For GPU-enabled code, you will need the [NVIDIA HPC SDK](https://developer.nvidia.com/hpc-sdk) compiler.
-A helper script will download it, install it and setup the necessary environment:
+This helper script will download it, install it and setup the necessary environment:
 ```
 scripts/install_hpc_sdk.sh 
 source setup_nv_h5.sh
@@ -173,6 +173,7 @@ The methods can be used directly through the command line after install:
                                  hdf5_fp32 : HFD5 format, using fp32 precision.
                                  hdf5_fp64 : HFD5 format, using fp64 precision.
         --subsample-depth   Depth of subsampling of the input BIOM before computing unifrac (required for mode==multi, optional for one-off)
+        --subsample-replacement	[OPTIONAL] Subsample with or without replacement (default is with)
         --n-subsamples	[OPTIONAL] if mode==multi, number of subsampled UniFracs to compute (default: 100)
         --permanova	[OPTIONAL] Number of PERMANOVA permutations to compute (default: 999 with -g, do not compute if 0)
         --pcoa	[OPTIONAL] Number of PCoA dimensions to compute (default: 10, do not compute if 0)


=====================================
combined/Makefile
=====================================
@@ -13,10 +13,10 @@ ifeq ($(PREFIX),)
 endif
 
 libssu.o: libssu.c
-	$(CC) -c libssu.c -fPIC
+	$(CC) $(CPPFLAGS) $(CFLAGS) -c libssu.c -fPIC
 
 libssu.so: libssu.o
-	$(CC) -shared -o libssu.so libssu.o -fPIC -ldl
+	$(CC) -shared -o libssu.so libssu.o -fPIC -ldl $(LDFLAGS)
 
 install: libssu.so
 	rm -f ${PREFIX}/lib//libssu.so; cp libssu.so ${PREFIX}/lib/


=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+unifrac-tools (1.4-1) unstable; urgency=medium
+
+  * Team Upload.
+  * New upstream version 1.4
+  * Drop gcc-13.patch
+  * Refresh/re-diff patches with new version
+
+ -- Nilesh Patra <nilesh at debian.org>  Tue, 24 Oct 2023 00:51:49 +0530
+
 unifrac-tools (1.3.2-5) unstable; urgency=medium
 
   * Team upload.


=====================================
debian/patches/enable-debug-flag.patch
=====================================
@@ -2,14 +2,14 @@ Description: Add debug flag to cppflags
 Author: Nilesh Patra <nilesh at debian.org>
 Last-Update: 2023-01-05
 Forwarded: not-needed
---- unifrac-tools.orig/src/Makefile
-+++ unifrac-tools/src/Makefile
+--- a/src/Makefile
++++ b/src/Makefile
 @@ -85,7 +85,7 @@
  endif
  
  
--CPPFLAGS += -Wall  -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
-+CPPFLAGS += -g -Wall -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+-CPPFLAGS += -Wall  -std=c++17 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
++CPPFLAGS += -g -Wall  -std=c++17 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
  
  all: api main install
  


=====================================
debian/patches/gcc-13.patch deleted
=====================================
@@ -1,37 +0,0 @@
-Description: fix build failure with gcc 13
-Author: Étienne Mollier <emollier at debian.org>
-Bug-Debian: https://bugs.debian.org/1037879
-Forwarded: https://github.com/biocore/unifrac-binaries/pull/45
-Last-Update: 2023-07-23
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- unifrac-tools.orig/src/tree.hpp
-+++ unifrac-tools/src/tree.hpp
-@@ -10,6 +10,7 @@
- #ifndef __UNIFRAC_TREE_H
- #define __UNIFRAC_TREE_H 1
- 
-+#include <cstdint>
- #include <string>
- #include <sstream>
- #include <iostream>
---- unifrac-tools.orig/src/biom_inmem.hpp
-+++ unifrac-tools/src/biom_inmem.hpp
-@@ -11,6 +11,7 @@
- #ifndef _UNIFRAC_BIOM_INMEM_H
- #define _UNIFRAC_BIOM_INMEM_H
- 
-+#include <cstdint>
- #include <vector>
- #include <unordered_map>
- 
---- unifrac-tools.orig/src/tsv.hpp
-+++ unifrac-tools/src/tsv.hpp
-@@ -11,6 +11,7 @@
- #ifndef _UNIFRAC_TSV_H
- #define _UNIFRAC_TSV_H
- 
-+#include <cstdint>
- #include <vector>
- #include <map>
- #include <string>


=====================================
debian/patches/hardening
=====================================
@@ -2,23 +2,8 @@ From: Michael R. Crusoe <crusoe at debian.org>
 Subject: Use CPPFLAGS, CFLAGS, and LDFLAGS
 Forwarded: https://github.com/biocore/unifrac-binaries/pull/46
 
---- unifrac-tools.orig/combined/Makefile
-+++ unifrac-tools/combined/Makefile
-@@ -13,10 +13,10 @@
- endif
- 
- libssu.o: libssu.c
--	$(CC) -c libssu.c -fPIC
-+	$(CC) $(CPPFLAGS) $(CFLAGS) -c libssu.c -fPIC
- 
- libssu.so: libssu.o
--	$(CC) -shared -o libssu.so libssu.o -fPIC -ldl
-+	$(CC) -shared -o libssu.so libssu.o -fPIC -ldl $(LDFLAGS)
- 
- install: libssu.so
- 	rm -f ${PREFIX}/lib//libssu.so; cp libssu.so ${PREFIX}/lib/
---- unifrac-tools.orig/test/Makefile
-+++ unifrac-tools/test/Makefile
+--- a/test/Makefile
++++ b/test/Makefile
 @@ -1,8 +1,5 @@
  .PHONY: test_binaries test
  
@@ -28,12 +13,3 @@ Forwarded: https://github.com/biocore/unifrac-binaries/pull/46
  ifeq ($(PREFIX),)
  	PREFIX := $(CONDA_PREFIX)
  endif
-@@ -13,7 +10,7 @@
- test_binaries: capi_test
- 
- capi_test: capi_test.c
--	$(CC) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib -o capi_test
-+	$(CC) $(CFLAGS) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib $(LDFLAGS) -o capi_test
- 
- clean:
- 	-rm -f *.o capi_test


=====================================
debian/patches/series
=====================================
@@ -4,5 +4,4 @@ soname.patch
 baseline.patch
 enable_linking_to_shared_hdf5_lib.patch
 enable-debug-flag.patch
-gcc-13.patch
 hardening


=====================================
src/Makefile
=====================================
@@ -118,7 +118,7 @@ ifeq (,$(findstring pgi,$(COMPILER)))
 endif
 
 
-CPPFLAGS += -Wall  -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
+CPPFLAGS += -Wall  -std=c++17 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib
 
 all: api main install
 


=====================================
src/api.cpp
=====================================
@@ -609,12 +609,7 @@ compute_status one_off_matrix_v2(const char* biom_filename, const char* tree_fil
     PARSE_TREE_TABLE(tree_filename, biom_filename)
     TDBG_STEP("load_files")
     if (subsample_depth>0) {
-        // We do not implement subsampling without replacement yet
-        if (!subsample_with_replacement) {
-          fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
-          return table_empty;
-        }
-        su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+        su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
         if ((table_subsampled.n_samples==0) || (table_subsampled.n_obs==0)) {
            return table_empty;
         }
@@ -636,12 +631,7 @@ compute_status one_off_matrix_fp32_v2(const char* biom_filename, const char* tre
     PARSE_TREE_TABLE(tree_filename, biom_filename)
     TDBG_STEP("load_files")
     if (subsample_depth>0) {
-        // We do not implement subsampling without replacement yet
-        if (!subsample_with_replacement) {
-          fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
-          return table_empty;
-        }
-        su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+        su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
         TDBG_STEP("subsample")
        return one_off_matrix_T<float,mat_full_fp32_t>(table_subsampled,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
     } else {
@@ -698,12 +688,7 @@ compute_status one_off_matrix_inmem_v2(const support_biom_t *table_data, const s
                     tree_data->n_parens);
 
     if (subsample_depth>0) {
-       // We do not implement subsampling without replacement yet
-       if (!subsample_with_replacement) {
-          fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
-          return table_empty;
-       }
-       su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+       su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
        TDBG_STEP("subsample")
        return one_off_matrix_T<double,mat_full_fp64_t>(table_subsampled,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
     } else {
@@ -751,12 +736,7 @@ compute_status one_off_matrix_inmem_fp32_v2(const support_biom_t *table_data, co
                     tree_data->n_parens);
 
     if (subsample_depth>0) {
-       // We do not implement subsampling without replacement yet
-       if (!subsample_with_replacement) {
-          fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
-          return table_empty;
-       }
-       su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+       su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
        TDBG_STEP("subsample")
        return one_off_matrix_T<float,mat_full_fp32_t>(table_subsampled,tree,unifrac_method,variance_adjust,alpha,bypass_tips,nthreads,mmap_dir,result);
     } else {
@@ -1272,11 +1252,6 @@ compute_status unifrac_multi_to_file_T(hid_t real_id, const bool save_dist,
     compute_status rc = okay;
     SETUP_TDBG("unifrac_multi_to_file")
 
-    if (!subsample_with_replacement) {
-      fprintf(stderr, "ERROR: subsampling without replacement not implemented yet.\n");
-      return table_empty;
-    }
-
     if (subsample_depth<1) {
       fprintf(stderr, "ERROR: subsampling depth cannot be 0.\n");
       return table_empty;
@@ -1317,7 +1292,7 @@ compute_status unifrac_multi_to_file_T(hid_t real_id, const bool save_dist,
       su::WriteHDF5Multi<TReal,TMat> h5obj(real_id, out_filename,pcoa_dims);
 
       for (unsigned int i=0; i<n_subsamples; i++) {
-        su::skbio_biom_subsampled table_subsampled(table, subsample_depth);
+        su::skbio_biom_subsampled table_subsampled(table, subsample_with_replacement, subsample_depth);
         if ((table_subsampled.n_samples==0) || (table_subsampled.n_obs==0)) {
            rc = table_empty;
            break;


=====================================
src/biom_inmem.hpp
=====================================
@@ -11,6 +11,7 @@
 #ifndef _UNIFRAC_BIOM_INMEM_H
 #define _UNIFRAC_BIOM_INMEM_H
 
+#include <cstdint>
 #include <vector>
 #include <unordered_map>
 


=====================================
src/biom_subsampled.cpp
=====================================
@@ -11,6 +11,9 @@
 #include <iostream>
 #include <stdio.h>
 #include <random>
+#include <vector>
+#include <algorithm>
+#include <omp.h>
 
 #include "biom_subsampled.hpp"
 
@@ -83,29 +86,190 @@ linked_sparse_transposed::~linked_sparse_transposed() {
     }
 }
 
-void linked_sparse_transposed::transposed_subsample_with_replacement(const uint32_t n, const uint32_t random_seed) {
-    std::mt19937 generator(random_seed);
+namespace su {
+  // Equivalent to iterator over np.repeat
+  // https://github.com/biocore/biom-format/blob/b0e71a00ecb349a6f5f1ca64a23d71f380ddc19c/biom/_subsample.pyx#LL64C24-L64C55
+  class WeightedSampleIterator
+  {
+  public:
+    // While we do not implememnt the whole random_access_iterator interface
+    // we want the implementations to use operator- and that requires random
+    using iterator_category = std::random_access_iterator_tag;
+    using difference_type   = int64_t;
+    using value_type        = uint32_t;
+    using pointer           = const uint32_t*;
+    using reference         = const uint32_t&;
 
-    // use common buffer to minimize allocation costs
-    double *data_in = new double[max_count];  // original values
-    uint32_t *data_out = new uint32_t[max_count]; // computed values
+    WeightedSampleIterator(uint64_t *_data_in, uint32_t _idx, uint64_t _cnt)
+    : data_in(_data_in)
+    , idx(_idx)
+    , cnt(_cnt)
+    {}
 
-    for (uint32_t i=0; i<n_obs; i++) {
-        unsigned int length = obs_counts_resident[i];
-        double* *data_arr = obs_data_resident[i];
+    reference operator*() const { return idx; }
+    pointer operator->() const { return &idx; }
+
+    WeightedSampleIterator& operator++()
+    {  
+       cnt++;
+       if (cnt>=data_in[idx]) {
+         cnt = 0;
+         idx++;
+       }
+       return *this;
+    }
+
+    WeightedSampleIterator operator++(int) { WeightedSampleIterator tmp = *this; ++(*this); return tmp; }
+
+    friend bool operator== (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
+    {
+       return (a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt);
+    };
+
+    friend bool operator!= (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
+    {
+       return !((a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt));
+    };
+
+    friend int64_t operator-(const WeightedSampleIterator& b, const WeightedSampleIterator& a)
+    {
+       int64_t diff = 0;
+       //assert(a.data_in == b.data_in);
+       //assert(a.idx <= b.idx);
+       //assert((a.idx > b.idx) || (a.cnt<=b.cnt));
+
+       for (uint32_t i = a.idx; i<b.idx; i++) {
+          diff += a.data_in[i];
+       }
+
+       return diff + b.cnt - a.cnt;
+    };
+
+  private:
+
+    uint64_t *data_in;
+    uint32_t idx; // index of data_in
+    uint64_t cnt; // how deep in data_in[idx] are we (must be < data_in[idx])
+  };
+
+  class WeightedSample
+  {
+  public:
+    WeightedSample(uint32_t _max_count, uint32_t _n, uint32_t random_seed)
+    : max_count(_max_count)
+    , n(_n)
+    , generator(random_seed)
+    , current_count(0)
+    , data(max_count)
+    , sample_out(n)
+    , data_out(max_count)
+    {}
+
+   void do_sample(unsigned int length, double* *data_arr) {
+        for (unsigned int j=0; j<length; j++) data_out[j] = 0;
+
+        // note: We are assuming length>=n
+        //      Enforced by the caller (via filtering)
+        assign(length,data_arr);
+        std::sample(begin(), end(),
+                    sample_out.begin(), n,
+                    generator);
+
+        for (uint32_t j=0; j<n; j++) data_out[sample_out[j]]++;
+
+        for (unsigned int j=0; j<length; j++) *(data_arr[j]) = data_out[j];
+    }
+
+private:
+    void assign(uint32_t length, double * *data_arr) {
+       current_count = length;
+       for (uint32_t j=0; j<length; j++) data[j] = *(data_arr[j]);
+    }
+
+    WeightedSampleIterator begin() { return WeightedSampleIterator(data.data(),0,0); }
+    WeightedSampleIterator end()   { return WeightedSampleIterator(data.data(),current_count,0); } // current_count is out of bounds
+
+    uint32_t max_count;
+    uint32_t n;
+    std::mt19937 generator;
 
-        for (unsigned int j=0; j<length; j++) data_in[j] = *(data_arr[j]);
-        
+    uint32_t current_count;
+
+    // use persistent buffer to minimize allocation costs
+    std::vector<uint64_t> data;  // original values
+    std::vector<uint32_t> sample_out;     // random output buffer
+    std::vector<uint32_t> data_out; // computed values
+  };
+
+  class WeightedSampleWithReplacement
+  {
+  public:
+    WeightedSampleWithReplacement(uint32_t _max_count, uint32_t _n, uint32_t random_seed)
+    : max_count(_max_count)
+    , n(_n)
+    , generator(random_seed)
+    , current_count(0)
+    , data(max_count)
+    , data_out(max_count)
+    {}
+
+   void do_sample(unsigned int length, double* *data_arr) {
+        assign(length,data_arr);
+
+        // note: We are assuming length>=n
+        //      Enforced by the caller (via filtering)
+        double *data_in = data.data();
         std::discrete_distribution<uint32_t> multinomial(data_in, data_in+length);
         for (unsigned int j=0; j<length; j++) data_out[j] = 0;
         for (uint32_t j=0; j<n; j++) data_out[multinomial(generator)]++;
 
         for (unsigned int j=0; j<length; j++) *(data_arr[j]) = data_out[j];
     }
-    delete[] data_out;
-    delete[] data_in;
+
+private:
+    void assign(uint32_t length, double * *data_arr) {
+       current_count = length;
+       for (uint32_t j=0; j<length; j++) data[j] = *(data_arr[j]);
+    }
+
+    uint32_t max_count;
+    uint32_t n;
+    std::mt19937 generator;
+
+    uint32_t current_count;
+
+    // use persistent buffer to minimize allocation costs
+    std::vector<double> data;  // original values
+    std::vector<uint32_t> data_out; // computed values
+  };
+} // end namespace su
+
+
+template<class TWork>
+inline void linked_sparse_transposed::transposed_subsample(const uint32_t n, const uint32_t random_seed) {
+    const uint32_t max_threads = omp_get_max_threads();
+    std::mt19937 master_generator(random_seed);
+
+    // use common buffer to minimize allocation cost, but need one per thread
+    std::vector<TWork> sample_data_arr;
+    for (uint32_t i=0; i<max_threads; i++) sample_data_arr.emplace(sample_data_arr.end(), max_count, n, master_generator());
+    
+    #pragma omp parallel for
+    for (uint32_t i=0; i<n_obs; i++) {
+        int my_thread_num = omp_get_thread_num();
+        sample_data_arr[my_thread_num].do_sample(obs_counts_resident[i], obs_data_resident[i]);
+    }
+}
+
+void linked_sparse_transposed::transposed_subsample_without_replacement(const uint32_t n, const uint32_t random_seed) {
+    transposed_subsample<WeightedSample>(n, random_seed);
 }
 
+void linked_sparse_transposed::transposed_subsample_with_replacement(const uint32_t n, const uint32_t random_seed) {
+    transposed_subsample<WeightedSampleWithReplacement>(n, random_seed);
+}
+
+
 // =====================  sparse_data_subsampled  ==========================
 
 void sparse_data_subsampled::subsample_with_replacement(const uint32_t n, const uint32_t random_seed) {
@@ -113,15 +277,24 @@ void sparse_data_subsampled::subsample_with_replacement(const uint32_t n, const
     transposed.transposed_subsample_with_replacement(n,random_seed);
 }
 
+void sparse_data_subsampled::subsample_without_replacement(const uint32_t n, const uint32_t random_seed) {
+    linked_sparse_transposed transposed(*this);
+    transposed.transposed_subsample_without_replacement(n,random_seed);
+}
+
 // =====================  biom_subsampled  ==========================
 
-biom_subsampled::biom_subsampled(const biom_inmem &parent, const uint32_t n, const uint32_t random_seed) 
+biom_subsampled::biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n, const uint32_t random_seed)
   : biom_inmem(true)
 {
    sparse_data_subsampled tmp_obj(parent.get_resident_obj(), parent.get_sample_counts(), n);
    if ((tmp_obj.n_obs==0) || (tmp_obj.n_samples==0)) return; //already everything filtered out
 
-   tmp_obj.subsample_with_replacement(n,random_seed);
+   if (w_replacement) {
+     tmp_obj.subsample_with_replacement(n,random_seed);
+   } else {
+     tmp_obj.subsample_without_replacement(n,random_seed);
+   } 
    // Note: We could filter out the zero rows
    // But that's just an optimization and will not be worth it most of the time
    steal_nonzero(parent,tmp_obj);


=====================================
src/biom_subsampled.hpp
=====================================
@@ -29,6 +29,12 @@ namespace su {
 
             /* perform subsampling with replacement, no filtering */
             void transposed_subsample_with_replacement(const uint32_t n, const uint32_t random_seed);
+
+            /* perform subsampling without replacement, no filtering */
+            void transposed_subsample_without_replacement(const uint32_t n, const uint32_t random_seed);
+        private:
+            // actual implementaiton, templated
+            template<class TWork> void transposed_subsample(const uint32_t n, const uint32_t random_seed);
         public:  // keep it open for ease of access
             uint32_t n_obs;     // row dimension
             uint32_t n_samples; // column dimension
@@ -64,6 +70,9 @@ namespace su {
 
             /* perform subsampling with replacement, no filtering */
             void subsample_with_replacement(const uint32_t n, const uint32_t random_seed);
+
+            /* perform subsampling without replacement, no filtering */
+            void subsample_without_replacement(const uint32_t n, const uint32_t random_seed);
     };
 
     class biom_subsampled : public biom_inmem {
@@ -71,9 +80,10 @@ namespace su {
             /* default constructor
              *
              * @param parent biom object to subsample
+             * @param w_replacement Whether to permute or use multinomial sampling
              * @param n Number of items to subsample
              */
-            biom_subsampled(const biom_inmem &parent, const uint32_t n, const uint32_t random_seed);
+            biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n, const uint32_t random_seed);
 
         protected:
             void steal_nonzero(const biom_inmem &parent, sparse_data& subsampled_obj);


=====================================
src/skbio_alt.cpp
=====================================
@@ -895,7 +895,7 @@ void su::permanova(const float * mat, unsigned int n_dims,
 // ======================= skbio_biom_subsampled  ================================
 
 
-su::skbio_biom_subsampled::skbio_biom_subsampled(const biom_inmem &parent, const uint32_t n)
- : su::biom_subsampled(parent, n, uint32_t(myRandomGenerator()))
+su::skbio_biom_subsampled::skbio_biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n)
+ : su::biom_subsampled(parent, w_replacement, n, uint32_t(myRandomGenerator()))
 {}
 


=====================================
src/skbio_alt.hpp
=====================================
@@ -55,9 +55,10 @@ public:
   /* default constructor
    *
    * @param parent biom object to subsample
+   * @param w_replacement Whether to permute or use multinomial sampling
    * @param n Number of items to subsample
    */
-   skbio_biom_subsampled(const biom_inmem &parent, const uint32_t n);
+   skbio_biom_subsampled(const biom_inmem &parent, const bool w_replacement, const uint32_t n);
 };
 
 


=====================================
src/su.cpp
=====================================
@@ -49,6 +49,7 @@ void usage() {
     std::cout << "    \t\t    hdf5_fp64 : HFD5 format, using fp64 precision." << std::endl;
     std::cout << "    \t\t    hdf5_nodist : HFD5 format, no distance matrix. (default if mode==multi)" << std::endl;
     std::cout << "    --subsample-depth\tDepth of subsampling of the input BIOM before computing unifrac (required for mode==multi, optional for one-off)" << std::endl;
+    std::cout << "    --subsample-replacement\t[OPTIONAL] Subsample with or without replacement (default is with)" << std::endl;
     std::cout << "    --n-subsamples\t[OPTIONAL] if mode==multi, number of subsampled UniFracs to compute (default: 100)" << std::endl;
     std::cout << "    --permanova\t[OPTIONAL] Number of PERMANOVA permutations to compute (default: 999 with -g, do not compute if 0)" << std::endl;
     std::cout << "    --pcoa\t[OPTIONAL] Number of PCoA dimensions to compute (default: 10, do not compute if 0)" << std::endl;
@@ -393,7 +394,7 @@ int mode_partial(std::string table_filename, std::string tree_filename,
 
 int mode_one_off(const std::string &table_filename, const std::string &tree_filename, 
                  const std::string &output_filename, const std::string &format_str, Format format_val, 
-                 const std::string &method_string, unsigned int subsample_depth, unsigned int pcoa_dims,
+                 const std::string &method_string, unsigned int subsample_depth, bool subsample_with_replacement, unsigned int pcoa_dims,
                  unsigned int permanova_perms, const std::string &grouping_filename, const std::string &grouping_columns,
                  bool vaw, double g_unifrac_alpha, bool bypass_tips,
                  unsigned int nsubsteps, const std::string &mmap_dir) {
@@ -457,7 +458,7 @@ int mode_one_off(const std::string &table_filename, const std::string &tree_file
 
       status = unifrac_to_file_v2(table_filename.c_str(), tree_filename.c_str(), output_filename.c_str(),
                                   method_string.c_str(), vaw, g_unifrac_alpha, bypass_tips, nsubsteps, format_str.c_str(),
-                                  subsample_depth, true,
+                                  subsample_depth, subsample_with_replacement,
                                   pcoa_dims, permanova_perms, grouping_c, columns_c, mmap_dir_c);
 
       if (status != okay) {
@@ -471,7 +472,7 @@ int mode_one_off(const std::string &table_filename, const std::string &tree_file
 int mode_multi(const std::string &table_filename, const std::string &tree_filename, 
                const std::string &output_filename, const std::string &format_str, Format format_val, 
                const std::string &method_string,
-               unsigned int n_subsamples, unsigned int subsample_depth,
+               unsigned int n_subsamples, unsigned int subsample_depth, bool subsample_with_replacement,
                unsigned int pcoa_dims,
                unsigned int permanova_perms, const std::string &grouping_filename, const std::string &grouping_columns,
                bool vaw, double g_unifrac_alpha, bool bypass_tips,
@@ -529,7 +530,7 @@ int mode_multi(const std::string &table_filename, const std::string &tree_filena
 
       status = unifrac_multi_to_file_v2(table_filename.c_str(), tree_filename.c_str(), output_filename.c_str(),
                                         method_string.c_str(), vaw, g_unifrac_alpha, bypass_tips, nsubsteps, format_str.c_str(),
-                                        n_subsamples, subsample_depth, true,
+                                        n_subsamples, subsample_depth, subsample_with_replacement,
                                         pcoa_dims, permanova_perms, grouping_c, columns_c, mmap_dir_c);
 
       if (status != okay) {
@@ -616,6 +617,7 @@ int main(int argc, char **argv){
     std::string permanova_arg = input.getCmdOption("--permanova");
     std::string seed_arg = input.getCmdOption("--seed");
     std::string subsample_depth_arg = input.getCmdOption("--subsample-depth");
+    std::string subsample_replacement_arg = input.getCmdOption("--subsample-replacement");
     std::string n_subsamples_arg = input.getCmdOption("--n-subsamples");
     std::string diskbuf_arg = input.getCmdOption("--diskbuf");
 
@@ -702,13 +704,23 @@ int main(int argc, char **argv){
         if(mode_arg == "multi" || mode_arg == "multiple") {
            err("--subsample-depth required in multi mode.");
            return EXIT_FAILURE;
-        } else {
-           subsample_depth = 0;
         }
     } else {
         subsample_depth = atoi(subsample_depth_arg.c_str());
     }
 
+    bool subsample_without_replacement = false;
+    if(!subsample_replacement_arg.empty()) {
+        if (subsample_replacement_arg == "with") {
+           subsample_without_replacement = false;
+        } else if (subsample_replacement_arg == "without") {
+           subsample_without_replacement = true;
+        } else {
+           err("Invalid --subsample-replacement argument, must be 'with' or 'without'.");
+           return EXIT_FAILURE;
+        }
+    }
+
     unsigned int n_subsamples = 0;
     if(n_subsamples_arg.empty()) {
         n_subsamples = 100;
@@ -723,7 +735,7 @@ int main(int argc, char **argv){
 
     if(mode_arg.empty() || mode_arg == "one-off")
         return mode_one_off(table_filename, tree_filename, output_filename,  format2str(format_val), format_val, method_string,
-                            subsample_depth,
+                            subsample_depth, !subsample_without_replacement,
                             pcoa_dims, permanova_perms, grouping_filename, grouping_columns,
                             vaw, g_unifrac_alpha, bypass_tips, nsubsteps, diskbuf_arg);
     else if(mode_arg == "partial") {
@@ -742,7 +754,7 @@ int main(int argc, char **argv){
         return mode_partial_report(table_filename, uint32_t(n_partials), bare);
     else if(mode_arg == "multi" || mode_arg == "multiple")
         return mode_multi(table_filename, tree_filename, output_filename, format2str(format_val), format_val, method_string,
-                            n_subsamples,subsample_depth,
+                            n_subsamples,subsample_depth, !subsample_without_replacement,
                             pcoa_dims, permanova_perms, grouping_filename, grouping_columns,
                             vaw, g_unifrac_alpha, bypass_tips, nsubsteps, diskbuf_arg);
     else 


=====================================
src/test_ska.cpp
=====================================
@@ -649,7 +649,7 @@ void test_subsample_replacement() {
     std::string oids_org[] = {"GG_OTU_1", "GG_OTU_2","GG_OTU_3", "GG_OTU_4", "GG_OTU_5"};
     std::vector<std::string> exp_org_oids = _string_array_to_vector(oids_org, exp_org_n_obs);
 
-    su::skbio_biom_subsampled table(org_table,5);
+    su::skbio_biom_subsampled table(org_table,true,5);
     uint32_t exp_n_samples = 2;
     uint32_t exp_n_obs = 3;
 
@@ -669,7 +669,7 @@ void test_subsample_replacement() {
     ASSERTINTEQ(table.n_samples , exp_n_samples);
     ASSERTINTLE(table.n_obs , exp_n_obs);
     ASSERT(table.get_sample_ids() == exp_sids);
-    // when with replacement
+
     // all columns should add to n==5
     {
       double exp_data[2] = {5.0, 5.0};
@@ -685,26 +685,147 @@ void test_subsample_replacement() {
       ASSERT(sum_vec == exp_vec);
     }
 
+    su::skbio_biom_subsampled table_empty(org_table,true,8);
+    uint32_t exp_empty_n_samples = 0;
+    uint32_t exp_empty_n_obs = 0;
+
+    ASSERT(table_empty.n_samples == exp_empty_n_samples);
+    ASSERT(table_empty.n_obs == exp_empty_n_obs);
+
+    SUITE_END();
+}
+
+void test_subsample_replacement_limit() {
+    SUITE_START("test subsample with replacement limit");
+
+    // test case when n == sum(vector) - 1
+
+    const char *t_obs_ids[] = {"OTU0","OTU1","OTU2","OTU3","OTU4",
+                               "OTU5","OTU6","OTU7","OTU8","OTU9"};
+    const char *t_samp_ids[] = {"S1"};
+    uint32_t t_index[] = {0,0,0,0,0,0,0,0,0,0};
+    uint32_t t_idxptr[] = {0,1,2,3,4,5,6,7,8,9,10}; 
+    double t_data[] = {2., 1., 2., 1., 7., 6., 3., 3., 5., 5.};
+    su::biom_inmem org_table(t_obs_ids,t_samp_ids,t_index,t_idxptr,t_data,10,1);
+
+    const uint32_t exp_org_n_samples = 1;
+    const uint32_t exp_org_n_obs = 10;
+
+    // verify we did the right thing
+    ASSERTINTEQ(org_table.n_samples , exp_org_n_samples);
+    ASSERTINTEQ(org_table.n_obs , exp_org_n_obs);
+
+    // column should add to n==35
+    {
+      double exp_data[1] = {35.0};
+      std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+      double data_sum[1] = {0.0};
+      for (auto obs_id : org_table.get_obs_ids()) {
+         double line[1];
+         org_table.get_obs_data(obs_id, line);
+         for (int j=0; j<1; j++) data_sum[j] += line[j];
+      }
+      std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+      ASSERT(sum_vec == exp_vec);
+    }
+
+    su::skbio_biom_subsampled table(org_table,true,34);
+    uint32_t exp_n_samples = 1;
+    uint32_t exp_n_obs = 10;
+
+    // now check basic rarefaction properties
+    ASSERTINTEQ(table.n_samples , exp_n_samples);
+    ASSERTINTLE(table.n_obs , exp_n_obs);
+
+    // column should add to n==34
+    {
+      double exp_data[1] = {34.0};
+      std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+      double data_sum[1] = {0.0};
+      for (auto obs_id : table.get_obs_ids()) {
+         double line[1];
+         table.get_obs_data(obs_id, line);
+         for (int j=0; j<1; j++) data_sum[j] += line[j];
+      }
+      std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+      ASSERT(sum_vec == exp_vec);
+    }
+
+    // As in scikit-bio and biom-format
     // when with replacement
-    // given any column, the number of permutations should be more than n
+    // the number of permutations should be more than n
     {
       std::unordered_set<uint64_t> perm_set;
       // will pick column 1
       for (int i=0; i<1000; i++) {
-         su::skbio_biom_subsampled table2(org_table,5);
+         su::skbio_biom_subsampled table2(org_table,true,34);
 
          uint64_t val = 0;
          for (auto obs_id : table2.get_obs_ids()) {
-            double line[2];
+            double line[1];
             table2.get_obs_data(obs_id, line);
-            val = val*10 + uint64_t(line[1]);
+            val = val*8 + uint64_t(line[0]);
          }
          perm_set.insert(val);
       }
-      ASSERT(uint32_t(perm_set.size()) > uint32_t(5));
+      ASSERT(uint32_t(perm_set.size()) > uint32_t(10));
+    }
+
+    SUITE_END();
+}
+
+void test_subsample_woreplacement() {
+    SUITE_START("test subsample without replacement");
+
+    su::biom org_table("test.biom");
+    uint32_t exp_org_n_samples = 6;
+    uint32_t exp_org_n_obs = 5;
+
+    std::string sids_org[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"};
+    std::vector<std::string> exp_org_sids = _string_array_to_vector(sids_org, exp_org_n_samples);
+
+    std::string oids_org[] = {"GG_OTU_1", "GG_OTU_2","GG_OTU_3", "GG_OTU_4", "GG_OTU_5"};
+    std::vector<std::string> exp_org_oids = _string_array_to_vector(oids_org, exp_org_n_obs);
+
+    su::skbio_biom_subsampled table(org_table,false,5);
+    uint32_t exp_n_samples = 2;
+    uint32_t exp_n_obs = 3;
+
+    std::string sids[] = {"Sample1", "Sample4"};
+    std::vector<std::string> exp_sids = _string_array_to_vector(sids, exp_n_samples);
+
+    std::string oids[] = {"GG_OTU_2","GG_OTU_3", "GG_OTU_4"};
+    std::vector<std::string> exp_oids = _string_array_to_vector(oids, exp_n_obs);
+
+    // verify it did not destroy the original table
+    ASSERTINTEQ(org_table.n_samples , exp_org_n_samples);
+    ASSERTINTEQ(org_table.n_obs , exp_org_n_obs);
+    ASSERT(org_table.get_sample_ids() == exp_org_sids);
+    ASSERT(org_table.get_obs_ids() == exp_org_oids);
+
+    // now check basic rarefaction properties
+    ASSERTINTEQ(table.n_samples , exp_n_samples);
+    ASSERTINTLE(table.n_obs , exp_n_obs);
+    ASSERT(table.get_sample_ids() == exp_sids);
+
+    // all columns should add to n==5
+    {
+      double exp_data[2] = {5.0, 5.0};
+      std::vector<double> exp_vec = _double_array_to_vector(exp_data, 2);
+
+      double data_sum[2] = {0.0, 0.0};
+      for (auto obs_id : table.get_obs_ids()) {
+         double line[2];
+         table.get_obs_data(obs_id, line);
+         for (int j=0; j<2; j++) data_sum[j] += line[j];
+      }
+      std::vector<double> sum_vec = _double_array_to_vector(data_sum, 2);
+      ASSERT(sum_vec == exp_vec);
     }
 
-    su::skbio_biom_subsampled table_empty(org_table,8);
+    su::skbio_biom_subsampled table_empty(org_table,false,8);
     uint32_t exp_empty_n_samples = 0;
     uint32_t exp_empty_n_obs = 0;
 
@@ -714,6 +835,87 @@ void test_subsample_replacement() {
     SUITE_END();
 }
 
+void test_subsample_woreplacement_limit() {
+    SUITE_START("test subsample without replacement limit");
+
+    // test case when n == sum(vector) - 1
+
+    const char *t_obs_ids[] = {"OTU0","OTU1","OTU2","OTU3","OTU4",
+                               "OTU5","OTU6","OTU7","OTU8","OTU9"};
+    const char *t_samp_ids[] = {"S1"};
+    uint32_t t_index[] = {0,0,0,0,0,0,0,0,0,0};
+    uint32_t t_idxptr[] = {0,1,2,3,4,5,6,7,8,9,10}; 
+    double t_data[] = {2., 1., 2., 1., 7., 6., 3., 3., 5., 5.};
+    su::biom_inmem org_table(t_obs_ids,t_samp_ids,t_index,t_idxptr,t_data,10,1);
+
+    const uint32_t exp_org_n_samples = 1;
+    const uint32_t exp_org_n_obs = 10;
+
+    // verify we did the right thing
+    ASSERTINTEQ(org_table.n_samples , exp_org_n_samples);
+    ASSERTINTEQ(org_table.n_obs , exp_org_n_obs);
+
+    // column should add to n==35
+    {
+      double exp_data[1] = {35.0};
+      std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+      double data_sum[1] = {0.0};
+      for (auto obs_id : org_table.get_obs_ids()) {
+         double line[1];
+         org_table.get_obs_data(obs_id, line);
+         for (int j=0; j<1; j++) data_sum[j] += line[j];
+      }
+      std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+      ASSERT(sum_vec == exp_vec);
+    }
+
+    su::skbio_biom_subsampled table(org_table,false,34);
+    uint32_t exp_n_samples = 1;
+    uint32_t exp_n_obs = 10;
+
+    // now check basic rarefaction properties
+    ASSERTINTEQ(table.n_samples , exp_n_samples);
+    ASSERTINTLE(table.n_obs , exp_n_obs);
+
+    // column should add to n==34
+    {
+      double exp_data[1] = {34.0};
+      std::vector<double> exp_vec = _double_array_to_vector(exp_data, 1);
+
+      double data_sum[1] = {0.0};
+      for (auto obs_id : table.get_obs_ids()) {
+         double line[1];
+         table.get_obs_data(obs_id, line);
+         for (int j=0; j<1; j++) data_sum[j] += line[j];
+      }
+      std::vector<double> sum_vec = _double_array_to_vector(data_sum, 1);
+      ASSERT(sum_vec == exp_vec);
+    }
+
+    // As in scikit-bio and biom-format
+    // when without replacement
+    // the number of permutations should be exactly n
+    {
+      std::unordered_set<uint64_t> perm_set;
+      // will pick column 1
+      for (int i=0; i<1000; i++) {
+         su::skbio_biom_subsampled table2(org_table,false,34);
+
+         uint64_t val = 0;
+         for (auto obs_id : table2.get_obs_ids()) {
+            double line[1];
+            table2.get_obs_data(obs_id, line);
+            val = val*8 + uint64_t(line[0]);
+         }
+         perm_set.insert(val);
+      }
+      ASSERT(uint32_t(perm_set.size()) == uint32_t(10));
+    }
+
+    SUITE_END();
+}
+
 int main(int argc, char** argv) {
     test_center_mat();
     test_pcoa();
@@ -723,6 +925,9 @@ int main(int argc, char** argv) {
     test_permanova_noties();
     test_permanova_unequal();
     test_subsample_replacement();
+    test_subsample_replacement_limit();
+    test_subsample_woreplacement();
+    test_subsample_woreplacement_limit();
 
     printf("\n");
     printf(" %i / %i suites failed\n", suites_failed, suites_run);


=====================================
src/tree.cpp
=====================================
@@ -369,16 +369,15 @@ void BPTree::structure_to_openclose() {
 }
 // trim from end
 // from http://stackoverflow.com/a/217605
-static inline std::string &rtrim(std::string &s) {
-    s.erase(std::find_if(s.rbegin(), s.rend(),
-            std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
-    return s;
+static inline void rtrim(std::string &s) {
+    s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
+        return !std::isspace(ch);
+    }).base(), s.end());
 }
 
-
 //// WEIRDNESS. THIS SOLVES IT WITH THE RTRIM. ISOLATE, MOVE TO CONSTRUCTOR.
 void BPTree::newick_to_metadata(std::string newick) {
-    newick = rtrim(newick);
+    rtrim(newick);
     
     std::string::iterator start = newick.begin();
     std::string::iterator end = newick.end();


=====================================
src/tree.hpp
=====================================
@@ -10,6 +10,7 @@
 #ifndef __UNIFRAC_TREE_H
 #define __UNIFRAC_TREE_H 1
 
+#include <cstdint>
 #include <string>
 #include <sstream>
 #include <iostream>


=====================================
src/tsv.hpp
=====================================
@@ -11,6 +11,7 @@
 #ifndef _UNIFRAC_TSV_H
 #define _UNIFRAC_TSV_H
 
+#include <cstdint>
 #include <vector>
 #include <map>
 #include <string>


=====================================
test/Makefile
=====================================
@@ -13,7 +13,7 @@ test: capi_test
 test_binaries: capi_test
 
 capi_test: capi_test.c
-	$(CC) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib -o capi_test
+	$(CC) $(CFLAGS) -std=c99 -O0 -g capi_test.c -I../src -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib $(LDFLAGS) -o capi_test
 
 clean:
 	-rm -f *.o capi_test



View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/6c8284acc22812b9d886b88681eb3e8901713ff9...30371a96e254628e402e15a84c246fa44ae0187e

-- 
View it on GitLab: https://salsa.debian.org/med-team/unifrac-tools/-/compare/6c8284acc22812b9d886b88681eb3e8901713ff9...30371a96e254628e402e15a84c246fa44ae0187e
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231023/92a32277/attachment-0001.htm>