[med-svn] [Git][med-team/python-biom-format][upstream] New upstream version 2.1.16

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Mon May 27 21:40:47 BST 2024



Étienne Mollier pushed to branch upstream at Debian Med / python-biom-format


Commits:
842fbdbf by Étienne Mollier at 2024-05-27T20:07:00+02:00
New upstream version 2.1.16
- - - - -


24 changed files:

- .github/workflows/python-package-conda.yml
- .github/workflows/release.yml
- ChangeLog.md
- biom/__init__.py
- biom/_filter.pyx
- biom/_subsample.pyx
- biom/_transform.pyx
- biom/table.py
- biom/tests/test_cli/test_add_metadata.py
- biom/tests/test_cli/test_subset_table.py
- biom/tests/test_cli/test_summarize_table.py
- biom/tests/test_cli/test_table_converter.py
- biom/tests/test_cli/test_table_normalizer.py
- biom/tests/test_cli/test_validate_table.py
- + biom/tests/test_data/edgecase_issue_952.biom
- biom/tests/test_parse.py
- biom/tests/test_table.py
- biom/tests/test_util.py
- biom/util.py
- ci/aarch64.conda_requirements.txt
- ci/conda_requirements.txt
- doc/conf.py
- doc/index.rst
- setup.py


Changes:

=====================================
.github/workflows/python-package-conda.yml
=====================================
@@ -9,8 +9,8 @@ on:
     branches: [ master ]
 
 env:
-  latest_python: "3.11"
-  supported_pythons: '["3.7", "3.8", "3.9", "3.10", "3.11"]'
+  latest_python: "3.12"
+  supported_pythons: '["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]'
   miniforge_version: "22.9.0-2"
   miniforge_variant: "Mambaforge"
 
@@ -34,7 +34,7 @@ jobs:
     needs: conf
     runs-on: "ubuntu-latest"
     steps:
-      - uses: actions/checkout at v3
+      - uses: actions/checkout at v4
       - uses: conda-incubator/setup-miniconda at v2
         with:
           auto-update-conda: true
@@ -56,7 +56,7 @@ jobs:
     needs: ["conf", "lint"]
     runs-on: "ubuntu-latest"
     steps:
-      - uses: actions/checkout at v3
+      - uses: actions/checkout at v4
       - uses: conda-incubator/setup-miniconda at v2
         with:
           auto-update-conda: true
@@ -81,15 +81,15 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        os: ["ubuntu-latest", "macos-latest"]
+        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
         python_version: ${{ fromJSON(needs.conf.outputs.supported_pythons) }}
         use_conda: [true, false]
     steps:
-      - uses: actions/checkout at v3
+      - uses: actions/checkout at v4
       - uses: conda-incubator/setup-miniconda at v2
         with:
           auto-update-conda: true
-          python-version: ${{ env.latest_python }}
+          python-version: ${{ matrix.python_version }}
           miniforge-version: ${{ env.miniforge_version }}
           miniforge-variant: ${{ env.miniforge_variant }}
           environment-file: ci/conda_host_env.yml
@@ -115,7 +115,7 @@ jobs:
     needs: ["conf", "lint", "doc", "test-all"]
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout at v3
+      - uses: actions/checkout at v4
         # setup-buildx-action uses the git context directly
         # but checklist wants the .git directory
       - name: Set up QEMU


=====================================
.github/workflows/release.yml
=====================================
@@ -1,92 +1,128 @@
 name: Release
 
-on: [push, pull_request]
+on:
+  push:
+    tags:
+      - '*'
+
+env:
+  earliest_python: "3.8"
+  latest_python: "3.12"
+  miniforge_version: "23.11.0-0"
+  miniforge_variant: "Mambaforge"
 
 jobs:
-  build_sdist:
-    name: Build sdist
+  release:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout at v3
-
-      - name: Build distribution
-        run: |
-          export RELEASE_VERSION=${{ github.ref_name }}
-          pip install numpy cython
-          pipx run build --sdist
-
-      - uses: actions/upload-artifact at v3
-        with:
-          name: dist-artifacts
-          path: dist/*.tar.gz
-
-  # adapted from 
-  # https://github.com/biopython/biopython/blob/master/.github/workflows/ci.yml
-  build_wheels:
-    name: Build wheels (py ${{ matrix.pyver }}) ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest]
-        pyver: ["37", "38", "39", "310", "311"]
-
-    steps:
-      - uses: actions/checkout at v3
-      - name: Set up Python
+      - uses: actions/checkout at v2
+      - name: Set up Python 3.8
         uses: actions/setup-python at v2
         with:
-          python-version: 3.9
-
-      - name: Install Python packaging tools
+          python-version: 3.8
+      - name: Build distribution
         run: |
+          # set version from '${{ github.ref_name }}'
           export RELEASE_VERSION=${{ github.ref_name }}
           pip install numpy cython
-          python -m pip install --upgrade pip setuptools wheel
-
-      # https://github.com/pypa/cibuildwheel/blob/main/examples/github-deploy.yml
-      - name: Build wheels (py ${{ matrix.pyver }}) Linux
-        if: matrix.os == 'ubuntu-latest' 
-        env:
-          CIBW_ARCHS_LINUX: x86_64
-          CIBW_SKIP: "*-musllinux*"
-          CIBW_BUILD: "cp${{ matrix.pyver }}-*"
-
-        uses: pypa/cibuildwheel at v2.12.3
-      
-      - name: Build wheels (py ${{ matrix.pyver }}) MacOS
-        if: matrix.os == 'macos-latest'
-        env:
-          CIBW_ARCHS_MACOS: "x86_64 arm64 universal2"
-          CIBW_BUILD: "cp${{ matrix.pyver }}-*"
-
-        uses: pypa/cibuildwheel at v2.12.3
-      
-      
-      - name: Upload wheels
-        uses: actions/upload-artifact at v3
-        with:
-          name: dist-artifacts
-          path: ./wheelhouse/*.whl
-
-  release:
-    needs: [build_wheels, build_sdist]
-    runs-on: ubuntu-latest
-    # this is not ideal as it doesn't limit to what type of tag
-    # but it at least seems to work
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
-
-    steps:
-      - name: Stage artifacts
-        uses: actions/download-artifact at v3
-        with:
-          name: dist-artifacts
-          path: dist/
-      
-      - name: Check artifacts
-        run: ls -lrt dist/
+          python setup.py sdist
 
       - name: Publish a Python distribution to PyPI
-        uses: pypa/gh-action-pypi-publish at v1.5.0
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+        uses: pypa/gh-action-pypi-publish at release/v1
         with:
           user: __token__
           password: ${{ secrets.PYPI_API_TOKEN }}
+
+# wheels are not working
+# cutting them out did not "just" work
+# getting a release out right now is more important than
+# precompiled builds.
+  #jobs:
+  #  build_sdist:
+  #    name: Build sdist
+  #    runs-on: ubuntu-latest
+  #    steps:
+  #      - uses: actions/checkout at v4
+  #
+  #      - name: Build distribution
+  #        run: |
+  #          pip install numpy cython
+  #          pipx run build --sdist
+  #
+  #      - uses: actions/upload-artifact at v4
+  #        with:
+  #          name: cibw-sdist
+  #          path: dist/*.tar.gz
+  #
+  #  # adapted from 
+  #  # https://github.com/pypa/cibuildwheel/blob/main/examples/github-deploy.yml
+  #  build_wheels:
+  #    name: Build wheels (py ${{ matrix.pyver }}) ${{ matrix.os }}
+  #    runs-on: ${{ matrix.os }}
+  #    strategy:
+  #      matrix:
+  #        os: [ubuntu-latest, macos-13, macos-14]
+  #        pyver: ["37", "38", "39", "310", "311", "312"]
+  #
+  #    steps:
+  #      - uses: actions/checkout at v4
+  #
+  #      - name: Install Python packaging tools
+  #        run: |
+  #          pip install numpy cython
+  #          python -m pip install --upgrade pip setuptools wheel
+  #
+  #      - name: Build wheels (py ${{ matrix.pyver }}) Linux
+  #        if: matrix.os == 'ubuntu-latest' 
+  #        env:
+  #          CIBW_ARCHS_LINUX: "x86_64 aarch64"
+  #          CIBW_SKIP: "*-musllinux*"
+  #          CIBW_BUILD: "cp${{ matrix.pyver }}-*"
+  #
+  #        uses: pypa/cibuildwheel at v2.17.0
+  #      
+  #      - name: Build wheels (py ${{ matrix.pyver }}) MacOS
+  #        if: matrix.os == 'macos-latest'
+  #        env:
+  #          CIBW_ARCHS_MACOS: "x86_64 arm64 universal2"
+  #          CIBW_BUILD: "cp${{ matrix.pyver }}-*"
+  #
+  #        uses: pypa/cibuildwheel at v2.17.0
+  #
+  #      - name: Build wheels (py ${{ matrix.pyver }}) Windows
+  #        if: matrix.os == 'windows-latest'
+  #        env:
+  #          CIBW_ARCHS_WINDOWS: "amd64 win32"
+  #          CIBW_BUILD: "cp${{ matrix.pyver }}-*"
+  #
+  #        uses: pypa/cibuildwheel at v2.17.0
+  #
+  #      - uses: actions/upload-artifact at v4
+  #        with:
+  #          name: cibw-wheels-${{ matrix.os }}-${{ matrix.pyver }}-${{ strategy.job-index }}
+  #          path: ./wheelhouse/*.whl
+  #
+  #  release:
+  #    needs: [build_wheels, build_sdist]
+  #    runs-on: ubuntu-latest
+  #    environment: pypi
+  #    permissions:
+  #      id-token: write
+  #
+  #    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+  #    steps:
+  #      - uses: actions/download-artifact at v4
+  #        with:
+  #          name: cibw-*
+  #          path: dist
+  #          merge-multiple: true
+  #      
+  #      - name: Check artifacts
+  #        run: ls -lrt dist/
+  #
+  #      - name: Publish Distribution
+  #        uses: pypa/gh-action-pypi-publish at v1.5.0
+  #        with:
+  #          user: __token__
+  #          password: ${{ secrets.PYPI_API_TOKEN }}


=====================================
ChangeLog.md
=====================================
@@ -1,6 +1,31 @@
 BIOM-Format ChangeLog
 =====================
 
+biom 2.1.16
+-----------
+
+Maintenance, bug fix, performance and feature release, May 9th 2024.
+
+New features:
+
+* Add Windows support. PR[#951](https://github.com/biocore/biom-format/pull/951) revises codebase to be Windows compatible and adds this support to the CI testing matrix.
+* Add NumPy 2.0 support. PR [#950](https://github.com/biocore/biom-format/pull/950) ensures code compatibility with NumPy 2.0. This support is yet to be added to the CI testing matrix.
+* Expand API for `Table.partition` to allow for passing `dict` mappings from ids to groups and vice versa, remove of empty vectors, and ignoring `None` partitions. See issue [#937](https://github.com/biocore/biom-format/issues/937)
+* NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956)
+* The optimized subsample without replacement method is now exposed as `biom.subsample`. Note that this method operates inplace on SciPy `csr_matrix` and `csc_matrix` objects. See issue [#958](https://github.com/biocore/biom-format/issues/958)
+
+Bug Fixes:
+
+* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)
+* In `subsample(..., with_replacement=True)`, it was possible to trigger a numerical stability on sum, see issue [#952](https://github.com/biocore/biom-format/issues/952)
+* `update_ids(..., strict=False)` could yield truncated IDs, see issue [#957](https://github.com/biocore/biom-format/issues/957)
+
+Performance improvements:
+
+* Revise `Table._fast_merge` to use COO directly. For very large tables, this reduces runtime by ~50x and memory by ~5x. See PR [#913](https://github.com/biocore/biom-format/pull/933).
+* Drastically reduce the memory needs of subsampling when sums are large. Also adds 64-bit support. See PR [#935](https://github.com/biocore/biom-format/pull/935).
+* Improve handling of not-perfectly-integer inputs. See PR [#938](https://github.com/biocore/biom-format/pull/938).
+
 biom 2.1.15
 -----------
 
@@ -11,7 +36,7 @@ Bug fixes:
 * Allow `Table.to_json` to properly handle numpy types in metadata, see issue [#886](https://github.com/biocore/biom-format/issues/886)
 * Do not modify IDs in place in the presence of duplicate relabels, see issue [#892](https://github.com/biocore/biom-format/issues/892)
 * Catch an edge case where a failured ID update in place would actually change IDs, see issue [#892](https://github.com/biocore/biom-format/issues/892)
-
+ 
 New features:
 
 * `biom.parse.save_table` makes saving less tedious, see issue [#897](https://github.com/biocore/biom-format/issues/897)


=====================================
biom/__init__.py
=====================================
@@ -51,6 +51,7 @@ either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse accordingly:
 from .table import Table
 from .parse import parse_biom_table as parse_table, load_table, save_table
 from .util import __format_version__, __version__
+from ._subsample import subsample
 
 __author__ = "Daniel McDonald"
 __copyright__ = "Copyright 2011-2020, The BIOM Format Development Team"
@@ -58,7 +59,7 @@ __credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso",
                "Jose Clemente", "Justin Kuczynski", "Antonio Gonzalez",
                "Yoshiki Vazquez Baeza", "Jose Navas", "Adam Robbins-Pianka",
                "Rob Knight", "Joshua Shorenstein", "Emily TerAvest",
-               "Michael Shaffer"]
+               "Michael Shaffer", "Qiyun Zhu", "Matt Aton"]
 __license__ = "BSD"
 __url__ = "http://biom-format.org"
 __maintainer__ = "Daniel McDonald"
@@ -95,4 +96,5 @@ def concat(tables, *args, **kwargs):
 
 
 __all__ = ['Table', 'example_table', 'parse_table', 'load_table',
-           '__format_version__', '__version__', 'save_table']
+           '__format_version__', '__version__', 'save_table',
+           'subsample']


=====================================
biom/_filter.pyx
=====================================
@@ -13,6 +13,7 @@ from types import FunctionType
 
 import numpy as np
 cimport numpy as cnp
+cnp.import_array()
 
 
 cdef cnp.ndarray[cnp.uint8_t, ndim=1] \


=====================================
biom/_subsample.pyx
=====================================
@@ -6,66 +6,169 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # -----------------------------------------------------------------------------
 
-
 import numpy as np
 cimport numpy as cnp
+cnp.import_array()
 
 
-def _subsample(arr, n, with_replacement, rng):
-    """Subsample non-zero values of a sparse array
+cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
+                                 cnp.ndarray[cnp.int32_t, ndim=1] indptr,
+                                 cnp.int64_t n,
+                                 object rng):
+    """Subsample non-zero values of a sparse array with replacement
+
+    Note: this method operates in place
 
     Parameters
     ----------
-    arr : {csr_matrix, csc_matrix}
-        A 1xM sparse vector
+    data : {csr_matrix, csc_matrix}.data
+        A 1xM sparse vector data
+    indptr : {csr_matrix, csc_matrix}.indptr
+        A 1xM sparse vector indptr
     n : int
         Number of items to subsample from `arr`
-    with_replacement : bool
-        Whether to permute or use multinomial sampling
     rng : Generator instance
         A random generator. This will likely be an instance returned 
         by np.random.default_rng
 
-    Returns
-    -------
-    ndarray
-        Subsampled data
-
     Notes
     -----
     This code was adapted from scikit-bio (`skbio.math._subsample`)
 
     """
     cdef:
-        cnp.int64_t counts_sum
-        cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
-        cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
-        cnp.ndarray[cnp.float64_t, ndim=1] result
-        cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
-        cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
-        cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
-        cnp.float64_t cnt
-        Py_ssize_t i, j, length
+        cnp.float64_t counts_sum
+        cnp.int32_t start,end,length
+        Py_ssize_t i
+        cnp.ndarray[cnp.float64_t, ndim=1] pvals
+        cnp.ndarray[cnp.float64_t, ndim=1] data_ceil 
+        
+    data_ceil = np.ceil(data)
+    for i in range(indptr.shape[0] - 1):
+        start, end = indptr[i], indptr[i+1]
+        length = end - start
+
+        # base p-values on integer data to avoid small numerical issues with 
+        # float on sum
+        counts_sum = data_ceil[start:end].sum()
+        pvals = data_ceil[start:end] / counts_sum
+
+        data[start:end] = rng.multinomial(n, pvals)
+
+
+cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
+                                    cnp.ndarray[cnp.int32_t, ndim=1] indptr,
+                                    cnp.int64_t n,
+                                    object rng):
+    """Subsample non-zero values of a sparse array w/out replacement
+
+    Note: this method operates in place
+
+    Parameters
+    ----------
+    data : {csr_matrix, csc_matrix}.data
+        A 1xM sparse vector data
+    indptr : {csr_matrix, csc_matrix}.indptr
+        A 1xM sparse vector indptr
+    n : int
+        Number of items to subsample from `arr`
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+    """
+    cdef:
+        cnp.int64_t counts_sum, count_el, perm_count_el
+        cnp.int64_t count_rem
+        cnp.ndarray[cnp.int64_t, ndim=1] permuted, intdata
+        Py_ssize_t i, idx
+        cnp.int32_t length,el,start,end
+        cnp.int64_t el_cnt
 
     for i in range(indptr.shape[0] - 1):
         start, end = indptr[i], indptr[i+1]
         length = end - start
-        counts_sum = data[start:end].sum()
+        # We are relying on data being integers
+        # If there are rounding erros, fp64 sums can lead to
+        # big errors in sum, so convert to int64, first
+        intdata = data[start:end].astype(np.int64)
+        counts_sum = intdata.sum()
         
-        if with_replacement:
-            pvals = data[start:end] / counts_sum
-            data[start:end] = rng.multinomial(n, pvals)
-        else:
-            if counts_sum < n:
-                data[start:end] = 0
-                continue
-
-            r = np.arange(length, dtype=np.int32)
-            unpacked = np.repeat(r, data_i[start:end])
-            permuted = rng.permutation(unpacked)[:n]
-
-            result = np.zeros(length, dtype=np.float64)
-            for idx in range(permuted.shape[0]):
-                result[permuted[idx]] += 1
-
-            data[start:end] = result
+        if counts_sum < n:
+            data[start:end] = 0
+            continue
+
+        permuted = rng.choice(counts_sum, n, replace=False, shuffle=False)
+        permuted.sort()
+
+        # now need to do reverse mapping
+        # since I am not using np.repeat anymore
+        # reminder, old logic was
+        #   r = np.arange(length)
+        #   unpacked = np.repeat(r, data_i[start:end])
+        #   permuted_unpacked = rng.choice(unpacked, n, replace=False, shuffle=False)
+        # 
+        # specifically, what we're going to do here is randomly pick what elements within
+        # each sample to keep. this is analogous issuing the prior np.repeat call, and obtaining
+        # a random set of index positions for that resulting array. however, we do not need to 
+        # perform the np.repeat call as we know the length of that resulting vector already,
+        # and additionally, we can compute the sample associated with an index in that array
+        # without constructing it.
+
+        el = 0         # index in result/data
+        count_el = 0  # index in permutted
+        count_rem = intdata[0]  # since each data has multiple els, keep track how many are left
+        el_cnt = 0
+        for idx in range(n):
+            perm_count_el = permuted[idx]
+            # The array is sorted, so just jump ahead if needed
+            # Move until we get withing the elements range
+            while (perm_count_el - count_el) >= count_rem:
+               #save the computed value
+               data[start+el] = el_cnt
+               # move to next element
+               el += 1
+               # move to the beginning of next element
+               count_el += count_rem
+               # Load how much we have avaialble
+               count_rem = intdata[el]
+               #re-start the el counter
+               el_cnt = 0
+            # increment the el counter
+            el_cnt += 1
+            # update the counters
+            # reduce what is left
+            count_rem -= (perm_count_el - count_el)
+            #move the pointer to where we stopped
+            count_el = perm_count_el
+        # save the last value
+        data[start+el] = el_cnt
+        # clean up tail elements
+        data[start+el+1:end] = 0
+
+
+def subsample(arr, n, with_replacement, rng):
+    """Subsample non-zero values of a sparse array
+
+    Note: this method operates in place
+
+    Parameters
+    ----------
+    arr : {csr_matrix, csc_matrix}
+        A 1xM sparse vector
+    n : int
+        Number of items to subsample from `arr`
+    with_replacement : bool
+        Whether to permute or use multinomial sampling
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+
+    Notes
+    -----
+    This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+    """
+    if (with_replacement):
+       _subsample_with_replacement(arr.data, arr.indptr, n, rng)
+    else:
+       _subsample_without_replacement(arr.data, arr.indptr, n, rng)


=====================================
biom/_transform.pyx
=====================================
@@ -9,6 +9,7 @@
 
 import numpy as np
 cimport numpy as cnp
+cnp.import_array()
 
 
 def _transform(arr, ids, metadata, function, axis):


=====================================
biom/table.py
=====================================
@@ -178,7 +178,7 @@ from copy import deepcopy
 from datetime import datetime
 from json import dumps as _json_dumps, JSONEncoder
 from functools import reduce, partial
-from operator import itemgetter, or_
+from operator import itemgetter
 from collections import defaultdict
 from collections.abc import Hashable, Iterable
 from numpy import ndarray, asarray, zeros, newaxis
@@ -195,7 +195,7 @@ from biom.util import (get_biom_format_version_string,
 from biom.err import errcheck
 from ._filter import _filter
 from ._transform import _transform
-from ._subsample import _subsample
+from ._subsample import subsample
 
 
 __author__ = "Daniel McDonald"
@@ -1073,7 +1073,6 @@ class Table:
             raise TableException("No common ids between table and tree.")
         _tree = tree.shear(names=common_tips)
         _table = self.filter(common_tips, axis=axis, inplace=False)
-        _table.remove_empty()
         _tree.prune()
         order = [n.name for n in _tree.tips()]
         _table = _table.sort_order(order, axis=axis)
@@ -1423,7 +1422,12 @@ class Table:
         >>> print(updated_table.ids(axis='sample'))
         ['s1.1' 's2.2' 's3.3']
         """
-        str_dtype = 'U%d' % max([len(v) for v in id_map.values()])
+        max_str_len = max([len(v) for v in id_map.values()])
+        if not strict:
+            ids = self.ids(axis=axis)
+            max_str_len = max(max_str_len, max([len(i) for i in ids]))
+
+        str_dtype = 'U%d' % max_str_len
         updated_ids = zeros(self.ids(axis=axis).size, dtype=str_dtype)
         for idx, old_id in enumerate(self.ids(axis=axis)):
             if strict and old_id not in id_map:
@@ -2395,16 +2399,24 @@ class Table:
 
         return table
 
-    def partition(self, f, axis='sample'):
+    def partition(self, f, axis='sample', remove_empty=False,
+                  ignore_none=False):
         """Yields partitions
 
         Parameters
         ----------
-        f : function
+        f : function, dict
             `f` is given the ID and metadata of the vector and must return
-            what partition the vector is part of.
+            what partition the vector is part of. If `dict`, a mapping of
+            either ID -> group, or group -> [list, of, ID] must be provided.
         axis : {'sample', 'observation'}, optional
             The axis to iterate over
+        remove_empty : bool, optional
+            If `True`, remove empty vectors from a partition. Default is
+            `False`.
+        ignore_none : bool, optional
+            If `True`, ignore partitions with the label `None`. Default is
+            `False`.
 
         Returns
         -------
@@ -2445,11 +2457,39 @@ class Table:
         O1  1.0
         O2  42.0
         """
+        # we are not checking for whether the IDs are or are not present as
+        # that introduces complexity of `strict`. Deferring that for now.
+        if isinstance(f, dict):
+            test = list(f.values())[0]
+
+            if isinstance(test, (list, tuple)):
+                # group -> [list, of, ids]
+                mapping = {}
+                for grp, ids in f.items():
+                    for id_ in ids:
+                        mapping[id_] = grp
+
+            elif isinstance(test, str):
+                # id_ -> grp
+                mapping = f
+
+            else:
+                raise ValueError(f"Unable to handle a type of `{type(test)}` "
+                                 "with mapping")
+
+            def part_f(i, m):
+                return mapping.get(i)
+        else:
+            part_f = f
+
         partitions = {}
         # conversion of vector types is not necessary, vectors are not
         # being passed to an arbitrary function
         for vals, id_, md in self.iter(dense=False, axis=axis):
-            part = f(id_, md)
+            part = part_f(id_, md)
+
+            if ignore_none and part is None:
+                continue
 
             # try to make it hashable...
             if not isinstance(part, Hashable):
@@ -2481,9 +2521,14 @@ class Table:
                 samp_md = md[:] if md is not None else None
                 indices = {'sample_index': self._sample_index.copy()}
 
-            yield part, Table(data, obs_ids, samp_ids, obs_md, samp_md,
-                              self.table_id, type=self.type, validate=False,
-                              **indices)
+            tab = Table(data, obs_ids, samp_ids, obs_md, samp_md,
+                        self.table_id, type=self.type, validate=False,
+                        **indices)
+
+            if remove_empty:
+                tab.remove_empty(inplace=True)
+
+            yield part, tab
 
     def collapse(self, f, collapse_f=None, norm=True, min_group_size=1,
                  include_collapsed_metadata=True, one_to_many=False,
@@ -2915,7 +2960,8 @@ class Table:
         with_replacement : boolean, optional
             If `False` (default), subsample without replacement. If `True`,
             resample with replacement via the multinomial distribution.
-            Should not be `True` if `by_id` is `True`.
+            Should not be `True` if `by_id` is `True`. Important: If `True`,
+            samples with a sum below `n` are retained.
         seed : int, optional
             If provided, set the numpy random seed with this value
 
@@ -2932,14 +2978,16 @@ class Table:
 
         Notes
         -----
-        Subsampling is performed without replacement. If `n` is greater than
-        the sum of a given vector, that vector is omitted from the result.
-
-        Adapted from `skbio.math.subsample`, see biom-format/licenses for more
-        information about scikit-bio.
+        If subsampling is performed without replacement, vectors with a sum
+        less than `n` are omitted from the result. This condition is not held
+        when operating with replacement.
 
         This code assumes absolute abundance if `by_id` is False.
 
+        If subsampling with replacement, `np.ceil` is applied prior to
+        calculating p-values to ensure that low-abundance features have a
+        chance to be sampled.
+
         Examples
         --------
         >>> import numpy as np
@@ -2987,7 +3035,7 @@ class Table:
             table.filter(lambda v, i, md: i in subset, axis=axis)
         else:
             data = table._get_sparse_data()
-            _subsample(data, n, with_replacement, rng)
+            subsample(data, n, with_replacement, rng)
             table._data = data
 
             table.filter(lambda v, i, md: v.sum() > 0, axis=axis)
@@ -3638,54 +3686,57 @@ class Table:
         tables = [self] + others
 
         # gather all identifiers across tables
-        all_features = reduce(or_, [set(t.ids(axis='observation'))
-                                    for t in tables])
-        all_samples = reduce(or_, [set(t.ids()) for t in tables])
+        all_features = set(np.hstack([t.ids(axis='observation')
+                                      for t in tables]))
+        all_samples = set(np.hstack([t.ids() for t in tables]))
+
+        # produce a new stable order
+        feature_order = sorted(all_features)
+        sample_order = sorted(all_samples)
 
         # generate unique integer ids for the identifiers, and let's order
         # it to be polite
-        feature_map = {i: idx for idx, i in enumerate(sorted(all_features))}
-        sample_map = {i: idx for idx, i in enumerate(sorted(all_samples))}
+        feature_map = {i: idx for idx, i in enumerate(feature_order)}
+        sample_map = {i: idx for idx, i in enumerate(sample_order)}
 
-        # produce a new stable order
-        get1 = lambda x: x[1]  # noqa
-        feature_order = [k for k, v in sorted(feature_map.items(), key=get1)]
-        sample_order = [k for k, v in sorted(sample_map.items(), key=get1)]
+        ntuples = sum([t.nnz for t in tables])
 
-        mi = []
-        values = []
+        # we're going to aggregate in COO. per scipy, it is efficient for
+        # construction of large matrices. importantly, it allows for
+        # duplicates which in this case correspond to multiple values for
+        # the same sample/feature across tables. the duplicates are summed
+        # implicitly on conversion to csr/csc.
+        rows = np.empty(ntuples, dtype=np.int32)
+        cols = np.empty(ntuples, dtype=np.int32)
+        data = np.empty(ntuples, dtype=self.matrix_data.dtype)
+
+        offset = 0
         for table in tables:
-            # these data are effectively [((row_index, col_index), value), ]
-            data_as_dok = table.matrix_data.todok()
-
-            # construct a map of the feature integer index to what it is in
-            # the full table
-            feat_ids = table.ids(axis='observation')
-            samp_ids = table.ids()
-            table_features = {idx: feature_map[i]
-                              for idx, i in enumerate(feat_ids)}
-            table_samples = {idx: sample_map[i]
-                             for idx, i in enumerate(samp_ids)}
-
-            for (f, s), v in data_as_dok.items():
-                # collect the indices and values, adjusting the indices as we
-                # go
-                mi.append((table_features[f], table_samples[s]))
-                values.append(v)
-
-        # construct a multiindex of the indices where the outer index is the
-        # feature and the inner index is the sample
-        mi = pd.MultiIndex.from_tuples(mi)
-        grouped = pd.Series(values, index=mi)
-
-        # aggregate the values where the outer and inner values in the
-        # multiindex are the same
-        collapsed_rcv = grouped.groupby(level=[0, 1]).sum()
-
-        # convert into a representation understood by the Table constructor
-        list_list = [[r, c, v] for (r, c), v in collapsed_rcv.items()]
-
-        return self.__class__(list_list, feature_order, sample_order)
+            t_nnz = table.nnz
+
+            coo = table.matrix_data.tocoo()
+
+            # we need to map the index positions in the current table to the
+            # index positions in the full matrix
+            row_map = np.array([feature_map[i]
+                                for i in table.ids(axis='observation')],
+                               dtype=np.int32)
+            col_map = np.array([sample_map[i]
+                                for i in table.ids()],
+                               dtype=np.int32)
+            coo.row = row_map[coo.row]
+            coo.col = col_map[coo.col]
+
+            # store our coo data
+            rows[offset:offset + t_nnz] = coo.row
+            cols[offset:offset + t_nnz] = coo.col
+            data[offset:offset + t_nnz] = coo.data
+            offset += t_nnz
+
+        coo = coo_matrix((data, (rows, cols)),
+                         shape=(len(feature_order), len(sample_order)))
+
+        return self.__class__(coo.tocsr(), feature_order, sample_order)
 
     def merge(self, other, sample='union', observation='union',
               sample_metadata_f=prefer_self,
@@ -4139,6 +4190,7 @@ html
 
             parser = defaultdict(lambda: general_parser)
             parser['taxonomy'] = vlen_list_of_str_parser
+            parser['Taxonomy'] = vlen_list_of_str_parser
             parser['KEGG_Pathways'] = vlen_list_of_str_parser
             parser['collapsed_ids'] = vlen_list_of_str_parser
             parser.update(parse_fs)
@@ -4573,6 +4625,7 @@ html
 
         formatter = defaultdict(lambda: general_formatter)
         formatter['taxonomy'] = vlen_list_of_str_formatter
+        formatter['Taxonomy'] = vlen_list_of_str_formatter
         formatter['KEGG_Pathways'] = vlen_list_of_str_formatter
         formatter['collapsed_ids'] = vlen_list_of_str_formatter
         formatter.update(format_fs)
@@ -4861,7 +4914,7 @@ html
             for col_index, val in enumerate(obs[0]):
                 if float(val) != 0.0:
                     built_row.append(
-                        "[%d,%d,%r]" % (obs_index, col_index, val)
+                        "[%d,%d,%f]" % (obs_index, col_index, val)
                     )
             if built_row:
                 # if we have written a row already, its safe to add a comma


=====================================
biom/tests/test_cli/test_add_metadata.py
=====================================
@@ -9,6 +9,7 @@
 # -----------------------------------------------------------------------------
 
 import tempfile
+import os
 from unittest import TestCase, main
 
 import biom
@@ -20,13 +21,17 @@ class TestAddMetadata(TestCase):
     def setUp(self):
         """Set up data for use in unit tests."""
         self.cmd = _add_metadata
-        with tempfile.NamedTemporaryFile('w') as fh:
+        with tempfile.NamedTemporaryFile('w', delete=False) as fh:
             fh.write(biom1)
             fh.flush()
             self.biom_table1 = biom.load_table(fh.name)
+            self.temporary_fh_name = fh.name
         self.sample_md_lines1 = sample_md1.split('\n')
         self.obs_md_lines1 = obs_md1.split('\n')
 
+    def tearDown(self):
+        os.unlink(self.temporary_fh_name)
+
     def test_add_sample_metadata_no_casting(self):
         """Correctly adds sample metadata without casting it."""
         # Add a subset of sample metadata to a table that doesn't have any


=====================================
biom/tests/test_cli/test_subset_table.py
=====================================
@@ -55,9 +55,10 @@ class TestSubsetTable(unittest.TestCase):
     def test_subset_samples_hdf5(self):
         """Correctly subsets samples in a hdf5 table"""
         cwd = os.getcwd()
-        if '/' in __file__:
-            os.chdir(__file__.rsplit('/', 1)[0])
-        obs = _subset_table(hdf5_biom='test_data/test.biom', axis='sample',
+        if os.path.sep in __file__:
+            os.chdir(os.path.dirname(__file__))
+        obs = _subset_table(hdf5_biom=os.path.join('test_data', 'test.biom'),
+                            axis='sample',
                             ids=['Sample1', 'Sample2', 'Sample3'],
                             json_table_str=None)
         os.chdir(cwd)
@@ -71,9 +72,9 @@ class TestSubsetTable(unittest.TestCase):
     def test_subset_observations_hdf5(self):
         """Correctly subsets samples in a hdf5 table"""
         cwd = os.getcwd()
-        if '/' in __file__:
-            os.chdir(__file__.rsplit('/', 1)[0])
-        obs = _subset_table(hdf5_biom='test_data/test.biom',
+        if os.path.sep in __file__:
+            os.chdir(os.path.dirname(__file__))
+        obs = _subset_table(hdf5_biom=os.path.join('test_data', 'test.biom'),
                             axis='observation',
                             ids=['GG_OTU_1', 'GG_OTU_3', 'GG_OTU_5'],
                             json_table_str=None)


=====================================
biom/tests/test_cli/test_summarize_table.py
=====================================
@@ -12,16 +12,21 @@ from biom.cli.table_summarizer import _summarize_table
 from biom.parse import load_table
 
 import tempfile
+import os
 from unittest import TestCase, main
 
 
 class TestSummarizeTable(TestCase):
 
     def setUp(self):
-        with tempfile.NamedTemporaryFile(mode='w') as fh:
+        with tempfile.NamedTemporaryFile(mode='w', delete=False) as fh:
             fh.write(biom1)
             fh.flush()
             self.biom1 = load_table(fh.name)
+            self.temporary_fh_name = fh.name
+
+    def tearDown(self):
+        os.unlink(self.temporary_fh_name)
 
     def test_default(self):
         """ TableSummarizer functions as expected


=====================================
biom/tests/test_cli/test_table_converter.py
=====================================
@@ -8,6 +8,7 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # -----------------------------------------------------------------------------
 
+import os
 from os.path import abspath, dirname, join
 import tempfile
 
@@ -28,16 +29,18 @@ class TableConverterTests(TestCase):
         self.cmd = _convert
         self.output_filepath = tempfile.NamedTemporaryFile().name
 
-        with tempfile.NamedTemporaryFile('w') as fh:
+        with tempfile.NamedTemporaryFile('w', delete=False) as fh:
             fh.write(biom1)
             fh.flush()
             self.biom_table1 = load_table(fh.name)
+            self.temporary_fh_table_name = fh.name
 
         self.biom_lines1 = biom1.split('\n')
-        with tempfile.NamedTemporaryFile('w') as fh:
+        with tempfile.NamedTemporaryFile('w', delete=False) as fh:
             fh.write(classic1)
             fh.flush()
             self.classic_biom1 = load_table(fh.name)
+            self.temporary_fh_classic_name = fh.name
 
         self.sample_md1 = MetadataMap.from_file(sample_md1.split('\n'))
 
@@ -47,6 +50,10 @@ class TableConverterTests(TestCase):
         self.json_collapsed_samples = join(test_data_dir,
                                            'json_sample_collapsed.biom')
 
+    def tearDown(self):
+        os.unlink(self.temporary_fh_classic_name)
+        os.unlink(self.temporary_fh_table_name)
+
     def test_classic_to_biom(self):
         """Correctly converts classic to biom."""
         self.cmd(table=self.classic_biom1,


=====================================
biom/tests/test_cli/test_table_normalizer.py
=====================================
@@ -24,9 +24,9 @@ class TableNormalizerTests(TestCase):
         self.cmd = _normalize_table
 
         cwd = os.getcwd()
-        if '/' in __file__:
-            os.chdir(__file__.rsplit('/', 1)[0])
-        self.table = biom.load_table('test_data/test.json')
+        if os.path.sep in __file__:
+            os.chdir(os.path.dirname(__file__))
+        self.table = biom.load_table(os.path.join('test_data', 'test.json'))
         os.chdir(cwd)
 
     def test_bad_inputs(self):


=====================================
biom/tests/test_cli/test_validate_table.py
=====================================
@@ -39,7 +39,8 @@ class TableValidatorTests(TestCase):
         self.to_remove = []
 
         cur_path = os.path.split(os.path.abspath(__file__))[0]
-        examples_path = os.path.join(cur_path.rsplit('/', 3)[0], 'examples')
+        examples_path = os.path.join(cur_path.rsplit(os.path.sep, 3)[0],
+                                     'examples')
         self.hdf5_file_valid = os.path.join(examples_path,
                                             'min_sparse_otu_table_hdf5.biom')
         self.hdf5_file_valid_md = os.path.join(examples_path,


=====================================
biom/tests/test_data/edgecase_issue_952.biom
=====================================
Binary files /dev/null and b/biom/tests/test_data/edgecase_issue_952.biom differ


=====================================
biom/tests/test_parse.py
=====================================
@@ -46,7 +46,7 @@ class ParseTests(TestCase):
         self.legacy_otu_table1 = legacy_otu_table1
         self.otu_table1 = otu_table1
         self.otu_table1_floats = otu_table1_floats
-        self.files_to_remove = []
+        self.to_remove = []
         self.biom_minimal_sparse = biom_minimal_sparse
 
         self.classic_otu_table1_w_tax = classic_otu_table1_w_tax.split('\n')
@@ -54,6 +54,11 @@ class ParseTests(TestCase):
         self.classic_table_with_complex_metadata = \
             classic_table_with_complex_metadata.split('\n')
 
+    def tearDown(self):
+        if self.to_remove:
+            for f in self.to_remove:
+                os.remove(f)
+
     def test_from_tsv_bug_854(self):
         data = StringIO('#FeatureID\tSample1')
         exp = Table([], [], ['Sample1'])
@@ -281,38 +286,40 @@ class ParseTests(TestCase):
     def test_parse_biom_table_hdf5(self):
         """Make sure we can parse a HDF5 table through the same loader"""
         cwd = os.getcwd()
-        if '/' in __file__[1:]:
-            os.chdir(__file__.rsplit('/', 1)[0])
-        Table.from_hdf5(h5py.File('test_data/test.biom', 'r'))
+        if os.path.sep in __file__[1:]:
+            os.chdir(os.path.dirname(__file__))
+        Table.from_hdf5(h5py.File(os.path.join('test_data', 'test.biom'),
+                                  'r'))
         os.chdir(cwd)
 
     def test_save_table_filepath(self):
         t = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['a', 'b'],
                   ['c', 'd', 'e'])
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             save_table(t, tmpfile.name)
             obs = load_table(tmpfile.name)
             self.assertEqual(obs, t)
+        self.to_remove.append(tmpfile.name)
 
     def test_load_table_filepath(self):
         cwd = os.getcwd()
-        if '/' in __file__[1:]:
-            os.chdir(__file__.rsplit('/', 1)[0])
-        load_table('test_data/test.biom')
+        if os.path.sep in __file__[1:]:
+            os.chdir(os.path.dirname(__file__))
+        load_table(os.path.join('test_data', 'test.biom'))
         os.chdir(cwd)
 
     def test_load_table_inmemory(self):
         cwd = os.getcwd()
-        if '/' in __file__[1:]:
-            os.chdir(__file__.rsplit('/', 1)[0])
-        load_table(h5py.File('test_data/test.biom', 'r'))
+        if os.path.sep in __file__[1:]:
+            os.chdir(os.path.dirname(__file__))
+        load_table(h5py.File(os.path.join('test_data', 'test.biom'), 'r'))
         os.chdir(cwd)
 
     def test_load_table_inmemory_json(self):
         cwd = os.getcwd()
-        if '/' in __file__[1:]:
-            os.chdir(__file__.rsplit('/', 1)[0])
-        load_table(open('test_data/test.json'))
+        if os.path.sep in __file__[1:]:
+            os.chdir(os.path.dirname(__file__))
+        load_table(open(os.path.join('test_data', 'test.json')))
         os.chdir(cwd)
 
     def test_load_table_inmemory_stringio(self):
@@ -350,10 +357,11 @@ class ParseTests(TestCase):
         """tests for parse_biom_table when we have h5py"""
         # We will round-trip the HDF5 file to several different formats, and
         # make sure we can recover the same table using parse_biom_table
-        if '/' in __file__[1:]:
-            os.chdir(__file__.rsplit('/', 1)[0])
+        if os.path.sep in __file__[1:]:
+            os.chdir(os.path.dirname(__file__))
 
-        t = parse_biom_table(h5py.File('test_data/test.biom', 'r'))
+        t = parse_biom_table(h5py.File(os.path.join('test_data', 'test.biom'),
+                                       'r'))
 
         # These things are not round-trippable using the general-purpose
         # parse_biom_table function


=====================================
biom/tests/test_table.py
=====================================
@@ -1016,13 +1016,15 @@ class TableTests(TestCase):
                   ['c', 'd', 'e'])
         t.add_metadata({'a': {'a / problem': 10}, 'b': {'a / problem': 20}},
                        axis='observation')
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             t.to_hdf5(h5, 'tests')
             h5.close()
 
             h5 = h5py.File(tmpfile.name, 'r')
             obs = Table.from_hdf5(h5)
+            h5.close()
+        self.to_remove.append(tmpfile.name)
 
         self.assertEqual(obs, t)
 
@@ -1030,7 +1032,7 @@ class TableTests(TestCase):
         t = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['a', 'b'],
                   ['c', 'd', 'e'])
         current = datetime.now()
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             t.to_hdf5(h5, 'tests', creation_date=current)
             h5.close()
@@ -1038,6 +1040,8 @@ class TableTests(TestCase):
             h5 = h5py.File(tmpfile.name, 'r')
             obs = Table.from_hdf5(h5)
             self.assertEqual(obs.create_date, current)
+            h5.close()
+        self.to_remove.append(tmpfile.name)
 
         self.assertEqual(obs, t)
 
@@ -1045,24 +1049,27 @@ class TableTests(TestCase):
         """Successfully writes an empty OTU table in HDF5 format"""
         # Create an empty OTU table
         t = Table([], [], [])
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             t.to_hdf5(h5, 'tests')
             h5.close()
+        self.to_remove.append(tmpfile.name)
 
     def test_to_hdf5_empty_table_bug_619(self):
         """Successfully writes an empty OTU table in HDF5 format"""
         t = example_table.filter({}, axis='observation', inplace=False)
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             t.to_hdf5(h5, 'tests')
             h5.close()
+        self.to_remove.append(tmpfile.name)
 
         t = example_table.filter({}, inplace=False)
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             t.to_hdf5(h5, 'tests')
             h5.close()
+        self.to_remove.append(tmpfile.name)
 
     def test_to_hdf5_missing_metadata_observation(self):
         # exercises a vlen_list
@@ -1070,10 +1077,11 @@ class TableTests(TestCase):
                   [{'taxonomy': None},
                    {'taxonomy': ['foo', 'baz']}])
 
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             with h5py.File(tmpfile.name, 'w') as h5:
                 t.to_hdf5(h5, 'tests')
             obs = load_table(tmpfile.name)
+        self.to_remove.append(tmpfile.name)
         self.assertEqual(obs.metadata(axis='observation'),
                          ({'taxonomy': None},
                           {'taxonomy': ['foo', 'baz']}))
@@ -1084,10 +1092,11 @@ class TableTests(TestCase):
                   [{'dat': None},
                    {'dat': 'foo'}])
 
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             with h5py.File(tmpfile.name, 'w') as h5:
                 t.to_hdf5(h5, 'tests')
             obs = load_table(tmpfile.name)
+        self.to_remove.append(tmpfile.name)
         self.assertEqual(obs.metadata(axis='sample'),
                          ({'dat': ''},
                           {'dat': 'foo'}))
@@ -1097,11 +1106,12 @@ class TableTests(TestCase):
                   [{'taxonomy_A': 'foo; bar'},
                    {'taxonomy_B': 'foo; baz'}])
 
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             with h5py.File(tmpfile.name, 'w') as h5:
                 with self.assertRaisesRegex(ValueError,
                                             'inconsistent metadata'):
                     t.to_hdf5(h5, 'tests')
+        self.to_remove.append(tmpfile.name)
 
     def test_to_hdf5_inconsistent_metadata_categories_sample(self):
         t = Table(np.array([[0, 1], [2, 3]]), ['a', 'b'], ['c', 'd'],
@@ -1109,21 +1119,23 @@ class TableTests(TestCase):
                   [{'dat_A': 'foo; bar'},
                    {'dat_B': 'foo; baz'}])
 
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             with h5py.File(tmpfile.name, 'w') as h5:
                 with self.assertRaisesRegex(ValueError,
                                             'inconsistent metadata'):
                     t.to_hdf5(h5, 'tests')
+        self.to_remove.append(tmpfile.name)
 
     def test_to_hdf5_malformed_taxonomy(self):
         t = Table(np.array([[0, 1], [2, 3]]), ['a', 'b'], ['c', 'd'],
                   [{'taxonomy': 'foo; bar'},
                    {'taxonomy': 'foo; baz'}])
 
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             with h5py.File(tmpfile.name, 'w') as h5:
                 t.to_hdf5(h5, 'tests')
             obs = load_table(tmpfile.name)
+        self.to_remove.append(tmpfile.name)
         self.assertEqual(obs.metadata(axis='observation'),
                          ({'taxonomy': ['foo', 'bar']},
                           {'taxonomy': ['foo', 'baz']}))
@@ -1134,9 +1146,11 @@ class TableTests(TestCase):
                         [{'foo': ['k__a', 'p__b']},
                          {'foo': ['k__a', 'p__c']}],
                         [{'barcode': 'aatt'}, {'barcode': 'ttgg'}])
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             st_rich.to_hdf5(h5, 'tests')
+            h5.close()
+        self.to_remove.append(tmpfile.name)
 
     def test_to_hdf5_custom_formatters(self):
         self.st_rich = Table(self.vals,
@@ -1151,7 +1165,7 @@ class TableTests(TestCase):
             grp.create_dataset(name, shape=data.shape, dtype=H5PY_VLEN_STR,
                                data=data, compression=compression)
 
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             self.st_rich.to_hdf5(h5, 'tests',
                                  format_fs={'barcode': bc_formatter})
@@ -1172,10 +1186,11 @@ class TableTests(TestCase):
                 self.assertNotEqual(m1['barcode'], m2['barcode'])
                 self.assertEqual(m1['barcode'].lower(), m2['barcode'])
             h5.close()
+        self.to_remove.append(tmpfile.name)
 
     def test_to_hdf5(self):
         """Write a file"""
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             self.st_rich.to_hdf5(h5, 'tests')
             h5.close()
@@ -1193,9 +1208,10 @@ class TableTests(TestCase):
             obs = Table.from_hdf5(h5)
             self.assertEqual(obs, self.st_rich)
             h5.close()
+        self.to_remove.append(tmpfile.name)
 
         # Test with a collapsed table
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             dt_rich = Table(
                 np.array([[5, 6, 7], [8, 9, 10], [11, 12, 13]]),
@@ -1238,9 +1254,10 @@ class TableTests(TestCase):
                 [{'collapsed_ids': ['a', 'c']},
                  {'collapsed_ids': ['b']}])
             self.assertEqual(obs, exp)
+        self.to_remove.append(tmpfile.name)
 
         # Test with table having a None on taxonomy
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             h5 = h5py.File(tmpfile.name, 'w')
             t = Table(self.vals, ['1', '2'], ['a', 'b'],
                       [{'taxonomy': ['k__a', 'p__b']},
@@ -1262,6 +1279,7 @@ class TableTests(TestCase):
             obs = Table.from_hdf5(h5)
             h5.close()
             self.assertEqual(obs, t)
+        self.to_remove.append(tmpfile.name)
 
     def test_from_tsv(self):
         tab1_fh = StringIO(otu_table1)
@@ -1593,7 +1611,7 @@ class TableTests(TestCase):
         df = example_table.to_dataframe()
         density = (float(example_table.matrix_data.getnnz()) /
                    np.prod(example_table.shape))
-        df_density = (df > 0).sum().sum() / np.prod(df.shape)
+        df_density = (df.values > 0).sum().sum() / np.prod(df.shape)
         assert np.allclose(df_density, density)
 
     def test_to_dataframe_dense(self):
@@ -1995,6 +2013,21 @@ class TableTests(TestCase):
         with self.assertRaises(TableException):
             table.align_to_dataframe(metadata)
 
+    @pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed")
+    def test_align_tree_issue_948(self):
+        table = Table(np.array([[0, 0, 0, 0],
+                                [2, 3, 4, 4],
+                                [5, 5, 3, 3],
+                                [0, 0, 0, 1]]).T,
+                      ['a', 'b', 'c', 'd'],
+                      ['s1', 's2', 's3', 's4'])
+        tree = skbio.TreeNode.read(["(a,b,c,d)r;"])
+        exp_tree = tree
+        exp_table = table.copy()
+        res_table, res_tree = table.align_tree(tree)
+        self.assertEqual(res_table, exp_table)
+        self.assertEqual(str(exp_tree), str(res_tree))
+
     @pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed")
     def test_align_tree_intersect_tips(self):
         # there are less tree tips than observations
@@ -2379,6 +2412,16 @@ class SparseTableTests(TestCase):
                          self.st_rich.data('2', 'observation'))
         self.assertEqual(obs.transpose(), self.st_rich)
 
+    def test_update_ids_strict_dtype_bug_issue_957(self):
+        t = Table(np.arange(6).reshape(2, 3),
+                  ['O1', 'O2'],
+                  ['ab', 'cdef', 'ghijkl'])
+        exp = Table(np.arange(6).reshape(2, 3),
+                    ['O1', 'O2'],
+                    ['AB', 'cdef', 'ghijkl'])
+        obs = t.update_ids({'ab': 'AB'}, strict=False, inplace=False)
+        self.assertEqual(obs, exp)
+
     def test_update_ids_inplace_bug_892(self):
         t = example_table.copy()
         exp = t.ids().copy()
@@ -3170,6 +3213,23 @@ class SparseTableTests(TestCase):
         with errstate(empty='raise'), self.assertRaises(TableException):
             self.st_rich.filter(f, 'observation')
 
+    def test_subsample_edgecase_issue_952(self):
+        # this file triggers an exception on Linux on subsample
+        # with replacement where the pvals computed sum to > 1. It is a
+        # subset of the data reported in issue 952, specifically constrained
+        # to the first 10 features with any empty samples removed.
+        path = 'test_data/edgecase_issue_952.biom'
+
+        # ...existing logic for test_data, not ideal, but consistent
+        cwd = os.getcwd()
+        if '/' in __file__:
+            os.chdir(__file__.rsplit('/', 1)[0])
+        table = Table.from_hdf5(h5py.File(path, 'r'))
+        os.chdir(cwd)
+
+        obs = table.subsample(10, with_replacement=True)
+        self.assertEqual(set(obs.sum('sample')), {10.0, })
+
     def test_subsample_same_seed_without_replacement(self):
         table = Table(np.array([[3, 1, 2], [0, 3, 4]]), ['O1', 'O2'],
                       ['S1', 'S2', 'S3'])
@@ -4237,6 +4297,79 @@ class SparseTableTests(TestCase):
         with self.assertRaisesRegex(TypeError, msg):
             Table._extract_data_from_tsv(tsv, dtype=int)
 
+    def test_partition_remove_empty(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        part_f = lambda i, m: i == 'S1'  # noqa
+        obs = dict(t.partition(part_f, remove_empty=True))
+        exp = {True: Table(np.array([[3, ], [4, ]]), ['O2', 'O3'], ['S1', ]),
+               False: Table(np.array([[1, 2]]), ['O1', ], ['S2', 'S3'])}
+        self.assertEqual(obs, exp)
+
+    def test_partition_ignore_none_true(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        part_f = lambda i, m: True if i == 'S1' else None  # noqa
+        obs = dict(t.partition(part_f, ignore_none=True))
+        exp = {True: Table(np.array([[0, ], [3, ], [4, ]]),
+                           ['O1', 'O2', 'O3'], ['S1', ])}
+        self.assertEqual(obs, exp)
+
+    def test_partition_ignore_none_false(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        part_f = lambda i, m: True if i == 'S1' else None  # noqa
+        obs = dict(t.partition(part_f, ignore_none=False))
+        exp = {True: Table(np.array([[0, ], [3, ], [4, ]]),
+                           ['O1', 'O2', 'O3'], ['S1', ]),
+               None: Table(np.array([[1, 2], [0, 0], [0, 0]]),
+                           ['O1', 'O2', 'O3'], ['S2', 'S3'])}
+        self.assertEqual(obs, exp)
+
+    def test_partition_dict_ids_to_groups(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        by_dict = {'S1': 'foo',
+                   'S2': 'bar',
+                   'S3': 'foo'}
+        exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S1', 'S3']),
+               'bar': Table(np.array([[1, ], [0, ], [0, ]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S2', ])}
+        obs = dict(t.partition(by_dict))
+        self.assertEqual(obs, exp)
+
+    def test_partition_dict_groups_to_ids(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        by_dict_group = {'foo': ['S1', 'S3'],
+                         'bar': ['S2', ]}
+        exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S1', 'S3']),
+               'bar': Table(np.array([[1, ], [0, ], [0, ]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S2', ])}
+        obs = dict(t.partition(by_dict_group))
+        self.assertEqual(obs, exp)
+
     def test_bin_samples_by_metadata(self):
         """Yield tables binned by sample metadata"""
         def f(id_, md):


=====================================
biom/tests/test_util.py
=====================================
@@ -42,6 +42,12 @@ class UtilTests(TestCase):
 
     def setUp(self):
         self.biom_otu_table1_w_tax = parse_biom_table(biom_otu_table1_w_tax)
+        self.to_remove = []
+
+    def tearDown(self):
+        if self.to_remove:
+            for f in self.to_remove:
+                os.remove(f)
 
     def test_generate_subsamples(self):
         table = Table(np.array([[3, 1, 1], [0, 3, 3]]), ['O1', 'O2'],
@@ -246,11 +252,14 @@ class UtilTests(TestCase):
         tmp_f = NamedTemporaryFile(
             mode='w',
             prefix='test_safe_md5',
-            suffix='txt')
+            suffix='txt',
+            delete=False)
         tmp_f.write('foo\n')
         tmp_f.flush()
 
         obs = safe_md5(open(tmp_f.name))
+        tmp_f.close()
+        self.to_remove.append(tmp_f.name)
         self.assertEqual(obs, exp)
 
         obs = safe_md5(['foo\n'])
@@ -262,9 +271,10 @@ class UtilTests(TestCase):
     def test_biom_open_hdf5_pathlib_write(self):
         t = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['a', 'b'],
                   ['c', 'd', 'e'])
-        with NamedTemporaryFile() as tmpfile:
+        with NamedTemporaryFile(delete=False) as tmpfile:
             with biom_open(pathlib.Path(tmpfile.name), 'w') as fp:
                 t.to_hdf5(fp, 'tests')
+        self.to_remove.append(tmpfile.name)
 
     def test_biom_open_hdf5_pathlib_read(self):
         cwd = os.getcwd()
@@ -309,11 +319,12 @@ class UtilTests(TestCase):
 
     def test_load_classic(self):
         tab = load_table(get_data_path('test.json'))
-        with NamedTemporaryFile(mode='w') as fp:
+        with NamedTemporaryFile(mode='w', delete=False) as fp:
             fp.write(str(tab))
             fp.flush()
 
             obs = load_table(fp.name)
+        self.to_remove.append(fp.name)
 
         npt.assert_equal(obs.ids(), tab.ids())
         npt.assert_equal(obs.ids(axis='observation'),


=====================================
biom/util.py
=====================================
@@ -41,7 +41,7 @@ __url__ = "http://biom-format.org"
 __maintainer__ = "Daniel McDonald"
 __email__ = "daniel.mcdonald at colorado.edu"
 __format_version__ = (2, 1)
-__version__ = "2.1.15"
+__version__ = "2.1.16"
 
 
 def generate_subsamples(table, n, axis='sample', by_id=False):
@@ -425,7 +425,6 @@ def biom_open(fp, permission='r'):
     if permission not in ['r', 'w', 'U', 'rb', 'wb']:
         raise OSError("Unknown mode: %s" % permission)
 
-    opener = functools.partial(io.open, encoding='utf-8')
     mode = permission
 
     # don't try to open an HDF5 file if H5PY is not installed, this can only
@@ -434,19 +433,20 @@ def biom_open(fp, permission='r'):
         if os.path.getsize(fp) == 0:
             raise ValueError("The file '%s' is empty and can't be parsed" % fp)
 
-    if mode in ['U', 'r', 'rb'] and h5py.is_hdf5(fp):
-        opener = h5py.File
-        mode = 'r' if permission == 'U' else permission
-    elif mode == 'w':
-        opener = h5py.File
-
     if mode in ['U', 'r', 'rb'] and is_gzip(fp):
-        def opener(fp, mode):
+        def opener(fp, mode):  # noqa
             return codecs.getreader('utf-8')(gzip_open(fp, mode))
         mode = 'rb' if permission in ['U', 'r'] else permission
     elif mode in ['w', 'wb'] and str(fp).endswith('.gz'):
-        def opener(fp, mode):
+        def opener(fp, mode):  # noqa
             codecs.getwriter('utf-8')(gzip_open(fp, mode))
+    elif mode in ['U', 'r', 'rb'] and h5py.is_hdf5(fp):
+        opener = h5py.File
+        mode = 'r' if permission == 'U' else permission
+    elif mode == 'w':
+        opener = h5py.File
+    else:
+        opener = functools.partial(io.open, encoding='utf-8')
 
     f = opener(fp, mode)
     try:


=====================================
ci/aarch64.conda_requirements.txt
=====================================
@@ -1,4 +1,3 @@
-natsort >= 4.0.3
 numpy >= 1.9.2
 pandas >= 0.20.0
 scipy >= 1.3.1


=====================================
ci/conda_requirements.txt
=====================================
@@ -1,4 +1,3 @@
-natsort >= 4.0.3
 numpy >= 1.9.2
 pandas >= 0.20.0
 scipy >= 1.3.1


=====================================
doc/conf.py
=====================================
@@ -57,15 +57,15 @@ master_doc = 'index'
 
 # General information about the project.
 project = 'biom-format'
-copyright = '2011-2022 The BIOM Format Development Team'
+copyright = '2011-2024 The BIOM Format Development Team'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The full version, including alpha/beta/rc tags.
-version = "2.1.15"
-release = "2.1.15"
+version = "2.1.16"
+release = "2.1.16"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.


=====================================
doc/index.rst
=====================================
@@ -35,6 +35,8 @@ Projects using the BIOM format
 * `EBI Metagenomics <https://www.ebi.ac.uk/metagenomics>`_
 * `GCModeller <http://gcmodeller.org>`_
 * `MetaPhlAn 2 <http://segatalab.cibio.unitn.it/tools/metaphlan2/>`__
+* `mia (TreeSummarizedExperiment; R/Bioconductor) <http://microbiome.github.io/>`__
+
 
 If you are using BIOM in your project, and would like your project to be listed, please submit a `pull request <https://github.com/biocore/biom-format/pulls>`_ to the BIOM project. More information on `submitting pull requests can be found here <https://help.github.com/articles/using-pull-requests>`_.
 


=====================================
setup.py
=====================================
@@ -32,7 +32,7 @@ __copyright__ = "Copyright 2011-2020, The BIOM Format Development Team"
 __credits__ = ["Greg Caporaso", "Daniel McDonald", "Jose Clemente",
                "Jai Ram Rideout", "Jorge Cañardo Alastuey", "Michael Hall"]
 __license__ = "BSD"
-__version__ = "2.1.15"
+__version__ = "2.1.16"
 __maintainer__ = "Daniel McDonald"
 __email__ = "mcdonadt at colorado.edu"
 
@@ -86,15 +86,20 @@ classes = """
     Topic :: Software Development :: Libraries :: Application Frameworks
     Topic :: Software Development :: Libraries :: Python Modules
     Programming Language :: Python
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3 :: Only
     Programming Language :: Python :: 3.6
     Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
     Programming Language :: Python :: Implementation :: CPython
     Operating System :: OS Independent
     Operating System :: POSIX :: Linux
     Operating System :: MacOS :: MacOS X
+    Operating System :: Microsoft :: Windows
 """
 classifiers = [s.strip() for s in classes.split('\n') if s]
 



View it on GitLab: https://salsa.debian.org/med-team/python-biom-format/-/commit/842fbdbfebc37de25b5599f49b46ec865c078844

-- 
This project does not include diff previews in email notifications.
View it on GitLab: https://salsa.debian.org/med-team/python-biom-format/-/commit/842fbdbfebc37de25b5599f49b46ec865c078844
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240527/2361f032/attachment-0001.htm>


More information about the debian-med-commit mailing list