[med-svn] [Git][med-team/python-biom-format][upstream] New upstream version 2.1.16
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Mon May 27 21:40:47 BST 2024
Étienne Mollier pushed to branch upstream at Debian Med / python-biom-format
Commits:
842fbdbf by Étienne Mollier at 2024-05-27T20:07:00+02:00
New upstream version 2.1.16
- - - - -
24 changed files:
- .github/workflows/python-package-conda.yml
- .github/workflows/release.yml
- ChangeLog.md
- biom/__init__.py
- biom/_filter.pyx
- biom/_subsample.pyx
- biom/_transform.pyx
- biom/table.py
- biom/tests/test_cli/test_add_metadata.py
- biom/tests/test_cli/test_subset_table.py
- biom/tests/test_cli/test_summarize_table.py
- biom/tests/test_cli/test_table_converter.py
- biom/tests/test_cli/test_table_normalizer.py
- biom/tests/test_cli/test_validate_table.py
- + biom/tests/test_data/edgecase_issue_952.biom
- biom/tests/test_parse.py
- biom/tests/test_table.py
- biom/tests/test_util.py
- biom/util.py
- ci/aarch64.conda_requirements.txt
- ci/conda_requirements.txt
- doc/conf.py
- doc/index.rst
- setup.py
Changes:
=====================================
.github/workflows/python-package-conda.yml
=====================================
@@ -9,8 +9,8 @@ on:
branches: [ master ]
env:
- latest_python: "3.11"
- supported_pythons: '["3.7", "3.8", "3.9", "3.10", "3.11"]'
+ latest_python: "3.12"
+ supported_pythons: '["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]'
miniforge_version: "22.9.0-2"
miniforge_variant: "Mambaforge"
@@ -34,7 +34,7 @@ jobs:
needs: conf
runs-on: "ubuntu-latest"
steps:
- - uses: actions/checkout at v3
+ - uses: actions/checkout at v4
- uses: conda-incubator/setup-miniconda at v2
with:
auto-update-conda: true
@@ -56,7 +56,7 @@ jobs:
needs: ["conf", "lint"]
runs-on: "ubuntu-latest"
steps:
- - uses: actions/checkout at v3
+ - uses: actions/checkout at v4
- uses: conda-incubator/setup-miniconda at v2
with:
auto-update-conda: true
@@ -81,15 +81,15 @@ jobs:
strategy:
fail-fast: true
matrix:
- os: ["ubuntu-latest", "macos-latest"]
+ os: ["ubuntu-latest", "macos-latest", "windows-latest"]
python_version: ${{ fromJSON(needs.conf.outputs.supported_pythons) }}
use_conda: [true, false]
steps:
- - uses: actions/checkout at v3
+ - uses: actions/checkout at v4
- uses: conda-incubator/setup-miniconda at v2
with:
auto-update-conda: true
- python-version: ${{ env.latest_python }}
+ python-version: ${{ matrix.python_version }}
miniforge-version: ${{ env.miniforge_version }}
miniforge-variant: ${{ env.miniforge_variant }}
environment-file: ci/conda_host_env.yml
@@ -115,7 +115,7 @@ jobs:
needs: ["conf", "lint", "doc", "test-all"]
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout at v3
+ - uses: actions/checkout at v4
# setup-buildx-action uses the git context directly
# but checklist wants the .git directory
- name: Set up QEMU
=====================================
.github/workflows/release.yml
=====================================
@@ -1,92 +1,128 @@
name: Release
-on: [push, pull_request]
+on:
+ push:
+ tags:
+ - '*'
+
+env:
+ earliest_python: "3.8"
+ latest_python: "3.12"
+ miniforge_version: "23.11.0-0"
+ miniforge_variant: "Mambaforge"
jobs:
- build_sdist:
- name: Build sdist
+ release:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout at v3
-
- - name: Build distribution
- run: |
- export RELEASE_VERSION=${{ github.ref_name }}
- pip install numpy cython
- pipx run build --sdist
-
- - uses: actions/upload-artifact at v3
- with:
- name: dist-artifacts
- path: dist/*.tar.gz
-
- # adapted from
- # https://github.com/biopython/biopython/blob/master/.github/workflows/ci.yml
- build_wheels:
- name: Build wheels (py ${{ matrix.pyver }}) ${{ matrix.os }}
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- os: [ubuntu-latest, macos-latest]
- pyver: ["37", "38", "39", "310", "311"]
-
- steps:
- - uses: actions/checkout at v3
- - name: Set up Python
+ - uses: actions/checkout at v2
+ - name: Set up Python 3.8
uses: actions/setup-python at v2
with:
- python-version: 3.9
-
- - name: Install Python packaging tools
+ python-version: 3.8
+ - name: Build distribution
run: |
+ # set version from '${{ github.ref_name }}'
export RELEASE_VERSION=${{ github.ref_name }}
pip install numpy cython
- python -m pip install --upgrade pip setuptools wheel
-
- # https://github.com/pypa/cibuildwheel/blob/main/examples/github-deploy.yml
- - name: Build wheels (py ${{ matrix.pyver }}) Linux
- if: matrix.os == 'ubuntu-latest'
- env:
- CIBW_ARCHS_LINUX: x86_64
- CIBW_SKIP: "*-musllinux*"
- CIBW_BUILD: "cp${{ matrix.pyver }}-*"
-
- uses: pypa/cibuildwheel at v2.12.3
-
- - name: Build wheels (py ${{ matrix.pyver }}) MacOS
- if: matrix.os == 'macos-latest'
- env:
- CIBW_ARCHS_MACOS: "x86_64 arm64 universal2"
- CIBW_BUILD: "cp${{ matrix.pyver }}-*"
-
- uses: pypa/cibuildwheel at v2.12.3
-
-
- - name: Upload wheels
- uses: actions/upload-artifact at v3
- with:
- name: dist-artifacts
- path: ./wheelhouse/*.whl
-
- release:
- needs: [build_wheels, build_sdist]
- runs-on: ubuntu-latest
- # this is not ideal as it doesn't limit to what type of tag
- # but it at least seems to work
- if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
-
- steps:
- - name: Stage artifacts
- uses: actions/download-artifact at v3
- with:
- name: dist-artifacts
- path: dist/
-
- - name: Check artifacts
- run: ls -lrt dist/
+ python setup.py sdist
- name: Publish a Python distribution to PyPI
- uses: pypa/gh-action-pypi-publish at v1.5.0
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+ uses: pypa/gh-action-pypi-publish at release/v1
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
+
+# wheels are not working
+# cutting them out did not "just" work
+# getting a release out right now is more important than
+# precompiled builds.
+ #jobs:
+ # build_sdist:
+ # name: Build sdist
+ # runs-on: ubuntu-latest
+ # steps:
+ # - uses: actions/checkout at v4
+ #
+ # - name: Build distribution
+ # run: |
+ # pip install numpy cython
+ # pipx run build --sdist
+ #
+ # - uses: actions/upload-artifact at v4
+ # with:
+ # name: cibw-sdist
+ # path: dist/*.tar.gz
+ #
+ # # adapted from
+ # # https://github.com/pypa/cibuildwheel/blob/main/examples/github-deploy.yml
+ # build_wheels:
+ # name: Build wheels (py ${{ matrix.pyver }}) ${{ matrix.os }}
+ # runs-on: ${{ matrix.os }}
+ # strategy:
+ # matrix:
+ # os: [ubuntu-latest, macos-13, macos-14]
+ # pyver: ["37", "38", "39", "310", "311", "312"]
+ #
+ # steps:
+ # - uses: actions/checkout at v4
+ #
+ # - name: Install Python packaging tools
+ # run: |
+ # pip install numpy cython
+ # python -m pip install --upgrade pip setuptools wheel
+ #
+ # - name: Build wheels (py ${{ matrix.pyver }}) Linux
+ # if: matrix.os == 'ubuntu-latest'
+ # env:
+ # CIBW_ARCHS_LINUX: "x86_64 aarch64"
+ # CIBW_SKIP: "*-musllinux*"
+ # CIBW_BUILD: "cp${{ matrix.pyver }}-*"
+ #
+ # uses: pypa/cibuildwheel at v2.17.0
+ #
+ # - name: Build wheels (py ${{ matrix.pyver }}) MacOS
+ # if: matrix.os == 'macos-latest'
+ # env:
+ # CIBW_ARCHS_MACOS: "x86_64 arm64 universal2"
+ # CIBW_BUILD: "cp${{ matrix.pyver }}-*"
+ #
+ # uses: pypa/cibuildwheel at v2.17.0
+ #
+ # - name: Build wheels (py ${{ matrix.pyver }}) Windows
+ # if: matrix.os == 'windows-latest'
+ # env:
+ # CIBW_ARCHS_WINDOWS: "amd64 win32"
+ # CIBW_BUILD: "cp${{ matrix.pyver }}-*"
+ #
+ # uses: pypa/cibuildwheel at v2.17.0
+ #
+ # - uses: actions/upload-artifact at v4
+ # with:
+ # name: cibw-wheels-${{ matrix.os }}-${{ matrix.pyver }}-${{ strategy.job-index }}
+ # path: ./wheelhouse/*.whl
+ #
+ # release:
+ # needs: [build_wheels, build_sdist]
+ # runs-on: ubuntu-latest
+ # environment: pypi
+ # permissions:
+ # id-token: write
+ #
+ # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+ # steps:
+ # - uses: actions/download-artifact at v4
+ # with:
+ # name: cibw-*
+ # path: dist
+ # merge-multiple: true
+ #
+ # - name: Check artifacts
+ # run: ls -lrt dist/
+ #
+ # - name: Publish Distribution
+ # uses: pypa/gh-action-pypi-publish at v1.5.0
+ # with:
+ # user: __token__
+ # password: ${{ secrets.PYPI_API_TOKEN }}
=====================================
ChangeLog.md
=====================================
@@ -1,6 +1,31 @@
BIOM-Format ChangeLog
=====================
+biom 2.1.16
+-----------
+
+Maintenance, bug fix, performance and feature release, May 9th 2024.
+
+New features:
+
+* Add Windows support. PR[#951](https://github.com/biocore/biom-format/pull/951) revises codebase to be Windows compatible and adds this support to the CI testing matrix.
+* Add NumPy 2.0 support. PR [#950](https://github.com/biocore/biom-format/pull/950) ensures code compatibility with NumPy 2.0. This support is yet to be added to the CI testing matrix.
+* Expand API for `Table.partition` to allow for passing `dict` mappings from ids to groups and vice versa, remove of empty vectors, and ignoring `None` partitions. See issue [#937](https://github.com/biocore/biom-format/issues/937)
+* NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956)
+* The optimized subsample without replacement method is now exposed as `biom.subsample`. Note that this method operates inplace on SciPy `csr_matrix` and `csc_matrix` objects. See issue [#958](https://github.com/biocore/biom-format/issues/958)
+
+Bug Fixes:
+
+* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)
+* In `subsample(..., with_replacement=True)`, it was possible to trigger a numerical stability on sum, see issue [#952](https://github.com/biocore/biom-format/issues/952)
+* `update_ids(..., strict=False)` could yield truncated IDs, see issue [#957](https://github.com/biocore/biom-format/issues/957)
+
+Performance improvements:
+
+* Revise `Table._fast_merge` to use COO directly. For very large tables, this reduces runtime by ~50x and memory by ~5x. See PR [#913](https://github.com/biocore/biom-format/pull/933).
+* Drastically reduce the memory needs of subsampling when sums are large. Also adds 64-bit support. See PR [#935](https://github.com/biocore/biom-format/pull/935).
+* Improve handling of not-perfectly-integer inputs. See PR [#938](https://github.com/biocore/biom-format/pull/938).
+
biom 2.1.15
-----------
@@ -11,7 +36,7 @@ Bug fixes:
* Allow `Table.to_json` to properly handle numpy types in metadata, see issue [#886](https://github.com/biocore/biom-format/issues/886)
* Do not modify IDs in place in the presence of duplicate relabels, see issue [#892](https://github.com/biocore/biom-format/issues/892)
* Catch an edge case where a failured ID update in place would actually change IDs, see issue [#892](https://github.com/biocore/biom-format/issues/892)
-
+
New features:
* `biom.parse.save_table` makes saving less tedious, see issue [#897](https://github.com/biocore/biom-format/issues/897)
=====================================
biom/__init__.py
=====================================
@@ -51,6 +51,7 @@ either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse accordingly:
from .table import Table
from .parse import parse_biom_table as parse_table, load_table, save_table
from .util import __format_version__, __version__
+from ._subsample import subsample
__author__ = "Daniel McDonald"
__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team"
@@ -58,7 +59,7 @@ __credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso",
"Jose Clemente", "Justin Kuczynski", "Antonio Gonzalez",
"Yoshiki Vazquez Baeza", "Jose Navas", "Adam Robbins-Pianka",
"Rob Knight", "Joshua Shorenstein", "Emily TerAvest",
- "Michael Shaffer"]
+ "Michael Shaffer", "Qiyun Zhu", "Matt Aton"]
__license__ = "BSD"
__url__ = "http://biom-format.org"
__maintainer__ = "Daniel McDonald"
@@ -95,4 +96,5 @@ def concat(tables, *args, **kwargs):
__all__ = ['Table', 'example_table', 'parse_table', 'load_table',
- '__format_version__', '__version__', 'save_table']
+ '__format_version__', '__version__', 'save_table',
+ 'subsample']
=====================================
biom/_filter.pyx
=====================================
@@ -13,6 +13,7 @@ from types import FunctionType
import numpy as np
cimport numpy as cnp
+cnp.import_array()
cdef cnp.ndarray[cnp.uint8_t, ndim=1] \
=====================================
biom/_subsample.pyx
=====================================
@@ -6,66 +6,169 @@
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------
-
import numpy as np
cimport numpy as cnp
+cnp.import_array()
-def _subsample(arr, n, with_replacement, rng):
- """Subsample non-zero values of a sparse array
+cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
+ cnp.ndarray[cnp.int32_t, ndim=1] indptr,
+ cnp.int64_t n,
+ object rng):
+ """Subsample non-zero values of a sparse array with replacement
+
+ Note: this method operates in place
Parameters
----------
- arr : {csr_matrix, csc_matrix}
- A 1xM sparse vector
+ data : {csr_matrix, csc_matrix}.data
+ A 1xM sparse vector data
+ indptr : {csr_matrix, csc_matrix}.indptr
+ A 1xM sparse vector indptr
n : int
Number of items to subsample from `arr`
- with_replacement : bool
- Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
- Returns
- -------
- ndarray
- Subsampled data
-
Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)
"""
cdef:
- cnp.int64_t counts_sum
- cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
- cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
- cnp.ndarray[cnp.float64_t, ndim=1] result
- cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
- cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
- cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
- cnp.float64_t cnt
- Py_ssize_t i, j, length
+ cnp.float64_t counts_sum
+ cnp.int32_t start,end,length
+ Py_ssize_t i
+ cnp.ndarray[cnp.float64_t, ndim=1] pvals
+ cnp.ndarray[cnp.float64_t, ndim=1] data_ceil
+
+ data_ceil = np.ceil(data)
+ for i in range(indptr.shape[0] - 1):
+ start, end = indptr[i], indptr[i+1]
+ length = end - start
+
+ # base p-values on integer data to avoid small numerical issues with
+ # float on sum
+ counts_sum = data_ceil[start:end].sum()
+ pvals = data_ceil[start:end] / counts_sum
+
+ data[start:end] = rng.multinomial(n, pvals)
+
+
+cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
+ cnp.ndarray[cnp.int32_t, ndim=1] indptr,
+ cnp.int64_t n,
+ object rng):
+ """Subsample non-zero values of a sparse array w/out replacement
+
+ Note: this method operates in place
+
+ Parameters
+ ----------
+ data : {csr_matrix, csc_matrix}.data
+ A 1xM sparse vector data
+ indptr : {csr_matrix, csc_matrix}.indptr
+ A 1xM sparse vector indptr
+ n : int
+ Number of items to subsample from `arr`
+ rng : Generator instance
+ A random generator. This will likely be an instance returned
+ by np.random.default_rng
+ """
+ cdef:
+ cnp.int64_t counts_sum, count_el, perm_count_el
+ cnp.int64_t count_rem
+ cnp.ndarray[cnp.int64_t, ndim=1] permuted, intdata
+ Py_ssize_t i, idx
+ cnp.int32_t length,el,start,end
+ cnp.int64_t el_cnt
for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
- counts_sum = data[start:end].sum()
+ # We are relying on data being integers
+ # If there are rounding erros, fp64 sums can lead to
+ # big errors in sum, so convert to int64, first
+ intdata = data[start:end].astype(np.int64)
+ counts_sum = intdata.sum()
- if with_replacement:
- pvals = data[start:end] / counts_sum
- data[start:end] = rng.multinomial(n, pvals)
- else:
- if counts_sum < n:
- data[start:end] = 0
- continue
-
- r = np.arange(length, dtype=np.int32)
- unpacked = np.repeat(r, data_i[start:end])
- permuted = rng.permutation(unpacked)[:n]
-
- result = np.zeros(length, dtype=np.float64)
- for idx in range(permuted.shape[0]):
- result[permuted[idx]] += 1
-
- data[start:end] = result
+ if counts_sum < n:
+ data[start:end] = 0
+ continue
+
+ permuted = rng.choice(counts_sum, n, replace=False, shuffle=False)
+ permuted.sort()
+
+ # now need to do reverse mapping
+ # since I am not using np.repeat anymore
+ # reminder, old logic was
+ # r = np.arange(length)
+ # unpacked = np.repeat(r, data_i[start:end])
+ # permuted_unpacked = rng.choice(unpacked, n, replace=False, shuffle=False)
+ #
+ # specifically, what we're going to do here is randomly pick what elements within
+ # each sample to keep. this is analogous issuing the prior np.repeat call, and obtaining
+ # a random set of index positions for that resulting array. however, we do not need to
+ # perform the np.repeat call as we know the length of that resulting vector already,
+ # and additionally, we can compute the sample associated with an index in that array
+ # without constructing it.
+
+ el = 0 # index in result/data
+ count_el = 0 # index in permutted
+ count_rem = intdata[0] # since each data has multiple els, keep track how many are left
+ el_cnt = 0
+ for idx in range(n):
+ perm_count_el = permuted[idx]
+ # The array is sorted, so just jump ahead if needed
+ # Move until we get withing the elements range
+ while (perm_count_el - count_el) >= count_rem:
+ #save the computed value
+ data[start+el] = el_cnt
+ # move to next element
+ el += 1
+ # move to the beginning of next element
+ count_el += count_rem
+ # Load how much we have avaialble
+ count_rem = intdata[el]
+ #re-start the el counter
+ el_cnt = 0
+ # increment the el counter
+ el_cnt += 1
+ # update the counters
+ # reduce what is left
+ count_rem -= (perm_count_el - count_el)
+ #move the pointer to where we stopped
+ count_el = perm_count_el
+ # save the last value
+ data[start+el] = el_cnt
+ # clean up tail elements
+ data[start+el+1:end] = 0
+
+
+def subsample(arr, n, with_replacement, rng):
+ """Subsample non-zero values of a sparse array
+
+ Note: this method operates in place
+
+ Parameters
+ ----------
+ arr : {csr_matrix, csc_matrix}
+ A 1xM sparse vector
+ n : int
+ Number of items to subsample from `arr`
+ with_replacement : bool
+ Whether to permute or use multinomial sampling
+ rng : Generator instance
+ A random generator. This will likely be an instance returned
+ by np.random.default_rng
+
+ Notes
+ -----
+ This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+ """
+ if (with_replacement):
+ _subsample_with_replacement(arr.data, arr.indptr, n, rng)
+ else:
+ _subsample_without_replacement(arr.data, arr.indptr, n, rng)
=====================================
biom/_transform.pyx
=====================================
@@ -9,6 +9,7 @@
import numpy as np
cimport numpy as cnp
+cnp.import_array()
def _transform(arr, ids, metadata, function, axis):
=====================================
biom/table.py
=====================================
@@ -178,7 +178,7 @@ from copy import deepcopy
from datetime import datetime
from json import dumps as _json_dumps, JSONEncoder
from functools import reduce, partial
-from operator import itemgetter, or_
+from operator import itemgetter
from collections import defaultdict
from collections.abc import Hashable, Iterable
from numpy import ndarray, asarray, zeros, newaxis
@@ -195,7 +195,7 @@ from biom.util import (get_biom_format_version_string,
from biom.err import errcheck
from ._filter import _filter
from ._transform import _transform
-from ._subsample import _subsample
+from ._subsample import subsample
__author__ = "Daniel McDonald"
@@ -1073,7 +1073,6 @@ class Table:
raise TableException("No common ids between table and tree.")
_tree = tree.shear(names=common_tips)
_table = self.filter(common_tips, axis=axis, inplace=False)
- _table.remove_empty()
_tree.prune()
order = [n.name for n in _tree.tips()]
_table = _table.sort_order(order, axis=axis)
@@ -1423,7 +1422,12 @@ class Table:
>>> print(updated_table.ids(axis='sample'))
['s1.1' 's2.2' 's3.3']
"""
- str_dtype = 'U%d' % max([len(v) for v in id_map.values()])
+ max_str_len = max([len(v) for v in id_map.values()])
+ if not strict:
+ ids = self.ids(axis=axis)
+ max_str_len = max(max_str_len, max([len(i) for i in ids]))
+
+ str_dtype = 'U%d' % max_str_len
updated_ids = zeros(self.ids(axis=axis).size, dtype=str_dtype)
for idx, old_id in enumerate(self.ids(axis=axis)):
if strict and old_id not in id_map:
@@ -2395,16 +2399,24 @@ class Table:
return table
- def partition(self, f, axis='sample'):
+ def partition(self, f, axis='sample', remove_empty=False,
+ ignore_none=False):
"""Yields partitions
Parameters
----------
- f : function
+ f : function, dict
`f` is given the ID and metadata of the vector and must return
- what partition the vector is part of.
+ what partition the vector is part of. If `dict`, a mapping of
+ either ID -> group, or group -> [list, of, ID] must be provided.
axis : {'sample', 'observation'}, optional
The axis to iterate over
+ remove_empty : bool, optional
+ If `True`, remove empty vectors from a partition. Default is
+ `False`.
+ ignore_none : bool, optional
+ If `True`, ignore partitions with the label `None`. Default is
+ `False`.
Returns
-------
@@ -2445,11 +2457,39 @@ class Table:
O1 1.0
O2 42.0
"""
+ # we are not checking for whether the IDs are or are not present as
+ # that introduces complexity of `strict`. Deferring that for now.
+ if isinstance(f, dict):
+ test = list(f.values())[0]
+
+ if isinstance(test, (list, tuple)):
+ # group -> [list, of, ids]
+ mapping = {}
+ for grp, ids in f.items():
+ for id_ in ids:
+ mapping[id_] = grp
+
+ elif isinstance(test, str):
+ # id_ -> grp
+ mapping = f
+
+ else:
+ raise ValueError(f"Unable to handle a type of `{type(test)}` "
+ "with mapping")
+
+ def part_f(i, m):
+ return mapping.get(i)
+ else:
+ part_f = f
+
partitions = {}
# conversion of vector types is not necessary, vectors are not
# being passed to an arbitrary function
for vals, id_, md in self.iter(dense=False, axis=axis):
- part = f(id_, md)
+ part = part_f(id_, md)
+
+ if ignore_none and part is None:
+ continue
# try to make it hashable...
if not isinstance(part, Hashable):
@@ -2481,9 +2521,14 @@ class Table:
samp_md = md[:] if md is not None else None
indices = {'sample_index': self._sample_index.copy()}
- yield part, Table(data, obs_ids, samp_ids, obs_md, samp_md,
- self.table_id, type=self.type, validate=False,
- **indices)
+ tab = Table(data, obs_ids, samp_ids, obs_md, samp_md,
+ self.table_id, type=self.type, validate=False,
+ **indices)
+
+ if remove_empty:
+ tab.remove_empty(inplace=True)
+
+ yield part, tab
def collapse(self, f, collapse_f=None, norm=True, min_group_size=1,
include_collapsed_metadata=True, one_to_many=False,
@@ -2915,7 +2960,8 @@ class Table:
with_replacement : boolean, optional
If `False` (default), subsample without replacement. If `True`,
resample with replacement via the multinomial distribution.
- Should not be `True` if `by_id` is `True`.
+ Should not be `True` if `by_id` is `True`. Important: If `True`,
+ samples with a sum below `n` are retained.
seed : int, optional
If provided, set the numpy random seed with this value
@@ -2932,14 +2978,16 @@ class Table:
Notes
-----
- Subsampling is performed without replacement. If `n` is greater than
- the sum of a given vector, that vector is omitted from the result.
-
- Adapted from `skbio.math.subsample`, see biom-format/licenses for more
- information about scikit-bio.
+ If subsampling is performed without replacement, vectors with a sum
+ less than `n` are omitted from the result. This condition is not held
+ when operating with replacement.
This code assumes absolute abundance if `by_id` is False.
+ If subsampling with replacement, `np.ceil` is applied prior to
+ calculating p-values to ensure that low-abundance features have a
+ chance to be sampled.
+
Examples
--------
>>> import numpy as np
@@ -2987,7 +3035,7 @@ class Table:
table.filter(lambda v, i, md: i in subset, axis=axis)
else:
data = table._get_sparse_data()
- _subsample(data, n, with_replacement, rng)
+ subsample(data, n, with_replacement, rng)
table._data = data
table.filter(lambda v, i, md: v.sum() > 0, axis=axis)
@@ -3638,54 +3686,57 @@ class Table:
tables = [self] + others
# gather all identifiers across tables
- all_features = reduce(or_, [set(t.ids(axis='observation'))
- for t in tables])
- all_samples = reduce(or_, [set(t.ids()) for t in tables])
+ all_features = set(np.hstack([t.ids(axis='observation')
+ for t in tables]))
+ all_samples = set(np.hstack([t.ids() for t in tables]))
+
+ # produce a new stable order
+ feature_order = sorted(all_features)
+ sample_order = sorted(all_samples)
# generate unique integer ids for the identifiers, and let's order
# it to be polite
- feature_map = {i: idx for idx, i in enumerate(sorted(all_features))}
- sample_map = {i: idx for idx, i in enumerate(sorted(all_samples))}
+ feature_map = {i: idx for idx, i in enumerate(feature_order)}
+ sample_map = {i: idx for idx, i in enumerate(sample_order)}
- # produce a new stable order
- get1 = lambda x: x[1] # noqa
- feature_order = [k for k, v in sorted(feature_map.items(), key=get1)]
- sample_order = [k for k, v in sorted(sample_map.items(), key=get1)]
+ ntuples = sum([t.nnz for t in tables])
- mi = []
- values = []
+ # we're going to aggregate in COO. per scipy, it is efficient for
+ # construction of large matrices. importantly, it allows for
+ # duplicates which in this case correspond to multiple values for
+ # the same sample/feature across tables. the duplicates are summed
+ # implicitly on conversion to csr/csc.
+ rows = np.empty(ntuples, dtype=np.int32)
+ cols = np.empty(ntuples, dtype=np.int32)
+ data = np.empty(ntuples, dtype=self.matrix_data.dtype)
+
+ offset = 0
for table in tables:
- # these data are effectively [((row_index, col_index), value), ]
- data_as_dok = table.matrix_data.todok()
-
- # construct a map of the feature integer index to what it is in
- # the full table
- feat_ids = table.ids(axis='observation')
- samp_ids = table.ids()
- table_features = {idx: feature_map[i]
- for idx, i in enumerate(feat_ids)}
- table_samples = {idx: sample_map[i]
- for idx, i in enumerate(samp_ids)}
-
- for (f, s), v in data_as_dok.items():
- # collect the indices and values, adjusting the indices as we
- # go
- mi.append((table_features[f], table_samples[s]))
- values.append(v)
-
- # construct a multiindex of the indices where the outer index is the
- # feature and the inner index is the sample
- mi = pd.MultiIndex.from_tuples(mi)
- grouped = pd.Series(values, index=mi)
-
- # aggregate the values where the outer and inner values in the
- # multiindex are the same
- collapsed_rcv = grouped.groupby(level=[0, 1]).sum()
-
- # convert into a representation understood by the Table constructor
- list_list = [[r, c, v] for (r, c), v in collapsed_rcv.items()]
-
- return self.__class__(list_list, feature_order, sample_order)
+ t_nnz = table.nnz
+
+ coo = table.matrix_data.tocoo()
+
+ # we need to map the index positions in the current table to the
+ # index positions in the full matrix
+ row_map = np.array([feature_map[i]
+ for i in table.ids(axis='observation')],
+ dtype=np.int32)
+ col_map = np.array([sample_map[i]
+ for i in table.ids()],
+ dtype=np.int32)
+ coo.row = row_map[coo.row]
+ coo.col = col_map[coo.col]
+
+ # store our coo data
+ rows[offset:offset + t_nnz] = coo.row
+ cols[offset:offset + t_nnz] = coo.col
+ data[offset:offset + t_nnz] = coo.data
+ offset += t_nnz
+
+ coo = coo_matrix((data, (rows, cols)),
+ shape=(len(feature_order), len(sample_order)))
+
+ return self.__class__(coo.tocsr(), feature_order, sample_order)
def merge(self, other, sample='union', observation='union',
sample_metadata_f=prefer_self,
@@ -4139,6 +4190,7 @@ html
parser = defaultdict(lambda: general_parser)
parser['taxonomy'] = vlen_list_of_str_parser
+ parser['Taxonomy'] = vlen_list_of_str_parser
parser['KEGG_Pathways'] = vlen_list_of_str_parser
parser['collapsed_ids'] = vlen_list_of_str_parser
parser.update(parse_fs)
@@ -4573,6 +4625,7 @@ html
formatter = defaultdict(lambda: general_formatter)
formatter['taxonomy'] = vlen_list_of_str_formatter
+ formatter['Taxonomy'] = vlen_list_of_str_formatter
formatter['KEGG_Pathways'] = vlen_list_of_str_formatter
formatter['collapsed_ids'] = vlen_list_of_str_formatter
formatter.update(format_fs)
@@ -4861,7 +4914,7 @@ html
for col_index, val in enumerate(obs[0]):
if float(val) != 0.0:
built_row.append(
- "[%d,%d,%r]" % (obs_index, col_index, val)
+ "[%d,%d,%f]" % (obs_index, col_index, val)
)
if built_row:
# if we have written a row already, its safe to add a comma
=====================================
biom/tests/test_cli/test_add_metadata.py
=====================================
@@ -9,6 +9,7 @@
# -----------------------------------------------------------------------------
import tempfile
+import os
from unittest import TestCase, main
import biom
@@ -20,13 +21,17 @@ class TestAddMetadata(TestCase):
def setUp(self):
"""Set up data for use in unit tests."""
self.cmd = _add_metadata
- with tempfile.NamedTemporaryFile('w') as fh:
+ with tempfile.NamedTemporaryFile('w', delete=False) as fh:
fh.write(biom1)
fh.flush()
self.biom_table1 = biom.load_table(fh.name)
+ self.temporary_fh_name = fh.name
self.sample_md_lines1 = sample_md1.split('\n')
self.obs_md_lines1 = obs_md1.split('\n')
+ def tearDown(self):
+ os.unlink(self.temporary_fh_name)
+
def test_add_sample_metadata_no_casting(self):
"""Correctly adds sample metadata without casting it."""
# Add a subset of sample metadata to a table that doesn't have any
=====================================
biom/tests/test_cli/test_subset_table.py
=====================================
@@ -55,9 +55,10 @@ class TestSubsetTable(unittest.TestCase):
def test_subset_samples_hdf5(self):
"""Correctly subsets samples in a hdf5 table"""
cwd = os.getcwd()
- if '/' in __file__:
- os.chdir(__file__.rsplit('/', 1)[0])
- obs = _subset_table(hdf5_biom='test_data/test.biom', axis='sample',
+ if os.path.sep in __file__:
+ os.chdir(os.path.dirname(__file__))
+ obs = _subset_table(hdf5_biom=os.path.join('test_data', 'test.biom'),
+ axis='sample',
ids=['Sample1', 'Sample2', 'Sample3'],
json_table_str=None)
os.chdir(cwd)
@@ -71,9 +72,9 @@ class TestSubsetTable(unittest.TestCase):
def test_subset_observations_hdf5(self):
"""Correctly subsets samples in a hdf5 table"""
cwd = os.getcwd()
- if '/' in __file__:
- os.chdir(__file__.rsplit('/', 1)[0])
- obs = _subset_table(hdf5_biom='test_data/test.biom',
+ if os.path.sep in __file__:
+ os.chdir(os.path.dirname(__file__))
+ obs = _subset_table(hdf5_biom=os.path.join('test_data', 'test.biom'),
axis='observation',
ids=['GG_OTU_1', 'GG_OTU_3', 'GG_OTU_5'],
json_table_str=None)
=====================================
biom/tests/test_cli/test_summarize_table.py
=====================================
@@ -12,16 +12,21 @@ from biom.cli.table_summarizer import _summarize_table
from biom.parse import load_table
import tempfile
+import os
from unittest import TestCase, main
class TestSummarizeTable(TestCase):
def setUp(self):
- with tempfile.NamedTemporaryFile(mode='w') as fh:
+ with tempfile.NamedTemporaryFile(mode='w', delete=False) as fh:
fh.write(biom1)
fh.flush()
self.biom1 = load_table(fh.name)
+ self.temporary_fh_name = fh.name
+
+ def tearDown(self):
+ os.unlink(self.temporary_fh_name)
def test_default(self):
""" TableSummarizer functions as expected
=====================================
biom/tests/test_cli/test_table_converter.py
=====================================
@@ -8,6 +8,7 @@
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------
+import os
from os.path import abspath, dirname, join
import tempfile
@@ -28,16 +29,18 @@ class TableConverterTests(TestCase):
self.cmd = _convert
self.output_filepath = tempfile.NamedTemporaryFile().name
- with tempfile.NamedTemporaryFile('w') as fh:
+ with tempfile.NamedTemporaryFile('w', delete=False) as fh:
fh.write(biom1)
fh.flush()
self.biom_table1 = load_table(fh.name)
+ self.temporary_fh_table_name = fh.name
self.biom_lines1 = biom1.split('\n')
- with tempfile.NamedTemporaryFile('w') as fh:
+ with tempfile.NamedTemporaryFile('w', delete=False) as fh:
fh.write(classic1)
fh.flush()
self.classic_biom1 = load_table(fh.name)
+ self.temporary_fh_classic_name = fh.name
self.sample_md1 = MetadataMap.from_file(sample_md1.split('\n'))
@@ -47,6 +50,10 @@ class TableConverterTests(TestCase):
self.json_collapsed_samples = join(test_data_dir,
'json_sample_collapsed.biom')
+ def tearDown(self):
+ os.unlink(self.temporary_fh_classic_name)
+ os.unlink(self.temporary_fh_table_name)
+
def test_classic_to_biom(self):
"""Correctly converts classic to biom."""
self.cmd(table=self.classic_biom1,
=====================================
biom/tests/test_cli/test_table_normalizer.py
=====================================
@@ -24,9 +24,9 @@ class TableNormalizerTests(TestCase):
self.cmd = _normalize_table
cwd = os.getcwd()
- if '/' in __file__:
- os.chdir(__file__.rsplit('/', 1)[0])
- self.table = biom.load_table('test_data/test.json')
+ if os.path.sep in __file__:
+ os.chdir(os.path.dirname(__file__))
+ self.table = biom.load_table(os.path.join('test_data', 'test.json'))
os.chdir(cwd)
def test_bad_inputs(self):
=====================================
biom/tests/test_cli/test_validate_table.py
=====================================
@@ -39,7 +39,8 @@ class TableValidatorTests(TestCase):
self.to_remove = []
cur_path = os.path.split(os.path.abspath(__file__))[0]
- examples_path = os.path.join(cur_path.rsplit('/', 3)[0], 'examples')
+ examples_path = os.path.join(cur_path.rsplit(os.path.sep, 3)[0],
+ 'examples')
self.hdf5_file_valid = os.path.join(examples_path,
'min_sparse_otu_table_hdf5.biom')
self.hdf5_file_valid_md = os.path.join(examples_path,
=====================================
biom/tests/test_data/edgecase_issue_952.biom
=====================================
Binary files /dev/null and b/biom/tests/test_data/edgecase_issue_952.biom differ
=====================================
biom/tests/test_parse.py
=====================================
@@ -46,7 +46,7 @@ class ParseTests(TestCase):
self.legacy_otu_table1 = legacy_otu_table1
self.otu_table1 = otu_table1
self.otu_table1_floats = otu_table1_floats
- self.files_to_remove = []
+ self.to_remove = []
self.biom_minimal_sparse = biom_minimal_sparse
self.classic_otu_table1_w_tax = classic_otu_table1_w_tax.split('\n')
@@ -54,6 +54,11 @@ class ParseTests(TestCase):
self.classic_table_with_complex_metadata = \
classic_table_with_complex_metadata.split('\n')
+ def tearDown(self):
+ if self.to_remove:
+ for f in self.to_remove:
+ os.remove(f)
+
def test_from_tsv_bug_854(self):
data = StringIO('#FeatureID\tSample1')
exp = Table([], [], ['Sample1'])
@@ -281,38 +286,40 @@ class ParseTests(TestCase):
def test_parse_biom_table_hdf5(self):
"""Make sure we can parse a HDF5 table through the same loader"""
cwd = os.getcwd()
- if '/' in __file__[1:]:
- os.chdir(__file__.rsplit('/', 1)[0])
- Table.from_hdf5(h5py.File('test_data/test.biom', 'r'))
+ if os.path.sep in __file__[1:]:
+ os.chdir(os.path.dirname(__file__))
+ Table.from_hdf5(h5py.File(os.path.join('test_data', 'test.biom'),
+ 'r'))
os.chdir(cwd)
def test_save_table_filepath(self):
t = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['a', 'b'],
['c', 'd', 'e'])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
save_table(t, tmpfile.name)
obs = load_table(tmpfile.name)
self.assertEqual(obs, t)
+ self.to_remove.append(tmpfile.name)
def test_load_table_filepath(self):
cwd = os.getcwd()
- if '/' in __file__[1:]:
- os.chdir(__file__.rsplit('/', 1)[0])
- load_table('test_data/test.biom')
+ if os.path.sep in __file__[1:]:
+ os.chdir(os.path.dirname(__file__))
+ load_table(os.path.join('test_data', 'test.biom'))
os.chdir(cwd)
def test_load_table_inmemory(self):
cwd = os.getcwd()
- if '/' in __file__[1:]:
- os.chdir(__file__.rsplit('/', 1)[0])
- load_table(h5py.File('test_data/test.biom', 'r'))
+ if os.path.sep in __file__[1:]:
+ os.chdir(os.path.dirname(__file__))
+ load_table(h5py.File(os.path.join('test_data', 'test.biom'), 'r'))
os.chdir(cwd)
def test_load_table_inmemory_json(self):
cwd = os.getcwd()
- if '/' in __file__[1:]:
- os.chdir(__file__.rsplit('/', 1)[0])
- load_table(open('test_data/test.json'))
+ if os.path.sep in __file__[1:]:
+ os.chdir(os.path.dirname(__file__))
+ load_table(open(os.path.join('test_data', 'test.json')))
os.chdir(cwd)
def test_load_table_inmemory_stringio(self):
@@ -350,10 +357,11 @@ class ParseTests(TestCase):
"""tests for parse_biom_table when we have h5py"""
# We will round-trip the HDF5 file to several different formats, and
# make sure we can recover the same table using parse_biom_table
- if '/' in __file__[1:]:
- os.chdir(__file__.rsplit('/', 1)[0])
+ if os.path.sep in __file__[1:]:
+ os.chdir(os.path.dirname(__file__))
- t = parse_biom_table(h5py.File('test_data/test.biom', 'r'))
+ t = parse_biom_table(h5py.File(os.path.join('test_data', 'test.biom'),
+ 'r'))
# These things are not round-trippable using the general-purpose
# parse_biom_table function
=====================================
biom/tests/test_table.py
=====================================
@@ -1016,13 +1016,15 @@ class TableTests(TestCase):
['c', 'd', 'e'])
t.add_metadata({'a': {'a / problem': 10}, 'b': {'a / problem': 20}},
axis='observation')
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
t.to_hdf5(h5, 'tests')
h5.close()
h5 = h5py.File(tmpfile.name, 'r')
obs = Table.from_hdf5(h5)
+ h5.close()
+ self.to_remove.append(tmpfile.name)
self.assertEqual(obs, t)
@@ -1030,7 +1032,7 @@ class TableTests(TestCase):
t = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['a', 'b'],
['c', 'd', 'e'])
current = datetime.now()
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
t.to_hdf5(h5, 'tests', creation_date=current)
h5.close()
@@ -1038,6 +1040,8 @@ class TableTests(TestCase):
h5 = h5py.File(tmpfile.name, 'r')
obs = Table.from_hdf5(h5)
self.assertEqual(obs.create_date, current)
+ h5.close()
+ self.to_remove.append(tmpfile.name)
self.assertEqual(obs, t)
@@ -1045,24 +1049,27 @@ class TableTests(TestCase):
"""Successfully writes an empty OTU table in HDF5 format"""
# Create an empty OTU table
t = Table([], [], [])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
t.to_hdf5(h5, 'tests')
h5.close()
+ self.to_remove.append(tmpfile.name)
def test_to_hdf5_empty_table_bug_619(self):
"""Successfully writes an empty OTU table in HDF5 format"""
t = example_table.filter({}, axis='observation', inplace=False)
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
t.to_hdf5(h5, 'tests')
h5.close()
+ self.to_remove.append(tmpfile.name)
t = example_table.filter({}, inplace=False)
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
t.to_hdf5(h5, 'tests')
h5.close()
+ self.to_remove.append(tmpfile.name)
def test_to_hdf5_missing_metadata_observation(self):
# exercises a vlen_list
@@ -1070,10 +1077,11 @@ class TableTests(TestCase):
[{'taxonomy': None},
{'taxonomy': ['foo', 'baz']}])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
with h5py.File(tmpfile.name, 'w') as h5:
t.to_hdf5(h5, 'tests')
obs = load_table(tmpfile.name)
+ self.to_remove.append(tmpfile.name)
self.assertEqual(obs.metadata(axis='observation'),
({'taxonomy': None},
{'taxonomy': ['foo', 'baz']}))
@@ -1084,10 +1092,11 @@ class TableTests(TestCase):
[{'dat': None},
{'dat': 'foo'}])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
with h5py.File(tmpfile.name, 'w') as h5:
t.to_hdf5(h5, 'tests')
obs = load_table(tmpfile.name)
+ self.to_remove.append(tmpfile.name)
self.assertEqual(obs.metadata(axis='sample'),
({'dat': ''},
{'dat': 'foo'}))
@@ -1097,11 +1106,12 @@ class TableTests(TestCase):
[{'taxonomy_A': 'foo; bar'},
{'taxonomy_B': 'foo; baz'}])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
with h5py.File(tmpfile.name, 'w') as h5:
with self.assertRaisesRegex(ValueError,
'inconsistent metadata'):
t.to_hdf5(h5, 'tests')
+ self.to_remove.append(tmpfile.name)
def test_to_hdf5_inconsistent_metadata_categories_sample(self):
t = Table(np.array([[0, 1], [2, 3]]), ['a', 'b'], ['c', 'd'],
@@ -1109,21 +1119,23 @@ class TableTests(TestCase):
[{'dat_A': 'foo; bar'},
{'dat_B': 'foo; baz'}])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
with h5py.File(tmpfile.name, 'w') as h5:
with self.assertRaisesRegex(ValueError,
'inconsistent metadata'):
t.to_hdf5(h5, 'tests')
+ self.to_remove.append(tmpfile.name)
def test_to_hdf5_malformed_taxonomy(self):
t = Table(np.array([[0, 1], [2, 3]]), ['a', 'b'], ['c', 'd'],
[{'taxonomy': 'foo; bar'},
{'taxonomy': 'foo; baz'}])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
with h5py.File(tmpfile.name, 'w') as h5:
t.to_hdf5(h5, 'tests')
obs = load_table(tmpfile.name)
+ self.to_remove.append(tmpfile.name)
self.assertEqual(obs.metadata(axis='observation'),
({'taxonomy': ['foo', 'bar']},
{'taxonomy': ['foo', 'baz']}))
@@ -1134,9 +1146,11 @@ class TableTests(TestCase):
[{'foo': ['k__a', 'p__b']},
{'foo': ['k__a', 'p__c']}],
[{'barcode': 'aatt'}, {'barcode': 'ttgg'}])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
st_rich.to_hdf5(h5, 'tests')
+ h5.close()
+ self.to_remove.append(tmpfile.name)
def test_to_hdf5_custom_formatters(self):
self.st_rich = Table(self.vals,
@@ -1151,7 +1165,7 @@ class TableTests(TestCase):
grp.create_dataset(name, shape=data.shape, dtype=H5PY_VLEN_STR,
data=data, compression=compression)
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
self.st_rich.to_hdf5(h5, 'tests',
format_fs={'barcode': bc_formatter})
@@ -1172,10 +1186,11 @@ class TableTests(TestCase):
self.assertNotEqual(m1['barcode'], m2['barcode'])
self.assertEqual(m1['barcode'].lower(), m2['barcode'])
h5.close()
+ self.to_remove.append(tmpfile.name)
def test_to_hdf5(self):
"""Write a file"""
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
self.st_rich.to_hdf5(h5, 'tests')
h5.close()
@@ -1193,9 +1208,10 @@ class TableTests(TestCase):
obs = Table.from_hdf5(h5)
self.assertEqual(obs, self.st_rich)
h5.close()
+ self.to_remove.append(tmpfile.name)
# Test with a collapsed table
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
dt_rich = Table(
np.array([[5, 6, 7], [8, 9, 10], [11, 12, 13]]),
@@ -1238,9 +1254,10 @@ class TableTests(TestCase):
[{'collapsed_ids': ['a', 'c']},
{'collapsed_ids': ['b']}])
self.assertEqual(obs, exp)
+ self.to_remove.append(tmpfile.name)
# Test with table having a None on taxonomy
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
h5 = h5py.File(tmpfile.name, 'w')
t = Table(self.vals, ['1', '2'], ['a', 'b'],
[{'taxonomy': ['k__a', 'p__b']},
@@ -1262,6 +1279,7 @@ class TableTests(TestCase):
obs = Table.from_hdf5(h5)
h5.close()
self.assertEqual(obs, t)
+ self.to_remove.append(tmpfile.name)
def test_from_tsv(self):
tab1_fh = StringIO(otu_table1)
@@ -1593,7 +1611,7 @@ class TableTests(TestCase):
df = example_table.to_dataframe()
density = (float(example_table.matrix_data.getnnz()) /
np.prod(example_table.shape))
- df_density = (df > 0).sum().sum() / np.prod(df.shape)
+ df_density = (df.values > 0).sum().sum() / np.prod(df.shape)
assert np.allclose(df_density, density)
def test_to_dataframe_dense(self):
@@ -1995,6 +2013,21 @@ class TableTests(TestCase):
with self.assertRaises(TableException):
table.align_to_dataframe(metadata)
+ @pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed")
+ def test_align_tree_issue_948(self):
+ table = Table(np.array([[0, 0, 0, 0],
+ [2, 3, 4, 4],
+ [5, 5, 3, 3],
+ [0, 0, 0, 1]]).T,
+ ['a', 'b', 'c', 'd'],
+ ['s1', 's2', 's3', 's4'])
+ tree = skbio.TreeNode.read(["(a,b,c,d)r;"])
+ exp_tree = tree
+ exp_table = table.copy()
+ res_table, res_tree = table.align_tree(tree)
+ self.assertEqual(res_table, exp_table)
+ self.assertEqual(str(exp_tree), str(res_tree))
+
@pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed")
def test_align_tree_intersect_tips(self):
# there are less tree tips than observations
@@ -2379,6 +2412,16 @@ class SparseTableTests(TestCase):
self.st_rich.data('2', 'observation'))
self.assertEqual(obs.transpose(), self.st_rich)
+ def test_update_ids_strict_dtype_bug_issue_957(self):
+ t = Table(np.arange(6).reshape(2, 3),
+ ['O1', 'O2'],
+ ['ab', 'cdef', 'ghijkl'])
+ exp = Table(np.arange(6).reshape(2, 3),
+ ['O1', 'O2'],
+ ['AB', 'cdef', 'ghijkl'])
+ obs = t.update_ids({'ab': 'AB'}, strict=False, inplace=False)
+ self.assertEqual(obs, exp)
+
def test_update_ids_inplace_bug_892(self):
t = example_table.copy()
exp = t.ids().copy()
@@ -3170,6 +3213,23 @@ class SparseTableTests(TestCase):
with errstate(empty='raise'), self.assertRaises(TableException):
self.st_rich.filter(f, 'observation')
+ def test_subsample_edgecase_issue_952(self):
+ # this file triggers an exception on Linux on subsample
+ # with replacement where the pvals computed sum to > 1. It is a
+ # subset of the data reported in issue 952, specifically constrained
+ # to the first 10 features with any empty samples removed.
+ path = 'test_data/edgecase_issue_952.biom'
+
+ # ...existing logic for test_data, not ideal, but consistent
+ cwd = os.getcwd()
+ if '/' in __file__:
+ os.chdir(__file__.rsplit('/', 1)[0])
+ table = Table.from_hdf5(h5py.File(path, 'r'))
+ os.chdir(cwd)
+
+ obs = table.subsample(10, with_replacement=True)
+ self.assertEqual(set(obs.sum('sample')), {10.0, })
+
def test_subsample_same_seed_without_replacement(self):
table = Table(np.array([[3, 1, 2], [0, 3, 4]]), ['O1', 'O2'],
['S1', 'S2', 'S3'])
@@ -4237,6 +4297,79 @@ class SparseTableTests(TestCase):
with self.assertRaisesRegex(TypeError, msg):
Table._extract_data_from_tsv(tsv, dtype=int)
+ def test_partition_remove_empty(self):
+ t = Table(np.array([[0, 1, 2],
+ [3, 0, 0],
+ [4, 0, 0]]),
+ ['O1', 'O2', 'O3'],
+ ['S1', 'S2', 'S3'])
+ part_f = lambda i, m: i == 'S1' # noqa
+ obs = dict(t.partition(part_f, remove_empty=True))
+ exp = {True: Table(np.array([[3, ], [4, ]]), ['O2', 'O3'], ['S1', ]),
+ False: Table(np.array([[1, 2]]), ['O1', ], ['S2', 'S3'])}
+ self.assertEqual(obs, exp)
+
+ def test_partition_ignore_none_true(self):
+ t = Table(np.array([[0, 1, 2],
+ [3, 0, 0],
+ [4, 0, 0]]),
+ ['O1', 'O2', 'O3'],
+ ['S1', 'S2', 'S3'])
+ part_f = lambda i, m: True if i == 'S1' else None # noqa
+ obs = dict(t.partition(part_f, ignore_none=True))
+ exp = {True: Table(np.array([[0, ], [3, ], [4, ]]),
+ ['O1', 'O2', 'O3'], ['S1', ])}
+ self.assertEqual(obs, exp)
+
+ def test_partition_ignore_none_false(self):
+ t = Table(np.array([[0, 1, 2],
+ [3, 0, 0],
+ [4, 0, 0]]),
+ ['O1', 'O2', 'O3'],
+ ['S1', 'S2', 'S3'])
+ part_f = lambda i, m: True if i == 'S1' else None # noqa
+ obs = dict(t.partition(part_f, ignore_none=False))
+ exp = {True: Table(np.array([[0, ], [3, ], [4, ]]),
+ ['O1', 'O2', 'O3'], ['S1', ]),
+ None: Table(np.array([[1, 2], [0, 0], [0, 0]]),
+ ['O1', 'O2', 'O3'], ['S2', 'S3'])}
+ self.assertEqual(obs, exp)
+
+ def test_partition_dict_ids_to_groups(self):
+ t = Table(np.array([[0, 1, 2],
+ [3, 0, 0],
+ [4, 0, 0]]),
+ ['O1', 'O2', 'O3'],
+ ['S1', 'S2', 'S3'])
+ by_dict = {'S1': 'foo',
+ 'S2': 'bar',
+ 'S3': 'foo'}
+ exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
+ ['O1', 'O2', 'O3'],
+ ['S1', 'S3']),
+ 'bar': Table(np.array([[1, ], [0, ], [0, ]]),
+ ['O1', 'O2', 'O3'],
+ ['S2', ])}
+ obs = dict(t.partition(by_dict))
+ self.assertEqual(obs, exp)
+
+ def test_partition_dict_groups_to_ids(self):
+ t = Table(np.array([[0, 1, 2],
+ [3, 0, 0],
+ [4, 0, 0]]),
+ ['O1', 'O2', 'O3'],
+ ['S1', 'S2', 'S3'])
+ by_dict_group = {'foo': ['S1', 'S3'],
+ 'bar': ['S2', ]}
+ exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
+ ['O1', 'O2', 'O3'],
+ ['S1', 'S3']),
+ 'bar': Table(np.array([[1, ], [0, ], [0, ]]),
+ ['O1', 'O2', 'O3'],
+ ['S2', ])}
+ obs = dict(t.partition(by_dict_group))
+ self.assertEqual(obs, exp)
+
def test_bin_samples_by_metadata(self):
"""Yield tables binned by sample metadata"""
def f(id_, md):
=====================================
biom/tests/test_util.py
=====================================
@@ -42,6 +42,12 @@ class UtilTests(TestCase):
def setUp(self):
self.biom_otu_table1_w_tax = parse_biom_table(biom_otu_table1_w_tax)
+ self.to_remove = []
+
+ def tearDown(self):
+ if self.to_remove:
+ for f in self.to_remove:
+ os.remove(f)
def test_generate_subsamples(self):
table = Table(np.array([[3, 1, 1], [0, 3, 3]]), ['O1', 'O2'],
@@ -246,11 +252,14 @@ class UtilTests(TestCase):
tmp_f = NamedTemporaryFile(
mode='w',
prefix='test_safe_md5',
- suffix='txt')
+ suffix='txt',
+ delete=False)
tmp_f.write('foo\n')
tmp_f.flush()
obs = safe_md5(open(tmp_f.name))
+ tmp_f.close()
+ self.to_remove.append(tmp_f.name)
self.assertEqual(obs, exp)
obs = safe_md5(['foo\n'])
@@ -262,9 +271,10 @@ class UtilTests(TestCase):
def test_biom_open_hdf5_pathlib_write(self):
t = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['a', 'b'],
['c', 'd', 'e'])
- with NamedTemporaryFile() as tmpfile:
+ with NamedTemporaryFile(delete=False) as tmpfile:
with biom_open(pathlib.Path(tmpfile.name), 'w') as fp:
t.to_hdf5(fp, 'tests')
+ self.to_remove.append(tmpfile.name)
def test_biom_open_hdf5_pathlib_read(self):
cwd = os.getcwd()
@@ -309,11 +319,12 @@ class UtilTests(TestCase):
def test_load_classic(self):
tab = load_table(get_data_path('test.json'))
- with NamedTemporaryFile(mode='w') as fp:
+ with NamedTemporaryFile(mode='w', delete=False) as fp:
fp.write(str(tab))
fp.flush()
obs = load_table(fp.name)
+ self.to_remove.append(fp.name)
npt.assert_equal(obs.ids(), tab.ids())
npt.assert_equal(obs.ids(axis='observation'),
=====================================
biom/util.py
=====================================
@@ -41,7 +41,7 @@ __url__ = "http://biom-format.org"
__maintainer__ = "Daniel McDonald"
__email__ = "daniel.mcdonald at colorado.edu"
__format_version__ = (2, 1)
-__version__ = "2.1.15"
+__version__ = "2.1.16"
def generate_subsamples(table, n, axis='sample', by_id=False):
@@ -425,7 +425,6 @@ def biom_open(fp, permission='r'):
if permission not in ['r', 'w', 'U', 'rb', 'wb']:
raise OSError("Unknown mode: %s" % permission)
- opener = functools.partial(io.open, encoding='utf-8')
mode = permission
# don't try to open an HDF5 file if H5PY is not installed, this can only
@@ -434,19 +433,20 @@ def biom_open(fp, permission='r'):
if os.path.getsize(fp) == 0:
raise ValueError("The file '%s' is empty and can't be parsed" % fp)
- if mode in ['U', 'r', 'rb'] and h5py.is_hdf5(fp):
- opener = h5py.File
- mode = 'r' if permission == 'U' else permission
- elif mode == 'w':
- opener = h5py.File
-
if mode in ['U', 'r', 'rb'] and is_gzip(fp):
- def opener(fp, mode):
+ def opener(fp, mode): # noqa
return codecs.getreader('utf-8')(gzip_open(fp, mode))
mode = 'rb' if permission in ['U', 'r'] else permission
elif mode in ['w', 'wb'] and str(fp).endswith('.gz'):
- def opener(fp, mode):
+ def opener(fp, mode): # noqa
codecs.getwriter('utf-8')(gzip_open(fp, mode))
+ elif mode in ['U', 'r', 'rb'] and h5py.is_hdf5(fp):
+ opener = h5py.File
+ mode = 'r' if permission == 'U' else permission
+ elif mode == 'w':
+ opener = h5py.File
+ else:
+ opener = functools.partial(io.open, encoding='utf-8')
f = opener(fp, mode)
try:
=====================================
ci/aarch64.conda_requirements.txt
=====================================
@@ -1,4 +1,3 @@
-natsort >= 4.0.3
numpy >= 1.9.2
pandas >= 0.20.0
scipy >= 1.3.1
=====================================
ci/conda_requirements.txt
=====================================
@@ -1,4 +1,3 @@
-natsort >= 4.0.3
numpy >= 1.9.2
pandas >= 0.20.0
scipy >= 1.3.1
=====================================
doc/conf.py
=====================================
@@ -57,15 +57,15 @@ master_doc = 'index'
# General information about the project.
project = 'biom-format'
-copyright = '2011-2022 The BIOM Format Development Team'
+copyright = '2011-2024 The BIOM Format Development Team'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The full version, including alpha/beta/rc tags.
-version = "2.1.15"
-release = "2.1.15"
+version = "2.1.16"
+release = "2.1.16"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
=====================================
doc/index.rst
=====================================
@@ -35,6 +35,8 @@ Projects using the BIOM format
* `EBI Metagenomics <https://www.ebi.ac.uk/metagenomics>`_
* `GCModeller <http://gcmodeller.org>`_
* `MetaPhlAn 2 <http://segatalab.cibio.unitn.it/tools/metaphlan2/>`__
+* `mia (TreeSummarizedExperiment; R/Bioconductor) <http://microbiome.github.io/>`__
+
If you are using BIOM in your project, and would like your project to be listed, please submit a `pull request <https://github.com/biocore/biom-format/pulls>`_ to the BIOM project. More information on `submitting pull requests can be found here <https://help.github.com/articles/using-pull-requests>`_.
=====================================
setup.py
=====================================
@@ -32,7 +32,7 @@ __copyright__ = "Copyright 2011-2020, The BIOM Format Development Team"
__credits__ = ["Greg Caporaso", "Daniel McDonald", "Jose Clemente",
"Jai Ram Rideout", "Jorge Cañardo Alastuey", "Michael Hall"]
__license__ = "BSD"
-__version__ = "2.1.15"
+__version__ = "2.1.16"
__maintainer__ = "Daniel McDonald"
__email__ = "mcdonadt at colorado.edu"
@@ -86,15 +86,20 @@ classes = """
Topic :: Software Development :: Libraries :: Application Frameworks
Topic :: Software Development :: Libraries :: Python Modules
Programming Language :: Python
+ Programming Language :: Python :: 3
+ Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
+ Programming Language :: Python :: 3.11
+ Programming Language :: Python :: 3.12
Programming Language :: Python :: Implementation :: CPython
Operating System :: OS Independent
Operating System :: POSIX :: Linux
Operating System :: MacOS :: MacOS X
+ Operating System :: Microsoft :: Windows
"""
classifiers = [s.strip() for s in classes.split('\n') if s]
View it on GitLab: https://salsa.debian.org/med-team/python-biom-format/-/commit/842fbdbfebc37de25b5599f49b46ec865c078844
--
This project does not include diff previews in email notifications.
View it on GitLab: https://salsa.debian.org/med-team/python-biom-format/-/commit/842fbdbfebc37de25b5599f49b46ec865c078844
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240527/2361f032/attachment-0001.htm>
More information about the debian-med-commit
mailing list