[med-svn] [Git][med-team/python-skbio][master] 3 commits: New upstream version 0.5.4
Andreas Tille
gitlab at salsa.debian.org
Tue Aug 28 18:48:27 BST 2018
Andreas Tille pushed to branch master at Debian Med / python-skbio
Commits:
ee541afd by Andreas Tille at 2018-08-28T16:13:43Z
New upstream version 0.5.4
- - - - -
d9b8192a by Andreas Tille at 2018-08-28T16:14:04Z
Update upstream source from tag 'upstream/0.5.4'
Update to upstream version '0.5.4'
with Debian dir 9bec36a983712c49f50e01e1b526064b2342432c
- - - - -
59737a89 by Andreas Tille at 2018-08-28T17:47:56Z
New upstream version
- - - - -
12 changed files:
- CHANGELOG.md
- debian/changelog
- skbio/__init__.py
- skbio/stats/ordination/__init__.py
- skbio/stats/ordination/_principal_coordinate_analysis.py
- skbio/stats/ordination/_utils.py
- + skbio/stats/ordination/tests/data/PCoA_sample_data_12dim
- + skbio/stats/ordination/tests/data/PCoA_sample_data_3_eigh_ref_3dim
- + skbio/stats/ordination/tests/data/PCoA_sample_data_3_fsvd_ref_3dim
- skbio/stats/ordination/tests/test_principal_coordinate_analysis.py
- skbio/stats/ordination/tests/test_util.py
- skbio/util/_testing.py
Changes:
=====================================
CHANGELOG.md
=====================================
@@ -1,5 +1,27 @@
# scikit-bio changelog
+## Version 0.5.4 (2018-08-23)
+
+### Features
+
+* Added `FSVD`, an alternative fast heuristic method to perform Principal Coordinates Analysis, to `skbio.stats.ordination.pcoa`.
+
+### Backward-incompatible changes [stable]
+
+### Backward-incompatible changes [experimental]
+
+### Performance enhancements
+
+* Added optimized utility methods `f_matrix_inplace` and `e_matrix_inplace` which perform `f_matrix` and `e_matrix` computations in-place and are used by the new `center_distance_matrix` method in `skbio.stats.ordination`.
+
+### Bug fixes
+
+### Deprecated functionality [stable]
+
+### Deprecated functionality [experimental]
+
+### Miscellaneous
+
## Version 0.5.3 (2018-08-07)
### Features
=====================================
debian/changelog
=====================================
@@ -1,10 +1,10 @@
-python-skbio (0.5.3-1) UNRELEASED; urgency=medium
+python-skbio (0.5.4-1) UNRELEASED; urgency=medium
* New upstream version
* Standards-Version: 4.2.0
* Build-Depends: python3-hdmedians
- -- Andreas Tille <tille at debian.org> Wed, 15 Aug 2018 15:21:02 +0200
+ -- Andreas Tille <tille at debian.org> Tue, 28 Aug 2018 18:14:05 +0200
python-skbio (0.5.2-1) unstable; urgency=medium
=====================================
skbio/__init__.py
=====================================
@@ -26,7 +26,7 @@ __all__ = ['Sequence', 'DNA', 'RNA', 'Protein', 'GeneticCode',
'TreeNode', 'nj', 'read', 'write', 'OrdinationResults']
__credits__ = "https://github.com/biocore/scikit-bio/graphs/contributors"
-__version__ = "0.5.3"
+__version__ = "0.5.4"
mottos = [
# 03/15/2014
=====================================
skbio/stats/ordination/__init__.py
=====================================
@@ -129,10 +129,11 @@ from ._correspondence_analysis import ca
from ._canonical_correspondence_analysis import cca
from ._principal_coordinate_analysis import pcoa, pcoa_biplot
from ._ordination_results import OrdinationResults
-from ._utils import (mean_and_std, scale, svd_rank, corr, e_matrix, f_matrix)
+from ._utils import (mean_and_std, scale, svd_rank, corr, e_matrix, f_matrix,
+ center_distance_matrix)
__all__ = ['ca', 'rda', 'cca', 'pcoa', 'pcoa_biplot', 'OrdinationResults',
'mean_and_std', 'scale', 'svd_rank', 'corr',
- 'e_matrix', 'f_matrix']
+ 'e_matrix', 'f_matrix', 'center_distance_matrix']
test = TestRunner(__file__).test
=====================================
skbio/stats/ordination/_principal_coordinate_analysis.py
=====================================
@@ -6,33 +6,29 @@
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
-from warnings import warn
-
-import pandas as pd
import numpy as np
+import pandas as pd
+from numpy import dot, hstack
+from numpy.linalg import qr, svd
+from numpy.random import standard_normal
from scipy.linalg import eigh
+from warnings import warn
from skbio.stats.distance import DistanceMatrix
from skbio.util._decorator import experimental
from ._ordination_results import OrdinationResults
-from ._utils import e_matrix, f_matrix, scale
-
-# - In cogent, after computing eigenvalues/vectors, the imaginary part
-# is dropped, if any. We know for a fact that the eigenvalues are
-# real, so that's not necessary, but eigenvectors can in principle
-# be complex (see for example
-# http://math.stackexchange.com/a/47807/109129 for details) and in
-# that case dropping the imaginary part means they'd no longer be
-# so, so I'm not doing that.
+from ._utils import center_distance_matrix, scale
@experimental(as_of="0.4.0")
-def pcoa(distance_matrix):
+def pcoa(distance_matrix, method="eigh", number_of_dimensions=0,
+ inplace=False):
r"""Perform Principal Coordinate Analysis.
- Principal Coordinate Analysis (PCoA) is a method similar to PCA
- that works from distance matrices, and so it can be used with
- ecologically meaningful distances like UniFrac for bacteria.
+ Principal Coordinate Analysis (PCoA) is a method similar
+ to Principal Components Analysis (PCA) with the difference that PCoA
+ operates on distance matrices, typically with non-euclidian and thus
+ ecologically meaningful distances like UniFrac in microbiome research.
In ecology, the euclidean distance preserved by Principal
Component Analysis (PCA) is often not a good choice because it
@@ -44,10 +40,36 @@ def pcoa(distance_matrix):
species is present in two sites, that means that the sites are
similar.).
+ Note that the returned eigenvectors are not normalized to unit length.
+
Parameters
----------
distance_matrix : DistanceMatrix
A distance matrix.
+ method : str, optional
+ Eigendecomposition method to use in performing PCoA.
+ By default, uses SciPy's `eigh`, which computes exact
+ eigenvectors and eigenvalues for all dimensions. The alternate
+ method, `fsvd`, uses faster heuristic eigendecomposition but loses
+ accuracy. The magnitude of accuracy lost is dependent on dataset.
+ number_of_dimensions : int, optional
+ Dimensions to reduce the distance matrix to. This number determines
+ how many eigenvectors and eigenvalues will be returned.
+ By default, equal to the number of dimensions of the distance matrix,
+ as default eigendecomposition using SciPy's `eigh` method computes
+ all eigenvectors and eigenvalues. If using fast heuristic
+ eigendecomposition through `fsvd`, a desired number of dimensions
+ should be specified. Note that the default eigendecomposition
+ method `eigh` does not natively support a specifying number of
+ dimensions to reduce a matrix to, so if this parameter is specified,
+ all eigenvectors and eigenvalues will be simply be computed with no
+ speed gain, and only the number specified by `number_of_dimensions`
+ will be returned. Specifying a value of `0`, the default, will
+ set `number_of_dimensions` equal to the number of dimensions of the
+ specified `distance_matrix`.
+ inplace : bool, optional
+ If true, centers a distance matrix in-place in a manner that reduces
+ memory consumption.
Returns
-------
@@ -62,37 +84,56 @@ def pcoa(distance_matrix):
Notes
-----
- It is sometimes known as metric multidimensional scaling or
- classical scaling.
-
- .. note::
-
- If the distance is not euclidean (for example if it is a
- semimetric and the triangle inequality doesn't hold),
- negative eigenvalues can appear. There are different ways
- to deal with that problem (see Legendre & Legendre 1998, \S
- 9.2.3), but none are currently implemented here.
-
- However, a warning is raised whenever negative eigenvalues
- appear, allowing the user to decide if they can be safely
- ignored.
+ .. note:: If the distance is not euclidean (for example if it is a
+ semimetric and the triangle inequality doesn't hold),
+ negative eigenvalues can appear. There are different ways
+ to deal with that problem (see Legendre & Legendre 1998, \S
+ 9.2.3), but none are currently implemented here.
+ However, a warning is raised whenever negative eigenvalues
+ appear, allowing the user to decide if they can be safely
+ ignored.
"""
distance_matrix = DistanceMatrix(distance_matrix)
- E_matrix = e_matrix(distance_matrix.data)
-
- # If the used distance was euclidean, pairwise distances
- # needn't be computed from the data table Y because F_matrix =
- # Y.dot(Y.T) (if Y has been centred).
- F_matrix = f_matrix(E_matrix)
-
- # If the eigendecomposition ever became a bottleneck, it could
- # be replaced with an iterative version that computes the
- # largest k eigenvectors.
- eigvals, eigvecs = eigh(F_matrix)
-
- # eigvals might not be ordered, so we order them (at least one
- # is zero). cogent makes eigenvalues positive by taking the
+ # Center distance matrix, a requirement for PCoA here
+ matrix_data = center_distance_matrix(distance_matrix.data, inplace=inplace)
+
+ # If no dimension specified, by default will compute all eigenvectors
+ # and eigenvalues
+ if number_of_dimensions == 0:
+ if method == "fsvd" and matrix_data.shape[0] > 10:
+ warn("FSVD: since no value for number_of_dimensions is specified, "
+ "PCoA for all dimensions will be computed, which may "
+ "result in long computation time if the original "
+ "distance matrix is large.", RuntimeWarning)
+
+ # distance_matrix is guaranteed to be square
+ number_of_dimensions = matrix_data.shape[0]
+ elif number_of_dimensions < 0:
+ raise ValueError('Invalid operation: cannot reduce distance matrix '
+ 'to negative dimensions using PCoA. Did you intend '
+ 'to specify the default value "0", which sets '
+ 'the number_of_dimensions equal to the '
+ 'dimensionality of the given distance matrix?')
+
+ # Perform eigendecomposition
+ if method == "eigh":
+ # eigh does not natively support specifying number_of_dimensions, i.e.
+ # there are no speed gains unlike in FSVD. Later, we slice off unwanted
+ # dimensions to conform the result of eigh to the specified
+ # number_of_dimensions.
+
+ eigvals, eigvecs = eigh(matrix_data)
+ long_method_name = "Principal Coordinate Analysis"
+ elif method == "fsvd":
+ eigvals, eigvecs = _fsvd(matrix_data, number_of_dimensions)
+ long_method_name = "Approximate Principal Coordinate Analysis " \
+ "using FSVD"
+ else:
+ raise ValueError(
+ "PCoA eigendecomposition method {} not supported.".format(method))
+
+ # cogent makes eigenvalues positive by taking the
# abs value, but that doesn't seem to be an approach accepted
# by L&L to deal with negative eigenvalues. We raise a warning
# in that case. First, we make values close to 0 equal to 0.
@@ -109,18 +150,14 @@ def pcoa(distance_matrix):
" {0} and the largest is {1}.".format(eigvals.min(),
eigvals.max()),
RuntimeWarning
- )
+ )
+
+ # eigvals might not be ordered, so we first sort them, then analogously
+ # sort the eigenvectors by the ordering of the eigenvalues too
idxs_descending = eigvals.argsort()[::-1]
eigvals = eigvals[idxs_descending]
eigvecs = eigvecs[:, idxs_descending]
- # Scale eigenvalues to have lenght = sqrt(eigenvalue). This
- # works because np.linalg.eigh returns normalized
- # eigenvectors. Each row contains the coordinates of the
- # objects in the space of principal coordinates. Note that at
- # least one eigenvalue is zero because only n-1 axes are
- # needed to represent n points in an euclidean space.
-
# If we return only the coordinates that make sense (i.e., that have a
# corresponding positive eigenvalue), then Jackknifed Beta Diversity
# won't work as it expects all the OrdinationResults to have the same
@@ -130,13 +167,46 @@ def pcoa(distance_matrix):
eigvecs[:, num_positive:] = np.zeros(eigvecs[:, num_positive:].shape)
eigvals[num_positive:] = np.zeros(eigvals[num_positive:].shape)
+ if method == "fsvd":
+ # Since the dimension parameter, hereafter referred to as 'd',
+ # restricts the number of eigenvalues and eigenvectors that FSVD
+ # computes, we need to use an alternative method to compute the sum
+ # of all eigenvalues, used to compute the array of proportions
+ # explained. Otherwise, the proportions calculated will only be
+ # relative to d number of dimensions computed; whereas we want
+ # it to be relative to the entire dimensionality of the
+ # centered distance matrix.
+
+ # An alternative method of calculating th sum of eigenvalues is by
+ # computing the trace of the centered distance matrix.
+ # See proof outlined here: https://goo.gl/VAYiXx
+ sum_eigenvalues = np.trace(matrix_data)
+ else:
+ # Calculate proportions the usual way
+ sum_eigenvalues = np.sum(eigvals)
+
+ proportion_explained = eigvals / sum_eigenvalues
+
+ # In case eigh is used, eigh computes all eigenvectors and -values.
+ # So if number_of_dimensions was specified, we manually need to ensure
+ # only the requested number of dimensions
+ # (number of eigenvectors and eigenvalues, respectively) are returned.
+ eigvecs = eigvecs[:, :number_of_dimensions]
+ eigvals = eigvals[:number_of_dimensions]
+ proportion_explained = proportion_explained[:number_of_dimensions]
+
+ # Scale eigenvalues to have length = sqrt(eigenvalue). This
+ # works because np.linalg.eigh returns normalized
+ # eigenvectors. Each row contains the coordinates of the
+ # objects in the space of principal coordinates. Note that at
+ # least one eigenvalue is zero because only n-1 axes are
+ # needed to represent n points in a euclidean space.
coordinates = eigvecs * np.sqrt(eigvals)
- proportion_explained = eigvals / eigvals.sum()
- axis_labels = ['PC%d' % i for i in range(1, eigvals.size + 1)]
+ axis_labels = ["PC%d" % i for i in range(1, number_of_dimensions + 1)]
return OrdinationResults(
- short_method_name='PCoA',
- long_method_name='Principal Coordinate Analysis',
+ short_method_name="PCoA",
+ long_method_name=long_method_name,
eigvals=pd.Series(eigvals, index=axis_labels),
samples=pd.DataFrame(coordinates, index=distance_matrix.ids,
columns=axis_labels),
@@ -144,6 +214,130 @@ def pcoa(distance_matrix):
index=axis_labels))
+def _fsvd(centered_distance_matrix, number_of_dimensions=10):
+ """
+ Performs singular value decomposition, or more specifically in
+ this case eigendecomposition, using fast heuristic algorithm
+ nicknamed "FSVD" (FastSVD), adapted and optimized from the algorithm
+ described by Halko et al (2011).
+
+ Parameters
+ ----------
+ centered_distance_matrix : np.array
+ Numpy matrix representing the distance matrix for which the
+ eigenvectors and eigenvalues shall be computed
+ number_of_dimensions : int
+ Number of dimensions to keep. Must be lower than or equal to the
+ rank of the given distance_matrix.
+
+ Returns
+ -------
+ np.array
+ Array of eigenvectors, each with number_of_dimensions length.
+ np.array
+ Array of eigenvalues, a total number of number_of_dimensions.
+
+ Notes
+ -----
+ The algorithm is based on 'An Algorithm for the Principal
+ Component analysis of Large Data Sets'
+ by N. Halko, P.G. Martinsson, Y. Shkolnisky, and M. Tygert.
+ Original Paper: https://arxiv.org/abs/1007.5510
+
+ Ported from MATLAB implementation described here:
+ https://stats.stackexchange.com/a/11934/211065
+ """
+
+ m, n = centered_distance_matrix.shape
+
+ # Number of levels of the Krylov method to use.
+ # For most applications, num_levels=1 or num_levels=2 is sufficient.
+ num_levels = 1
+
+ # Changes the power of the spectral norm, thus minimizing the error).
+ use_power_method = False
+
+ # Note: a (conjugate) transpose is removed for performance, since we
+ # only expect square matrices.
+ if m != n:
+ raise ValueError('FSVD expects square distance matrix')
+
+ if number_of_dimensions > m or number_of_dimensions > n:
+ raise ValueError('FSVD: number_of_dimensions cannot be larger than'
+ ' the dimensionality of the given distance matrix.')
+
+ if number_of_dimensions < 0:
+ raise ValueError('Invalid operation: cannot reduce distance matrix '
+ 'to negative dimensions using PCoA. Did you intend '
+ 'to specify the default value "0", which sets '
+ 'the number_of_dimensions equal to the '
+ 'dimensionality of the given distance matrix?')
+
+ k = number_of_dimensions + 2
+
+ # Form a real nxl matrix G whose entries are independent, identically
+ # distributed Gaussian random variables of zero mean and unit variance
+ G = standard_normal(size=(n, k))
+
+ if use_power_method:
+ # use only the given exponent
+ H = dot(centered_distance_matrix, G)
+
+ for x in range(2, num_levels + 2):
+ # enhance decay of singular values
+ # note: distance_matrix is no longer transposed, saves work
+ # since we're expecting symmetric, square matrices anyway
+ # (Daniel McDonald's changes)
+ H = dot(centered_distance_matrix, dot(centered_distance_matrix, H))
+
+ else:
+ # compute the m x l matrices H^{(0)}, ..., H^{(i)}
+ # Note that this is done implicitly in each iteration below.
+ H = dot(centered_distance_matrix, G)
+ # to enhance performance
+ H = hstack(
+ (H,
+ dot(centered_distance_matrix, dot(centered_distance_matrix, H))))
+ for x in range(3, num_levels + 2):
+ tmp = dot(centered_distance_matrix,
+ dot(centered_distance_matrix, H))
+
+ H = hstack(
+ (H, dot(centered_distance_matrix,
+ dot(centered_distance_matrix, tmp))))
+
+ # Using the pivoted QR-decomposition, form a real m * ((i+1)l) matrix Q
+ # whose columns are orthonormal, s.t. there exists a real
+ # ((i+1)l) * ((i+1)l) matrix R for which H = QR
+ Q, R = qr(H)
+
+ # Compute the n * ((i+1)l) product matrix T = A^T Q
+ T = dot(centered_distance_matrix, Q) # step 3
+
+ # Form an SVD of T
+ Vt, St, W = svd(T, full_matrices=False)
+ W = W.transpose()
+
+ # Compute the m * ((i+1)l) product matrix
+ Ut = dot(Q, W)
+
+ U_fsvd = Ut[:, :number_of_dimensions]
+
+ S = St[:number_of_dimensions]
+
+ # drop imaginary component, if we got one
+ # Note:
+ # In cogent, after computing eigenvalues/vectors, the imaginary part
+ # is dropped, if any. We know for a fact that the eigenvalues are
+ # real, so that's not necessary, but eigenvectors can in principle
+ # be complex (see for example
+ # http://math.stackexchange.com/a/47807/109129 for details)
+ eigenvalues = S.real
+ eigenvectors = U_fsvd.real
+
+ return eigenvalues, eigenvectors
+
+
@experimental(as_of="0.5.3")
def pcoa_biplot(ordination, y):
"""Compute the projection of descriptors into a PCoA matrix
@@ -171,7 +365,7 @@ def pcoa_biplot(ordination, y):
# acknowledge that most saved ordinations lack a name, however if they have
# a name, it should be PCoA
if (ordination.short_method_name != '' and
- ordination.short_method_name != 'PCoA'):
+ ordination.short_method_name != 'PCoA'):
raise ValueError('This biplot computation can only be performed in a '
'PCoA matrix.')
=====================================
skbio/stats/ordination/_utils.py
=====================================
@@ -196,3 +196,86 @@ def f_matrix(E_matrix):
col_means = E_matrix.mean(axis=0, keepdims=True)
matrix_mean = E_matrix.mean()
return E_matrix - row_means - col_means + matrix_mean
+
+
+def center_distance_matrix(distance_matrix, inplace=False):
+ """
+ Centers a distance matrix.
+
+ Note: If the used distance was euclidean, pairwise distances
+ needn't be computed from the data table Y because F_matrix =
+ Y.dot(Y.T) (if Y has been centered).
+ But since we're expecting distance_matrix to be non-euclidian,
+ we do the following computation as per
+ Numerical Ecology (Legendre & Legendre 1998).
+
+ Parameters
+ ----------
+ distance_matrix : 2D array_like
+ Distance matrix.
+ inplace : bool, optional
+ Whether or not to center the given distance matrix in-place, which
+ is more efficient in terms of memory and computation.
+ """
+ if inplace:
+ return _f_matrix_inplace(_e_matrix_inplace(distance_matrix))
+ else:
+ return f_matrix(e_matrix(distance_matrix))
+
+
+def _e_matrix_inplace(distance_matrix):
+ """
+ Compute E matrix from a distance matrix inplace.
+ Squares and divides by -2 the input element-wise. Eq. 9.20 in
+ Legendre & Legendre 1998.
+
+ Modified from :func:`skbio.stats.ordination.e_matrix` function,
+ performing row-wise operations to avoid excessive memory allocations.
+
+ Parameters
+ ----------
+ distance_matrix : 2D array_like
+ Distance matrix.
+ """
+ distance_matrix = distance_matrix.astype(np.float)
+
+ for i in np.arange(len(distance_matrix)):
+ distance_matrix[i] = (distance_matrix[i] * distance_matrix[i]) / -2
+ return distance_matrix
+
+
+def _f_matrix_inplace(e_matrix):
+ """
+ Compute F matrix from E matrix inplace.
+ Centering step: for each element, the mean of the corresponding
+ row and column are subtracted, and the mean of the whole
+ matrix is added. Eq. 9.21 in Legendre & Legendre 1998.
+
+ Modified from :func:`skbio.stats.ordination.f_matrix` function,
+ performing row-wise operations to avoid excessive memory allocations.
+
+ Parameters
+ ----------
+ e_matrix : 2D array_like
+ A matrix representing the "E matrix" as described above.
+ """
+ e_matrix = e_matrix.astype(np.float)
+
+ row_means = np.zeros(len(e_matrix), dtype=float)
+ col_means = np.zeros(len(e_matrix), dtype=float)
+ matrix_mean = 0.0
+
+ for i in np.arange(len(e_matrix)):
+ row_means[i] = e_matrix[i].mean()
+ matrix_mean += e_matrix[i].sum()
+ col_means += e_matrix[i]
+ matrix_mean /= len(e_matrix) ** 2
+ col_means /= len(e_matrix)
+
+ for i in np.arange(len(e_matrix)):
+ v = e_matrix[i]
+ v -= row_means[i]
+ v -= col_means
+ v += matrix_mean
+ e_matrix[i] = v
+ return e_matrix
=====================================
skbio/stats/ordination/tests/data/PCoA_sample_data_12dim
=====================================
@@ -0,0 +1,13 @@
+ 1 2 3 4 5 6 7 8 9 10 11 12
+1 0.0 0.909636950834479 0.9869428157422727 0.7190893636533138 0.6960053958431593 0.6477238596907942 0.30557161358653495 0.8946966124829346 0.12780699944110363 0.39915339691165386 0.7641239434153432 0.6248070796484706
+2 0.909636950834479 0.0 0.3871354292850083 0.6881160960373599 0.5550593584027527 0.5855786600007656 0.4843215561734061 0.20448304199758327 0.4067028703340123 0.2754701840044086 0.6269219445617967 0.05629366991581264
+3 0.9869428157422727 0.3871354292850083 0.0 0.6088130886750466 0.8611896463567201 0.2815827949525225 0.6500535832888426 0.8196046614443331 0.356410088497226 0.05164123821334954 0.7110953188954077 0.32855281988632934
+4 0.7190893636533138 0.6881160960373599 0.6088130886750466 0.0 0.7453215102240474 0.9916540031629704 0.14394284428694282 0.8388378539413649 0.15115603934799038 0.13871462268568635 0.1934605692727246 0.9804118301943398
+5 0.6960053958431593 0.5550593584027527 0.8611896463567201 0.7453215102240474 0.0 0.7996611932937304 0.30579824243478326 0.5227960305398314 0.8564730629853469 0.7786384040043949 0.06843106040159719 0.7715973816221341
+6 0.6477238596907942 0.5855786600007656 0.2815827949525225 0.9916540031629704 0.7996611932937304 0.0 0.8869204659721949 0.1619378942802252 0.10200764546980268 0.17805335055828198 0.8796559972720953 0.20933243431218862
+7 0.30557161358653495 0.4843215561734061 0.6500535832888426 0.14394284428694282 0.30579824243478326 0.8869204659721949 0.0 0.9489770186788868 0.9531210051205121 0.5834385348164726 0.31984891724102216 0.5852822100268925
+8 0.8946966124829346 0.20448304199758327 0.8196046614443331 0.8388378539413649 0.5227960305398314 0.1619378942802252 0.9489770186788868 0.0 0.16681381452925792 0.9281238242741929 0.604480007052297 0.43978806925866687
+9 0.12780699944110363 0.4067028703340123 0.356410088497226 0.15115603934799038 0.8564730629853469 0.10200764546980268 0.9531210051205121 0.16681381452925792 0.0 0.10766387594368387 0.9552101788877516 0.6135541732435132
+10 0.39915339691165386 0.2754701840044086 0.05164123821334954 0.13871462268568635 0.7786384040043949 0.17805335055828198 0.5834385348164726 0.9281238242741929 0.10766387594368387 0.0 0.4157921207508062 0.31997143485314194
+11 0.7641239434153432 0.6269219445617967 0.7110953188954077 0.1934605692727246 0.06843106040159719 0.8796559972720953 0.31984891724102216 0.604480007052297 0.9552101788877516 0.4157921207508062 0.0 0.11189976154475101
+12 0.6248070796484706 0.05629366991581264 0.32855281988632934 0.9804118301943398 0.7715973816221341 0.20933243431218862 0.5852822100268925 0.43978806925866687 0.6135541732435132 0.31997143485314194 0.11189976154475101 0.0
=====================================
skbio/stats/ordination/tests/data/PCoA_sample_data_3_eigh_ref_3dim
=====================================
@@ -0,0 +1,22 @@
+Eigvals 3
+0.5123672604605051 0.30071909442702155 0.26791206600414047
+
+Proportion explained 3
+0.26757383277657976 0.15704469604990076 0.13991186377402365
+
+Species 0 0
+
+Site 9 3
+PC.636 -0.2584654611828421 0.17399954688273872 -0.03828757925519412
+PC.635 -0.27100113539100934 -0.018595131906339258 0.08648419263485663
+PC.356 0.23507789817473093 0.09625192544887005 0.34579272671386985
+PC.481 0.026140766432533755 -0.011145967653319655 -0.14766060301460787
+PC.354 0.2850075522831216 -0.019254988848331687 -0.062326337538532166
+PC.593 0.20463632624145514 -0.13936115093164073 -0.29151381962286704
+PC.355 0.23348240321199026 0.22525797406849954 0.018862309626814944
+PC.607 -0.09496319113225934 -0.4209748024953033 0.15486945486941445
+PC.634 -0.35991515863772167 0.11382259543482587 -0.06622034441375392
+
+Biplot 0 0
+
+Site constraints 0 0
=====================================
skbio/stats/ordination/tests/data/PCoA_sample_data_3_fsvd_ref_3dim
=====================================
@@ -0,0 +1,22 @@
+Eigvals 3
+0.5123672604605054 0.3007190944270222 0.2679120660041405
+
+Proportion explained 3
+0.2675738327765797 0.15704469604990098 0.13991186377402356
+
+Species 0 0
+
+Site 9 3
+PC.636 -0.2584654611828421 0.17399954688273822 -0.03828757925519378
+PC.635 -0.2710011353910087 -0.018595131906339074 0.08648419263485721
+PC.356 0.23507789817473107 0.0962519254488692 0.3457927267138707
+PC.481 0.026140766432533644 -0.011145967653318808 -0.14766060301460796
+PC.354 0.28500755228312136 -0.019254988848331694 -0.06232633753853236
+PC.593 0.20463632624145525 -0.13936115093164014 -0.2915138196228672
+PC.355 0.23348240321199093 0.22525797406849954 0.018862309626815132
+PC.607 -0.09496319113225955 -0.4209748024953044 0.1548694548694131
+PC.634 -0.35991515863772144 0.1138225954348267 -0.06622034441375402
+
+Biplot 0 0
+
+Site constraints 0 0
=====================================
skbio/stats/ordination/tests/test_principal_coordinate_analysis.py
=====================================
@@ -6,9 +6,9 @@
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
-import pandas as pd
import numpy as np
import numpy.testing as npt
+import pandas as pd
from copy import deepcopy
from unittest import TestCase, main
@@ -27,7 +27,7 @@ class TestPCoA(TestCase):
def test_simple(self):
eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868,
- 0.19169895, 0.16054235, 0.15017696, 0.12245775,
+ 0.19169895, 0.16054235, 0.15017696, 0.12245775,
0.0]
proportion_explained = [0.2675738328, 0.157044696, 0.1399118638,
0.1091402725, 0.1001110485,
@@ -53,6 +53,73 @@ class TestPCoA(TestCase):
assert_ordination_results_equal(results, expected_results,
ignore_directionality=True)
+ def test_fsvd_inplace(self):
+ dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+ dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+
+ expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
+ inplace=True)
+
+ results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
+ inplace=True)
+
+ assert_ordination_results_equal(results, expected_results,
+ ignore_directionality=True,
+ ignore_method_names=True)
+
+ def test_fsvd(self):
+ dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+ dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+ dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+
+ # Test eigh vs. fsvd pcoa and inplace parameter
+ expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
+ inplace=False)
+
+ results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
+ inplace=False)
+
+ results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3,
+ inplace=True)
+
+ assert_ordination_results_equal(results, expected_results,
+ ignore_directionality=True,
+ ignore_method_names=True)
+
+ assert_ordination_results_equal(results, results_inplace,
+ ignore_directionality=True,
+ ignore_method_names=True)
+
+ # Test number_of_dimensions edge cases
+ results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0,
+ inplace=False)
+ expected_results2 = pcoa(dm3, method="fsvd",
+ number_of_dimensions=dm3.data.shape[0],
+ inplace=False)
+
+ assert_ordination_results_equal(results2, expected_results2,
+ ignore_directionality=True,
+ ignore_method_names=True)
+
+ with self.assertRaises(ValueError):
+ dim_too_large = dm1.data.shape[0] + 10
+ pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large)
+
+ with self.assertRaises(ValueError):
+ pcoa(dm2, method="fsvd", number_of_dimensions=-1)
+
+ with self.assertRaises(ValueError):
+ dim_too_large = dm1.data.shape[0] + 10
+ pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large)
+
+ with self.assertRaises(ValueError):
+ pcoa(dm2, method="eigh", number_of_dimensions=-1)
+
+ dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim'))
+ with self.assertWarnsRegex(RuntimeWarning,
+ "no value for number_of_dimensions"):
+ pcoa(dm_big, method="fsvd", number_of_dimensions=0)
+
def test_extensive(self):
eigvals = [0.3984635, 0.36405689, 0.28804535, 0.27479983,
0.19165361, 0.0]
@@ -203,15 +270,17 @@ class TestPCoABiplot(TestCase):
self.descriptors.index = pd.Index(new_index)
with self.assertRaisesRegex(ValueError, 'The eigenvectors and the '
- 'descriptors must describe the same '
- 'samples.'):
+ 'descriptors must describe '
+ 'the same '
+ 'samples.'):
pcoa_biplot(self.ordination, self.descriptors)
def test_not_a_pcoa(self):
self.ordination.short_method_name = 'RDA'
self.ordination.long_method_name = 'Redundancy Analysis'
with self.assertRaisesRegex(ValueError, 'This biplot computation can'
- ' only be performed in a PCoA matrix.'):
+ ' only be performed in a '
+ 'PCoA matrix.'):
pcoa_biplot(self.ordination, self.descriptors)
def test_from_seralized_results(self):
=====================================
skbio/stats/ordination/tests/test_util.py
=====================================
@@ -6,12 +6,16 @@
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
+from unittest import TestCase, main
+import copy
+
import numpy as np
import numpy.testing as npt
-from unittest import TestCase, main
+from skbio.stats.ordination import corr, mean_and_std, e_matrix, f_matrix, \
+ center_distance_matrix
-from skbio.stats.ordination import corr, mean_and_std, e_matrix, f_matrix
+from skbio.stats.ordination._utils import _e_matrix_inplace, _f_matrix_inplace
class TestUtils(TestCase):
@@ -51,7 +55,7 @@ class TestUtils(TestCase):
def test_e_matrix(self):
E = e_matrix(self.matrix)
- expected_E = np.array([[-0.5, -2., -4.5],
+ expected_E = np.array([[-0.5, -2., -4.5],
[-8., -12.5, -18.]])
npt.assert_almost_equal(E, expected_E)
@@ -61,6 +65,37 @@ class TestUtils(TestCase):
# Note that `test_make_F_matrix` in cogent is wrong
npt.assert_almost_equal(F, expected_F)
+ def test_e_matrix_inplace(self):
+ E = _e_matrix_inplace(self.matrix)
+ expected_E = np.array([[-0.5, -2., -4.5],
+ [-8., -12.5, -18.]])
+ npt.assert_almost_equal(E, expected_E)
+
+ def test_f_matrix_inplace(self):
+ F = _f_matrix_inplace(self.matrix2)
+ expected_F = np.zeros((3, 3))
+ npt.assert_almost_equal(F, expected_F)
+
+ def test_center_distance_matrix_inplace(self):
+ dm_expected = f_matrix(e_matrix(self.small_mat))
+
+ # make copy of matrix to test inplace centering
+ matrix_copy = copy.deepcopy(self.small_mat)
+ dm_centered = center_distance_matrix(matrix_copy, inplace=False)
+
+ # ensure that matrix_copy was NOT modified inplace
+ self.assertTrue(np.array_equal(matrix_copy, self.small_mat))
+
+ # and ensure that the result of centering was correct
+ npt.assert_almost_equal(dm_expected, dm_centered)
+
+ # next, sort same matrix inplace
+ matrix_copy2 = copy.deepcopy(self.small_mat)
+ dm_centered_inp = center_distance_matrix(matrix_copy2, inplace=True)
+
+ # and ensure that the result of inplace centering was correct
+ npt.assert_almost_equal(dm_expected, dm_centered_inp)
+
if __name__ == '__main__':
main()
=====================================
skbio/util/_testing.py
=====================================
@@ -6,9 +6,9 @@
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
-import os
-import inspect
import warnings
+import inspect
+import os
import nose
import numpy as np
@@ -80,6 +80,7 @@ class TestRunner:
and ugly. This class invokes nose with the required options.
"""
+
@experimental(as_of="0.4.0")
def __init__(self, filename):
self._filename = filename
@@ -279,7 +280,7 @@ def _normalize_signs(arr1, arr2):
raise ValueError(
"Arrays must have the same shape ({0} vs {1}).".format(arr1.shape,
arr2.shape)
- )
+ )
# To avoid issues around zero, we'll compare signs of the values
# with highest absolute value
View it on GitLab: https://salsa.debian.org/med-team/python-skbio/compare/3d3c45f3254533f7a7055537dba85e12078e2138...59737a89a9cfea9e36f5487d037b6dc5e014ee74
--
View it on GitLab: https://salsa.debian.org/med-team/python-skbio/compare/3d3c45f3254533f7a7055537dba85e12078e2138...59737a89a9cfea9e36f5487d037b6dc5e014ee74
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180828/ad71359d/attachment-0001.html>
More information about the debian-med-commit
mailing list