[med-svn] [Git][med-team/python-skbio][upstream] New upstream version 0.5.4

Andreas Tille gitlab at salsa.debian.org
Tue Aug 28 18:48:37 BST 2018


Andreas Tille pushed to branch upstream at Debian Med / python-skbio


Commits:
ee541afd by Andreas Tille at 2018-08-28T16:13:43Z
New upstream version 0.5.4
- - - - -


11 changed files:

- CHANGELOG.md
- skbio/__init__.py
- skbio/stats/ordination/__init__.py
- skbio/stats/ordination/_principal_coordinate_analysis.py
- skbio/stats/ordination/_utils.py
- + skbio/stats/ordination/tests/data/PCoA_sample_data_12dim
- + skbio/stats/ordination/tests/data/PCoA_sample_data_3_eigh_ref_3dim
- + skbio/stats/ordination/tests/data/PCoA_sample_data_3_fsvd_ref_3dim
- skbio/stats/ordination/tests/test_principal_coordinate_analysis.py
- skbio/stats/ordination/tests/test_util.py
- skbio/util/_testing.py


Changes:

=====================================
CHANGELOG.md
=====================================
@@ -1,5 +1,27 @@
 # scikit-bio changelog
 
+## Version 0.5.4 (2018-08-23)
+
+### Features
+
+* Added `FSVD`, an alternative fast heuristic method to perform Principal Coordinates Analysis, to `skbio.stats.ordination.pcoa`.
+
+### Backward-incompatible changes [stable]
+
+### Backward-incompatible changes [experimental]
+
+### Performance enhancements
+
+* Added optimized utility methods `f_matrix_inplace` and `e_matrix_inplace` which perform `f_matrix` and `e_matrix` computations in-place and are used by the new `center_distance_matrix` method in `skbio.stats.ordination`.
+
+### Bug fixes
+
+### Deprecated functionality [stable]
+
+### Deprecated functionality [experimental]
+
+### Miscellaneous
+
 ## Version 0.5.3 (2018-08-07)
 
 ### Features


=====================================
skbio/__init__.py
=====================================
@@ -26,7 +26,7 @@ __all__ = ['Sequence', 'DNA', 'RNA', 'Protein', 'GeneticCode',
            'TreeNode', 'nj', 'read', 'write', 'OrdinationResults']
 
 __credits__ = "https://github.com/biocore/scikit-bio/graphs/contributors"
-__version__ = "0.5.3"
+__version__ = "0.5.4"
 
 mottos = [
     # 03/15/2014


=====================================
skbio/stats/ordination/__init__.py
=====================================
@@ -129,10 +129,11 @@ from ._correspondence_analysis import ca
 from ._canonical_correspondence_analysis import cca
 from ._principal_coordinate_analysis import pcoa, pcoa_biplot
 from ._ordination_results import OrdinationResults
-from ._utils import (mean_and_std, scale, svd_rank, corr, e_matrix, f_matrix)
+from ._utils import (mean_and_std, scale, svd_rank, corr, e_matrix, f_matrix,
+                     center_distance_matrix)
 
 __all__ = ['ca', 'rda', 'cca', 'pcoa', 'pcoa_biplot', 'OrdinationResults',
            'mean_and_std', 'scale', 'svd_rank', 'corr',
-           'e_matrix', 'f_matrix']
+           'e_matrix', 'f_matrix', 'center_distance_matrix']
 
 test = TestRunner(__file__).test


=====================================
skbio/stats/ordination/_principal_coordinate_analysis.py
=====================================
@@ -6,33 +6,29 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from warnings import warn
-
-import pandas as pd
 import numpy as np
+import pandas as pd
+from numpy import dot, hstack
+from numpy.linalg import qr, svd
+from numpy.random import standard_normal
 from scipy.linalg import eigh
+from warnings import warn
 
 from skbio.stats.distance import DistanceMatrix
 from skbio.util._decorator import experimental
 from ._ordination_results import OrdinationResults
-from ._utils import e_matrix, f_matrix, scale
-
-# - In cogent, after computing eigenvalues/vectors, the imaginary part
-#   is dropped, if any. We know for a fact that the eigenvalues are
-#   real, so that's not necessary, but eigenvectors can in principle
-#   be complex (see for example
-#   http://math.stackexchange.com/a/47807/109129 for details) and in
-#   that case dropping the imaginary part means they'd no longer be
-#   so, so I'm not doing that.
+from ._utils import center_distance_matrix, scale
 
 
 @experimental(as_of="0.4.0")
-def pcoa(distance_matrix):
+def pcoa(distance_matrix, method="eigh", number_of_dimensions=0,
+         inplace=False):
     r"""Perform Principal Coordinate Analysis.
 
-    Principal Coordinate Analysis (PCoA) is a method similar to PCA
-    that works from distance matrices, and so it can be used with
-    ecologically meaningful distances like UniFrac for bacteria.
+    Principal Coordinate Analysis (PCoA) is a method similar
+    to Principal Components Analysis (PCA) with the difference that PCoA
+    operates on distance matrices, typically with non-euclidian and thus
+    ecologically meaningful distances like UniFrac in microbiome research.
 
     In ecology, the euclidean distance preserved by Principal
     Component Analysis (PCA) is often not a good choice because it
@@ -44,10 +40,36 @@ def pcoa(distance_matrix):
     species is present in two sites, that means that the sites are
     similar.).
 
+    Note that the returned eigenvectors are not normalized to unit length.
+
     Parameters
     ----------
     distance_matrix : DistanceMatrix
         A distance matrix.
+    method : str, optional
+        Eigendecomposition method to use in performing PCoA.
+        By default, uses SciPy's `eigh`, which computes exact
+        eigenvectors and eigenvalues for all dimensions. The alternate
+        method, `fsvd`, uses faster heuristic eigendecomposition but loses
+        accuracy. The magnitude of accuracy lost is dependent on dataset.
+    number_of_dimensions : int, optional
+        Dimensions to reduce the distance matrix to. This number determines
+        how many eigenvectors and eigenvalues will be returned.
+        By default, equal to the number of dimensions of the distance matrix,
+        as default eigendecomposition using SciPy's `eigh` method computes
+        all eigenvectors and eigenvalues. If using fast heuristic
+        eigendecomposition through `fsvd`, a desired number of dimensions
+        should be specified. Note that the default eigendecomposition
+        method `eigh` does not natively support a specifying number of
+        dimensions to reduce a matrix to, so if this parameter is specified,
+        all eigenvectors and eigenvalues will be simply be computed with no
+        speed gain, and only the number specified by `number_of_dimensions`
+        will be returned. Specifying a value of `0`, the default, will
+        set `number_of_dimensions` equal to the number of dimensions of the
+        specified `distance_matrix`.
+    inplace : bool, optional
+        If true, centers a distance matrix in-place in a manner that reduces
+        memory consumption.
 
     Returns
     -------
@@ -62,37 +84,56 @@ def pcoa(distance_matrix):
 
     Notes
     -----
-    It is sometimes known as metric multidimensional scaling or
-    classical scaling.
-
-    .. note::
-
-       If the distance is not euclidean (for example if it is a
-       semimetric and the triangle inequality doesn't hold),
-       negative eigenvalues can appear. There are different ways
-       to deal with that problem (see Legendre & Legendre 1998, \S
-       9.2.3), but none are currently implemented here.
-
-       However, a warning is raised whenever negative eigenvalues
-       appear, allowing the user to decide if they can be safely
-       ignored.
+    .. note:: If the distance is not euclidean (for example if it is a
+        semimetric and the triangle inequality doesn't hold),
+        negative eigenvalues can appear. There are different ways
+        to deal with that problem (see Legendre & Legendre 1998, \S
+        9.2.3), but none are currently implemented here.
+        However, a warning is raised whenever negative eigenvalues
+        appear, allowing the user to decide if they can be safely
+        ignored.
     """
     distance_matrix = DistanceMatrix(distance_matrix)
 
-    E_matrix = e_matrix(distance_matrix.data)
-
-    # If the used distance was euclidean, pairwise distances
-    # needn't be computed from the data table Y because F_matrix =
-    # Y.dot(Y.T) (if Y has been centred).
-    F_matrix = f_matrix(E_matrix)
-
-    # If the eigendecomposition ever became a bottleneck, it could
-    # be replaced with an iterative version that computes the
-    # largest k eigenvectors.
-    eigvals, eigvecs = eigh(F_matrix)
-
-    # eigvals might not be ordered, so we order them (at least one
-    # is zero). cogent makes eigenvalues positive by taking the
+    # Center distance matrix, a requirement for PCoA here
+    matrix_data = center_distance_matrix(distance_matrix.data, inplace=inplace)
+
+    # If no dimension specified, by default will compute all eigenvectors
+    # and eigenvalues
+    if number_of_dimensions == 0:
+        if method == "fsvd" and matrix_data.shape[0] > 10:
+            warn("FSVD: since no value for number_of_dimensions is specified, "
+                 "PCoA for all dimensions will be computed, which may "
+                 "result in long computation time if the original "
+                 "distance matrix is large.", RuntimeWarning)
+
+        # distance_matrix is guaranteed to be square
+        number_of_dimensions = matrix_data.shape[0]
+    elif number_of_dimensions < 0:
+        raise ValueError('Invalid operation: cannot reduce distance matrix '
+                         'to negative dimensions using PCoA. Did you intend '
+                         'to specify the default value "0", which sets '
+                         'the number_of_dimensions equal to the '
+                         'dimensionality of the given distance matrix?')
+
+    # Perform eigendecomposition
+    if method == "eigh":
+        # eigh does not natively support specifying number_of_dimensions, i.e.
+        # there are no speed gains unlike in FSVD. Later, we slice off unwanted
+        # dimensions to conform the result of eigh to the specified
+        # number_of_dimensions.
+
+        eigvals, eigvecs = eigh(matrix_data)
+        long_method_name = "Principal Coordinate Analysis"
+    elif method == "fsvd":
+        eigvals, eigvecs = _fsvd(matrix_data, number_of_dimensions)
+        long_method_name = "Approximate Principal Coordinate Analysis " \
+                           "using FSVD"
+    else:
+        raise ValueError(
+            "PCoA eigendecomposition method {} not supported.".format(method))
+
+    # cogent makes eigenvalues positive by taking the
     # abs value, but that doesn't seem to be an approach accepted
     # by L&L to deal with negative eigenvalues. We raise a warning
     # in that case. First, we make values close to 0 equal to 0.
@@ -109,18 +150,14 @@ def pcoa(distance_matrix):
             " {0} and the largest is {1}.".format(eigvals.min(),
                                                   eigvals.max()),
             RuntimeWarning
-            )
+        )
+
+    # eigvals might not be ordered, so we first sort them, then analogously
+    # sort the eigenvectors by the ordering of the eigenvalues too
     idxs_descending = eigvals.argsort()[::-1]
     eigvals = eigvals[idxs_descending]
     eigvecs = eigvecs[:, idxs_descending]
 
-    # Scale eigenvalues to have lenght = sqrt(eigenvalue). This
-    # works because np.linalg.eigh returns normalized
-    # eigenvectors. Each row contains the coordinates of the
-    # objects in the space of principal coordinates. Note that at
-    # least one eigenvalue is zero because only n-1 axes are
-    # needed to represent n points in an euclidean space.
-
     # If we return only the coordinates that make sense (i.e., that have a
     # corresponding positive eigenvalue), then Jackknifed Beta Diversity
     # won't work as it expects all the OrdinationResults to have the same
@@ -130,13 +167,46 @@ def pcoa(distance_matrix):
     eigvecs[:, num_positive:] = np.zeros(eigvecs[:, num_positive:].shape)
     eigvals[num_positive:] = np.zeros(eigvals[num_positive:].shape)
 
+    if method == "fsvd":
+        # Since the dimension parameter, hereafter referred to as 'd',
+        # restricts the number of eigenvalues and eigenvectors that FSVD
+        # computes, we need to use an alternative method to compute the sum
+        # of all eigenvalues, used to compute the array of proportions
+        # explained. Otherwise, the proportions calculated will only be
+        # relative to d number of dimensions computed; whereas we want
+        # it to be relative to the entire dimensionality of the
+        # centered distance matrix.
+
+        # An alternative method of calculating th sum of eigenvalues is by
+        # computing the trace of the centered distance matrix.
+        # See proof outlined here: https://goo.gl/VAYiXx
+        sum_eigenvalues = np.trace(matrix_data)
+    else:
+        # Calculate proportions the usual way
+        sum_eigenvalues = np.sum(eigvals)
+
+    proportion_explained = eigvals / sum_eigenvalues
+
+    # In case eigh is used, eigh computes all eigenvectors and -values.
+    # So if number_of_dimensions was specified, we manually need to ensure
+    # only the requested number of dimensions
+    # (number of eigenvectors and eigenvalues, respectively) are returned.
+    eigvecs = eigvecs[:, :number_of_dimensions]
+    eigvals = eigvals[:number_of_dimensions]
+    proportion_explained = proportion_explained[:number_of_dimensions]
+
+    # Scale eigenvalues to have length = sqrt(eigenvalue). This
+    # works because np.linalg.eigh returns normalized
+    # eigenvectors. Each row contains the coordinates of the
+    # objects in the space of principal coordinates. Note that at
+    # least one eigenvalue is zero because only n-1 axes are
+    # needed to represent n points in a euclidean space.
     coordinates = eigvecs * np.sqrt(eigvals)
-    proportion_explained = eigvals / eigvals.sum()
 
-    axis_labels = ['PC%d' % i for i in range(1, eigvals.size + 1)]
+    axis_labels = ["PC%d" % i for i in range(1, number_of_dimensions + 1)]
     return OrdinationResults(
-        short_method_name='PCoA',
-        long_method_name='Principal Coordinate Analysis',
+        short_method_name="PCoA",
+        long_method_name=long_method_name,
         eigvals=pd.Series(eigvals, index=axis_labels),
         samples=pd.DataFrame(coordinates, index=distance_matrix.ids,
                              columns=axis_labels),
@@ -144,6 +214,130 @@ def pcoa(distance_matrix):
                                        index=axis_labels))
 
 
+def _fsvd(centered_distance_matrix, number_of_dimensions=10):
+    """
+    Performs singular value decomposition, or more specifically in
+    this case eigendecomposition, using fast heuristic algorithm
+    nicknamed "FSVD" (FastSVD), adapted and optimized from the algorithm
+    described by Halko et al (2011).
+
+    Parameters
+    ----------
+    centered_distance_matrix : np.array
+       Numpy matrix representing the distance matrix for which the
+       eigenvectors and eigenvalues shall be computed
+    number_of_dimensions : int
+       Number of dimensions to keep. Must be lower than or equal to the
+       rank of the given distance_matrix.
+
+    Returns
+    -------
+    np.array
+       Array of eigenvectors, each with number_of_dimensions length.
+    np.array
+       Array of eigenvalues, a total number of number_of_dimensions.
+
+    Notes
+    -----
+    The algorithm is based on 'An Algorithm for the Principal
+    Component analysis of Large Data Sets'
+    by N. Halko, P.G. Martinsson, Y. Shkolnisky, and M. Tygert.
+    Original Paper: https://arxiv.org/abs/1007.5510
+
+    Ported from MATLAB implementation described here:
+    https://stats.stackexchange.com/a/11934/211065
+    """
+
+    m, n = centered_distance_matrix.shape
+
+    # Number of levels of the Krylov method to use.
+    # For most applications, num_levels=1 or num_levels=2 is sufficient.
+    num_levels = 1
+
+    # Changes the power of the spectral norm, thus minimizing the error).
+    use_power_method = False
+
+    # Note: a (conjugate) transpose is removed for performance, since we
+    # only expect square matrices.
+    if m != n:
+        raise ValueError('FSVD expects square distance matrix')
+
+    if number_of_dimensions > m or number_of_dimensions > n:
+        raise ValueError('FSVD: number_of_dimensions cannot be larger than'
+                         ' the dimensionality of the given distance matrix.')
+
+    if number_of_dimensions < 0:
+        raise ValueError('Invalid operation: cannot reduce distance matrix '
+                         'to negative dimensions using PCoA. Did you intend '
+                         'to specify the default value "0", which sets '
+                         'the number_of_dimensions equal to the '
+                         'dimensionality of the given distance matrix?')
+
+    k = number_of_dimensions + 2
+
+    # Form a real nxl matrix G whose entries are independent, identically
+    # distributed Gaussian random variables of zero mean and unit variance
+    G = standard_normal(size=(n, k))
+
+    if use_power_method:
+        # use only the given exponent
+        H = dot(centered_distance_matrix, G)
+
+        for x in range(2, num_levels + 2):
+            # enhance decay of singular values
+            # note: distance_matrix is no longer transposed, saves work
+            # since we're expecting symmetric, square matrices anyway
+            # (Daniel McDonald's changes)
+            H = dot(centered_distance_matrix, dot(centered_distance_matrix, H))
+
+    else:
+        # compute the m x l matrices H^{(0)}, ..., H^{(i)}
+        # Note that this is done implicitly in each iteration below.
+        H = dot(centered_distance_matrix, G)
+        # to enhance performance
+        H = hstack(
+            (H,
+             dot(centered_distance_matrix, dot(centered_distance_matrix, H))))
+        for x in range(3, num_levels + 2):
+            tmp = dot(centered_distance_matrix,
+                      dot(centered_distance_matrix, H))
+
+            H = hstack(
+                (H, dot(centered_distance_matrix,
+                        dot(centered_distance_matrix, tmp))))
+
+    # Using the pivoted QR-decomposition, form a real m * ((i+1)l) matrix Q
+    # whose columns are orthonormal, s.t. there exists a real
+    # ((i+1)l) * ((i+1)l) matrix R for which H = QR
+    Q, R = qr(H)
+
+    # Compute the n * ((i+1)l) product matrix T = A^T Q
+    T = dot(centered_distance_matrix, Q)  # step 3
+
+    # Form an SVD of T
+    Vt, St, W = svd(T, full_matrices=False)
+    W = W.transpose()
+
+    # Compute the m * ((i+1)l) product matrix
+    Ut = dot(Q, W)
+
+    U_fsvd = Ut[:, :number_of_dimensions]
+
+    S = St[:number_of_dimensions]
+
+    # drop imaginary component, if we got one
+    # Note:
+    #   In cogent, after computing eigenvalues/vectors, the imaginary part
+    #   is dropped, if any. We know for a fact that the eigenvalues are
+    #   real, so that's not necessary, but eigenvectors can in principle
+    #   be complex (see for example
+    #   http://math.stackexchange.com/a/47807/109129 for details)
+    eigenvalues = S.real
+    eigenvectors = U_fsvd.real
+
+    return eigenvalues, eigenvectors
+
+
 @experimental(as_of="0.5.3")
 def pcoa_biplot(ordination, y):
     """Compute the projection of descriptors into a PCoA matrix
@@ -171,7 +365,7 @@ def pcoa_biplot(ordination, y):
     # acknowledge that most saved ordinations lack a name, however if they have
     # a name, it should be PCoA
     if (ordination.short_method_name != '' and
-       ordination.short_method_name != 'PCoA'):
+            ordination.short_method_name != 'PCoA'):
         raise ValueError('This biplot computation can only be performed in a '
                          'PCoA matrix.')
 


=====================================
skbio/stats/ordination/_utils.py
=====================================
@@ -196,3 +196,86 @@ def f_matrix(E_matrix):
     col_means = E_matrix.mean(axis=0, keepdims=True)
     matrix_mean = E_matrix.mean()
     return E_matrix - row_means - col_means + matrix_mean
+
+
+def center_distance_matrix(distance_matrix, inplace=False):
+    """
+    Centers a distance matrix.
+
+    Note: If the used distance was euclidean, pairwise distances
+    needn't be computed from the data table Y because F_matrix =
+    Y.dot(Y.T) (if Y has been centered).
+    But since we're expecting distance_matrix to be non-euclidian,
+    we do the following computation as per
+    Numerical Ecology (Legendre & Legendre 1998).
+
+    Parameters
+    ----------
+    distance_matrix : 2D array_like
+        Distance matrix.
+    inplace : bool, optional
+        Whether or not to center the given distance matrix in-place, which
+        is more efficient in terms of memory and computation.
+    """
+    if inplace:
+        return _f_matrix_inplace(_e_matrix_inplace(distance_matrix))
+    else:
+        return f_matrix(e_matrix(distance_matrix))
+
+
+def _e_matrix_inplace(distance_matrix):
+    """
+    Compute E matrix from a distance matrix inplace.
+    Squares and divides by -2 the input element-wise. Eq. 9.20 in
+    Legendre & Legendre 1998.
+
+    Modified from :func:`skbio.stats.ordination.e_matrix` function,
+    performing row-wise operations to avoid excessive memory allocations.
+
+    Parameters
+    ----------
+    distance_matrix : 2D array_like
+        Distance matrix.
+    """
+    distance_matrix = distance_matrix.astype(np.float)
+
+    for i in np.arange(len(distance_matrix)):
+        distance_matrix[i] = (distance_matrix[i] * distance_matrix[i]) / -2
+    return distance_matrix
+
+
+def _f_matrix_inplace(e_matrix):
+    """
+    Compute F matrix from E matrix inplace.
+    Centering step: for each element, the mean of the corresponding
+    row and column are subtracted, and the mean of the whole
+    matrix is added. Eq. 9.21 in Legendre & Legendre 1998.
+
+    Modified from :func:`skbio.stats.ordination.f_matrix` function,
+    performing row-wise operations to avoid excessive memory allocations.
+
+    Parameters
+    ----------
+    e_matrix : 2D array_like
+        A matrix representing the "E matrix" as described above.
+    """
+    e_matrix = e_matrix.astype(np.float)
+
+    row_means = np.zeros(len(e_matrix), dtype=float)
+    col_means = np.zeros(len(e_matrix), dtype=float)
+    matrix_mean = 0.0
+
+    for i in np.arange(len(e_matrix)):
+        row_means[i] = e_matrix[i].mean()
+        matrix_mean += e_matrix[i].sum()
+        col_means += e_matrix[i]
+    matrix_mean /= len(e_matrix) ** 2
+    col_means /= len(e_matrix)
+
+    for i in np.arange(len(e_matrix)):
+        v = e_matrix[i]
+        v -= row_means[i]
+        v -= col_means
+        v += matrix_mean
+        e_matrix[i] = v
+    return e_matrix


=====================================
skbio/stats/ordination/tests/data/PCoA_sample_data_12dim
=====================================
@@ -0,0 +1,13 @@
+	1	2	3	4	5	6	7	8	9	10	11	12
+1	0.0	0.909636950834479	0.9869428157422727	0.7190893636533138	0.6960053958431593	0.6477238596907942	0.30557161358653495	0.8946966124829346	0.12780699944110363	0.39915339691165386	0.7641239434153432	0.6248070796484706
+2	0.909636950834479	0.0	0.3871354292850083	0.6881160960373599	0.5550593584027527	0.5855786600007656	0.4843215561734061	0.20448304199758327	0.4067028703340123	0.2754701840044086	0.6269219445617967	0.05629366991581264
+3	0.9869428157422727	0.3871354292850083	0.0	0.6088130886750466	0.8611896463567201	0.2815827949525225	0.6500535832888426	0.8196046614443331	0.356410088497226	0.05164123821334954	0.7110953188954077	0.32855281988632934
+4	0.7190893636533138	0.6881160960373599	0.6088130886750466	0.0	0.7453215102240474	0.9916540031629704	0.14394284428694282	0.8388378539413649	0.15115603934799038	0.13871462268568635	0.1934605692727246	0.9804118301943398
+5	0.6960053958431593	0.5550593584027527	0.8611896463567201	0.7453215102240474	0.0	0.7996611932937304	0.30579824243478326	0.5227960305398314	0.8564730629853469	0.7786384040043949	0.06843106040159719	0.7715973816221341
+6	0.6477238596907942	0.5855786600007656	0.2815827949525225	0.9916540031629704	0.7996611932937304	0.0	0.8869204659721949	0.1619378942802252	0.10200764546980268	0.17805335055828198	0.8796559972720953	0.20933243431218862
+7	0.30557161358653495	0.4843215561734061	0.6500535832888426	0.14394284428694282	0.30579824243478326	0.8869204659721949	0.0	0.9489770186788868	0.9531210051205121	0.5834385348164726	0.31984891724102216	0.5852822100268925
+8	0.8946966124829346	0.20448304199758327	0.8196046614443331	0.8388378539413649	0.5227960305398314	0.1619378942802252	0.9489770186788868	0.0	0.16681381452925792	0.9281238242741929	0.604480007052297	0.43978806925866687
+9	0.12780699944110363	0.4067028703340123	0.356410088497226	0.15115603934799038	0.8564730629853469	0.10200764546980268	0.9531210051205121	0.16681381452925792	0.0	0.10766387594368387	0.9552101788877516	0.6135541732435132
+10	0.39915339691165386	0.2754701840044086	0.05164123821334954	0.13871462268568635	0.7786384040043949	0.17805335055828198	0.5834385348164726	0.9281238242741929	0.10766387594368387	0.0	0.4157921207508062	0.31997143485314194
+11	0.7641239434153432	0.6269219445617967	0.7110953188954077	0.1934605692727246	0.06843106040159719	0.8796559972720953	0.31984891724102216	0.604480007052297	0.9552101788877516	0.4157921207508062	0.0	0.11189976154475101
+12	0.6248070796484706	0.05629366991581264	0.32855281988632934	0.9804118301943398	0.7715973816221341	0.20933243431218862	0.5852822100268925	0.43978806925866687	0.6135541732435132	0.31997143485314194	0.11189976154475101	0.0


=====================================
skbio/stats/ordination/tests/data/PCoA_sample_data_3_eigh_ref_3dim
=====================================
@@ -0,0 +1,22 @@
+Eigvals	3
+0.5123672604605051	0.30071909442702155	0.26791206600414047
+
+Proportion explained	3
+0.26757383277657976	0.15704469604990076	0.13991186377402365
+
+Species	0	0
+
+Site	9	3
+PC.636	-0.2584654611828421	0.17399954688273872	-0.03828757925519412
+PC.635	-0.27100113539100934	-0.018595131906339258	0.08648419263485663
+PC.356	0.23507789817473093	0.09625192544887005	0.34579272671386985
+PC.481	0.026140766432533755	-0.011145967653319655	-0.14766060301460787
+PC.354	0.2850075522831216	-0.019254988848331687	-0.062326337538532166
+PC.593	0.20463632624145514	-0.13936115093164073	-0.29151381962286704
+PC.355	0.23348240321199026	0.22525797406849954	0.018862309626814944
+PC.607	-0.09496319113225934	-0.4209748024953033	0.15486945486941445
+PC.634	-0.35991515863772167	0.11382259543482587	-0.06622034441375392
+
+Biplot	0	0
+
+Site constraints	0	0


=====================================
skbio/stats/ordination/tests/data/PCoA_sample_data_3_fsvd_ref_3dim
=====================================
@@ -0,0 +1,22 @@
+Eigvals	3
+0.5123672604605054	0.3007190944270222	0.2679120660041405
+
+Proportion explained	3
+0.2675738327765797	0.15704469604990098	0.13991186377402356
+
+Species	0	0
+
+Site	9	3
+PC.636	-0.2584654611828421	0.17399954688273822	-0.03828757925519378
+PC.635	-0.2710011353910087	-0.018595131906339074	0.08648419263485721
+PC.356	0.23507789817473107	0.0962519254488692	0.3457927267138707
+PC.481	0.026140766432533644	-0.011145967653318808	-0.14766060301460796
+PC.354	0.28500755228312136	-0.019254988848331694	-0.06232633753853236
+PC.593	0.20463632624145525	-0.13936115093164014	-0.2915138196228672
+PC.355	0.23348240321199093	0.22525797406849954	0.018862309626815132
+PC.607	-0.09496319113225955	-0.4209748024953044	0.1548694548694131
+PC.634	-0.35991515863772144	0.1138225954348267	-0.06622034441375402
+
+Biplot	0	0
+
+Site constraints	0	0


=====================================
skbio/stats/ordination/tests/test_principal_coordinate_analysis.py
=====================================
@@ -6,9 +6,9 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import pandas as pd
 import numpy as np
 import numpy.testing as npt
+import pandas as pd
 from copy import deepcopy
 from unittest import TestCase, main
 
@@ -27,7 +27,7 @@ class TestPCoA(TestCase):
 
     def test_simple(self):
         eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868,
-                   0.19169895, 0.16054235,  0.15017696,  0.12245775,
+                   0.19169895, 0.16054235, 0.15017696, 0.12245775,
                    0.0]
         proportion_explained = [0.2675738328, 0.157044696, 0.1399118638,
                                 0.1091402725, 0.1001110485,
@@ -53,6 +53,73 @@ class TestPCoA(TestCase):
         assert_ordination_results_equal(results, expected_results,
                                         ignore_directionality=True)
 
+    def test_fsvd_inplace(self):
+        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+
+        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
+                                inplace=True)
+
+        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
+                       inplace=True)
+
+        assert_ordination_results_equal(results, expected_results,
+                                        ignore_directionality=True,
+                                        ignore_method_names=True)
+
+    def test_fsvd(self):
+        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+        dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
+
+        # Test eigh vs. fsvd pcoa and inplace parameter
+        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
+                                inplace=False)
+
+        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
+                       inplace=False)
+
+        results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3,
+                               inplace=True)
+
+        assert_ordination_results_equal(results, expected_results,
+                                        ignore_directionality=True,
+                                        ignore_method_names=True)
+
+        assert_ordination_results_equal(results, results_inplace,
+                                        ignore_directionality=True,
+                                        ignore_method_names=True)
+
+        # Test number_of_dimensions edge cases
+        results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0,
+                        inplace=False)
+        expected_results2 = pcoa(dm3, method="fsvd",
+                                 number_of_dimensions=dm3.data.shape[0],
+                                 inplace=False)
+
+        assert_ordination_results_equal(results2, expected_results2,
+                                        ignore_directionality=True,
+                                        ignore_method_names=True)
+
+        with self.assertRaises(ValueError):
+            dim_too_large = dm1.data.shape[0] + 10
+            pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large)
+
+        with self.assertRaises(ValueError):
+            pcoa(dm2, method="fsvd", number_of_dimensions=-1)
+
+        with self.assertRaises(ValueError):
+            dim_too_large = dm1.data.shape[0] + 10
+            pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large)
+
+        with self.assertRaises(ValueError):
+            pcoa(dm2, method="eigh", number_of_dimensions=-1)
+
+        dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim'))
+        with self.assertWarnsRegex(RuntimeWarning,
+                                   "no value for number_of_dimensions"):
+            pcoa(dm_big, method="fsvd", number_of_dimensions=0)
+
     def test_extensive(self):
         eigvals = [0.3984635, 0.36405689, 0.28804535, 0.27479983,
                    0.19165361, 0.0]
@@ -203,15 +270,17 @@ class TestPCoABiplot(TestCase):
         self.descriptors.index = pd.Index(new_index)
 
         with self.assertRaisesRegex(ValueError, 'The eigenvectors and the '
-                                    'descriptors must describe the same '
-                                    'samples.'):
+                                                'descriptors must describe '
+                                                'the same '
+                                                'samples.'):
             pcoa_biplot(self.ordination, self.descriptors)
 
     def test_not_a_pcoa(self):
         self.ordination.short_method_name = 'RDA'
         self.ordination.long_method_name = 'Redundancy Analysis'
         with self.assertRaisesRegex(ValueError, 'This biplot computation can'
-                                    ' only be performed in a PCoA matrix.'):
+                                                ' only be performed in a '
+                                                'PCoA matrix.'):
             pcoa_biplot(self.ordination, self.descriptors)
 
     def test_from_seralized_results(self):


=====================================
skbio/stats/ordination/tests/test_util.py
=====================================
@@ -6,12 +6,16 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
+from unittest import TestCase, main
+import copy
+
 import numpy as np
 import numpy.testing as npt
 
-from unittest import TestCase, main
+from skbio.stats.ordination import corr, mean_and_std, e_matrix, f_matrix, \
+    center_distance_matrix
 
-from skbio.stats.ordination import corr, mean_and_std, e_matrix, f_matrix
+from skbio.stats.ordination._utils import _e_matrix_inplace, _f_matrix_inplace
 
 
 class TestUtils(TestCase):
@@ -51,7 +55,7 @@ class TestUtils(TestCase):
 
     def test_e_matrix(self):
         E = e_matrix(self.matrix)
-        expected_E = np.array([[-0.5,  -2.,  -4.5],
+        expected_E = np.array([[-0.5, -2., -4.5],
                                [-8., -12.5, -18.]])
         npt.assert_almost_equal(E, expected_E)
 
@@ -61,6 +65,37 @@ class TestUtils(TestCase):
         # Note that `test_make_F_matrix` in cogent is wrong
         npt.assert_almost_equal(F, expected_F)
 
+    def test_e_matrix_inplace(self):
+        E = _e_matrix_inplace(self.matrix)
+        expected_E = np.array([[-0.5, -2., -4.5],
+                               [-8., -12.5, -18.]])
+        npt.assert_almost_equal(E, expected_E)
+
+    def test_f_matrix_inplace(self):
+        F = _f_matrix_inplace(self.matrix2)
+        expected_F = np.zeros((3, 3))
+        npt.assert_almost_equal(F, expected_F)
+
+    def test_center_distance_matrix_inplace(self):
+        dm_expected = f_matrix(e_matrix(self.small_mat))
+
+        # make copy of matrix to test inplace centering
+        matrix_copy = copy.deepcopy(self.small_mat)
+        dm_centered = center_distance_matrix(matrix_copy, inplace=False)
+
+        # ensure that matrix_copy was NOT modified inplace
+        self.assertTrue(np.array_equal(matrix_copy, self.small_mat))
+
+        # and ensure that the result of centering was correct
+        npt.assert_almost_equal(dm_expected, dm_centered)
+
+        # next, sort same matrix inplace
+        matrix_copy2 = copy.deepcopy(self.small_mat)
+        dm_centered_inp = center_distance_matrix(matrix_copy2, inplace=True)
+
+        # and ensure that the result of inplace centering was correct
+        npt.assert_almost_equal(dm_expected, dm_centered_inp)
+
 
 if __name__ == '__main__':
     main()


=====================================
skbio/util/_testing.py
=====================================
@@ -6,9 +6,9 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import os
-import inspect
 import warnings
+import inspect
+import os
 
 import nose
 import numpy as np
@@ -80,6 +80,7 @@ class TestRunner:
     and ugly. This class invokes nose with the required options.
 
     """
+
     @experimental(as_of="0.4.0")
     def __init__(self, filename):
         self._filename = filename
@@ -279,7 +280,7 @@ def _normalize_signs(arr1, arr2):
         raise ValueError(
             "Arrays must have the same shape ({0} vs {1}).".format(arr1.shape,
                                                                    arr2.shape)
-            )
+        )
 
     # To avoid issues around zero, we'll compare signs of the values
     # with highest absolute value



View it on GitLab: https://salsa.debian.org/med-team/python-skbio/commit/ee541afd060a47c69a82a7edb6b3009d43663cc6

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-skbio/commit/ee541afd060a47c69a82a7edb6b3009d43663cc6
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180828/e866608a/attachment-0001.html>


More information about the debian-med-commit mailing list