Bug#1082291: scikit-learn: tests fail with scipy 1.14
Drew Parsons
dparsons at debian.org
Thu Sep 19 18:10:54 BST 2024
Source: scikit-learn
Version: 1.4.2+dfsg-6
Severity: normal
scikit-learn tests fail with scipy 1.14 from experimental
Perhaps it's been fixed in the latest upstream release, I don't know.
https://ci.debian.net/packages/s/scikit-learn/unstable/amd64/51759247/
648s _______ test_standard_scaler_partial_fit_numerical_stability[csc_array] ________
648s
648s sparse_container = <class 'scipy.sparse._csc.csc_array'>
648s
648s @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
648s def test_standard_scaler_partial_fit_numerical_stability(sparse_container):
648s # Test if the incremental computation introduces significative errors
648s # for large datasets with values of large magniture
648s rng = np.random.RandomState(0)
648s n_features = 2
648s n_samples = 100
648s offsets = rng.uniform(-1e15, 1e15, size=n_features)
648s scales = rng.uniform(1e3, 1e6, size=n_features)
648s X = rng.randn(n_samples, n_features) * scales + offsets
648s
648s scaler_batch = StandardScaler().fit(X)
648s scaler_incr = StandardScaler()
648s for chunk in X:
648s scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
648s
648s # Regardless of abs values, they must not be more diff 6 significant digits
648s tol = 10 ** (-6)
648s assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
648s assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
648s assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
648s # NOTE Be aware that for much larger offsets std is very unstable (last
648s # assert) while mean is OK.
648s
648s # Sparse input
648s size = (100, 3)
648s scale = 1e20
648s X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale)
648s
648s # with_mean=False is required with sparse input
648s scaler = StandardScaler(with_mean=False).fit(X)
648s scaler_incr = StandardScaler(with_mean=False)
648s
648s for chunk in X:
648s > scaler_incr = scaler_incr.partial_fit(chunk)
648s
648s /usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_data.py:598:
648s _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
648s /usr/lib/python3/dist-packages/sklearn/base.py:1474: in wrapper
648s return fit_method(estimator, *args, **kwargs)
648s _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
648s
648s self = StandardScaler(with_mean=False)
648s X = <Compressed Sparse Row sparse array of dtype 'float64'
648s with 0 stored elements and shape (3,)>
648s y = None, sample_weight = None
648s
648s @_fit_context(prefer_skip_nested_validation=True)
648s def partial_fit(self, X, y=None, sample_weight=None):
648s """Online computation of mean and std on X for later scaling.
648s
648s All of X is processed as a single batch. This is intended for cases
648s when :meth:`fit` is not feasible due to very large number of
648s `n_samples` or because X is read from a continuous stream.
648s
648s The algorithm for incremental mean and std is given in Equation 1.5a,b
648s in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
648s for computing the sample variance: Analysis and recommendations."
648s The American Statistician 37.3 (1983): 242-247:
648s
648s Parameters
648s ----------
648s X : {array-like, sparse matrix} of shape (n_samples, n_features)
648s The data used to compute the mean and standard deviation
648s used for later scaling along the features axis.
648s
648s y : None
648s Ignored.
648s
648s sample_weight : array-like of shape (n_samples,), default=None
648s Individual weights for each sample.
648s
648s .. versionadded:: 0.24
648s parameter *sample_weight* support to StandardScaler.
648s
648s Returns
648s -------
648s self : object
648s Fitted scaler.
648s """
648s first_call = not hasattr(self, "n_samples_seen_")
648s X = self._validate_data(
648s X,
648s accept_sparse=("csr", "csc"),
648s dtype=FLOAT_DTYPES,
648s force_all_finite="allow-nan",
648s reset=first_call,
648s )
648s > n_features = X.shape[1]
648s E IndexError: tuple index out of range
648s
648s /usr/lib/python3/dist-packages/sklearn/preprocessing/_data.py:919: IndexError
648s _______ test_standard_scaler_partial_fit_numerical_stability[csr_array] ________
648s
648s sparse_container = <class 'scipy.sparse._csr.csr_array'>
648s
648s @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
648s def test_standard_scaler_partial_fit_numerical_stability(sparse_container):
648s # Test if the incremental computation introduces significative errors
648s # for large datasets with values of large magniture
648s rng = np.random.RandomState(0)
648s n_features = 2
648s n_samples = 100
648s offsets = rng.uniform(-1e15, 1e15, size=n_features)
648s scales = rng.uniform(1e3, 1e6, size=n_features)
648s X = rng.randn(n_samples, n_features) * scales + offsets
648s
648s scaler_batch = StandardScaler().fit(X)
648s scaler_incr = StandardScaler()
648s for chunk in X:
648s scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
648s
648s # Regardless of abs values, they must not be more diff 6 significant digits
648s tol = 10 ** (-6)
648s assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
648s assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
648s assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
648s # NOTE Be aware that for much larger offsets std is very unstable (last
648s # assert) while mean is OK.
648s
648s # Sparse input
648s size = (100, 3)
648s scale = 1e20
648s X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale)
648s
648s # with_mean=False is required with sparse input
648s scaler = StandardScaler(with_mean=False).fit(X)
648s scaler_incr = StandardScaler(with_mean=False)
648s
648s for chunk in X:
648s > scaler_incr = scaler_incr.partial_fit(chunk)
648s
648s /usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_data.py:598:
648s _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
648s /usr/lib/python3/dist-packages/sklearn/base.py:1474: in wrapper
648s return fit_method(estimator, *args, **kwargs)
648s _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
648s
648s self = StandardScaler(with_mean=False)
648s X = <Compressed Sparse Row sparse array of dtype 'float64'
648s with 0 stored elements and shape (3,)>
648s y = None, sample_weight = None
648s
648s @_fit_context(prefer_skip_nested_validation=True)
648s def partial_fit(self, X, y=None, sample_weight=None):
648s """Online computation of mean and std on X for later scaling.
648s
648s All of X is processed as a single batch. This is intended for cases
648s when :meth:`fit` is not feasible due to very large number of
648s `n_samples` or because X is read from a continuous stream.
648s
648s The algorithm for incremental mean and std is given in Equation 1.5a,b
648s in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
648s for computing the sample variance: Analysis and recommendations."
648s The American Statistician 37.3 (1983): 242-247:
648s
648s Parameters
648s ----------
648s X : {array-like, sparse matrix} of shape (n_samples, n_features)
648s The data used to compute the mean and standard deviation
648s used for later scaling along the features axis.
648s
648s y : None
648s Ignored.
648s
648s sample_weight : array-like of shape (n_samples,), default=None
648s Individual weights for each sample.
648s
648s .. versionadded:: 0.24
648s parameter *sample_weight* support to StandardScaler.
648s
648s Returns
648s -------
648s self : object
648s Fitted scaler.
648s """
648s first_call = not hasattr(self, "n_samples_seen_")
648s X = self._validate_data(
648s X,
648s accept_sparse=("csr", "csc"),
648s dtype=FLOAT_DTYPES,
648s force_all_finite="allow-nan",
648s reset=first_call,
648s )
648s > n_features = X.shape[1]
648s E IndexError: tuple index out of range
648s
648s /usr/lib/python3/dist-packages/sklearn/preprocessing/_data.py:919: IndexError
...
650s FAILED ../../../../usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_data.py::test_standard_scaler_partial_fit_numerical_stability[csc_array]
650s FAILED ../../../../usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_data.py::test_standard_scaler_partial_fit_numerical_stability[csr_array]
650s = 2 failed, 29267 passed, 3384 skipped, 2 deselected, 88 xfailed, 45 xpassed, 3276 warnings in 600.51s (0:10:00) =
We've only recently upgrade to scipy 1.13 but we'll want to upgrade
further to scopy 1.14 before too long.
More information about the debian-science-maintainers
mailing list