Bug#1096252: scikit-learn: fails tests with scipy 1.15
Drew Parsons
dparsons at debian.org
Mon Feb 17 16:44:25 GMT 2025
Source: scikit-learn
Version: 1.4.2+dfsg-7
Severity: normal
scikit-learn is failing tests with scipy 1.15 from experimental
https://ci.debian.net/data/autopkgtest/unstable/amd64/s/scikit-learn/57873442/log.gz
256s FAILED ../../../../usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py::test_csr_polynomial_expansion_index_overflow[csr_array-False-True-2-65535]
256s FAILED ../../../../usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py::test_csr_polynomial_expansion_index_overflow[csr_array-False-True-3-2344]
256s = 2 failed, 29260 passed, 3388 skipped, 88 xfailed, 45 xpassed, 8173 warnings in 129.25s (0:02:09) =
255s __ test_csr_polynomial_expansion_index_overflow[csr_array-False-True-2-65535] __
255s [gw51] linux -- Python 3.12.9 /usr/bin/python3.12
255s
255s degree = 2, n_features = 65535, interaction_only = True, include_bias = False
255s csr_container = <class 'scipy.sparse._csr.csr_array'>
255s
255s @pytest.mark.parametrize(
255s "degree, n_features",
255s [
255s # Needs promotion to int64 when interaction_only=False
255s (2, 65535),
255s (3, 2344),
255s # This guarantees that the intermediate operation when calculating
255s # output columns would overflow a C-long, hence checks that python-
255s # longs are being used.
255s (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)),
255s (3, 65535),
255s # This case tests the second clause of the overflow check which
255s # takes into account the value of `n_features` itself.
255s (2, int(np.sqrt(np.iinfo(np.int64).max))),
255s ],
255s )
255s @pytest.mark.parametrize("interaction_only", [True, False])
255s @pytest.mark.parametrize("include_bias", [True, False])
255s @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
255s def test_csr_polynomial_expansion_index_overflow(
255s degree, n_features, interaction_only, include_bias, csr_container
255s ):
255s """Tests known edge-cases to the dtype promotion strategy and custom
255s Cython code, including a current bug in the upstream
255s `scipy.sparse.hstack`.
255s """
255s data = [1.0]
255s row = [0]
255s col = [n_features - 1]
255s
255s # First degree index
255s expected_indices = [
255s n_features - 1 + int(include_bias),
255s ]
255s # Second degree index
255s expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0])
255s # Third degree index
255s expected_indices.append(
255s n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]
255s )
255s
255s X = csr_container((data, (row, col)))
255s pf = PolynomialFeatures(
255s interaction_only=interaction_only, include_bias=include_bias, degree=degree
255s )
255s
255s # Calculate the number of combinations a-priori, and if needed check for
255s # the correct ValueError and terminate the test early.
255s num_combinations = pf._num_combinations(
255s n_features=n_features,
255s min_degree=0,
255s max_degree=degree,
255s interaction_only=pf.interaction_only,
255s include_bias=pf.include_bias,
255s )
255s if num_combinations > np.iinfo(np.intp).max:
255s msg = (
255s r"The output that would result from the current configuration would have"
255s r" \d* features which is too large to be indexed"
255s )
255s with pytest.raises(ValueError, match=msg):
255s pf.fit(X)
255s return
255s
255s # In SciPy < 1.8, a bug occurs when an intermediate matrix in
255s # `to_stack` in `hstack` fits within int32 however would require int64 when
255s # combined with all previous matrices in `to_stack`.
255s if sp_version < parse_version("1.8.0"):
255s has_bug = False
255s max_int32 = np.iinfo(np.int32).max
255s cumulative_size = n_features + include_bias
255s for deg in range(2, degree + 1):
255s max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
255s max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
255s cumulative_size += max_indices + 1
255s needs_int64 = max(max_indices, max_indptr) > max_int32
255s has_bug |= not needs_int64 and cumulative_size > max_int32
255s if has_bug:
255s msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
255s with pytest.raises(ValueError, match=msg):
255s X_trans = pf.fit_transform(X)
255s return
255s
255s # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
255s # dtype for representing indices and indptr if `n_features` is still
255s # small enough so that each block matrix's indices and indptr arrays
255s # can be represented with `np.int32`. We test `n_features==65535`
255s # since it is guaranteed to run into this bug.
255s if (
255s sp_version < parse_version("1.9.2")
255s and n_features == 65535
255s and degree == 2
255s and not interaction_only
255s ): # pragma: no cover
255s msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
255s with pytest.raises(ValueError, match=msg):
255s X_trans = pf.fit_transform(X)
255s return
255s X_trans = pf.fit_transform(X)
255s
255s expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
255s # Terms higher than first degree
255s non_bias_terms = 1 + (degree - 1) * int(not interaction_only)
255s expected_nnz = int(include_bias) + non_bias_terms
255s assert X_trans.dtype == X.dtype
255s assert X_trans.shape == (1, pf.n_output_features_)
255s > assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype
255s E AssertionError: assert dtype('int64') == <class 'numpy.int32'>
255s E + where dtype('int64') = array([65534]).dtype
255s E + where array([65534]) = <Compressed Sparse Row sparse array of dtype 'float64'\n with 1 stored elements and shape (1, 2147450880)>.indices
255s
255s /usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py:1132: AssertionError
255s __ test_csr_polynomial_expansion_index_overflow[csr_array-False-True-3-2344] ___
255s [gw51] linux -- Python 3.12.9 /usr/bin/python3.12
255s
255s degree = 3, n_features = 2344, interaction_only = True, include_bias = False
255s csr_container = <class 'scipy.sparse._csr.csr_array'>
255s
255s @pytest.mark.parametrize(
255s "degree, n_features",
255s [
255s # Needs promotion to int64 when interaction_only=False
255s (2, 65535),
255s (3, 2344),
255s # This guarantees that the intermediate operation when calculating
255s # output columns would overflow a C-long, hence checks that python-
255s # longs are being used.
255s (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)),
255s (3, 65535),
255s # This case tests the second clause of the overflow check which
255s # takes into account the value of `n_features` itself.
255s (2, int(np.sqrt(np.iinfo(np.int64).max))),
255s ],
255s )
255s @pytest.mark.parametrize("interaction_only", [True, False])
255s @pytest.mark.parametrize("include_bias", [True, False])
255s @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
255s def test_csr_polynomial_expansion_index_overflow(
255s degree, n_features, interaction_only, include_bias, csr_container
255s ):
255s """Tests known edge-cases to the dtype promotion strategy and custom
255s Cython code, including a current bug in the upstream
255s `scipy.sparse.hstack`.
255s """
255s data = [1.0]
255s row = [0]
255s col = [n_features - 1]
255s
255s # First degree index
255s expected_indices = [
255s n_features - 1 + int(include_bias),
255s ]
255s # Second degree index
255s expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0])
255s # Third degree index
255s expected_indices.append(
255s n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]
255s )
255s
255s X = csr_container((data, (row, col)))
255s pf = PolynomialFeatures(
255s interaction_only=interaction_only, include_bias=include_bias, degree=degree
255s )
255s
255s # Calculate the number of combinations a-priori, and if needed check for
255s # the correct ValueError and terminate the test early.
255s num_combinations = pf._num_combinations(
255s n_features=n_features,
255s min_degree=0,
255s max_degree=degree,
255s interaction_only=pf.interaction_only,
255s include_bias=pf.include_bias,
255s )
255s if num_combinations > np.iinfo(np.intp).max:
255s msg = (
256s r"The output that would result from the current configuration would have"
256s r" \d* features which is too large to be indexed"
256s )
256s with pytest.raises(ValueError, match=msg):
256s pf.fit(X)
256s return
256s
256s # In SciPy < 1.8, a bug occurs when an intermediate matrix in
256s # `to_stack` in `hstack` fits within int32 however would require int64 when
256s # combined with all previous matrices in `to_stack`.
256s if sp_version < parse_version("1.8.0"):
256s has_bug = False
256s max_int32 = np.iinfo(np.int32).max
256s cumulative_size = n_features + include_bias
256s for deg in range(2, degree + 1):
256s max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
256s max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
256s cumulative_size += max_indices + 1
256s needs_int64 = max(max_indices, max_indptr) > max_int32
256s has_bug |= not needs_int64 and cumulative_size > max_int32
256s if has_bug:
256s msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
256s with pytest.raises(ValueError, match=msg):
256s X_trans = pf.fit_transform(X)
256s return
256s
256s # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
256s # dtype for representing indices and indptr if `n_features` is still
256s # small enough so that each block matrix's indices and indptr arrays
256s # can be represented with `np.int32`. We test `n_features==65535`
256s # since it is guaranteed to run into this bug.
256s if (
256s sp_version < parse_version("1.9.2")
256s and n_features == 65535
256s and degree == 2
256s and not interaction_only
256s ): # pragma: no cover
256s msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
256s with pytest.raises(ValueError, match=msg):
256s X_trans = pf.fit_transform(X)
256s return
256s X_trans = pf.fit_transform(X)
256s
256s expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
256s # Terms higher than first degree
256s non_bias_terms = 1 + (degree - 1) * int(not interaction_only)
256s expected_nnz = int(include_bias) + non_bias_terms
256s assert X_trans.dtype == X.dtype
256s assert X_trans.shape == (1, pf.n_output_features_)
256s > assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype
256s E AssertionError: assert dtype('int64') == <class 'numpy.int32'>
256s E + where dtype('int64') = array([2343]).dtype
256s E + where array([2343]) = <Compressed Sparse Row sparse array of dtype 'float64'\n with 1 stored elements and shape (1, 2146455884)>.indices
256s
256s /usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py:1132: AssertionError
More information about the debian-science-maintainers
mailing list