Bug#1096252: scikit-learn: fails tests with scipy 1.15

Mon Feb 17 16:44:25 GMT 2025

Source: scikit-learn
Version: 1.4.2+dfsg-7
Severity: normal

scikit-learn is failing tests with scipy 1.15 from experimental

https://ci.debian.net/data/autopkgtest/unstable/amd64/s/scikit-learn/57873442/log.gz

256s FAILED ../../../../usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py::test_csr_polynomial_expansion_index_overflow[csr_array-False-True-2-65535]
256s FAILED ../../../../usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py::test_csr_polynomial_expansion_index_overflow[csr_array-False-True-3-2344]
256s = 2 failed, 29260 passed, 3388 skipped, 88 xfailed, 45 xpassed, 8173 warnings in 129.25s (0:02:09) =


255s __ test_csr_polynomial_expansion_index_overflow[csr_array-False-True-2-65535] __
255s [gw51] linux -- Python 3.12.9 /usr/bin/python3.12
255s 
255s degree = 2, n_features = 65535, interaction_only = True, include_bias = False
255s csr_container = <class 'scipy.sparse._csr.csr_array'>
255s 
255s     @pytest.mark.parametrize(
255s         "degree, n_features",
255s         [
255s             # Needs promotion to int64 when interaction_only=False
255s             (2, 65535),
255s             (3, 2344),
255s             # This guarantees that the intermediate operation when calculating
255s             # output columns would overflow a C-long, hence checks that python-
255s             # longs are being used.
255s             (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)),
255s             (3, 65535),
255s             # This case tests the second clause of the overflow check which
255s             # takes into account the value of `n_features` itself.
255s             (2, int(np.sqrt(np.iinfo(np.int64).max))),
255s         ],
255s     )
255s     @pytest.mark.parametrize("interaction_only", [True, False])
255s     @pytest.mark.parametrize("include_bias", [True, False])
255s     @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
255s     def test_csr_polynomial_expansion_index_overflow(
255s         degree, n_features, interaction_only, include_bias, csr_container
255s     ):
255s         """Tests known edge-cases to the dtype promotion strategy and custom
255s         Cython code, including a current bug in the upstream
255s         `scipy.sparse.hstack`.
255s         """
255s         data = [1.0]
255s         row = [0]
255s         col = [n_features - 1]
255s     
255s         # First degree index
255s         expected_indices = [
255s             n_features - 1 + int(include_bias),
255s         ]
255s         # Second degree index
255s         expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0])
255s         # Third degree index
255s         expected_indices.append(
255s             n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]
255s         )
255s     
255s         X = csr_container((data, (row, col)))
255s         pf = PolynomialFeatures(
255s             interaction_only=interaction_only, include_bias=include_bias, degree=degree
255s         )
255s     
255s         # Calculate the number of combinations a-priori, and if needed check for
255s         # the correct ValueError and terminate the test early.
255s         num_combinations = pf._num_combinations(
255s             n_features=n_features,
255s             min_degree=0,
255s             max_degree=degree,
255s             interaction_only=pf.interaction_only,
255s             include_bias=pf.include_bias,
255s         )
255s         if num_combinations > np.iinfo(np.intp).max:
255s             msg = (
255s                 r"The output that would result from the current configuration would have"
255s                 r" \d* features which is too large to be indexed"
255s             )
255s             with pytest.raises(ValueError, match=msg):
255s                 pf.fit(X)
255s             return
255s     
255s         # In SciPy < 1.8, a bug occurs when an intermediate matrix in
255s         # `to_stack` in `hstack` fits within int32 however would require int64 when
255s         # combined with all previous matrices in `to_stack`.
255s         if sp_version < parse_version("1.8.0"):
255s             has_bug = False
255s             max_int32 = np.iinfo(np.int32).max
255s             cumulative_size = n_features + include_bias
255s             for deg in range(2, degree + 1):
255s                 max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
255s                 max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
255s                 cumulative_size += max_indices + 1
255s                 needs_int64 = max(max_indices, max_indptr) > max_int32
255s                 has_bug |= not needs_int64 and cumulative_size > max_int32
255s             if has_bug:
255s                 msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
255s                 with pytest.raises(ValueError, match=msg):
255s                     X_trans = pf.fit_transform(X)
255s                 return
255s     
255s         # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
255s         # dtype for representing indices and indptr if `n_features` is still
255s         # small enough so that each block matrix's indices and indptr arrays
255s         # can be represented with `np.int32`. We test `n_features==65535`
255s         # since it is guaranteed to run into this bug.
255s         if (
255s             sp_version < parse_version("1.9.2")
255s             and n_features == 65535
255s             and degree == 2
255s             and not interaction_only
255s         ):  # pragma: no cover
255s             msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
255s             with pytest.raises(ValueError, match=msg):
255s                 X_trans = pf.fit_transform(X)
255s             return
255s         X_trans = pf.fit_transform(X)
255s     
255s         expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
255s         # Terms higher than first degree
255s         non_bias_terms = 1 + (degree - 1) * int(not interaction_only)
255s         expected_nnz = int(include_bias) + non_bias_terms
255s         assert X_trans.dtype == X.dtype
255s         assert X_trans.shape == (1, pf.n_output_features_)
255s >       assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype
255s E       AssertionError: assert dtype('int64') == <class 'numpy.int32'>
255s E        +  where dtype('int64') = array([65534]).dtype
255s E        +    where array([65534]) = <Compressed Sparse Row sparse array of dtype 'float64'\n	with 1 stored elements and shape (1, 2147450880)>.indices
255s 
255s /usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py:1132: AssertionError
255s __ test_csr_polynomial_expansion_index_overflow[csr_array-False-True-3-2344] ___
255s [gw51] linux -- Python 3.12.9 /usr/bin/python3.12
255s 
255s degree = 3, n_features = 2344, interaction_only = True, include_bias = False
255s csr_container = <class 'scipy.sparse._csr.csr_array'>
255s 
255s     @pytest.mark.parametrize(
255s         "degree, n_features",
255s         [
255s             # Needs promotion to int64 when interaction_only=False
255s             (2, 65535),
255s             (3, 2344),
255s             # This guarantees that the intermediate operation when calculating
255s             # output columns would overflow a C-long, hence checks that python-
255s             # longs are being used.
255s             (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)),
255s             (3, 65535),
255s             # This case tests the second clause of the overflow check which
255s             # takes into account the value of `n_features` itself.
255s             (2, int(np.sqrt(np.iinfo(np.int64).max))),
255s         ],
255s     )
255s     @pytest.mark.parametrize("interaction_only", [True, False])
255s     @pytest.mark.parametrize("include_bias", [True, False])
255s     @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
255s     def test_csr_polynomial_expansion_index_overflow(
255s         degree, n_features, interaction_only, include_bias, csr_container
255s     ):
255s         """Tests known edge-cases to the dtype promotion strategy and custom
255s         Cython code, including a current bug in the upstream
255s         `scipy.sparse.hstack`.
255s         """
255s         data = [1.0]
255s         row = [0]
255s         col = [n_features - 1]
255s     
255s         # First degree index
255s         expected_indices = [
255s             n_features - 1 + int(include_bias),
255s         ]
255s         # Second degree index
255s         expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0])
255s         # Third degree index
255s         expected_indices.append(
255s             n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]
255s         )
255s     
255s         X = csr_container((data, (row, col)))
255s         pf = PolynomialFeatures(
255s             interaction_only=interaction_only, include_bias=include_bias, degree=degree
255s         )
255s     
255s         # Calculate the number of combinations a-priori, and if needed check for
255s         # the correct ValueError and terminate the test early.
255s         num_combinations = pf._num_combinations(
255s             n_features=n_features,
255s             min_degree=0,
255s             max_degree=degree,
255s             interaction_only=pf.interaction_only,
255s             include_bias=pf.include_bias,
255s         )
255s         if num_combinations > np.iinfo(np.intp).max:
255s             msg = (
256s                 r"The output that would result from the current configuration would have"
256s                 r" \d* features which is too large to be indexed"
256s             )
256s             with pytest.raises(ValueError, match=msg):
256s                 pf.fit(X)
256s             return
256s     
256s         # In SciPy < 1.8, a bug occurs when an intermediate matrix in
256s         # `to_stack` in `hstack` fits within int32 however would require int64 when
256s         # combined with all previous matrices in `to_stack`.
256s         if sp_version < parse_version("1.8.0"):
256s             has_bug = False
256s             max_int32 = np.iinfo(np.int32).max
256s             cumulative_size = n_features + include_bias
256s             for deg in range(2, degree + 1):
256s                 max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
256s                 max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
256s                 cumulative_size += max_indices + 1
256s                 needs_int64 = max(max_indices, max_indptr) > max_int32
256s                 has_bug |= not needs_int64 and cumulative_size > max_int32
256s             if has_bug:
256s                 msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
256s                 with pytest.raises(ValueError, match=msg):
256s                     X_trans = pf.fit_transform(X)
256s                 return
256s     
256s         # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
256s         # dtype for representing indices and indptr if `n_features` is still
256s         # small enough so that each block matrix's indices and indptr arrays
256s         # can be represented with `np.int32`. We test `n_features==65535`
256s         # since it is guaranteed to run into this bug.
256s         if (
256s             sp_version < parse_version("1.9.2")
256s             and n_features == 65535
256s             and degree == 2
256s             and not interaction_only
256s         ):  # pragma: no cover
256s             msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
256s             with pytest.raises(ValueError, match=msg):
256s                 X_trans = pf.fit_transform(X)
256s             return
256s         X_trans = pf.fit_transform(X)
256s     
256s         expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
256s         # Terms higher than first degree
256s         non_bias_terms = 1 + (degree - 1) * int(not interaction_only)
256s         expected_nnz = int(include_bias) + non_bias_terms
256s         assert X_trans.dtype == X.dtype
256s         assert X_trans.shape == (1, pf.n_output_features_)
256s >       assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype
256s E       AssertionError: assert dtype('int64') == <class 'numpy.int32'>
256s E        +  where dtype('int64') = array([2343]).dtype
256s E        +    where array([2343]) = <Compressed Sparse Row sparse array of dtype 'float64'\n	with 1 stored elements and shape (1, 2146455884)>.indices
256s 
256s /usr/lib/python3/dist-packages/sklearn/preprocessing/tests/test_polynomial.py:1132: AssertionError