Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

[WIP] FIX index overflow error in sparse matrix polynomial expansion (bis) #19676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
Closed
19 changes: 8 additions & 11 deletions 19 sklearn/preprocessing/_csr_polynomial_expansion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@

from scipy.sparse import csr_matrix
from numpy cimport ndarray
import numpy as np
cimport numpy as np

np.import_array()
ctypedef np.int32_t INDEX_T
ctypedef fused INDEX_T:
np.int32_t
np.int64_t

ctypedef fused DATA_T:
np.float32_t
Expand Down Expand Up @@ -51,7 +54,9 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
ndarray[INDEX_T, ndim=1] indices,
ndarray[INDEX_T, ndim=1] indptr,
INDEX_T d, INDEX_T interaction_only,
INDEX_T d,
INDEX_T expanded_dimensionality,
INDEX_T interaction_only,
INDEX_T degree):
"""
Perform a second-degree polynomial or interaction expansion on a scipy
Expand Down Expand Up @@ -86,13 +91,6 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
"""

assert degree in (2, 3)

if degree == 2:
expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
else:
expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
- interaction_only*d**2)
if expanded_dimensionality == 0:
return None
assert expanded_dimensionality > 0
Expand All @@ -119,8 +117,7 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
shape=num_rows + 1, dtype=indptr.dtype)

cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
i_ptr, j_ptr, k_ptr, num_cols_in_row, \
expanded_column
i_ptr, j_ptr, k_ptr, num_cols_in_row, col

with nogil:
expanded_indptr[0] = indptr[0]
Expand Down
32 changes: 31 additions & 1 deletion 32 sklearn/preprocessing/_polynomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,31 @@
]


def _cast_to_int64_if_needed(X, degree, interaction_only):
"""Computes the expanded dimensionality and casts X to int64 if the
expanded dim is too big"""
d = int(X.shape[1])
assert degree in (2, 3)
if degree == 2:
expanded_dimensionality = (d ** 2 + d) / 2 - interaction_only * d
else:
expanded_dimensionality = ((d ** 3 + 3 * d ** 2 + 2 * d) / 6 -
interaction_only * d ** 2)
if expanded_dimensionality > np.iinfo(np.int64).max:
raise ValueError("The expanded dimensionality will be too big to "
"be contained in an int64.")
if expanded_dimensionality > np.iinfo(np.int32).max:
# if the expansion needs int64 ints, we cast every index value in X
# to int64
X = X.copy()
X.indices = X.indices.astype(np.int64)
X.indptr = X.indptr.astype(np.int64)
Comment on lines +43 to +44
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two copies being created here, one from X.copy and here when changing the dtype of the indices. I do not see a good alternative. I would expect the following to work:

        X = csr_matrix(
            (X.data, X.indices.astype(np.int64), X.indptr.astype(np.int64)),
            shape=X.shape, copy=True,
        )

but internally csr_matrix checks the indices and will cast back down to np.int32.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's right, your solution sounds good
I'm not completely sure to understand the problem with it: is it because when creating a csr_matrix with indices that could hold in int32, scipy automatically choses int32 indices ?

In this case maybe I could avoid copying X, but deal with every part of it (data, indices, inptr) separately, like:

  • remove the X = X.copy() line
  • do X_indices = X.indices.astype(np.int64, copy=False) (to do a copy only if the type is changed)
  • similarly X_indptr =X_indptr.astype(np.int64, copy=False)
    And then return X.data, and X_indices and X_indptr, which I would use later on in _csr_polynomial_expansion
    What do you think ?

return X, np.int64(d), np.int64(expanded_dimensionality)
else:
# otherwise we keep X as is
return X, np.int32(d), np.int32(expanded_dimensionality)


class PolynomialFeatures(TransformerMixin, BaseEstimator):
"""Generate polynomial and interaction features.

Expand Down Expand Up @@ -249,8 +274,13 @@ def transform(self, X):
to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype))
to_stack.append(X)
for deg in range(2, self.degree+1):
(X, d, expanded_dim
) = _cast_to_int64_if_needed(X, deg,
self.interaction_only)
Xp_next = _csr_polynomial_expansion(X.data, X.indices,
X.indptr, X.shape[1],
X.indptr,
d,
expanded_dim,
self.interaction_only,
deg)
if Xp_next is None:
Expand Down
12 changes: 12 additions & 0 deletions 12 sklearn/preprocessing/tests/test_polynomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,18 @@ def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
assert_array_almost_equal(Xt_csr.A, Xt_dense)


def test_polynomial_features_csr_int64():
"""Tests that if the number of features fits in an int32 BUT the
expanded number of features is too big for int32 (so needs int64),
there is no overflow in the result."""
rng = np.random.RandomState(42)
x = sparse.rand(3, 70000, density=0.0001, format='csr',
random_state=rng)
pf = PolynomialFeatures(interaction_only=True, include_bias=False,
degree=2)
pf.fit_transform(x)


@pytest.mark.parametrize("n_features", [1, 4, 5])
@pytest.mark.parametrize("degree", range(1, 5))
@pytest.mark.parametrize("interaction_only", [True, False])
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.