diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 84fef3f042dc7..0702b61b233dd 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -6,10 +6,13 @@ from scipy.sparse import csr_matrix from numpy cimport ndarray +import numpy as np cimport numpy as np np.import_array() -ctypedef np.int32_t INDEX_T +ctypedef fused INDEX_T: + np.int32_t + np.int64_t ctypedef fused DATA_T: np.float32_t @@ -51,7 +54,9 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k, def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data, ndarray[INDEX_T, ndim=1] indices, ndarray[INDEX_T, ndim=1] indptr, - INDEX_T d, INDEX_T interaction_only, + INDEX_T d, + INDEX_T expanded_dimensionality, + INDEX_T interaction_only, INDEX_T degree): """ Perform a second-degree polynomial or interaction expansion on a scipy @@ -86,13 +91,6 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data, Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes. """ - assert degree in (2, 3) - - if degree == 2: - expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d) - else: - expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6 - - interaction_only*d**2) if expanded_dimensionality == 0: return None assert expanded_dimensionality > 0 @@ -119,8 +117,7 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data, shape=num_rows + 1, dtype=indptr.dtype) cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \ - i_ptr, j_ptr, k_ptr, num_cols_in_row, \ - expanded_column + i_ptr, j_ptr, k_ptr, num_cols_in_row, col with nogil: expanded_indptr[0] = indptr[0] diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index d1ec49d7539bf..5c1a2a1b662d3 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -23,6 +23,31 @@ ] +def _cast_to_int64_if_needed(X, degree, interaction_only): + """Computes the expanded dimensionality and casts X to int64 if the + expanded dim is too big""" + d = int(X.shape[1]) + assert degree in (2, 3) + if degree == 2: + expanded_dimensionality = (d ** 2 + d) / 2 - interaction_only * d + else: + expanded_dimensionality = ((d ** 3 + 3 * d ** 2 + 2 * d) / 6 - + interaction_only * d ** 2) + if expanded_dimensionality > np.iinfo(np.int64).max: + raise ValueError("The expanded dimensionality will be too big to " + "be contained in an int64.") + if expanded_dimensionality > np.iinfo(np.int32).max: + # if the expansion needs int64 ints, we cast every index value in X + # to int64 + X = X.copy() + X.indices = X.indices.astype(np.int64) + X.indptr = X.indptr.astype(np.int64) + return X, np.int64(d), np.int64(expanded_dimensionality) + else: + # otherwise we keep X as is + return X, np.int32(d), np.int32(expanded_dimensionality) + + class PolynomialFeatures(TransformerMixin, BaseEstimator): """Generate polynomial and interaction features. @@ -249,8 +274,13 @@ def transform(self, X): to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) to_stack.append(X) for deg in range(2, self.degree+1): + (X, d, expanded_dim + ) = _cast_to_int64_if_needed(X, deg, + self.interaction_only) Xp_next = _csr_polynomial_expansion(X.data, X.indices, - X.indptr, X.shape[1], + X.indptr, + d, + expanded_dim, self.interaction_only, deg) if Xp_next is None: diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 59c3a59df8873..dd1fd54c5c71d 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -552,6 +552,18 @@ def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): assert_array_almost_equal(Xt_csr.A, Xt_dense) +def test_polynomial_features_csr_int64(): + """Tests that if the number of features fits in an int32 BUT the + expanded number of features is too big for int32 (so needs int64), + there is no overflow in the result.""" + rng = np.random.RandomState(42) + x = sparse.rand(3, 70000, density=0.0001, format='csr', + random_state=rng) + pf = PolynomialFeatures(interaction_only=True, include_bias=False, + degree=2) + pf.fit_transform(x) + + @pytest.mark.parametrize("n_features", [1, 4, 5]) @pytest.mark.parametrize("degree", range(1, 5)) @pytest.mark.parametrize("interaction_only", [True, False])