Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 90e12b8

Browse filesBrowse files
committed
refactor code and factors out the int64 casting
1 parent fe39a7b commit 90e12b8
Copy full SHA for 90e12b8

File tree

2 files changed

+35
-30
lines changed
Filter options

2 files changed

+35
-30
lines changed

‎sklearn/preprocessing/_csr_polynomial_expansion.pyx

Copy file name to clipboardExpand all lines: sklearn/preprocessing/_csr_polynomial_expansion.pyx
+4-26Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ from scipy.sparse import csr_matrix
88
from numpy cimport ndarray
99
import numpy as np
1010
cimport numpy as np
11-
cdef int MAX_INT32 = 2147483647
12-
# TODO: use finfo instead
1311

1412
np.import_array()
1513
ctypedef fused INDEX_T:
@@ -56,7 +54,9 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
5654
def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
5755
ndarray[INDEX_T, ndim=1] indices,
5856
ndarray[INDEX_T, ndim=1] indptr,
59-
INDEX_T d, INDEX_T interaction_only,
57+
INDEX_T d,
58+
INDEX_T expanded_dimensionality,
59+
INDEX_T interaction_only,
6060
INDEX_T degree):
6161
"""
6262
Perform a second-degree polynomial or interaction expansion on a scipy
@@ -91,13 +91,6 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
9191
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
9292
"""
9393

94-
assert degree in (2, 3)
95-
96-
if degree == 2:
97-
expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
98-
else:
99-
expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
100-
- interaction_only*d**2)
10194
if expanded_dimensionality == 0:
10295
return None
10396
assert expanded_dimensionality > 0
@@ -106,12 +99,7 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
10699

107100
# Count how many nonzero elements the expanded matrix will contain.
108101
for row_i in range(indptr.shape[0]-1):
109-
# nnz is the number of nonzero elements in this row.
110-
# The number of nonzero elements can explode
111-
# in the expanded space, so we cast to int64
112-
# before the expansion computation to
113-
# avoid overflow:
114-
nnz = np.int64(indptr[row_i + 1] - indptr[row_i])
102+
nnz = indptr[row_i + 1] - indptr[row_i]
115103
# TODO: check that the casting is indeed done
116104
if degree == 2:
117105
total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
@@ -128,16 +116,6 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
128116
cdef ndarray[INDEX_T, ndim=1] expanded_indptr = ndarray(
129117
shape=num_rows + 1, dtype=indptr.dtype)
130118

131-
# if the expanded_dimensionality is too big, we need to cast
132-
# indices to int64. If total_nnz is too big, we need to cast
133-
# expanded_indptr to int64 too.
134-
# TODO: check that the casting is indeed done
135-
if expanded_dimensionality > MAX_INT32:
136-
indices = np.array(indices, dtype=np.int64)
137-
expanded_indices = np.array(expanded_indices, dtype=np.int64)
138-
if total_nnz > MAX_INT32:
139-
expanded_indptr = np.array(expanded_indptr, dtype=np.int64)
140-
141119
cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
142120
i_ptr, j_ptr, k_ptr, num_cols_in_row, col
143121

‎sklearn/preprocessing/_polynomial.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/_polynomial.py
+31-4Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,32 @@
2323
]
2424

2525

26+
def _cast_to_int64_if_needed(X, degree, interaction_only):
27+
"""Computes the expanded dimensionality and casts X to int64 if the
28+
expanded dim is too big"""
29+
d = int(X.shape[1])
30+
assert degree in (2, 3)
31+
if degree == 2:
32+
expanded_dimensionality = (d ** 2 + d) / 2 - interaction_only * d
33+
else:
34+
expanded_dimensionality = ((d ** 3 + 3 * d ** 2 + 2 * d) / 6 -
35+
interaction_only * d ** 2)
36+
if expanded_dimensionality > np.iinfo(np.int64).max:
37+
raise ValueError("The expanded dimensionality will be too big to "
38+
"be contained in an int64.")
39+
if expanded_dimensionality > np.iinfo(np.int32).max:
40+
# if the expansion needs int64 ints, we cast every index value in X
41+
# to int64
42+
X = X.copy()
43+
X.data = X.data.astype(np.int64)
44+
X.indices = X.indices.astype(np.int64)
45+
X.indptr = X.indptr.astype(np.int64)
46+
return X, np.int64(d), np.int64(expanded_dimensionality)
47+
else:
48+
# otherwise we keep X as is
49+
return X, np.int32(d), np.int32(expanded_dimensionality)
50+
51+
2652
class PolynomialFeatures(TransformerMixin, BaseEstimator):
2753
"""Generate polynomial and interaction features.
2854
@@ -249,12 +275,13 @@ def transform(self, X):
249275
to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype))
250276
to_stack.append(X)
251277
for deg in range(2, self.degree+1):
252-
# Dimensionality of the expanded space can become very
253-
# large so we cast X.shape[1] to int64 to avoid overflow
254-
# when computing expanded_dimensionality
278+
(X, d, expanded_dim
279+
) = _cast_to_int64_if_needed(X, deg,
280+
self.interaction_only)
255281
Xp_next = _csr_polynomial_expansion(X.data, X.indices,
256282
X.indptr,
257-
np.int64(X.shape[1]),
283+
d,
284+
expanded_dim,
258285
self.interaction_only,
259286
deg)
260287
if Xp_next is None:

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.