Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 9474386

Browse filesBrowse files
schudererjnothman
authored andcommitted
FIX ColumnTransformer: raise error on reordered columns with remainder (#14237)
* FIX Raise error on reordered columns in ColumnTransformer with remainder * FIX Check for different length of X.columns to avoid exception * FIX linter, line too long * FIX import _check_key_type from its new location utils * ENH Adjust doc, allow added columns * Fix comment typo as suggested, remove non-essential exposition in doc * Add PR 14237 to what's new * Avoid AttributeError in favor of ValueError "column names only for DF" * ENH Add check for n_features_ for array-likes and DataFrames * Rename self.n_features to self._n_features * Replaced backslash line continuation with parenthesis * Style changes
1 parent e423647 commit 9474386
Copy full SHA for 9474386

File tree

Expand file treeCollapse file tree

3 files changed

+88
-4
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+88
-4
lines changed

‎doc/whats_new/v0.20.rst

Copy file name to clipboardExpand all lines: doc/whats_new/v0.20.rst
+9Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@ Changelog
2020
- |Fix| Fixed a bug in :class:`cluster.KMeans` where KMeans++ initialisation
2121
could rarely result in an IndexError. :issue:`11756` by `Joel Nothman`_.
2222

23+
:mod:`sklearn.compose`
24+
.....................
25+
26+
- |Fix| Fixed an issue in :class:`compose.ColumnTransformer` where using
27+
DataFrames whose column order differs between :func:``fit`` and
28+
:func:``transform`` could lead to silently passing incorrect columns to the
29+
``remainder`` transformer.
30+
:pr:`14237` by `Andreas Schuderer <schuderer>`.
31+
2332
:mod:`sklearn.model_selection`
2433
..............................
2534

‎sklearn/compose/_column_transformer.py

Copy file name to clipboardExpand all lines: sklearn/compose/_column_transformer.py
+31-4Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
8383
the transformers.
8484
By setting ``remainder`` to be an estimator, the remaining
8585
non-specified columns will use the ``remainder`` estimator. The
86-
estimator must support `fit` and `transform`.
86+
estimator must support :term:`fit` and :term:`transform`.
87+
Note that using this feature requires that the DataFrame columns
88+
input at :term:`fit` and :term:`transform` have identical order.
8789
8890
sparse_threshold : float, default = 0.3
8991
If the output of the different transfromers contains sparse matrices,
@@ -295,11 +297,17 @@ def _validate_remainder(self, X):
295297
"'passthrough', or estimator. '%s' was passed instead" %
296298
self.remainder)
297299

298-
n_columns = X.shape[1]
300+
# Make it possible to check for reordered named columns on transform
301+
if (hasattr(X, 'columns') and
302+
any(_check_key_type(cols, str) for cols in self._columns)):
303+
self._df_columns = X.columns
304+
305+
self._n_features = X.shape[1]
299306
cols = []
300307
for columns in self._columns:
301308
cols.extend(_get_column_indices(X, columns))
302-
remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None
309+
remaining_idx = list(set(range(self._n_features)) - set(cols))
310+
remaining_idx = sorted(remaining_idx) or None
303311

304312
self._remainder = ('remainder', self.remainder, remaining_idx)
305313

@@ -488,8 +496,27 @@ def transform(self, X):
488496
489497
"""
490498
check_is_fitted(self, 'transformers_')
491-
492499
X = _check_X(X)
500+
501+
if self._n_features > X.shape[1]:
502+
raise ValueError('Number of features of the input must be equal '
503+
'to or greater than that of the fitted '
504+
'transformer. Transformer n_features is {0} '
505+
'and input n_features is {1}.'
506+
.format(self._n_features, X.shape[1]))
507+
508+
# No column reordering allowed for named cols combined with remainder
509+
if (self._remainder[2] is not None and
510+
hasattr(self, '_df_columns') and
511+
hasattr(X, 'columns')):
512+
n_cols_fit = len(self._df_columns)
513+
n_cols_transform = len(X.columns)
514+
if (n_cols_transform >= n_cols_fit and
515+
any(X.columns[:n_cols_fit] != self._df_columns)):
516+
raise ValueError('Column ordering must be equal for fit '
517+
'and for transform when using the '
518+
'remainder keyword')
519+
493520
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
494521
self._validate_output(Xs)
495522

‎sklearn/compose/tests/test_column_transformer.py

Copy file name to clipboardExpand all lines: sklearn/compose/tests/test_column_transformer.py
+48Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,17 @@ def test_column_transformer_invalid_columns(remainder):
498498
assert_raise_message(ValueError, "Specifying the columns",
499499
ct.fit, X_array)
500500

501+
# transformed n_features does not match fitted n_features
502+
col = [0, 1]
503+
ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
504+
ct.fit(X_array)
505+
X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
506+
ct.transform(X_array_more) # Should accept added columns
507+
X_array_fewer = np.array([[0, 1, 2], ]).T
508+
err_msg = 'Number of features'
509+
with pytest.raises(ValueError, match=err_msg):
510+
ct.transform(X_array_fewer)
511+
501512

502513
def test_column_transformer_invalid_transformer():
503514

@@ -1033,3 +1044,40 @@ def test_column_transformer_negative_column_indexes():
10331044
tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough')
10341045
tf_2 = ColumnTransformer([('ohe', ohe, [2])], remainder='passthrough')
10351046
assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
1047+
1048+
1049+
@pytest.mark.parametrize("explicit_colname", ['first', 'second'])
1050+
def test_column_transformer_reordered_column_names_remainder(explicit_colname):
1051+
"""Regression test for issue #14223: 'Named col indexing fails with
1052+
ColumnTransformer remainder on changing DataFrame column ordering'
1053+
1054+
Should raise error on changed order combined with remainder.
1055+
Should allow for added columns in `transform` input DataFrame
1056+
as long as all preceding columns match.
1057+
"""
1058+
pd = pytest.importorskip('pandas')
1059+
1060+
X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
1061+
X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
1062+
1063+
X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
1064+
X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
1065+
1066+
tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
1067+
remainder=Trans())
1068+
1069+
tf.fit(X_fit_df)
1070+
err_msg = 'Column ordering must be equal'
1071+
with pytest.raises(ValueError, match=err_msg):
1072+
tf.transform(X_trans_df)
1073+
1074+
# No error for added columns if ordering is identical
1075+
X_extended_df = X_fit_df.copy()
1076+
X_extended_df['third'] = [3, 6, 9]
1077+
tf.transform(X_extended_df) # No error should be raised
1078+
1079+
# No 'columns' AttributeError when transform input is a numpy array
1080+
X_array = X_fit_array.copy()
1081+
err_msg = 'Specifying the columns'
1082+
with pytest.raises(ValueError, match=err_msg):
1083+
tf.transform(X_array)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.