scikit-learn
diff --git a/‎doc/whats_new/v0.20.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v0.20.rst
+9Lines changed: 9 additions & 0 deletions b/‎doc/whats_new/v0.20.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v0.20.rst
+9Lines changed: 9 additions & 0 deletions
diff --git a/‎sklearn/compose/_column_transformer.py
Copy file name to clipboardExpand all lines: sklearn/compose/_column_transformer.py
+31-4Lines changed: 31 additions & 4 deletions b/‎sklearn/compose/_column_transformer.py
Copy file name to clipboardExpand all lines: sklearn/compose/_column_transformer.py
+31-4Lines changed: 31 additions & 4 deletions
diff --git a/‎sklearn/compose/tests/test_column_transformer.py
Copy file name to clipboardExpand all lines: sklearn/compose/tests/test_column_transformer.py
+48Lines changed: 48 additions & 0 deletions b/‎sklearn/compose/tests/test_column_transformer.py
Copy file name to clipboardExpand all lines: sklearn/compose/tests/test_column_transformer.py
+48Lines changed: 48 additions & 0 deletions
@@ -20,6 +20,15 @@ Changelog
 - |Fix| Fixed a bug in :class:`cluster.KMeans` where KMeans++ initialisation
   could rarely result in an IndexError. :issue:`11756` by `Joel Nothman`_.
 
+:mod:`sklearn.compose`
+.....................
+
+- |Fix| Fixed an issue in :class:`compose.ColumnTransformer` where using
+  DataFrames whose column order differs between :func:``fit`` and
+  :func:``transform`` could lead to silently passing incorrect columns to the
+  ``remainder`` transformer.
+  :pr:`14237` by `Andreas Schuderer <schuderer>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
 
@@ -83,7 +83,9 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
         the transformers.
         By setting ``remainder`` to be an estimator, the remaining
         non-specified columns will use the ``remainder`` estimator. The
-        estimator must support `fit` and `transform`.
+        estimator must support :term:`fit` and :term:`transform`.
+        Note that using this feature requires that the DataFrame columns
+        input at :term:`fit` and :term:`transform` have identical order.
 
     sparse_threshold : float, default = 0.3
         If the output of the different transfromers contains sparse matrices,
@@ -295,11 +297,17 @@ def _validate_remainder(self, X):
                 "'passthrough', or estimator. '%s' was passed instead" %
                 self.remainder)
 
-        n_columns = X.shape[1]
+        # Make it possible to check for reordered named columns on transform
+        if (hasattr(X, 'columns') and
+                any(_check_key_type(cols, str) for cols in self._columns)):
+            self._df_columns = X.columns
+
+        self._n_features = X.shape[1]
         cols = []
         for columns in self._columns:
             cols.extend(_get_column_indices(X, columns))
-        remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None
+        remaining_idx = list(set(range(self._n_features)) - set(cols))
+        remaining_idx = sorted(remaining_idx) or None
 
         self._remainder = ('remainder', self.remainder, remaining_idx)
 
@@ -488,8 +496,27 @@ def transform(self, X):
 
         """
         check_is_fitted(self, 'transformers_')
-
         X = _check_X(X)
+
+        if self._n_features > X.shape[1]:
+            raise ValueError('Number of features of the input must be equal '
+                             'to or greater than that of the fitted '
+                             'transformer. Transformer n_features is {0} '
+                             'and input n_features is {1}.'
+                             .format(self._n_features, X.shape[1]))
+
+        # No column reordering allowed for named cols combined with remainder
+        if (self._remainder[2] is not None and
+                hasattr(self, '_df_columns') and
+                hasattr(X, 'columns')):
+            n_cols_fit = len(self._df_columns)
+            n_cols_transform = len(X.columns)
+            if (n_cols_transform >= n_cols_fit and
+                    any(X.columns[:n_cols_fit] != self._df_columns)):
+                raise ValueError('Column ordering must be equal for fit '
+                                 'and for transform when using the '
+                                 'remainder keyword')
+
         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
         self._validate_output(Xs)
 
 
@@ -498,6 +498,17 @@ def test_column_transformer_invalid_columns(remainder):
         assert_raise_message(ValueError, "Specifying the columns",
                              ct.fit, X_array)
 
+    # transformed n_features does not match fitted n_features
+    col = [0, 1]
+    ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
+    ct.fit(X_array)
+    X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
+    ct.transform(X_array_more)  # Should accept added columns
+    X_array_fewer = np.array([[0, 1, 2], ]).T
+    err_msg = 'Number of features'
+    with pytest.raises(ValueError, match=err_msg):
+        ct.transform(X_array_fewer)
+
 
 def test_column_transformer_invalid_transformer():
 
@@ -1033,3 +1044,40 @@ def test_column_transformer_negative_column_indexes():
     tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough')
     tf_2 = ColumnTransformer([('ohe', ohe,  [2])], remainder='passthrough')
     assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
+
+
+@pytest.mark.parametrize("explicit_colname", ['first', 'second'])
+def test_column_transformer_reordered_column_names_remainder(explicit_colname):
+    """Regression test for issue #14223: 'Named col indexing fails with
+       ColumnTransformer remainder on changing DataFrame column ordering'
+
+       Should raise error on changed order combined with remainder.
+       Should allow for added columns in `transform` input DataFrame
+       as long as all preceding columns match.
+    """
+    pd = pytest.importorskip('pandas')
+
+    X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
+
+    X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
+    X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
+
+    tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
+                           remainder=Trans())
+
+    tf.fit(X_fit_df)
+    err_msg = 'Column ordering must be equal'
+    with pytest.raises(ValueError, match=err_msg):
+        tf.transform(X_trans_df)
+
+    # No error for added columns if ordering is identical
+    X_extended_df = X_fit_df.copy()
+    X_extended_df['third'] = [3, 6, 9]
+    tf.transform(X_extended_df)  # No error should be raised
+
+    # No 'columns' AttributeError when transform input is a numpy array
+    X_array = X_fit_array.copy()
+    err_msg = 'Specifying the columns'
+    with pytest.raises(ValueError, match=err_msg):
+        tf.transform(X_array)