scikit-learn · kschluns · Dec 22, 2024 · Jan 1, 2025 · Jan 1, 2025 · Jan 1, 2025
diff --git a/doc/whats_new/upcoming_changes/sklearn.feature_selection/30529.fix.rst b/doc/whats_new/upcoming_changes/sklearn.feature_selection/30529.fix.rst
@@ -0,0 +1,7 @@
+- , :class:`impute.IterativeImputer`, :class:`feature_selection.RFE`,
+  :class:`feature_selection.RFECV`, :class:`feature_selection.SelectFromModel`, and
+  :class:`feature_selection.SequentialFeatureSelector` now properly route metadata when
+  used inside a Pipeline object. Prior to this fix, when sample_weight was provided in
+  the Pipeline's `**fit_params`, the `sample_weight` was not being routed to the feature
+  selector's estimator. In certain cases, this would result in incorrect feature
+  selection. By :user:`Kyle Schluns <kschluns>`
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
@@ -494,6 +494,7 @@ def get_metadata_routing(self):
        router = MetadataRouter(owner=self.__class__.__name__).add(
            estimator=self.estimator,
            method_mapping=MethodMapping()
+            .add(caller="fit_transform", callee="fit_transform")
-            .add(caller="fit_transform", callee="fit_transform")
+            .add(caller="fit_transform", callee="fit")
-            .add(caller="fit_transform", callee="fit_transform")
+            .add(caller="fit_transform", callee="fit")
            .add(caller="partial_fit", callee="partial_fit")
            .add(caller="fit", callee="fit"),
        )

diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
@@ -542,6 +542,7 @@ def get_metadata_routing(self):
        router = MetadataRouter(owner=self.__class__.__name__).add(
            estimator=self.estimator,
            method_mapping=MethodMapping()
+            .add(caller="fit_transform", callee="fit_transform")
            .add(caller="fit", callee="fit")
            .add(caller="predict", callee="predict")
            .add(caller="score", callee="score"),
@@ -968,7 +969,9 @@ def get_metadata_routing(self):
        router = MetadataRouter(owner=self.__class__.__name__)
        router.add(
            estimator=self.estimator,
-            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="fit_transform", callee="fit_transform"),
        )
        router.add(
            splitter=check_cv(self.cv),

diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py
@@ -350,7 +350,9 @@ def get_metadata_routing(self):
        router = MetadataRouter(owner=self.__class__.__name__)
        router.add(
            estimator=self.estimator,
-            method_mapping=MethodMapping().add(caller="fit", callee="fit"),
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="fit_transform", callee="fit_transform"),
        )
        router.add(
            splitter=check_cv(self.cv, classifier=is_classifier(self.estimator)),

diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
@@ -1025,6 +1025,8 @@ def get_metadata_routing(self):
        """
        router = MetadataRouter(owner=self.__class__.__name__).add(
            estimator=self.estimator,
-            method_mapping=MethodMapping().add(callee="fit", caller="fit"),
+            method_mapping=MethodMapping()
+            .add(callee="fit", caller="fit")
+            .add(callee="fit_transform", caller="fit_transform"),
        )
        return router
diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py
@@ -57,20 +57,41 @@ def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs)
    obj : estimator object
        sub-estimator to check routed params for
    method : str
-        sub-estimator's method where metadata is routed to, or otherwise in
-        the context of metadata routing referred to as 'callee'
-    parent : str
-        the parent method which should have called `method`, or otherwise in
-        the context of metadata routing referred to as 'caller'
+        The target sub-estimator method for which to check recorded metadata,
+        otherwise in the context of metadata routing referred to as 'callee'.
+    parent : str or None
+        The parent method which should have called `method`, or otherwise in
+        the context of metadata routing referred to as 'caller'.
+
+        Sub-estimator metadata is only checked if the `caller` method matches
+        the value defined by `parent`. If `parent` is None, the target
+        sub-estimator metadata is checked regardless of the `caller` method.
+
+        NOTE: many metaestimators call the subestimator in roundabout ways
+        and this makes it very difficult to know what method name to use for
+        `parent`. If misspecified, it results in tests passing trivially. For
+        example, when fitting the RFE metaestimator, RFE.fit() calls RFE._fit()
+        , which then calls subestimator.fit(). In this case, the user
+        configuring the test should set method="fit" and parent="_fit",
+        otherwise the test will pass trivially.
    split_params : tuple, default=empty
        specifies any parameters which are to be checked as being a subset
        of the original values
    **kwargs : dict
        passed metadata
    """
-    all_records = (
-        getattr(obj, "_records", dict()).get(method, dict()).get(parent, list())
-    )
+
+    if parent:
+        all_records = (
+            getattr(obj, "_records", dict()).get(method, dict()).get(parent, list())
+        )
+    else:
+        all_records = [
+            r
+            for record in getattr(obj, "_records", dict()).get(method, dict()).values()
+            for r in record
+        ]
+
    for record in all_records:
        # first check that the names of the metadata passed are the same as
        # expected. The names are stored as keys in `record`.

diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -63,12 +63,14 @@
    MultiOutputRegressor,
    RegressorChain,
 )
+from sklearn.pipeline import Pipeline
 from sklearn.semi_supervised import SelfTrainingClassifier
 from sklearn.tests.metadata_routing_common import (
    ConsumingClassifier,
    ConsumingRegressor,
    ConsumingScorer,
    ConsumingSplitter,
+    ConsumingTransformer,
    NonConsumingClassifier,
    NonConsumingRegressor,
    _Registry,
@@ -421,6 +423,7 @@
        "estimator": "classifier",
        "X": X,
        "y": y,
+        "preserves_metadata": "subset",
        "estimator_routing_methods": ["fit"],
        "scorer_name": "scoring",
        "scorer_routing_methods": ["fit"],
@@ -439,6 +442,7 @@
        "metaestimator": RFECV,
        "estimator": "classifier",
        "estimator_name": "estimator",
+        "preserves_metadata": "subset",
        "estimator_routing_methods": ["fit"],
        "cv_name": "cv",
        "cv_routing_methods": ["fit"],
@@ -708,6 +712,91 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator):
                    method(X, **method_kwargs)


+@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
+@config_context(enable_metadata_routing=True)
+def test_metaestimators_in_pipeline(metaestimator):
+    # Check that metadata is routed correctly to the sub-estimator when the
+    # metaestimator is an intermediate step within a Pipeline.
+    if "estimator" not in metaestimator:
+        # This test only makes sense for metaestimators which have a
+        # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
+        return
+
+    metaestimator_class = metaestimator["metaestimator"]
+    X = metaestimator["X"]
+    y = metaestimator["y"]
+    method_name = "fit"
+    method_mapping = metaestimator.get("method_mapping", {})
+    preserves_metadata = metaestimator.get("preserves_metadata", True)
+
+    for key in ["sample_weight", "metadata"]:
+        val = {"sample_weight": sample_weight, "metadata": metadata}[key]
+        method_kwargs = {key: val}
+
+        kwargs, (estimator, registry), (scorer, _), (cv, _) = get_init_args(
+            metaestimator, sub_estimator_consumes=True
+        )
+        if scorer:
+            set_requests(
+                scorer, method_mapping={}, methods=["score"], metadata_name=key
+            )
+        if cv:
+            cv.set_split_request(groups=True, metadata=True)
+
+        final_estimator = ConsumingTransformer()
+
+        # `set_{method}_request({metadata}==True)` on the underlying estimator
+        set_requests(
+            estimator,
+            method_mapping=method_mapping,
+            methods=[method_name],
+            metadata_name=key,
+        )
+
+        # `set_{method}_request({metadata}==True)` on the final estimator
+        # (required to avoid UnsetMetadataPassedError on final estimator)
+        set_requests(
+            final_estimator,
+            method_mapping={},
+            methods=[method_name],
+            metadata_name=key,
+        )
+
+        metaestimator_instance = metaestimator_class(**kwargs)
+        extra_method_args = metaestimator.get("method_args", {}).get(method_name, {})
+
+        if not hasattr(metaestimator_instance, "transform"):
+            # This test only makes sense for metaestimators that can be
+            # intermediate steps in a Pipeline (e.g. transform method exists)
+            return
+
+        pipe = Pipeline(
+            [
+                ("feature_selector", metaestimator_instance),
+                ("final_estimator", final_estimator),
+            ]
+        )
+
+        pipe_method = getattr(pipe, method_name)
+        pipe_method(X, y, **method_kwargs, **extra_method_args)
+
+        # sanity check that registry is not empty, or else the test passes
+        # trivially
+        for registry_ in [registry]:
+            assert registry_
+            split_params = (
+                method_kwargs.keys() if preserves_metadata == "subset" else ()
+            )
+            for estimator in registry_:
+                check_recorded_metadata(
+                    estimator,
+                    method=method_name,
+                    parent=None,
+                    split_params=split_params,
+                    **method_kwargs,
+                )
+
+
 @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
 @config_context(enable_metadata_routing=True)
 def test_setting_request_on_sub_estimator_removes_error(metaestimator):