scikit-learn · antoinebaker · May 22, 2025 · May 22, 2025 · May 23, 2025 · May 23, 2025
diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/31414.fix.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/31414.fix.rst
@@ -0,0 +1,5 @@
+- :class:`ensemble.BaggingClassfier`, :class:`ensemble.BaggingRegressor`
+  and :class:`ensemble.IsolationForest` now use `sample_weight` to draw
+  the samples instead of forwarding them multiplied with a uniformly sampling
-  the samples instead of forwarding them multiplied with a uniformly sampling
+  the samples instead of forwarding them multiplied by a uniformly sampled
-  the samples instead of forwarding them multiplied with a uniformly sampling
+  the samples instead of forwarding them multiplied by a uniformly sampled
+  mask to the underlying estimators.
+  By :user:`Antoine Baker <antoinebaker>`.
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
@@ -72,6 +72,7 @@ def _generate_bagging_indices(
    n_samples,
    max_features,
    max_samples,
+    sample_weight,
 ):
    """Randomly draw feature and sample indices."""
    # Get valid random state
@@ -81,18 +82,37 @@ def _generate_bagging_indices(
    feature_indices = _generate_indices(
        random_state, bootstrap_features, n_features, max_features
    )
-    sample_indices = _generate_indices(
-        random_state, bootstrap_samples, n_samples, max_samples
-    )
+    if sample_weight is None:
+        sample_indices = _generate_indices(
+            random_state, bootstrap_samples, n_samples, max_samples
+        )
+    else:
+        normalized_sample_weight = sample_weight / np.sum(sample_weight)
+        sample_indices = random_state.choice(
+            n_samples,
+            max_samples,
+            replace=bootstrap_samples,
+            p=normalized_sample_weight,
+        )

    return feature_indices, sample_indices


+def _consumes_sample_weight(estimator):
+    if _routing_enabled():
+        request_or_router = get_routing_for_object(estimator)
+        consumes_sample_weight = request_or_router.consumes("fit", ("sample_weight",))
+    else:
+        consumes_sample_weight = has_fit_parameter(estimator, "sample_weight")
+    return consumes_sample_weight
+
+
 def _parallel_build_estimators(
    n_estimators,
    ensemble,
    X,
    y,
+    sample_weight,
    seeds,
    total_n_estimators,
    verbose,
@@ -108,22 +128,12 @@ def _parallel_build_estimators(
    bootstrap_features = ensemble.bootstrap_features
    has_check_input = has_fit_parameter(ensemble.estimator_, "check_input")
    requires_feature_indexing = bootstrap_features or max_features != n_features
+    consumes_sample_weight = _consumes_sample_weight(ensemble.estimator_)

    # Build estimators
    estimators = []
    estimators_features = []

-    # TODO: (slep6) remove if condition for unrouted sample_weight when metadata
-    # routing can't be disabled.
-    support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight")
-    if not _routing_enabled() and (
-        not support_sample_weight and fit_params.get("sample_weight") is not None
-    ):
-        raise ValueError(
-            "The base estimator doesn't support sample weight, but sample_weight is "
-            "passed to the fit method."
-        )
-
    for i in range(n_estimators):
        if verbose > 1:
            print(
@@ -139,7 +149,8 @@ def _parallel_build_estimators(
        else:
            estimator_fit = estimator.fit

-        # Draw random feature, sample indices
+        # Draw random feature, sample indices (using normalized sample_weight
+        # as probabilites if provided)
        features, indices = _generate_bagging_indices(
            random_state,
            bootstrap_features,
@@ -148,45 +159,22 @@ def _parallel_build_estimators(
            n_samples,
            max_features,
            max_samples,
+            sample_weight,
        )

        fit_params_ = fit_params.copy()

-        # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata
-        # routing can't be disabled.
-        # 1. If routing is enabled, we will check if the routing supports sample
-        # weight and use it if it does.
-        # 2. If routing is not enabled, we will check if the base
-        # estimator supports sample_weight and use it if it does.
-
        # Note: Row sampling can be achieved either through setting sample_weight or
-        # by indexing. The former is more efficient. Therefore, use this method
+        # by indexing. The former is more memory efficient. Therefore, use this method
        # if possible, otherwise use indexing.
-        if _routing_enabled():
-            request_or_router = get_routing_for_object(ensemble.estimator_)
-            consumes_sample_weight = request_or_router.consumes(
-                "fit", ("sample_weight",)
-            )
-        else:
-            consumes_sample_weight = support_sample_weight
        if consumes_sample_weight:
-            # Draw sub samples, using sample weights, and then fit
-            curr_sample_weight = _check_sample_weight(
-                fit_params_.pop("sample_weight", None), X
-            ).copy()
-
-            if bootstrap:
-                sample_counts = np.bincount(indices, minlength=n_samples)
-                curr_sample_weight *= sample_counts
-            else:
-                not_indices_mask = ~indices_to_mask(indices, n_samples)
-                curr_sample_weight[not_indices_mask] = 0
-
-            fit_params_["sample_weight"] = curr_sample_weight
+            # Row sampling by setting sample_weight
+            indices_as_sample_weight = np.bincount(indices, minlength=n_samples)
+            fit_params_["sample_weight"] = indices_as_sample_weight
            X_ = X[:, features] if requires_feature_indexing else X
            estimator_fit(X_, y, **fit_params_)
        else:
-            # cannot use sample_weight, so use indexing
+            # Row sampling by indexing
            y_ = _safe_indexing(y, indices)
            X_ = _safe_indexing(X, indices)
            fit_params_ = _check_method_params(X, params=fit_params_, indices=indices)
@@ -354,9 +342,11 @@ def fit(self, X, y, sample_weight=None, **fit_params):
            regression).

        sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights. If None, then samples are equally weighted.
-            Note that this is supported only if the base estimator supports
-            sample weighting.
+            Sample weights. If None, then samples are equally weighted. Used as
+            probabilities to sample the training set. Note that the expected
+            frequency semantics for the `sample_weight` parameter are only
+            fulfilled when sampling with replacement `bootstrap=True`.
+
        **fit_params : dict
            Parameters to pass to the underlying estimators.

@@ -386,6 +376,14 @@ def fit(self, X, y, sample_weight=None, **fit_params):
            multi_output=True,
        )

+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+        if sample_weight is not None and not self.bootstrap:
+            warn(
+                f"When fitting {self.__class__.__name__} with sample_weight "
+                f"it is recommended to use bootstrap=True, got {self.bootstrap}."
+            )
+
        return self._fit(
            X,
            y,
@@ -435,8 +433,6 @@ def _fit(

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.
-            Note that this is supported only if the base estimator supports
-            sample weighting.

        **fit_params : dict, default=None
            Parameters to pass to the :term:`fit` method of the underlying
@@ -457,18 +453,11 @@ def _fit(
        # Check parameters
        self._validate_estimator(self._get_estimator())

-        if sample_weight is not None:
-            fit_params["sample_weight"] = sample_weight
-
        if _routing_enabled():
            routed_params = process_routing(self, "fit", **fit_params)
        else:
            routed_params = Bunch()
            routed_params.estimator = Bunch(fit=fit_params)
-            if "sample_weight" in fit_params:
-                routed_params.estimator.fit["sample_weight"] = fit_params[
-                    "sample_weight"
-                ]

        if max_depth is not None:
            self.estimator_.max_depth = max_depth
@@ -499,6 +488,11 @@ def _fit(
        # Store validated integer feature sampling value
        self._max_features = max_features

+        # Store sample_weight (needed in _get_estimators_indices). Note that
+        # we intentionally do not materialize `sample_weight=None` as an array
+        # of ones to avoid unnecessarily cluttering trained estimator pickles.
+        self._sample_weight = sample_weight
+
        # Other checks
        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available if bootstrap=True")
@@ -552,6 +546,7 @@ def _fit(
                self,
                X,
                y,
+                sample_weight,
                seeds[starts[i] : starts[i + 1]],
                total_n_estimators,
                verbose=self.verbose,
@@ -596,6 +591,7 @@ def _get_estimators_indices(self):
                self._n_samples,
                self._max_features,
                self._max_samples,
+                self._sample_weight,
            )

            yield feature_indices, sample_indices
@@ -737,8 +733,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
        - If float, then draw `max(1, int(max_features * n_features_in_))` features.

    bootstrap : bool, default=True
-        Whether samples are drawn with replacement. If False, sampling
-        without replacement is performed.
+        Whether samples are drawn with replacement. If False, sampling without
+        replacement is performed. If fitting with `sample_weight`, it is
+        strongly recommended to choose True, as only drawing with replacement
+        will ensure the expected frequency semantics of `sample_weight`.

    bootstrap_features : bool, default=False
        Whether features are drawn with replacement.
@@ -1245,8 +1243,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
        - If float, then draw `max(1, int(max_features * n_features_in_))` features.

    bootstrap : bool, default=True
-        Whether samples are drawn with replacement. If False, sampling
-        without replacement is performed.
+        Whether samples are drawn with replacement. If False, sampling without
+        replacement is performed. If fitting with `sample_weight`, it is
+        strongly recommended to choose True, as only drawing with replacement
+        will ensure the expected frequency semantics of `sample_weight`.

    bootstrap_features : bool, default=False
        Whether features are drawn with replacement.

diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
@@ -20,7 +20,12 @@
 from ..utils._chunking import get_chunk_n_rows
 from ..utils._param_validation import Interval, RealNotInt, StrOptions
 from ..utils.parallel import Parallel, delayed
-from ..utils.validation import _num_samples, check_is_fitted, validate_data
+from ..utils.validation import (
+    _check_sample_weight,
+    _num_samples,
+    check_is_fitted,
+    validate_data,
+)
 from ._bagging import BaseBagging

 __all__ = ["IsolationForest"]
@@ -317,6 +322,10 @@ def fit(self, X, y=None, sample_weight=None):
        X = validate_data(
            self, X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False
        )
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
        if issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
@@ -350,7 +359,7 @@ def fit(self, X, y=None, sample_weight=None):
        super()._fit(
            X,
            y,
-            max_samples,
+            max_samples=max_samples,
            max_depth=max_depth,
            sample_weight=sample_weight,
            check_input=False,

diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
@@ -589,28 +589,6 @@ def test_bagging_with_pipeline():
    assert isinstance(estimator[0].steps[-1][1].random_state, int)


-class DummyZeroEstimator(BaseEstimator):
-    def fit(self, X, y):
-        self.classes_ = np.unique(y)
-        return self
-
-    def predict(self, X):
-        return self.classes_[np.zeros(X.shape[0], dtype=int)]
-
-
-def test_bagging_sample_weight_unsupported_but_passed():
-    estimator = BaggingClassifier(DummyZeroEstimator())
-    rng = check_random_state(0)
-
-    estimator.fit(iris.data, iris.target).predict(iris.data)
-    with pytest.raises(ValueError):
-        estimator.fit(
-            iris.data,
-            iris.target,
-            sample_weight=rng.randint(10, size=(iris.data.shape[0])),
-        )
-
-
 def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
@@ -692,6 +670,28 @@ def test_warm_start_with_oob_score_fails():
        clf.fit(X, y)


+def test_warning_bootstrap_sample_weight():
+    X, y = iris.data, iris.target
+    sample_weight = np.ones_like(y)
+    clf = BaggingClassifier(bootstrap=False)
+    warn_msg = (
+        "When fitting BaggingClassifier with sample_weight "
+        "it is recommended to use bootstrap=True"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        clf.fit(X, y, sample_weight=sample_weight)
+
+    X, y = diabetes.data, diabetes.target
+    sample_weight = np.ones_like(y)
+    reg = BaggingRegressor(bootstrap=False)
+    warn_msg = (
+        "When fitting BaggingRegressor with sample_weight "
+        "it is recommended to use bootstrap=True"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        reg.fit(X, y, sample_weight=sample_weight)
+
+
 def test_oob_score_removed_on_warm_start():
    X, y = make_hastie_10_2(n_samples=100, random_state=1)