Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

FIX Draw indices using sample_weight in Bagging #31414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
Loading
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions 5 doc/whats_new/upcoming_changes/sklearn.ensemble/31414.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- :class:`ensemble.BaggingClassfier`, :class:`ensemble.BaggingRegressor`
and :class:`ensemble.IsolationForest` now use `sample_weight` to draw
the samples instead of forwarding them multiplied with a uniformly sampling
Copy link
Member

@ogrisel ogrisel May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I made a typo in my past suggestion:

Suggested change
the samples instead of forwarding them multiplied with a uniformly sampling
the samples instead of forwarding them multiplied by a uniformly sampled

mask to the underlying estimators.
By :user:`Antoine Baker <antoinebaker>`.
120 changes: 60 additions & 60 deletions 120 sklearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def _generate_bagging_indices(
n_samples,
max_features,
max_samples,
sample_weight,
):
"""Randomly draw feature and sample indices."""
# Get valid random state
Expand All @@ -81,18 +82,37 @@ def _generate_bagging_indices(
feature_indices = _generate_indices(
random_state, bootstrap_features, n_features, max_features
)
sample_indices = _generate_indices(
random_state, bootstrap_samples, n_samples, max_samples
)
if sample_weight is None:
sample_indices = _generate_indices(
random_state, bootstrap_samples, n_samples, max_samples
)
else:
normalized_sample_weight = sample_weight / np.sum(sample_weight)
sample_indices = random_state.choice(
n_samples,
max_samples,
replace=bootstrap_samples,
p=normalized_sample_weight,
)

return feature_indices, sample_indices


def _consumes_sample_weight(estimator):
if _routing_enabled():
request_or_router = get_routing_for_object(estimator)
consumes_sample_weight = request_or_router.consumes("fit", ("sample_weight",))
else:
consumes_sample_weight = has_fit_parameter(estimator, "sample_weight")
return consumes_sample_weight


def _parallel_build_estimators(
n_estimators,
ensemble,
X,
y,
sample_weight,
seeds,
total_n_estimators,
verbose,
Expand All @@ -108,22 +128,12 @@ def _parallel_build_estimators(
bootstrap_features = ensemble.bootstrap_features
has_check_input = has_fit_parameter(ensemble.estimator_, "check_input")
requires_feature_indexing = bootstrap_features or max_features != n_features
consumes_sample_weight = _consumes_sample_weight(ensemble.estimator_)

# Build estimators
estimators = []
estimators_features = []

# TODO: (slep6) remove if condition for unrouted sample_weight when metadata
# routing can't be disabled.
support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight")
if not _routing_enabled() and (
not support_sample_weight and fit_params.get("sample_weight") is not None
):
raise ValueError(
"The base estimator doesn't support sample weight, but sample_weight is "
"passed to the fit method."
)

for i in range(n_estimators):
if verbose > 1:
print(
Expand All @@ -139,7 +149,8 @@ def _parallel_build_estimators(
else:
estimator_fit = estimator.fit

# Draw random feature, sample indices
# Draw random feature, sample indices (using normalized sample_weight
# as probabilites if provided)
features, indices = _generate_bagging_indices(
random_state,
bootstrap_features,
Expand All @@ -148,45 +159,22 @@ def _parallel_build_estimators(
n_samples,
max_features,
max_samples,
sample_weight,
)

fit_params_ = fit_params.copy()

# TODO(SLEP6): remove if condition for unrouted sample_weight when metadata
# routing can't be disabled.
# 1. If routing is enabled, we will check if the routing supports sample
# weight and use it if it does.
# 2. If routing is not enabled, we will check if the base
# estimator supports sample_weight and use it if it does.

# Note: Row sampling can be achieved either through setting sample_weight or
# by indexing. The former is more efficient. Therefore, use this method
# by indexing. The former is more memory efficient. Therefore, use this method
# if possible, otherwise use indexing.
if _routing_enabled():
request_or_router = get_routing_for_object(ensemble.estimator_)
consumes_sample_weight = request_or_router.consumes(
"fit", ("sample_weight",)
)
else:
consumes_sample_weight = support_sample_weight
if consumes_sample_weight:
# Draw sub samples, using sample weights, and then fit
curr_sample_weight = _check_sample_weight(
fit_params_.pop("sample_weight", None), X
).copy()

if bootstrap:
sample_counts = np.bincount(indices, minlength=n_samples)
curr_sample_weight *= sample_counts
else:
not_indices_mask = ~indices_to_mask(indices, n_samples)
curr_sample_weight[not_indices_mask] = 0

fit_params_["sample_weight"] = curr_sample_weight
# Row sampling by setting sample_weight
indices_as_sample_weight = np.bincount(indices, minlength=n_samples)
fit_params_["sample_weight"] = indices_as_sample_weight
X_ = X[:, features] if requires_feature_indexing else X
estimator_fit(X_, y, **fit_params_)
else:
# cannot use sample_weight, so use indexing
# Row sampling by indexing
y_ = _safe_indexing(y, indices)
X_ = _safe_indexing(X, indices)
fit_params_ = _check_method_params(X, params=fit_params_, indices=indices)
Expand Down Expand Up @@ -354,9 +342,11 @@ def fit(self, X, y, sample_weight=None, **fit_params):
regression).

sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if the base estimator supports
sample weighting.
Sample weights. If None, then samples are equally weighted. Used as
probabilities to sample the training set. Note that the expected
frequency semantics for the `sample_weight` parameter are only
fulfilled when sampling with replacement `bootstrap=True`.

**fit_params : dict
Parameters to pass to the underlying estimators.

Expand Down Expand Up @@ -386,6 +376,14 @@ def fit(self, X, y, sample_weight=None, **fit_params):
multi_output=True,
)

if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
if sample_weight is not None and not self.bootstrap:
warn(
f"When fitting {self.__class__.__name__} with sample_weight "
f"it is recommended to use bootstrap=True, got {self.bootstrap}."
)

return self._fit(
X,
y,
Expand Down Expand Up @@ -435,8 +433,6 @@ def _fit(

sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if the base estimator supports
sample weighting.

**fit_params : dict, default=None
Parameters to pass to the :term:`fit` method of the underlying
Expand All @@ -457,18 +453,11 @@ def _fit(
# Check parameters
self._validate_estimator(self._get_estimator())

if sample_weight is not None:
fit_params["sample_weight"] = sample_weight

if _routing_enabled():
routed_params = process_routing(self, "fit", **fit_params)
else:
routed_params = Bunch()
routed_params.estimator = Bunch(fit=fit_params)
if "sample_weight" in fit_params:
routed_params.estimator.fit["sample_weight"] = fit_params[
"sample_weight"
]

if max_depth is not None:
self.estimator_.max_depth = max_depth
Expand Down Expand Up @@ -499,6 +488,11 @@ def _fit(
# Store validated integer feature sampling value
self._max_features = max_features

# Store sample_weight (needed in _get_estimators_indices). Note that
# we intentionally do not materialize `sample_weight=None` as an array
# of ones to avoid unnecessarily cluttering trained estimator pickles.
self._sample_weight = sample_weight

# Other checks
if not self.bootstrap and self.oob_score:
raise ValueError("Out of bag estimation only available if bootstrap=True")
Expand Down Expand Up @@ -552,6 +546,7 @@ def _fit(
self,
X,
y,
sample_weight,
seeds[starts[i] : starts[i + 1]],
total_n_estimators,
verbose=self.verbose,
Expand Down Expand Up @@ -596,6 +591,7 @@ def _get_estimators_indices(self):
self._n_samples,
self._max_features,
self._max_samples,
self._sample_weight,
)

yield feature_indices, sample_indices
Expand Down Expand Up @@ -737,8 +733,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
- If float, then draw `max(1, int(max_features * n_features_in_))` features.

bootstrap : bool, default=True
Whether samples are drawn with replacement. If False, sampling
without replacement is performed.
Whether samples are drawn with replacement. If False, sampling without
replacement is performed. If fitting with `sample_weight`, it is
strongly recommended to choose True, as only drawing with replacement
will ensure the expected frequency semantics of `sample_weight`.

bootstrap_features : bool, default=False
Whether features are drawn with replacement.
Expand Down Expand Up @@ -1245,8 +1243,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
- If float, then draw `max(1, int(max_features * n_features_in_))` features.

bootstrap : bool, default=True
Whether samples are drawn with replacement. If False, sampling
without replacement is performed.
Whether samples are drawn with replacement. If False, sampling without
replacement is performed. If fitting with `sample_weight`, it is
strongly recommended to choose True, as only drawing with replacement
will ensure the expected frequency semantics of `sample_weight`.

bootstrap_features : bool, default=False
Whether features are drawn with replacement.
Expand Down
13 changes: 11 additions & 2 deletions 13 sklearn/ensemble/_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
from ..utils._chunking import get_chunk_n_rows
from ..utils._param_validation import Interval, RealNotInt, StrOptions
from ..utils.parallel import Parallel, delayed
from ..utils.validation import _num_samples, check_is_fitted, validate_data
from ..utils.validation import (
_check_sample_weight,
_num_samples,
check_is_fitted,
validate_data,
)
from ._bagging import BaseBagging

__all__ = ["IsolationForest"]
Expand Down Expand Up @@ -317,6 +322,10 @@ def fit(self, X, y=None, sample_weight=None):
X = validate_data(
self, X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False
)

if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, dtype=None)

if issparse(X):
# Pre-sort indices to avoid that each individual tree of the
# ensemble sorts the indices.
Expand Down Expand Up @@ -350,7 +359,7 @@ def fit(self, X, y=None, sample_weight=None):
super()._fit(
X,
y,
max_samples,
max_samples=max_samples,
max_depth=max_depth,
sample_weight=sample_weight,
check_input=False,
Expand Down
44 changes: 22 additions & 22 deletions 44 sklearn/ensemble/tests/test_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,28 +589,6 @@ def test_bagging_with_pipeline():
assert isinstance(estimator[0].steps[-1][1].random_state, int)


class DummyZeroEstimator(BaseEstimator):
def fit(self, X, y):
self.classes_ = np.unique(y)
return self

def predict(self, X):
return self.classes_[np.zeros(X.shape[0], dtype=int)]


def test_bagging_sample_weight_unsupported_but_passed():
estimator = BaggingClassifier(DummyZeroEstimator())
rng = check_random_state(0)

estimator.fit(iris.data, iris.target).predict(iris.data)
with pytest.raises(ValueError):
estimator.fit(
iris.data,
iris.target,
sample_weight=rng.randint(10, size=(iris.data.shape[0])),
)


def test_warm_start(random_state=42):
# Test if fitting incrementally with warm start gives a forest of the
# right size and the same results as a normal fit.
Expand Down Expand Up @@ -692,6 +670,28 @@ def test_warm_start_with_oob_score_fails():
clf.fit(X, y)


def test_warning_bootstrap_sample_weight():
X, y = iris.data, iris.target
sample_weight = np.ones_like(y)
clf = BaggingClassifier(bootstrap=False)
warn_msg = (
"When fitting BaggingClassifier with sample_weight "
"it is recommended to use bootstrap=True"
)
with pytest.warns(UserWarning, match=warn_msg):
clf.fit(X, y, sample_weight=sample_weight)

X, y = diabetes.data, diabetes.target
sample_weight = np.ones_like(y)
reg = BaggingRegressor(bootstrap=False)
warn_msg = (
"When fitting BaggingRegressor with sample_weight "
"it is recommended to use bootstrap=True"
)
with pytest.warns(UserWarning, match=warn_msg):
reg.fit(X, y, sample_weight=sample_weight)


def test_oob_score_removed_on_warm_start():
X, y = make_hastie_10_2(n_samples=100, random_state=1)

Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.