Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 9b26300

Browse filesBrowse files
DeaMariaLeonSeladusglemaitre
authored andcommitted
ENH add support for sample_weight in KBinsDiscretizer with strategy="quantile" (scikit-learn#24935)
Co-authored-by: seladus <clement.blancovolle@insa-rouen.fr> Co-authored-by: Seladus <71873495+Seladus@users.noreply.github.com> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent 6fb41e3 commit 9b26300
Copy full SHA for 9b26300

File tree

3 files changed

+123
-15
lines changed
Filter options

3 files changed

+123
-15
lines changed

‎doc/whats_new/v1.3.rst

Copy file name to clipboardExpand all lines: doc/whats_new/v1.3.rst
+10Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,16 @@ Changelog
4242
`feature_union["scalar"]`) to access transformers by name. :pr:`25093` by
4343
`Thomas Fan`_.
4444

45+
:mod:`sklearn.preprocessing`
46+
............................
47+
48+
- |Enhancement| Added support for `sample_weight` in
49+
:class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
50+
`sample_weight` for each sample to be used while fitting. The option is only
51+
available when `strategy` is set to `quantile`.
52+
:pr:`24935` by :user:`Seladus <seladus>`, :user:`Guillaume Lemaitre <glemaitre>`, and
53+
:user:`Dea María Léon <deamarialeon>`.
54+
4555
Code and Documentation Contributors
4656
-----------------------------------
4757

‎sklearn/preprocessing/_discretization.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/_discretization.py
+29-3Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
from ..utils.validation import check_is_fitted
1717
from ..utils.validation import check_random_state
1818
from ..utils.validation import _check_feature_names_in
19+
from ..utils.validation import _check_sample_weight
20+
from ..utils.stats import _weighted_percentile
1921
from ..utils import _safe_indexing
2022

2123

@@ -182,7 +184,7 @@ def __init__(
182184
self.subsample = subsample
183185
self.random_state = random_state
184186

185-
def fit(self, X, y=None):
187+
def fit(self, X, y=None, sample_weight=None):
186188
"""
187189
Fit the estimator.
188190
@@ -195,6 +197,12 @@ def fit(self, X, y=None):
195197
Ignored. This parameter exists only for compatibility with
196198
:class:`~sklearn.pipeline.Pipeline`.
197199
200+
sample_weight : ndarray of shape (n_samples,)
201+
Contains weight values to be associated with each sample.
202+
Only possible when `strategy` is set to `"quantile"`.
203+
204+
.. versionadded:: 1.3
205+
198206
Returns
199207
-------
200208
self : object
@@ -233,9 +241,19 @@ def fit(self, X, y=None):
233241
'`subsample` must be used with `strategy="quantile"`.'
234242
)
235243

244+
elif sample_weight is not None and self.strategy != "quantile":
245+
raise ValueError(
246+
"`sample_weight` was provided but it can only be "
247+
"used with strategy='quantile'. Got strategy="
248+
f"{self.strategy!r} instead."
249+
)
250+
236251
n_features = X.shape[1]
237252
n_bins = self._validate_n_bins(n_features)
238253

254+
if sample_weight is not None:
255+
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
256+
239257
bin_edges = np.zeros(n_features, dtype=object)
240258
for jj in range(n_features):
241259
column = X[:, jj]
@@ -254,8 +272,16 @@ def fit(self, X, y=None):
254272

255273
elif self.strategy == "quantile":
256274
quantiles = np.linspace(0, 100, n_bins[jj] + 1)
257-
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
258-
275+
if sample_weight is None:
276+
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
277+
else:
278+
bin_edges[jj] = np.asarray(
279+
[
280+
_weighted_percentile(column, sample_weight, q)
281+
for q in quantiles
282+
],
283+
dtype=np.float64,
284+
)
259285
elif self.strategy == "kmeans":
260286
from ..cluster import KMeans # fixes import loops
261287

‎sklearn/preprocessing/tests/test_discretization.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/tests/test_discretization.py
+84-12Lines changed: 84 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,38 @@
1010
assert_array_almost_equal,
1111
assert_array_equal,
1212
assert_allclose_dense_sparse,
13+
assert_allclose,
1314
)
1415

1516
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
1617

1718

1819
@pytest.mark.parametrize(
19-
"strategy, expected",
20+
"strategy, expected, sample_weight",
2021
[
21-
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
22-
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
23-
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
22+
("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None),
23+
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None),
24+
("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None),
25+
(
26+
"quantile",
27+
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
28+
[1, 1, 2, 1],
29+
),
30+
(
31+
"quantile",
32+
[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
33+
[1, 1, 1, 1],
34+
),
35+
(
36+
"quantile",
37+
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
38+
[0, 1, 1, 1],
39+
),
2440
],
2541
)
26-
def test_fit_transform(strategy, expected):
42+
def test_fit_transform(strategy, expected, sample_weight):
2743
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
28-
est.fit(X)
44+
est.fit(X, sample_weight=sample_weight)
2945
assert_array_equal(expected, est.transform(X))
3046

3147

@@ -35,6 +51,18 @@ def test_valid_n_bins():
3551
assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
3652

3753

54+
@pytest.mark.parametrize("strategy", ["uniform", "kmeans"])
55+
def test_kbinsdiscretizer_wrong_strategy_with_weights(strategy):
56+
"""Check that we raise an error when the wrong strategy is used."""
57+
sample_weight = np.ones(shape=(len(X)))
58+
est = KBinsDiscretizer(n_bins=3, strategy=strategy)
59+
err_msg = (
60+
"`sample_weight` was provided but it can only be used with strategy='quantile'."
61+
)
62+
with pytest.raises(ValueError, match=err_msg):
63+
est.fit(X, sample_weight=sample_weight)
64+
65+
3866
def test_invalid_n_bins_array():
3967
# Bad shape
4068
n_bins = np.full((2, 4), 2.0)
@@ -74,17 +102,40 @@ def test_invalid_n_bins_array():
74102

75103

76104
@pytest.mark.parametrize(
77-
"strategy, expected",
105+
"strategy, expected, sample_weight",
78106
[
79-
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
80-
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
81-
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
107+
("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None),
108+
("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None),
109+
("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None),
110+
(
111+
"quantile",
112+
[[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
113+
[1, 1, 3, 1],
114+
),
115+
(
116+
"quantile",
117+
[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
118+
[0, 1, 3, 1],
119+
),
120+
# (
121+
# "quantile",
122+
# [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
123+
# [1, 1, 1, 1],
124+
# ),
125+
#
126+
# TODO: This test case above aims to test if the case where an array of
127+
# ones passed in sample_weight parameter is equal to the case when
128+
# sample_weight is None.
129+
# Unfortunately, the behavior of `_weighted_percentile` when
130+
# `sample_weight = [1, 1, 1, 1]` are currently not equivalent.
131+
# This problem has been adressed in issue :
132+
# https://github.com/scikit-learn/scikit-learn/issues/17370
82133
],
83134
)
84-
def test_fit_transform_n_bins_array(strategy, expected):
135+
def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
85136
est = KBinsDiscretizer(
86137
n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
87-
).fit(X)
138+
).fit(X, sample_weight=sample_weight)
88139
assert_array_equal(expected, est.transform(X))
89140

90141
# test the shape of bin_edges_
@@ -94,6 +145,27 @@ def test_fit_transform_n_bins_array(strategy, expected):
94145
assert bin_edges.shape == (n_bins + 1,)
95146

96147

148+
@pytest.mark.filterwarnings("ignore: Bins whose width are too small")
149+
def test_kbinsdiscretizer_effect_sample_weight():
150+
"""Check the impact of `sample_weight` one computed quantiles."""
151+
X = np.array([[-2], [-1], [1], [3], [500], [1000]])
152+
# add a large number of bins such that each sample with a non-null weight
153+
# will be used as bin edge
154+
est = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
155+
est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0])
156+
assert_allclose(est.bin_edges_[0], [-2, -1, 1, 3])
157+
assert_allclose(est.transform(X), [[0.0], [1.0], [2.0], [2.0], [2.0], [2.0]])
158+
159+
160+
def test_kbinsdiscretizer_no_mutating_sample_weight():
161+
"""Make sure that `sample_weight` is not changed in place."""
162+
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile")
163+
sample_weight = np.array([1, 3, 1, 2], dtype=np.float64)
164+
sample_weight_copy = np.copy(sample_weight)
165+
est.fit(X, sample_weight=sample_weight)
166+
assert_allclose(sample_weight, sample_weight_copy)
167+
168+
97169
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
98170
def test_same_min_max(strategy):
99171
warnings.simplefilter("always")

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.