jjerphan
diff --git a/‎doc/whats_new/v1.3.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v1.3.rst
+10Lines changed: 10 additions & 0 deletions b/‎doc/whats_new/v1.3.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v1.3.rst
+10Lines changed: 10 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/_discretization.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/_discretization.py
+29-3Lines changed: 29 additions & 3 deletions b/‎sklearn/preprocessing/_discretization.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/_discretization.py
+29-3Lines changed: 29 additions & 3 deletions
diff --git a/‎sklearn/preprocessing/tests/test_discretization.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/tests/test_discretization.py
+84-12Lines changed: 84 additions & 12 deletions b/‎sklearn/preprocessing/tests/test_discretization.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/tests/test_discretization.py
+84-12Lines changed: 84 additions & 12 deletions
@@ -42,6 +42,16 @@ Changelog
   `feature_union["scalar"]`) to access transformers by name. :pr:`25093` by
   `Thomas Fan`_.
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Enhancement| Added support for `sample_weight` in
+  :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
+  `sample_weight` for each sample to be used while fitting. The option is only
+  available when `strategy` is set to `quantile`.
+  :pr:`24935` by :user:`Seladus <seladus>`, :user:`Guillaume Lemaitre <glemaitre>`, and
+  :user:`Dea María Léon <deamarialeon>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
 
@@ -16,6 +16,8 @@
 from ..utils.validation import check_is_fitted
 from ..utils.validation import check_random_state
 from ..utils.validation import _check_feature_names_in
+from ..utils.validation import _check_sample_weight
+from ..utils.stats import _weighted_percentile
 from ..utils import _safe_indexing
 
 
@@ -182,7 +184,7 @@ def __init__(
         self.subsample = subsample
         self.random_state = random_state
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
         """
         Fit the estimator.
 
@@ -195,6 +197,12 @@ def fit(self, X, y=None):
             Ignored. This parameter exists only for compatibility with
             :class:`~sklearn.pipeline.Pipeline`.
 
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+            Only possible when `strategy` is set to `"quantile"`.
+
+            .. versionadded:: 1.3
+
         Returns
         -------
         self : object
@@ -233,9 +241,19 @@ def fit(self, X, y=None):
                 '`subsample` must be used with `strategy="quantile"`.'
             )
 
+        elif sample_weight is not None and self.strategy != "quantile":
+            raise ValueError(
+                "`sample_weight` was provided but it can only be "
+                "used with strategy='quantile'. Got strategy="
+                f"{self.strategy!r} instead."
+            )
+
         n_features = X.shape[1]
         n_bins = self._validate_n_bins(n_features)
 
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
         bin_edges = np.zeros(n_features, dtype=object)
         for jj in range(n_features):
             column = X[:, jj]
@@ -254,8 +272,16 @@ def fit(self, X, y=None):
 
             elif self.strategy == "quantile":
                 quantiles = np.linspace(0, 100, n_bins[jj] + 1)
-                bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
-
+                if sample_weight is None:
+                    bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
+                else:
+                    bin_edges[jj] = np.asarray(
+                        [
+                            _weighted_percentile(column, sample_weight, q)
+                            for q in quantiles
+                        ],
+                        dtype=np.float64,
+                    )
             elif self.strategy == "kmeans":
                 from ..cluster import KMeans  # fixes import loops
 
 
@@ -10,22 +10,38 @@
     assert_array_almost_equal,
     assert_array_equal,
     assert_allclose_dense_sparse,
+    assert_allclose,
 )
 
 X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
 
 
 @pytest.mark.parametrize(
-    "strategy, expected",
+    "strategy, expected, sample_weight",
     [
-        ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
-        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
-        ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
+        ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None),
+        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None),
+        ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 2, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 1, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+            [0, 1, 1, 1],
+        ),
     ],
 )
-def test_fit_transform(strategy, expected):
+def test_fit_transform(strategy, expected, sample_weight):
     est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
-    est.fit(X)
+    est.fit(X, sample_weight=sample_weight)
     assert_array_equal(expected, est.transform(X))
 
 
@@ -35,6 +51,18 @@ def test_valid_n_bins():
     assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
 
 
+@pytest.mark.parametrize("strategy", ["uniform", "kmeans"])
+def test_kbinsdiscretizer_wrong_strategy_with_weights(strategy):
+    """Check that we raise an error when the wrong strategy is used."""
+    sample_weight = np.ones(shape=(len(X)))
+    est = KBinsDiscretizer(n_bins=3, strategy=strategy)
+    err_msg = (
+        "`sample_weight` was provided but it can only be used with strategy='quantile'."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, sample_weight=sample_weight)
+
+
 def test_invalid_n_bins_array():
     # Bad shape
     n_bins = np.full((2, 4), 2.0)
@@ -74,17 +102,40 @@ def test_invalid_n_bins_array():
 
 
 @pytest.mark.parametrize(
-    "strategy, expected",
+    "strategy, expected, sample_weight",
     [
-        ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
-        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
-        ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
+        ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None),
+        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None),
+        ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+            [1, 1, 3, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+            [0, 1, 3, 1],
+        ),
+        # (
+        #     "quantile",
+        #     [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+        #     [1, 1, 1, 1],
+        # ),
+        #
+        # TODO: This test case above aims to test if the case where an array of
+        #       ones passed in sample_weight parameter is equal to the case when
+        #       sample_weight is None.
+        #       Unfortunately, the behavior of `_weighted_percentile` when
+        #       `sample_weight = [1, 1, 1, 1]` are currently not equivalent.
+        #       This problem has been adressed in issue :
+        #       https://github.com/scikit-learn/scikit-learn/issues/17370
     ],
 )
-def test_fit_transform_n_bins_array(strategy, expected):
+def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
     est = KBinsDiscretizer(
         n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
-    ).fit(X)
+    ).fit(X, sample_weight=sample_weight)
     assert_array_equal(expected, est.transform(X))
 
     # test the shape of bin_edges_
@@ -94,6 +145,27 @@ def test_fit_transform_n_bins_array(strategy, expected):
         assert bin_edges.shape == (n_bins + 1,)
 
 
+@pytest.mark.filterwarnings("ignore: Bins whose width are too small")
+def test_kbinsdiscretizer_effect_sample_weight():
+    """Check the impact of `sample_weight` one computed quantiles."""
+    X = np.array([[-2], [-1], [1], [3], [500], [1000]])
+    # add a large number of bins such that each sample with a non-null weight
+    # will be used as bin edge
+    est = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
+    est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0])
+    assert_allclose(est.bin_edges_[0], [-2, -1, 1, 3])
+    assert_allclose(est.transform(X), [[0.0], [1.0], [2.0], [2.0], [2.0], [2.0]])
+
+
+def test_kbinsdiscretizer_no_mutating_sample_weight():
+    """Make sure that `sample_weight` is not changed in place."""
+    est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile")
+    sample_weight = np.array([1, 3, 1, 2], dtype=np.float64)
+    sample_weight_copy = np.copy(sample_weight)
+    est.fit(X, sample_weight=sample_weight)
+    assert_allclose(sample_weight, sample_weight_copy)
+
+
 @pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
 def test_same_min_max(strategy):
     warnings.simplefilter("always")