From 779ce507d0ab501650603542cc1bf5ffc652e10e Mon Sep 17 00:00:00 2001
From: Carlo <carlo_lemos@hotmail.com>
Date: Fri, 5 May 2023 02:53:26 -0300
Subject: [PATCH 01/12] add sample_weight to ohe draft

---
 sklearn/preprocessing/_encoders.py           | 14 +++-
 sklearn/preprocessing/tests/test_encoders.py | 60 ++++++++++++++++
 sklearn/utils/_encode.py                     | 74 +++++++++++++++++---
 3 files changed, 135 insertions(+), 13 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index fd9941f5336ed..2e3c563a3876e 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -70,6 +70,7 @@ def _check_X(self, X, force_all_finite=True):
     def _fit(
         self,
         X,
+        sample_weight=None,
         handle_unknown="error",
         force_all_finite=True,
         return_counts=False,
@@ -98,7 +99,9 @@ def _fit(
             Xi = X_list[i]
 
             if self.categories == "auto":
-                result = _unique(Xi, return_counts=compute_counts)
+                result = _unique(
+                    Xi, sample_weight=sample_weight, return_counts=compute_counts
+                )
                 if compute_counts:
                     cats, counts = result
                     category_counts.append(counts)
@@ -147,7 +150,7 @@ def _fit(
                         )
                         raise ValueError(msg)
                 if compute_counts:
-                    category_counts.append(_get_counts(Xi, cats))
+                    category_counts.append(_get_counts(Xi, cats, sample_weight))
 
             self.categories_.append(cats)
 
@@ -281,6 +284,8 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
             If there are infrequent categories, indices of infrequent
             categories. Otherwise None.
         """
+        # TODO ohe_sw: We would have to change this...
+        # But it really makes sense... :(
         if isinstance(self.min_frequency, numbers.Integral):
             infrequent_mask = category_count < self.min_frequency
         elif isinstance(self.min_frequency, numbers.Real):
@@ -953,7 +958,7 @@ def _compute_n_features_outs(self):
 
         return output
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
         """
         Fit OneHotEncoder to X.
 
@@ -971,6 +976,8 @@ def fit(self, X, y=None):
         self
             Fitted encoder.
         """
+        # TODO ohe_sw: Add to docstring that `sample_weight` is only used when
+        # `max_categories` or `min_frequency` are not default values.
         self._validate_params()
 
         if self.sparse != "deprecated":
@@ -986,6 +993,7 @@ def fit(self, X, y=None):
 
         self._fit(
             X,
+            sample_weight=sample_weight,
             handle_unknown=self.handle_unknown,
             force_all_finite="allow-nan",
         )
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 42c66980bfeba..847838d1bed04 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2303,3 +2303,63 @@ def test_ordinal_encoder_missing_appears_infrequent():
     )
     X_trans = ordinal.transform(X_test)
     assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])
+
+
+@pytest.mark.parametrize(
+    "X, sample_weight, expected_shape",
+    [
+        (
+            [
+                ["car", 3],
+                ["bike", 3],
+                ["car", 1],
+                ["bike", 3],
+                ["boat", 2],
+                ["airplane", 4],
+            ],
+            np.array([2, 2.5, 0.5, 0.1, 0, 0]),
+            (6, 5),
+        ),
+        (
+            [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
+            np.array([5, 5, 0.1, 0.3, 4, 0.9]),
+            (6, 3),
+        ),
+    ],
+)
+def test_one_hot_encoder_sample_weight_min_frequency(X, sample_weight, expected_shape):
+    ohe = OneHotEncoder(min_frequency=2)
+    X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
+
+    print(X_trans.toarray())
+    assert X_trans.shape == expected_shape
+
+
+@pytest.mark.parametrize(
+    "X, sample_weight, expected_shape",
+    [
+        (
+            [
+                ["car", 3],
+                ["bike", 3],
+                ["car", 1],
+                ["bike", 3],
+                ["boat", 2],
+                ["airplane", 4],
+            ],
+            np.array([2, 2.5, 0.5, 0.1, 0, 0]),
+            (6, 4),
+        ),
+        (
+            [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
+            np.array([5, 5, 0.1, 0.3, 4, 0.9]),
+            (6, 2),
+        ),
+    ],
+)
+def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape):
+    ohe = OneHotEncoder(max_categories=2)
+    X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
+
+    print(X_trans.toarray())
+    assert X_trans.shape == expected_shape
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index de48890fcaacf..df2b6c5151dd1 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -4,9 +4,10 @@
 
 import numpy as np
 from . import is_scalar_nan
+from .validation import _check_sample_weight
 
 
-def _unique(values, *, return_inverse=False, return_counts=False):
+def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=False):
     """Helper function to find unique values with support for python objects.
 
     Uses pure python method for object dtype, and numpy method for
@@ -39,19 +40,70 @@ def _unique(values, *, return_inverse=False, return_counts=False):
     """
     if values.dtype == object:
         return _unique_python(
-            values, return_inverse=return_inverse, return_counts=return_counts
+            values,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+            sample_weight=sample_weight,
         )
     # numerical
     return _unique_np(
-        values, return_inverse=return_inverse, return_counts=return_counts
+        values,
+        sample_weight=sample_weight,
+        return_inverse=return_inverse,
+        return_counts=return_counts,
     )
 
 
-def _unique_np(values, return_inverse=False, return_counts=False):
+def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False):
+    # TODO ohe_sw: add one line docstring
+    sample_weight = _check_sample_weight(sample_weight, arr)
+
+    sorted_indices = np.argsort(arr)
+    sorted_arr = arr[sorted_indices]
+    sorted_sample_weight = sample_weight[sorted_indices]
+
+    # TODO ohe_sw: Using two `np.unique` is certainly suboptimal, but for now I can't
+    # see how to build the `unique_inverse` of `arr` with the `unique_inverse` of
+    # `sorted_arr`.
+    unique_elements, unique_indices = np.unique(sorted_arr, return_index=True)
+    _, unique_inverse = np.unique(arr, return_inverse=True)
+
+    unique_indices = np.append(unique_indices, len(arr))
+    subarrays = np.split(sorted_sample_weight, unique_indices[1:])
+    group_sums = np.array(
+        [np.sum(subarray.astype(float)) for subarray in subarrays[:-1]]
+    )
+
+    results = [unique_elements]
+    if return_inverse:
+        results.append(unique_inverse)
+    if return_counts:
+        results.append(group_sums)
+
+    print(*results)
+    if len(results) > 1:
+        return tuple(results)
+    return results[0]
+
+
+def _unique_np(values, sample_weight=None, return_inverse=False, return_counts=False):
     """Helper function to find unique values for numpy arrays that correctly
     accounts for nans. See `_unique` documentation for details."""
-    uniques = np.unique(
-        values, return_inverse=return_inverse, return_counts=return_counts
+    # if sample_weight is None:
+    #     uniques = np.unique(
+    #         values, return_inverse=return_inverse, return_counts=return_counts
+    #     )
+    # else:
+    # TODO ohe_sw: _unique_groupby_sum is behaving like usual `np.unique`
+    # when `sample_weight=None`, ie, "`sample_weight=np.ones_like(X)`" because of
+    # `utils.validation._check_sample_weight`.
+    # Leaving the above lines comment for now because I want to show that the
+    # behaviour is the same.
+    uniques = _unique_groupby_sum(
+        values,
+        sample_weight=sample_weight,
+        return_inverse=return_inverse,
+        return_counts=return_counts,
     )
 
     inverse, counts = None, None
@@ -164,7 +216,7 @@ def _map_to_integer(values, uniques):
     return np.array([table[v] for v in values])
 
 
-def _unique_python(values, *, return_inverse, return_counts):
+def _unique_python(values, *, return_inverse, return_counts, sample_weight=None):
     # Only used in `_uniques`, see docstring there for details
     try:
         uniques_set = set(values)
@@ -185,7 +237,7 @@ def _unique_python(values, *, return_inverse, return_counts):
         ret += (_map_to_integer(values, uniques),)
 
     if return_counts:
-        ret += (_get_counts(values, uniques),)
+        ret += (_get_counts(values, uniques, sample_weight),)
 
     return ret[0] if len(ret) == 1 else ret
 
@@ -339,7 +391,7 @@ def __missing__(self, key):
         raise KeyError(key)
 
 
-def _get_counts(values, uniques):
+def _get_counts(values, uniques, sample_weight=None):
     """Get the count of each of the `uniques` in `values`.
 
     The counts will use the order passed in by `uniques`. For non-object dtypes,
@@ -353,7 +405,9 @@ def _get_counts(values, uniques):
                 output[i] = counter[item]
         return output
 
-    unique_values, counts = _unique_np(values, return_counts=True)
+    unique_values, counts = _unique_np(
+        values, sample_weight=sample_weight, return_counts=True
+    )
 
     # Recorder unique_values based on input: `uniques`
     uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)

From bf6951c0ccf248e87191fb1fc3ca7b0e6b18c32b Mon Sep 17 00:00:00 2001
From: Carlo <carlo_lemos@hotmail.com>
Date: Fri, 5 May 2023 03:07:18 -0300
Subject: [PATCH 02/12] update changelog

---
 doc/whats_new/v1.3.rst   | 4 ++++
 sklearn/utils/_encode.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index bb245aa466152..b7e06ce3bd0da 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -467,6 +467,10 @@ Changelog
   The callable combines input arguments `(input_feature, category)` to a string.
   :pr:`22506` by :user:`Mario Kostelac <mariokostelac>`.
 
+- |Enhancement| Adds support for `sample_weight` in
+  :class:`preprocessing.OneHotEncoder`. TODO ohe_sw.
+  :pr:`26330` by :user:`Carlo Lemos <vitaliset>`.
+
 - |Enhancement| Added support for `sample_weight` in
   :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
   `sample_weight` for each sample to be used while fitting. The option is only
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index df2b6c5151dd1..7de32034595ab 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -56,6 +56,7 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F
 
 def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False):
     # TODO ohe_sw: add one line docstring
+    # TODO ohe_sw: create tests
     sample_weight = _check_sample_weight(sample_weight, arr)
 
     sorted_indices = np.argsort(arr)

From 3eb2c8476e924c716af24fcd374b0486f75ba82b Mon Sep 17 00:00:00 2001
From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com>
Date: Fri, 5 May 2023 14:34:37 -0300
Subject: [PATCH 03/12] removing extra debugging print

---
 sklearn/utils/_encode.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 7de32034595ab..0b453659c50b3 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -81,7 +81,6 @@ def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=
     if return_counts:
         results.append(group_sums)
 
-    print(*results)
     if len(results) > 1:
         return tuple(results)
     return results[0]

From a7a39404cfdd6c5a868a063df58cddd5c5cc8903 Mon Sep 17 00:00:00 2001
From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com>
Date: Sat, 6 May 2023 19:13:28 -0300
Subject: [PATCH 04/12] make ci green with temporary sample_weight doc

---
 sklearn/preprocessing/_encoders.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 2e3c563a3876e..d37939b0fe35c 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -971,6 +971,9 @@ def fit(self, X, y=None, sample_weight=None):
             Ignored. This parameter exists only for compatibility with
             :class:`~sklearn.pipeline.Pipeline`.
 
+       sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+
         Returns
         -------
         self

From 8d827f08f9327b9740f79658ae21cf274ae2bfdb Mon Sep 17 00:00:00 2001
From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com>
Date: Sat, 6 May 2023 19:20:47 -0300
Subject: [PATCH 05/12] linting

---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d37939b0fe35c..45f18c341fe6d 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -971,7 +971,7 @@ def fit(self, X, y=None, sample_weight=None):
             Ignored. This parameter exists only for compatibility with
             :class:`~sklearn.pipeline.Pipeline`.
 
-       sample_weight : array-like of shape (n_samples,), default=None
+        sample_weight : array-like of shape (n_samples,), default=None
             Sample weights. If None, then samples are equally weighted.
 
         Returns

From be69aecc95baaf659f464f60fec5e56b8c9b4efc Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Fri, 23 Jun 2023 00:46:13 -0300
Subject: [PATCH 06/12] first version of ohe with sw

---
 sklearn/preprocessing/_encoders.py           | 31 +++++++-----
 sklearn/preprocessing/tests/test_encoders.py | 41 ++++++++++++++--
 sklearn/utils/_encode.py                     | 50 ++++++++++----------
 sklearn/utils/tests/test_encode.py           | 25 ++++++++++
 4 files changed, 105 insertions(+), 42 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 45f18c341fe6d..9162d90af5ad4 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -83,6 +83,8 @@ def _fit(
             X, force_all_finite=force_all_finite
         )
         self.n_features_in_ = n_features
+        if sample_weight is not None:
+            n_samples = np.sum(sample_weight)
 
         if self.categories != "auto":
             if len(self.categories) != n_features:
@@ -269,11 +271,13 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
 
         Parameters
         ----------
-        category_count : ndarray of shape (n_cardinality,)
-            Category counts.
+        category_count : array-like of shape (n_cardinality,)
+            Category counts or sum of `sample_weight` for the samples from the
+            category when `sample_weight` is different from `None`.
 
         n_samples : int
-            Number of samples.
+            Number of samples in training set or total sum of `sample_weight`
+            for all samples when `sample_weight` is different from `None`.
 
         col_idx : int
             Index of the current category. Only used for the error message.
@@ -284,8 +288,6 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
             If there are infrequent categories, indices of infrequent
             categories. Otherwise None.
         """
-        # TODO ohe_sw: We would have to change this...
-        # But it really makes sense... :(
         if isinstance(self.min_frequency, numbers.Integral):
             infrequent_mask = category_count < self.min_frequency
         elif isinstance(self.min_frequency, numbers.Real):
@@ -338,7 +340,8 @@ def _fit_infrequent_category_mapping(
         Parameters
         ----------
         n_samples : int
-            Number of samples in training set.
+            Number of samples in training set or total sum of `sample_weight`
+            for all samples when `sample_weight` is different from `None`.
         category_counts: list of ndarray
             `category_counts[i]` is the category counts corresponding to
             `self.categories_[i]`.
@@ -563,13 +566,15 @@ class OneHotEncoder(_BaseEncoder):
 
     min_frequency : int or float, default=None
         Specifies the minimum frequency below which a category will be
-        considered infrequent.
+        considered infrequent. If during fit `sample_weight` is different from
+        default, then count will be done with sum of samples' weight.
 
         - If `int`, categories with a smaller cardinality will be considered
           infrequent.
 
         - If `float`, categories with a smaller cardinality than
-          `min_frequency * n_samples`  will be considered infrequent.
+          `min_frequency * n_samples` will be considered infrequent. If
+          `sample_weight` is different from `None`, `n_samples = sum(sample_weight)`.
 
         .. versionadded:: 1.1
             Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
@@ -972,15 +977,17 @@ def fit(self, X, y=None, sample_weight=None):
             :class:`~sklearn.pipeline.Pipeline`.
 
         sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights. If None, then samples are equally weighted.
+            Sample weights used to weight the categories when using filtering
+            catergories with `max_categories` and `min_frequency`. If `None`,
+            then samples are equally weighted. If both `max_categories` and
+            `min_frequency` are set to default values, then `sample_weight`
+            is ignored.
 
         Returns
         -------
         self
             Fitted encoder.
         """
-        # TODO ohe_sw: Add to docstring that `sample_weight` is only used when
-        # `max_categories` or `min_frequency` are not default values.
         self._validate_params()
 
         if self.sparse != "deprecated":
@@ -1300,7 +1307,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
           infrequent.
 
         - If `float`, categories with a smaller cardinality than
-          `min_frequency * n_samples`  will be considered infrequent.
+          `min_frequency * n_samples` will be considered infrequent.
 
         .. versionadded:: 1.3
             Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 847838d1bed04..304f8877e57cc 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2327,11 +2327,15 @@ def test_ordinal_encoder_missing_appears_infrequent():
         ),
     ],
 )
-def test_one_hot_encoder_sample_weight_min_frequency(X, sample_weight, expected_shape):
+@pytest.mark.parametrize(
+    "min_frequency",
+    [0.3, 0.9, 2],
+)
+def test_one_hot_encoder_sample_weight_min_frequency(
+    X, sample_weight, expected_shape, min_frequency
+):
     ohe = OneHotEncoder(min_frequency=2)
     X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
-
-    print(X_trans.toarray())
     assert X_trans.shape == expected_shape
 
 
@@ -2360,6 +2364,33 @@ def test_one_hot_encoder_sample_weight_min_frequency(X, sample_weight, expected_
 def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape):
     ohe = OneHotEncoder(max_categories=2)
     X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
-
-    print(X_trans.toarray())
     assert X_trans.shape == expected_shape
+
+
+@pytest.mark.parametrize(
+    "X",
+    [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
+)
+@pytest.mark.parametrize(
+    "min_frequency",
+    [0.1, 0.3, 0.5, 0.9],
+)
+def test_one_hot_encoder_sample_weight_constant(X, min_frequency):
+    ohe = OneHotEncoder(min_frequency=min_frequency)
+    X_sw_None = ohe.fit_transform(X, sample_weight=None).toarray()
+    X_sw_constant1 = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
+    X_sw_constant5 = ohe.fit_transform(X, sample_weight=5 * np.ones(len(X))).toarray()
+
+    assert_array_equal(X_sw_None, X_sw_constant1)
+    assert_array_equal(X_sw_None, X_sw_constant5)
+
+@pytest.mark.parametrize(
+    "X",
+    [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
+)
+def test_one_hot_encoder_sample_weight_is_ignored(X):
+    ohe = OneHotEncoder()
+    X_sw_None = ohe.fit_transform(X).toarray()
+    X_sw_ones = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
+
+    assert_array_equal(X_sw_None, X_sw_ones)
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 0b453659c50b3..b677348dccd57 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -48,24 +48,21 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F
     # numerical
     return _unique_np(
         values,
-        sample_weight=sample_weight,
         return_inverse=return_inverse,
         return_counts=return_counts,
+        sample_weight=sample_weight,
     )
 
 
 def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False):
-    # TODO ohe_sw: add one line docstring
-    # TODO ohe_sw: create tests
+    """This functions behaves like np.unique but it counts the values of `arr` taking
+    into acount `samplt_weight`."""
     sample_weight = _check_sample_weight(sample_weight, arr)
 
     sorted_indices = np.argsort(arr)
     sorted_arr = arr[sorted_indices]
     sorted_sample_weight = sample_weight[sorted_indices]
 
-    # TODO ohe_sw: Using two `np.unique` is certainly suboptimal, but for now I can't
-    # see how to build the `unique_inverse` of `arr` with the `unique_inverse` of
-    # `sorted_arr`.
     unique_elements, unique_indices = np.unique(sorted_arr, return_index=True)
     _, unique_inverse = np.unique(arr, return_inverse=True)
 
@@ -86,25 +83,20 @@ def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=
     return results[0]
 
 
-def _unique_np(values, sample_weight=None, return_inverse=False, return_counts=False):
+def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None):
     """Helper function to find unique values for numpy arrays that correctly
     accounts for nans. See `_unique` documentation for details."""
-    # if sample_weight is None:
-    #     uniques = np.unique(
-    #         values, return_inverse=return_inverse, return_counts=return_counts
-    #     )
-    # else:
-    # TODO ohe_sw: _unique_groupby_sum is behaving like usual `np.unique`
-    # when `sample_weight=None`, ie, "`sample_weight=np.ones_like(X)`" because of
-    # `utils.validation._check_sample_weight`.
-    # Leaving the above lines comment for now because I want to show that the
-    # behaviour is the same.
-    uniques = _unique_groupby_sum(
-        values,
-        sample_weight=sample_weight,
-        return_inverse=return_inverse,
-        return_counts=return_counts,
-    )
+    if sample_weight is None:
+        uniques = np.unique(
+            values, return_inverse=return_inverse, return_counts=return_counts
+        )
+    else:
+        uniques = _unique_groupby_sum(
+            values,
+            sample_weight=sample_weight,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+        )
 
     inverse, counts = None, None
 
@@ -394,6 +386,9 @@ def __missing__(self, key):
 def _get_counts(values, uniques, sample_weight=None):
     """Get the count of each of the `uniques` in `values`.
 
+    If `sample_weight` is not `None` then the count is actually the sum of
+    `sample_weight` for that unique value.
+
     The counts will use the order passed in by `uniques`. For non-object dtypes,
     `uniques` is assumed to be sorted and `np.nan` is at the end.
     """
@@ -402,11 +397,16 @@ def _get_counts(values, uniques, sample_weight=None):
         output = np.zeros(len(uniques), dtype=np.int64)
         for i, item in enumerate(uniques):
             with suppress(KeyError):
-                output[i] = counter[item]
+                if sample_weight is None:
+                    output[i] = counter[item]
+                else:
+                    # TODO ohe_sw: I need to create tests for this. Is this
+                    # values == item working for NaN items?
+                    output[i] = np.sum(sample_weight[values == item])
         return output
 
     unique_values, counts = _unique_np(
-        values, sample_weight=sample_weight, return_counts=True
+        values, return_counts=True, sample_weight=sample_weight
     )
 
     # Recorder unique_values based on input: `uniques`
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 083db25b7ca80..48da80e5a6a17 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -8,6 +8,7 @@
 from sklearn.utils._encode import _encode
 from sklearn.utils._encode import _check_unknown
 from sklearn.utils._encode import _get_counts
+from sklearn.utils._encode import _unique_groupby_sum
 
 
 @pytest.mark.parametrize(
@@ -275,3 +276,27 @@ def test_check_unknown_with_both_missing_values():
 def test_get_counts(values, uniques, expected_counts):
     counts = _get_counts(values, uniques)
     assert_array_equal(counts, expected_counts)
+
+
+@pytest.mark.parametrize(
+    "arr, sample_weight, expected_unique, expected_sum",
+    [
+        (
+            np.array([1] * 3 + [2] * 2 + [3]),
+            np.array([0, 1, 2, 3, 4, 5]),
+            [1, 2, 3],
+            [3, 7, 5],
+        ),
+        (
+            np.array([3] + [2] * 2 + [1] * 3),
+            np.array([5, 3, 4, 2, 1, 0]),
+            [1, 2, 3],
+            [3, 7, 5],
+        ),
+    ],
+)
+def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum):
+    # TODO ohe_sw: Do more parametrize scenarios
+    unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True)
+    assert_array_equal(groupby_sum, expected_sum)
+    assert_array_equal(unique, expected_unique)

From 94871acf1eb1533ed88f7503c85c235a36e97377 Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Fri, 23 Jun 2023 00:51:04 -0300
Subject: [PATCH 07/12] updating changelog description and small linting

---
 doc/whats_new/v1.3.rst                       | 5 +++--
 sklearn/preprocessing/tests/test_encoders.py | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 4d5d4e919c3fa..c616a30afb2df 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -490,8 +490,9 @@ Changelog
   :pr:`22506` by :user:`Mario Kostelac <mariokostelac>`.
 
 - |Enhancement| Adds support for `sample_weight` in
-  :class:`preprocessing.OneHotEncoder`. TODO ohe_sw.
-  :pr:`26330` by :user:`Carlo Lemos <vitaliset>`.
+  :class:`preprocessing.OneHotEncoder`. When using `sample_weight`, then `min_frequency`
+  and `max_categories` will filter according to sum samples' weight for that category
+  instead of count. :pr:`26330` by :user:`Carlo Lemos <vitaliset>`.
 
 - |Enhancement| Added support for `sample_weight` in
   :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 304f8877e57cc..e2ec08fa0021a 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2384,6 +2384,7 @@ def test_one_hot_encoder_sample_weight_constant(X, min_frequency):
     assert_array_equal(X_sw_None, X_sw_constant1)
     assert_array_equal(X_sw_None, X_sw_constant5)
 
+
 @pytest.mark.parametrize(
     "X",
     [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],

From 8e6db11435dcbbc378ef591f6ed9a24f18533be9 Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Fri, 23 Jun 2023 01:02:57 -0300
Subject: [PATCH 08/12] resolve linting error introduced in conflit

---
 sklearn/utils/tests/test_encode.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 342e049de5d36..f600073b45f20 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -5,7 +5,11 @@
 from numpy.testing import assert_array_equal
 
 from sklearn.utils._encode import (
-  _check_unknown, _encode, _get_counts, _unique, _unique_groupby_sum
+  _check_unknown,
+  _encode,
+  _get_counts,
+  _unique,
+  _unique_groupby_sum
 )
 
 

From 51f9eae4b1feea8314631374d5b24b49b79e7285 Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Fri, 23 Jun 2023 01:07:48 -0300
Subject: [PATCH 09/12] black

---
 sklearn/utils/tests/test_encode.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index f600073b45f20..e95e3ec5e0343 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -5,11 +5,11 @@
 from numpy.testing import assert_array_equal
 
 from sklearn.utils._encode import (
-  _check_unknown,
-  _encode,
-  _get_counts,
-  _unique,
-  _unique_groupby_sum
+    _check_unknown,
+    _encode,
+    _get_counts,
+    _unique,
+    _unique_groupby_sum,
 )
 
 

From 6fa547fe677753c29a04e7f332d4683ddc87c759 Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Sun, 9 Jun 2024 18:05:21 -0300
Subject: [PATCH 10/12] updating PR to fit the array api introduced in 27381

---
 sklearn/utils/_array_api.py           | 78 +++++++++++++++++++++++----
 sklearn/utils/_encode.py              | 50 ++---------------
 sklearn/utils/tests/test_array_api.py | 25 +++++++++
 sklearn/utils/tests/test_encode.py    | 32 +----------
 4 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py
index 7bf9183c80772..76af192c0c360 100644
--- a/sklearn/utils/_array_api.py
+++ b/sklearn/utils/_array_api.py
@@ -397,19 +397,37 @@ def asarray(self, x, *, dtype=None, device=None, copy=None):  # noqa
         else:
             return numpy.asarray(x, dtype=dtype)
 
-    def unique_inverse(self, x):
-        return numpy.unique(x, return_inverse=True)
+    def unique_inverse(self, x, *, sample_weight=None):
+        if sample_weight is None:
+            return numpy.unique(x, return_inverse=True)
+        else:
+            return _unique_groupby_sum(x, sample_weight, return_inverse=True)
 
-    def unique_counts(self, x):
-        return numpy.unique(x, return_counts=True)
+    def unique_counts(self, x, *, sample_weight=None):
+        if sample_weight is None:
+            return numpy.unique(x, return_counts=True)
+        else:
+            return _unique_groupby_sum(x, sample_weight, return_counts=True)
 
-    def unique_values(self, x):
-        return numpy.unique(x)
+    def unique_values(self, x, *, sample_weight=None):
+        if sample_weight is None:
+            return numpy.unique(x)
+        else:
+            return _unique_groupby_sum(x, sample_weight)
 
-    def unique_all(self, x):
-        return numpy.unique(
-            x, return_index=True, return_inverse=True, return_counts=True
-        )
+    def unique_all(self, x, *, sample_weight=None):
+        if sample_weight is None:
+            return numpy.unique(
+                x, return_index=True, return_inverse=True, return_counts=True
+            )
+        else:
+            return _unique_groupby_sum(
+                x,
+                sample_weight,
+                return_index=True,
+                return_inverse=True,
+                return_counts=True,
+            )
 
     def concat(self, arrays, *, axis=None):
         return numpy.concatenate(arrays, axis=axis)
@@ -967,3 +985,43 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False):
         return ret[: ar1.shape[0]]
     else:
         return xp.take(ret, rev_idx, axis=0)
+
+
+def _unique_groupby_sum(
+    arr,
+    sample_weight,
+    return_index=False,
+    return_inverse=False,
+    return_counts=False,
+):
+    """This functions behaves like numpy.unique but it counts the values of `arr`
+    taking into acount `sample_weight`."""
+    if sample_weight is None:
+        sample_weight = numpy.ones(len(arr))
+    # TODO ohe_sw: sample_weight = _check_sample_weight(sample_weight, arr)
+    # gave me circular import. What should I do?
+
+    sorted_indices = numpy.argsort(arr)
+    sorted_arr = arr[sorted_indices]
+    sorted_sample_weight = sample_weight[sorted_indices]
+
+    unique_elements, unique_indices = numpy.unique(sorted_arr, return_index=True)
+    _, unique_inverse = numpy.unique(arr, return_inverse=True)
+
+    unique_indices = numpy.append(unique_indices, len(arr))
+    subarrays = numpy.split(sorted_sample_weight, unique_indices[1:])
+    group_sums = numpy.array(
+        [numpy.sum(subarray.astype(float)) for subarray in subarrays[:-1]]
+    )
+
+    results = [unique_elements]
+    if return_index:
+        results.append(unique_indices)
+    if return_inverse:
+        results.append(unique_inverse)
+    if return_counts:
+        results.append(group_sums)
+
+    if len(results) > 1:
+        return tuple(results)
+    return results[0]
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 608de23b88713..2d5ae33051529 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -4,8 +4,6 @@
 
 import numpy as np
 
-from . import is_scalar_nan
-from .validation import _check_sample_weight
 from ._array_api import (
     _isin,
     _searchsorted,
@@ -63,61 +61,21 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F
     )
 
 
-def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False):
-    """This functions behaves like np.unique but it counts the values of `arr` taking
-    into acount `samplt_weight`."""
-    sample_weight = _check_sample_weight(sample_weight, arr)
-
-    sorted_indices = np.argsort(arr)
-    sorted_arr = arr[sorted_indices]
-    sorted_sample_weight = sample_weight[sorted_indices]
-
-    unique_elements, unique_indices = np.unique(sorted_arr, return_index=True)
-    _, unique_inverse = np.unique(arr, return_inverse=True)
-
-    unique_indices = np.append(unique_indices, len(arr))
-    subarrays = np.split(sorted_sample_weight, unique_indices[1:])
-    group_sums = np.array(
-        [np.sum(subarray.astype(float)) for subarray in subarrays[:-1]]
-    )
-
-    results = [unique_elements]
-    if return_inverse:
-        results.append(unique_inverse)
-    if return_counts:
-        results.append(group_sums)
-
-    if len(results) > 1:
-        return tuple(results)
-    return results[0]
-
-
 def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None):
     """Helper function to find unique values for numpy arrays that correctly
     accounts for nans. See `_unique` documentation for details."""
-#     if sample_weight is None:
-#         uniques = np.unique(
-#             values, return_inverse=return_inverse, return_counts=return_counts
-#         )
-#     else:
-#         uniques = _unique_groupby_sum(
-#             values,
-#             sample_weight=sample_weight,
-#             return_inverse=return_inverse,
-#             return_counts=return_counts,
-#         )
     xp, _ = get_namespace(values)
 
     inverse, counts = None, None
 
     if return_inverse and return_counts:
-        uniques, _, inverse, counts = xp.unique_all(values)
+        uniques, _, inverse, counts = xp.unique_all(values, sample_weight=sample_weight)
     elif return_inverse:
-        uniques, inverse = xp.unique_inverse(values)
+        uniques, inverse = xp.unique_inverse(values, sample_weight=sample_weight)
     elif return_counts:
-        uniques, counts = xp.unique_counts(values)
+        uniques, counts = xp.unique_counts(values, sample_weight=sample_weight)
     else:
-        uniques = xp.unique_values(values)
+        uniques = xp.unique_values(values, sample_weight=sample_weight)
 
     # np.unique will have duplicate missing values at the end of `uniques`
     # here we clip the nans and remove it from uniques
diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py
index 25913e7f54846..50caee99be36d 100644
--- a/sklearn/utils/tests/test_array_api.py
+++ b/sklearn/utils/tests/test_array_api.py
@@ -20,6 +20,7 @@
     _nanmin,
     _NumPyAPIWrapper,
     _ravel,
+    _unique_groupby_sum,
     device,
     get_namespace,
     get_namespace_and_device,
@@ -566,3 +567,27 @@ def test_get_namespace_and_device():
         assert namespace is xp_torch
         assert is_array_api
         assert device == some_torch_tensor.device
+
+
+@pytest.mark.parametrize(
+    "arr, sample_weight, expected_unique, expected_sum",
+    [
+        (
+            numpy.array([1] * 3 + [2] * 2 + [3]),
+            numpy.array([0, 1, 2, 3, 4, 5]),
+            [1, 2, 3],
+            [3, 7, 5],
+        ),
+        (
+            numpy.array([3] + [2] * 2 + [1] * 3),
+            numpy.array([5, 3, 4, 2, 1, 0]),
+            [1, 2, 3],
+            [3, 7, 5],
+        ),
+    ],
+)
+def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum):
+    # TODO ohe_sw: Do more parametrize scenarios
+    unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True)
+    assert_array_equal(groupby_sum, expected_sum)
+    assert_array_equal(unique, expected_unique)
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index e95e3ec5e0343..9118eb56f0ba4 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -4,13 +4,7 @@
 import pytest
 from numpy.testing import assert_array_equal
 
-from sklearn.utils._encode import (
-    _check_unknown,
-    _encode,
-    _get_counts,
-    _unique,
-    _unique_groupby_sum,
-)
+from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
 
 
 @pytest.mark.parametrize(
@@ -278,27 +272,3 @@ def test_check_unknown_with_both_missing_values():
 def test_get_counts(values, uniques, expected_counts):
     counts = _get_counts(values, uniques)
     assert_array_equal(counts, expected_counts)
-
-
-@pytest.mark.parametrize(
-    "arr, sample_weight, expected_unique, expected_sum",
-    [
-        (
-            np.array([1] * 3 + [2] * 2 + [3]),
-            np.array([0, 1, 2, 3, 4, 5]),
-            [1, 2, 3],
-            [3, 7, 5],
-        ),
-        (
-            np.array([3] + [2] * 2 + [1] * 3),
-            np.array([5, 3, 4, 2, 1, 0]),
-            [1, 2, 3],
-            [3, 7, 5],
-        ),
-    ],
-)
-def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum):
-    # TODO ohe_sw: Do more parametrize scenarios
-    unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True)
-    assert_array_equal(groupby_sum, expected_sum)
-    assert_array_equal(unique, expected_unique)

From 7e0ea84faae4b80e6de9826534dbdfdd8b995605 Mon Sep 17 00:00:00 2001
From: vitaliset <carlo_lemos@hotmail.com>
Date: Mon, 10 Jun 2024 00:17:11 -0300
Subject: [PATCH 11/12] update new api logic properly, update tests on
 sample_weight and adjustments to sum(sample_weight)=0 scenario

---
 sklearn/preprocessing/_encoders.py           | 11 ++-
 sklearn/preprocessing/tests/test_encoders.py | 10 +--
 sklearn/utils/_array_api.py                  | 78 +++-----------------
 sklearn/utils/_encode.py                     | 65 ++++++++++++++--
 sklearn/utils/tests/test_array_api.py        | 25 -------
 sklearn/utils/tests/test_encode.py           | 37 +++++++++-
 6 files changed, 118 insertions(+), 108 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8ed398b9352f8..f1e6d6833de32 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -16,7 +16,11 @@
 from ..utils._missing import is_scalar_nan
 from ..utils._param_validation import Interval, RealNotInt, StrOptions
 from ..utils._set_output import _get_output_config
-from ..utils.validation import _check_feature_names_in, check_is_fitted
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_sample_weight,
+    check_is_fitted,
+)
 
 __all__ = ["OneHotEncoder", "OrdinalEncoder"]
 
@@ -82,6 +86,11 @@ def _fit(
         )
         self.n_features_in_ = n_features
         if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+            # Filtering rows with sample_weight equals zero so we don't get extra dummy
+            # columns.
+            X_list = [Xi[sample_weight != 0] for Xi in X_list]
+            sample_weight = sample_weight[sample_weight != 0]
             n_samples = np.sum(sample_weight)
 
         if self.categories != "auto":
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 3bd466ebd1bfc..ccb4c96652b9d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2336,7 +2336,7 @@ def test_ordinal_encoder_missing_appears_infrequent():
                 ["airplane", 4],
             ],
             np.array([2, 2.5, 0.5, 0.1, 0, 0]),
-            (6, 5),
+            (6, 4),  # columns: car, bike, 3, infrequent (1)
         ),
         (
             [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
@@ -2352,9 +2352,9 @@ def test_ordinal_encoder_missing_appears_infrequent():
 def test_one_hot_encoder_sample_weight_min_frequency(
     X, sample_weight, expected_shape, min_frequency
 ):
-    ohe = OneHotEncoder(min_frequency=2)
+    ohe = OneHotEncoder(min_frequency=2, handle_unknown="infrequent_if_exist")
     X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
-    assert X_trans.shape == expected_shape
+    assert_allclose(X_trans.shape, expected_shape)
 
 
 @pytest.mark.parametrize(
@@ -2380,9 +2380,9 @@ def test_one_hot_encoder_sample_weight_min_frequency(
     ],
 )
 def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape):
-    ohe = OneHotEncoder(max_categories=2)
+    ohe = OneHotEncoder(max_categories=2, handle_unknown="ignore")
     X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
-    assert X_trans.shape == expected_shape
+    assert_allclose(X_trans.shape, expected_shape)
 
 
 @pytest.mark.parametrize(
diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py
index 76af192c0c360..7bf9183c80772 100644
--- a/sklearn/utils/_array_api.py
+++ b/sklearn/utils/_array_api.py
@@ -397,37 +397,19 @@ def asarray(self, x, *, dtype=None, device=None, copy=None):  # noqa
         else:
             return numpy.asarray(x, dtype=dtype)
 
-    def unique_inverse(self, x, *, sample_weight=None):
-        if sample_weight is None:
-            return numpy.unique(x, return_inverse=True)
-        else:
-            return _unique_groupby_sum(x, sample_weight, return_inverse=True)
+    def unique_inverse(self, x):
+        return numpy.unique(x, return_inverse=True)
 
-    def unique_counts(self, x, *, sample_weight=None):
-        if sample_weight is None:
-            return numpy.unique(x, return_counts=True)
-        else:
-            return _unique_groupby_sum(x, sample_weight, return_counts=True)
+    def unique_counts(self, x):
+        return numpy.unique(x, return_counts=True)
 
-    def unique_values(self, x, *, sample_weight=None):
-        if sample_weight is None:
-            return numpy.unique(x)
-        else:
-            return _unique_groupby_sum(x, sample_weight)
+    def unique_values(self, x):
+        return numpy.unique(x)
 
-    def unique_all(self, x, *, sample_weight=None):
-        if sample_weight is None:
-            return numpy.unique(
-                x, return_index=True, return_inverse=True, return_counts=True
-            )
-        else:
-            return _unique_groupby_sum(
-                x,
-                sample_weight,
-                return_index=True,
-                return_inverse=True,
-                return_counts=True,
-            )
+    def unique_all(self, x):
+        return numpy.unique(
+            x, return_index=True, return_inverse=True, return_counts=True
+        )
 
     def concat(self, arrays, *, axis=None):
         return numpy.concatenate(arrays, axis=axis)
@@ -985,43 +967,3 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False):
         return ret[: ar1.shape[0]]
     else:
         return xp.take(ret, rev_idx, axis=0)
-
-
-def _unique_groupby_sum(
-    arr,
-    sample_weight,
-    return_index=False,
-    return_inverse=False,
-    return_counts=False,
-):
-    """This functions behaves like numpy.unique but it counts the values of `arr`
-    taking into acount `sample_weight`."""
-    if sample_weight is None:
-        sample_weight = numpy.ones(len(arr))
-    # TODO ohe_sw: sample_weight = _check_sample_weight(sample_weight, arr)
-    # gave me circular import. What should I do?
-
-    sorted_indices = numpy.argsort(arr)
-    sorted_arr = arr[sorted_indices]
-    sorted_sample_weight = sample_weight[sorted_indices]
-
-    unique_elements, unique_indices = numpy.unique(sorted_arr, return_index=True)
-    _, unique_inverse = numpy.unique(arr, return_inverse=True)
-
-    unique_indices = numpy.append(unique_indices, len(arr))
-    subarrays = numpy.split(sorted_sample_weight, unique_indices[1:])
-    group_sums = numpy.array(
-        [numpy.sum(subarray.astype(float)) for subarray in subarrays[:-1]]
-    )
-
-    results = [unique_elements]
-    if return_index:
-        results.append(unique_indices)
-    if return_inverse:
-        results.append(unique_inverse)
-    if return_counts:
-        results.append(group_sums)
-
-    if len(results) > 1:
-        return tuple(results)
-    return results[0]
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 2d5ae33051529..fc5e97aad7d69 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -12,6 +12,7 @@
     get_namespace,
 )
 from ._missing import is_scalar_nan
+from .validation import _check_sample_weight
 
 
 def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=False):
@@ -61,21 +62,69 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F
     )
 
 
+def _xp_unique_groupby_sum(
+    xp,
+    arr,
+    sample_weight,
+    return_index=False,
+    return_inverse=False,
+    return_counts=False,
+):
+    """This functions behaves like xp.unique_all but it counts the values of `arr`
+    taking into acount `sample_weight`."""
+    sample_weight = _check_sample_weight(sample_weight, arr)
+
+    sorted_indices = xp.argsort(arr)
+    sorted_arr = arr[sorted_indices]
+    sorted_sample_weight = sample_weight[sorted_indices]
+
+    unique_elements, unique_indices, _, _ = xp.unique_all(sorted_arr)
+    _, unique_inverse = xp.unique_inverse(arr)
+
+    # TODO ohe_sw: Update to xp-logic. This functions are not in the API.
+    unique_indices = np.append(unique_indices, len(arr))
+    subarrays = np.split(sorted_sample_weight, unique_indices[1:])
+    group_sums = np.array(
+        [np.sum(subarray.astype(float)) for subarray in subarrays[:-1]]
+    )
+
+    results = [unique_elements]
+    if return_index:
+        results.append(unique_indices)
+    if return_inverse:
+        results.append(unique_inverse)
+    if return_counts:
+        results.append(group_sums)
+
+    if len(results) > 1:
+        return tuple(results)
+    return results[0]
+
+
 def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None):
     """Helper function to find unique values for numpy arrays that correctly
     accounts for nans. See `_unique` documentation for details."""
     xp, _ = get_namespace(values)
 
     inverse, counts = None, None
-
-    if return_inverse and return_counts:
-        uniques, _, inverse, counts = xp.unique_all(values, sample_weight=sample_weight)
-    elif return_inverse:
-        uniques, inverse = xp.unique_inverse(values, sample_weight=sample_weight)
-    elif return_counts:
-        uniques, counts = xp.unique_counts(values, sample_weight=sample_weight)
+    if sample_weight is None:
+        if return_inverse and return_counts:
+            uniques, _, inverse, counts = xp.unique_all(values)
+        elif return_inverse:
+            uniques, inverse = xp.unique_inverse(values)
+        elif return_counts:
+            uniques, counts = xp.unique_counts(values)
+        else:
+            uniques = xp.unique_values(values)
     else:
-        uniques = xp.unique_values(values, sample_weight=sample_weight)
+        uniques, _, inverse, counts = _xp_unique_groupby_sum(
+            xp,
+            values,
+            sample_weight,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+        )
 
     # np.unique will have duplicate missing values at the end of `uniques`
     # here we clip the nans and remove it from uniques
diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py
index 50caee99be36d..25913e7f54846 100644
--- a/sklearn/utils/tests/test_array_api.py
+++ b/sklearn/utils/tests/test_array_api.py
@@ -20,7 +20,6 @@
     _nanmin,
     _NumPyAPIWrapper,
     _ravel,
-    _unique_groupby_sum,
     device,
     get_namespace,
     get_namespace_and_device,
@@ -567,27 +566,3 @@ def test_get_namespace_and_device():
         assert namespace is xp_torch
         assert is_array_api
         assert device == some_torch_tensor.device
-
-
-@pytest.mark.parametrize(
-    "arr, sample_weight, expected_unique, expected_sum",
-    [
-        (
-            numpy.array([1] * 3 + [2] * 2 + [3]),
-            numpy.array([0, 1, 2, 3, 4, 5]),
-            [1, 2, 3],
-            [3, 7, 5],
-        ),
-        (
-            numpy.array([3] + [2] * 2 + [1] * 3),
-            numpy.array([5, 3, 4, 2, 1, 0]),
-            [1, 2, 3],
-            [3, 7, 5],
-        ),
-    ],
-)
-def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum):
-    # TODO ohe_sw: Do more parametrize scenarios
-    unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True)
-    assert_array_equal(groupby_sum, expected_sum)
-    assert_array_equal(unique, expected_unique)
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 9118eb56f0ba4..c1c430ca82df9 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -4,7 +4,14 @@
 import pytest
 from numpy.testing import assert_array_equal
 
-from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
+from sklearn.utils._array_api import get_namespace
+from sklearn.utils._encode import (
+    _check_unknown,
+    _encode,
+    _get_counts,
+    _unique,
+    _xp_unique_groupby_sum,
+)
 
 
 @pytest.mark.parametrize(
@@ -272,3 +279,31 @@ def test_check_unknown_with_both_missing_values():
 def test_get_counts(values, uniques, expected_counts):
     counts = _get_counts(values, uniques)
     assert_array_equal(counts, expected_counts)
+
+
+@pytest.mark.parametrize(
+    "arr, sample_weight, expected_unique, expected_sum",
+    [
+        (
+            np.array([1] * 3 + [2] * 2 + [3]),
+            np.array([0, 1, 2, 3, 4, 5]),
+            [1, 2, 3],
+            [3, 7, 5],
+        ),
+        (
+            np.array([3] + [2] * 2 + [1] * 3),
+            np.array([5, 3, 4, 2, 1, 0]),
+            [1, 2, 3],
+            [3, 7, 5],
+        ),
+    ],
+)
+def test_xp_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum):
+    # TODO ohe_sw: Do more parametrize scenarios
+    # TODO ohe_sw: Test other array types
+    xp, _ = get_namespace(arr)
+    unique, groupby_sum = _xp_unique_groupby_sum(
+        xp, arr, sample_weight, return_counts=True
+    )
+    assert_array_equal(groupby_sum, expected_sum)
+    assert_array_equal(unique, expected_unique)

From 55159386b8ca0ea6eb5793ea61973ddca6ee7f8e Mon Sep 17 00:00:00 2001
From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com>
Date: Tue, 13 May 2025 01:57:35 +0000
Subject: [PATCH 12/12] update whats_new

---
 .../sklearn.preprocessing/26330.enhancement.rst              | 4 ++++
 doc/whats_new/v1.3.rst                                       | 5 -----
 2 files changed, 4 insertions(+), 5 deletions(-)
 create mode 100644 doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst

diff --git a/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst
new file mode 100644
index 0000000000000..63fe86bebed33
--- /dev/null
+++ b/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst
@@ -0,0 +1,4 @@
+- Adds support for `sample_weight` in :class:`preprocessing.OneHotEncoder`. When
+  using `sample_weight`, then `min_frequency` and `max_categories` will filter
+  according to sum samples' weight for that category instead of count. By
+  :user:`Carlo Lemos <vitaliset>`.
\ No newline at end of file
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 440b0a97f04cf..f523c02e14447 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -871,11 +871,6 @@ Changelog
   combines input arguments `(input_feature, category)` to a string.
   :pr:`22506` by :user:`Mario Kostelac <mariokostelac>`.
 
-- |Enhancement| Adds support for `sample_weight` in
-  :class:`preprocessing.OneHotEncoder`. When using `sample_weight`, then `min_frequency`
-  and `max_categories` will filter according to sum samples' weight for that category
-  instead of count. :pr:`26330` by :user:`Carlo Lemos <vitaliset>`.
-
 - |Enhancement| Added support for `sample_weight` in
   :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
   `sample_weight` for each sample to be used while fitting. The option is only