scikit-learn · vitaliset · May 5, 2023 · May 5, 2023 · May 5, 2023 · May 6, 2023
diff --git a/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst
@@ -0,0 +1,4 @@
+- Adds support for `sample_weight` in :class:`preprocessing.OneHotEncoder`. When
+  using `sample_weight`, then `min_frequency` and `max_categories` will filter
+  according to sum samples' weight for that category instead of count. By
+  :user:`Carlo Lemos <vitaliset>`.
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -19,6 +19,7 @@
    _check_feature_names,
    _check_feature_names_in,
    _check_n_features,
+    _check_sample_weight,
    check_is_fitted,
 )

@@ -72,6 +73,7 @@ def _check_X(self, X, ensure_all_finite=True):
    def _fit(
        self,
        X,
+        sample_weight=None,
        handle_unknown="error",
        ensure_all_finite=True,
        return_counts=False,
@@ -84,6 +86,13 @@ def _fit(
            X, ensure_all_finite=ensure_all_finite
        )
        self.n_features_in_ = n_features
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+            # Filtering rows with sample_weight equals zero so we don't get extra dummy
+            # columns.
+            X_list = [Xi[sample_weight != 0] for Xi in X_list]
+            sample_weight = sample_weight[sample_weight != 0]
+            n_samples = np.sum(sample_weight)

        if self.categories != "auto":
            if len(self.categories) != n_features:
@@ -100,7 +109,9 @@ def _fit(
            Xi = X_list[i]

            if self.categories == "auto":
-                result = _unique(Xi, return_counts=compute_counts)
+                result = _unique(
+                    Xi, sample_weight=sample_weight, return_counts=compute_counts
+                )
                if compute_counts:
                    cats, counts = result
                    category_counts.append(counts)
@@ -163,7 +174,7 @@ def _fit(
                        )
                        raise ValueError(msg)
                if compute_counts:
-                    category_counts.append(_get_counts(Xi, cats))
+                    category_counts.append(_get_counts(Xi, cats, sample_weight))

            self.categories_.append(cats)

@@ -281,11 +292,13 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):

        Parameters
        ----------
-        category_count : ndarray of shape (n_cardinality,)
-            Category counts.
+        category_count : array-like of shape (n_cardinality,)
+            Category counts or sum of `sample_weight` for the samples from the
+            category when `sample_weight` is different from `None`.

        n_samples : int
-            Number of samples.
+            Number of samples in training set or total sum of `sample_weight`
+            for all samples when `sample_weight` is different from `None`.

        col_idx : int
            Index of the current category. Only used for the error message.
@@ -348,7 +361,8 @@ def _fit_infrequent_category_mapping(
        Parameters
        ----------
        n_samples : int
-            Number of samples in training set.
+            Number of samples in training set or total sum of `sample_weight`
+            for all samples when `sample_weight` is different from `None`.
        category_counts: list of ndarray
            `category_counts[i]` is the category counts corresponding to
            `self.categories_[i]`.
@@ -578,13 +592,15 @@ class OneHotEncoder(_BaseEncoder):

    min_frequency : int or float, default=None
        Specifies the minimum frequency below which a category will be
-        considered infrequent.
+        considered infrequent. If during fit `sample_weight` is different from
+        default, then count will be done with sum of samples' weight.

        - If `int`, categories with a smaller cardinality will be considered
          infrequent.

        - If `float`, categories with a smaller cardinality than
-          `min_frequency * n_samples`  will be considered infrequent.
+          `min_frequency * n_samples` will be considered infrequent. If
+          `sample_weight` is different from `None`, `n_samples = sum(sample_weight)`.

        .. versionadded:: 1.1
            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
@@ -970,7 +986,7 @@ def _compute_n_features_outs(self):
        return output

    @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
        """
        Fit OneHotEncoder to X.

@@ -983,13 +999,21 @@ def fit(self, X, y=None):
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights used to weight the categories when using filtering
+            catergories with `max_categories` and `min_frequency`. If `None`,
+            then samples are equally weighted. If both `max_categories` and
+            `min_frequency` are set to default values, then `sample_weight`
+            is ignored.
+
        Returns
        -------
        self
            Fitted encoder.
        """
        self._fit(
            X,
+            sample_weight=sample_weight,
            handle_unknown=self.handle_unknown,
            ensure_all_finite="allow-nan",
        )
@@ -1313,7 +1337,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
          infrequent.

        - If `float`, categories with a smaller cardinality than
-          `min_frequency * n_samples`  will be considered infrequent.
+          `min_frequency * n_samples` will be considered infrequent.

        .. versionadded:: 1.3
            Read more in the :ref:`User Guide <encoder_infrequent_categories>`.

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -2352,6 +2352,98 @@ def test_ordinal_encoder_missing_appears_infrequent():
    assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])


+@pytest.mark.parametrize(
+    "X, sample_weight, expected_shape",
+    [
+        (
+            [
+                ["car", 3],
+                ["bike", 3],
+                ["car", 1],
+                ["bike", 3],
+                ["boat", 2],
+                ["airplane", 4],
+            ],
+            np.array([2, 2.5, 0.5, 0.1, 0, 0]),
+            (6, 4),  # columns: car, bike, 3, infrequent (1)
+        ),
+        (
+            [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
+            np.array([5, 5, 0.1, 0.3, 4, 0.9]),
+            (6, 3),
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "min_frequency",
+    [0.3, 0.9, 2],
+)
+def test_one_hot_encoder_sample_weight_min_frequency(
+    X, sample_weight, expected_shape, min_frequency
+):
+    ohe = OneHotEncoder(min_frequency=2, handle_unknown="infrequent_if_exist")
+    X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
+    assert_allclose(X_trans.shape, expected_shape)
+
+
+@pytest.mark.parametrize(
+    "X, sample_weight, expected_shape",
+    [
+        (
+            [
+                ["car", 3],
+                ["bike", 3],
+                ["car", 1],
+                ["bike", 3],
+                ["boat", 2],
+                ["airplane", 4],
+            ],
+            np.array([2, 2.5, 0.5, 0.1, 0, 0]),
+            (6, 4),
+        ),
+        (
+            [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
+            np.array([5, 5, 0.1, 0.3, 4, 0.9]),
+            (6, 2),
+        ),
+    ],
+)
+def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape):
+    ohe = OneHotEncoder(max_categories=2, handle_unknown="ignore")
+    X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
+    assert_allclose(X_trans.shape, expected_shape)
+
+
+@pytest.mark.parametrize(
+    "X",
+    [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
+)
+@pytest.mark.parametrize(
+    "min_frequency",
+    [0.1, 0.3, 0.5, 0.9],
+)
+def test_one_hot_encoder_sample_weight_constant(X, min_frequency):
+    ohe = OneHotEncoder(min_frequency=min_frequency)
+    X_sw_None = ohe.fit_transform(X, sample_weight=None).toarray()
+    X_sw_constant1 = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
+    X_sw_constant5 = ohe.fit_transform(X, sample_weight=5 * np.ones(len(X))).toarray()
+
+    assert_array_equal(X_sw_None, X_sw_constant1)
+    assert_array_equal(X_sw_None, X_sw_constant5)
+
+
+@pytest.mark.parametrize(
+    "X",
+    [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
+)
+def test_one_hot_encoder_sample_weight_is_ignored(X):
+    ohe = OneHotEncoder()
+    X_sw_None = ohe.fit_transform(X).toarray()
+    X_sw_ones = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
+
+    assert_array_equal(X_sw_None, X_sw_ones)
+
+
 @pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
 def test_encoder_not_fitted(Encoder):
    """Check that we raise a `NotFittedError` by calling transform before fit with

diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
@@ -15,9 +15,10 @@
    xpx,
 )
 from ._missing import is_scalar_nan
+from .validation import _check_sample_weight


-def _unique(values, *, return_inverse=False, return_counts=False):
+def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=False):
    """Helper function to find unique values with support for python objects.

    Uses pure python method for object dtype, and numpy method for
@@ -50,29 +51,83 @@
    """
    if values.dtype == object:
        return _unique_python(
-            values, return_inverse=return_inverse, return_counts=return_counts
+            values,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+            sample_weight=sample_weight,
        )
    # numerical
    return _unique_np(
-        values, return_inverse=return_inverse, return_counts=return_counts
+        values,
+        return_inverse=return_inverse,
+        return_counts=return_counts,
+        sample_weight=sample_weight,
    )


-def _unique_np(values, return_inverse=False, return_counts=False):
+def _xp_unique_groupby_sum(
+    xp,
+    arr,
+    sample_weight,
+    return_index=False,
+    return_inverse=False,
+    return_counts=False,
+):
+    """This functions behaves like xp.unique_all but it counts the values of `arr`
+    taking into acount `sample_weight`."""
+    sample_weight = _check_sample_weight(sample_weight, arr)
+
+    sorted_indices = xp.argsort(arr)
+    sorted_arr = arr[sorted_indices]
+    sorted_sample_weight = sample_weight[sorted_indices]
+
+    unique_elements, unique_indices, _, _ = xp.unique_all(sorted_arr)
+    _, unique_inverse = xp.unique_inverse(arr)
+
+    # TODO ohe_sw: Update to xp-logic. This functions are not in the API.
+    unique_indices = np.append(unique_indices, len(arr))
+    subarrays = np.split(sorted_sample_weight, unique_indices[1:])
+    group_sums = np.array(
+        [np.sum(subarray.astype(float)) for subarray in subarrays[:-1]]
+    )
+
+    results = [unique_elements]
+    if return_index:
+        results.append(unique_indices)
+    if return_inverse:
+        results.append(unique_inverse)
+    if return_counts:
+        results.append(group_sums)
+
+    if len(results) > 1:
+        return tuple(results)
+    return results[0]
+
+
+def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None):
    """Helper function to find unique values for numpy arrays that correctly
    accounts for nans. See `_unique` documentation for details."""
    xp, _ = get_namespace(values)

    inverse, counts = None, None
-
-    if return_inverse and return_counts:
-        uniques, _, inverse, counts = xp.unique_all(values)
-    elif return_inverse:
-        uniques, inverse = xp.unique_inverse(values)
-    elif return_counts:
-        uniques, counts = xp.unique_counts(values)
+    if sample_weight is None:
+        if return_inverse and return_counts:
+            uniques, _, inverse, counts = xp.unique_all(values)
+        elif return_inverse:
+            uniques, inverse = xp.unique_inverse(values)
+        elif return_counts:
+            uniques, counts = xp.unique_counts(values)
+        else:
+            uniques = xp.unique_values(values)
    else:
-        uniques = xp.unique_values(values)
+        uniques, _, inverse, counts = _xp_unique_groupby_sum(
+            xp,
+            values,
+            sample_weight,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+        )

    # np.unique will have duplicate missing values at the end of `uniques`
    # here we clip the nans and remove it from uniques
@@ -174,7 +229,7 @@
    return xp.asarray([table[v] for v in values], device=device(values))


-def _unique_python(values, *, return_inverse, return_counts):
+def _unique_python(values, *, return_inverse, return_counts, sample_weight=None):
    # Only used in `_uniques`, see docstring there for details
    try:
        uniques_set = set(values)
@@ -195,7 +250,7 @@
        ret += (_map_to_integer(values, uniques),)

    if return_counts:
-        ret += (_get_counts(values, uniques),)
+        ret += (_get_counts(values, uniques, sample_weight),)

    return ret[0] if len(ret) == 1 else ret

@@ -349,9 +404,12 @@
        raise KeyError(key)


-def _get_counts(values, uniques):
+def _get_counts(values, uniques, sample_weight=None):
    """Get the count of each of the `uniques` in `values`.

+    If `sample_weight` is not `None` then the count is actually the sum of
+    `sample_weight` for that unique value.
+
    The counts will use the order passed in by `uniques`. For non-object dtypes,
    `uniques` is assumed to be sorted and `np.nan` is at the end.
    """
@@ -360,10 +418,17 @@
        output = np.zeros(len(uniques), dtype=np.int64)
        for i, item in enumerate(uniques):
            with suppress(KeyError):
-                output[i] = counter[item]
+                if sample_weight is None:
+                    output[i] = counter[item]
+                else:
+                    # TODO ohe_sw: I need to create tests for this. Is this
+                    # values == item working for NaN items?
+                    output[i] = np.sum(sample_weight[values == item])
        return output

-    unique_values, counts = _unique_np(values, return_counts=True)
+    unique_values, counts = _unique_np(
+        values, return_counts=True, sample_weight=sample_weight
+    )

    # Recorder unique_values based on input: `uniques`
    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)