From 779ce507d0ab501650603542cc1bf5ffc652e10e Mon Sep 17 00:00:00 2001 From: Carlo Date: Fri, 5 May 2023 02:53:26 -0300 Subject: [PATCH 01/12] add sample_weight to ohe draft --- sklearn/preprocessing/_encoders.py | 14 +++- sklearn/preprocessing/tests/test_encoders.py | 60 ++++++++++++++++ sklearn/utils/_encode.py | 74 +++++++++++++++++--- 3 files changed, 135 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index fd9941f5336ed..2e3c563a3876e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -70,6 +70,7 @@ def _check_X(self, X, force_all_finite=True): def _fit( self, X, + sample_weight=None, handle_unknown="error", force_all_finite=True, return_counts=False, @@ -98,7 +99,9 @@ def _fit( Xi = X_list[i] if self.categories == "auto": - result = _unique(Xi, return_counts=compute_counts) + result = _unique( + Xi, sample_weight=sample_weight, return_counts=compute_counts + ) if compute_counts: cats, counts = result category_counts.append(counts) @@ -147,7 +150,7 @@ def _fit( ) raise ValueError(msg) if compute_counts: - category_counts.append(_get_counts(Xi, cats)) + category_counts.append(_get_counts(Xi, cats, sample_weight)) self.categories_.append(cats) @@ -281,6 +284,8 @@ def _identify_infrequent(self, category_count, n_samples, col_idx): If there are infrequent categories, indices of infrequent categories. Otherwise None. """ + # TODO ohe_sw: We would have to change this... + # But it really makes sense... :( if isinstance(self.min_frequency, numbers.Integral): infrequent_mask = category_count < self.min_frequency elif isinstance(self.min_frequency, numbers.Real): @@ -953,7 +958,7 @@ def _compute_n_features_outs(self): return output - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """ Fit OneHotEncoder to X. @@ -971,6 +976,8 @@ def fit(self, X, y=None): self Fitted encoder. """ + # TODO ohe_sw: Add to docstring that `sample_weight` is only used when + # `max_categories` or `min_frequency` are not default values. self._validate_params() if self.sparse != "deprecated": @@ -986,6 +993,7 @@ def fit(self, X, y=None): self._fit( X, + sample_weight=sample_weight, handle_unknown=self.handle_unknown, force_all_finite="allow-nan", ) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 42c66980bfeba..847838d1bed04 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2303,3 +2303,63 @@ def test_ordinal_encoder_missing_appears_infrequent(): ) X_trans = ordinal.transform(X_test) assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]]) + + +@pytest.mark.parametrize( + "X, sample_weight, expected_shape", + [ + ( + [ + ["car", 3], + ["bike", 3], + ["car", 1], + ["bike", 3], + ["boat", 2], + ["airplane", 4], + ], + np.array([2, 2.5, 0.5, 0.1, 0, 0]), + (6, 5), + ), + ( + [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]], + np.array([5, 5, 0.1, 0.3, 4, 0.9]), + (6, 3), + ), + ], +) +def test_one_hot_encoder_sample_weight_min_frequency(X, sample_weight, expected_shape): + ohe = OneHotEncoder(min_frequency=2) + X_trans = ohe.fit_transform(X, sample_weight=sample_weight) + + print(X_trans.toarray()) + assert X_trans.shape == expected_shape + + +@pytest.mark.parametrize( + "X, sample_weight, expected_shape", + [ + ( + [ + ["car", 3], + ["bike", 3], + ["car", 1], + ["bike", 3], + ["boat", 2], + ["airplane", 4], + ], + np.array([2, 2.5, 0.5, 0.1, 0, 0]), + (6, 4), + ), + ( + [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]], + np.array([5, 5, 0.1, 0.3, 4, 0.9]), + (6, 2), + ), + ], +) +def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape): + ohe = OneHotEncoder(max_categories=2) + X_trans = ohe.fit_transform(X, sample_weight=sample_weight) + + print(X_trans.toarray()) + assert X_trans.shape == expected_shape diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index de48890fcaacf..df2b6c5151dd1 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -4,9 +4,10 @@ import numpy as np from . import is_scalar_nan +from .validation import _check_sample_weight -def _unique(values, *, return_inverse=False, return_counts=False): +def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=False): """Helper function to find unique values with support for python objects. Uses pure python method for object dtype, and numpy method for @@ -39,19 +40,70 @@ def _unique(values, *, return_inverse=False, return_counts=False): """ if values.dtype == object: return _unique_python( - values, return_inverse=return_inverse, return_counts=return_counts + values, + return_inverse=return_inverse, + return_counts=return_counts, + sample_weight=sample_weight, ) # numerical return _unique_np( - values, return_inverse=return_inverse, return_counts=return_counts + values, + sample_weight=sample_weight, + return_inverse=return_inverse, + return_counts=return_counts, ) -def _unique_np(values, return_inverse=False, return_counts=False): +def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False): + # TODO ohe_sw: add one line docstring + sample_weight = _check_sample_weight(sample_weight, arr) + + sorted_indices = np.argsort(arr) + sorted_arr = arr[sorted_indices] + sorted_sample_weight = sample_weight[sorted_indices] + + # TODO ohe_sw: Using two `np.unique` is certainly suboptimal, but for now I can't + # see how to build the `unique_inverse` of `arr` with the `unique_inverse` of + # `sorted_arr`. + unique_elements, unique_indices = np.unique(sorted_arr, return_index=True) + _, unique_inverse = np.unique(arr, return_inverse=True) + + unique_indices = np.append(unique_indices, len(arr)) + subarrays = np.split(sorted_sample_weight, unique_indices[1:]) + group_sums = np.array( + [np.sum(subarray.astype(float)) for subarray in subarrays[:-1]] + ) + + results = [unique_elements] + if return_inverse: + results.append(unique_inverse) + if return_counts: + results.append(group_sums) + + print(*results) + if len(results) > 1: + return tuple(results) + return results[0] + + +def _unique_np(values, sample_weight=None, return_inverse=False, return_counts=False): """Helper function to find unique values for numpy arrays that correctly accounts for nans. See `_unique` documentation for details.""" - uniques = np.unique( - values, return_inverse=return_inverse, return_counts=return_counts + # if sample_weight is None: + # uniques = np.unique( + # values, return_inverse=return_inverse, return_counts=return_counts + # ) + # else: + # TODO ohe_sw: _unique_groupby_sum is behaving like usual `np.unique` + # when `sample_weight=None`, ie, "`sample_weight=np.ones_like(X)`" because of + # `utils.validation._check_sample_weight`. + # Leaving the above lines comment for now because I want to show that the + # behaviour is the same. + uniques = _unique_groupby_sum( + values, + sample_weight=sample_weight, + return_inverse=return_inverse, + return_counts=return_counts, ) inverse, counts = None, None @@ -164,7 +216,7 @@ def _map_to_integer(values, uniques): return np.array([table[v] for v in values]) -def _unique_python(values, *, return_inverse, return_counts): +def _unique_python(values, *, return_inverse, return_counts, sample_weight=None): # Only used in `_uniques`, see docstring there for details try: uniques_set = set(values) @@ -185,7 +237,7 @@ def _unique_python(values, *, return_inverse, return_counts): ret += (_map_to_integer(values, uniques),) if return_counts: - ret += (_get_counts(values, uniques),) + ret += (_get_counts(values, uniques, sample_weight),) return ret[0] if len(ret) == 1 else ret @@ -339,7 +391,7 @@ def __missing__(self, key): raise KeyError(key) -def _get_counts(values, uniques): +def _get_counts(values, uniques, sample_weight=None): """Get the count of each of the `uniques` in `values`. The counts will use the order passed in by `uniques`. For non-object dtypes, @@ -353,7 +405,9 @@ def _get_counts(values, uniques): output[i] = counter[item] return output - unique_values, counts = _unique_np(values, return_counts=True) + unique_values, counts = _unique_np( + values, sample_weight=sample_weight, return_counts=True + ) # Recorder unique_values based on input: `uniques` uniques_in_values = np.isin(uniques, unique_values, assume_unique=True) From bf6951c0ccf248e87191fb1fc3ca7b0e6b18c32b Mon Sep 17 00:00:00 2001 From: Carlo Date: Fri, 5 May 2023 03:07:18 -0300 Subject: [PATCH 02/12] update changelog --- doc/whats_new/v1.3.rst | 4 ++++ sklearn/utils/_encode.py | 1 + 2 files changed, 5 insertions(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index bb245aa466152..b7e06ce3bd0da 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -467,6 +467,10 @@ Changelog The callable combines input arguments `(input_feature, category)` to a string. :pr:`22506` by :user:`Mario Kostelac `. +- |Enhancement| Adds support for `sample_weight` in + :class:`preprocessing.OneHotEncoder`. TODO ohe_sw. + :pr:`26330` by :user:`Carlo Lemos `. + - |Enhancement| Added support for `sample_weight` in :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter `sample_weight` for each sample to be used while fitting. The option is only diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index df2b6c5151dd1..7de32034595ab 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -56,6 +56,7 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False): # TODO ohe_sw: add one line docstring + # TODO ohe_sw: create tests sample_weight = _check_sample_weight(sample_weight, arr) sorted_indices = np.argsort(arr) From 3eb2c8476e924c716af24fcd374b0486f75ba82b Mon Sep 17 00:00:00 2001 From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com> Date: Fri, 5 May 2023 14:34:37 -0300 Subject: [PATCH 03/12] removing extra debugging print --- sklearn/utils/_encode.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 7de32034595ab..0b453659c50b3 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -81,7 +81,6 @@ def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts= if return_counts: results.append(group_sums) - print(*results) if len(results) > 1: return tuple(results) return results[0] From a7a39404cfdd6c5a868a063df58cddd5c5cc8903 Mon Sep 17 00:00:00 2001 From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com> Date: Sat, 6 May 2023 19:13:28 -0300 Subject: [PATCH 04/12] make ci green with temporary sample_weight doc --- sklearn/preprocessing/_encoders.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 2e3c563a3876e..d37939b0fe35c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -971,6 +971,9 @@ def fit(self, X, y=None, sample_weight=None): Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Returns ------- self From 8d827f08f9327b9740f79658ae21cf274ae2bfdb Mon Sep 17 00:00:00 2001 From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com> Date: Sat, 6 May 2023 19:20:47 -0300 Subject: [PATCH 05/12] linting --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index d37939b0fe35c..45f18c341fe6d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -971,7 +971,7 @@ def fit(self, X, y=None, sample_weight=None): Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. - sample_weight : array-like of shape (n_samples,), default=None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Returns From be69aecc95baaf659f464f60fec5e56b8c9b4efc Mon Sep 17 00:00:00 2001 From: vitaliset Date: Fri, 23 Jun 2023 00:46:13 -0300 Subject: [PATCH 06/12] first version of ohe with sw --- sklearn/preprocessing/_encoders.py | 31 +++++++----- sklearn/preprocessing/tests/test_encoders.py | 41 ++++++++++++++-- sklearn/utils/_encode.py | 50 ++++++++++---------- sklearn/utils/tests/test_encode.py | 25 ++++++++++ 4 files changed, 105 insertions(+), 42 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 45f18c341fe6d..9162d90af5ad4 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -83,6 +83,8 @@ def _fit( X, force_all_finite=force_all_finite ) self.n_features_in_ = n_features + if sample_weight is not None: + n_samples = np.sum(sample_weight) if self.categories != "auto": if len(self.categories) != n_features: @@ -269,11 +271,13 @@ def _identify_infrequent(self, category_count, n_samples, col_idx): Parameters ---------- - category_count : ndarray of shape (n_cardinality,) - Category counts. + category_count : array-like of shape (n_cardinality,) + Category counts or sum of `sample_weight` for the samples from the + category when `sample_weight` is different from `None`. n_samples : int - Number of samples. + Number of samples in training set or total sum of `sample_weight` + for all samples when `sample_weight` is different from `None`. col_idx : int Index of the current category. Only used for the error message. @@ -284,8 +288,6 @@ def _identify_infrequent(self, category_count, n_samples, col_idx): If there are infrequent categories, indices of infrequent categories. Otherwise None. """ - # TODO ohe_sw: We would have to change this... - # But it really makes sense... :( if isinstance(self.min_frequency, numbers.Integral): infrequent_mask = category_count < self.min_frequency elif isinstance(self.min_frequency, numbers.Real): @@ -338,7 +340,8 @@ def _fit_infrequent_category_mapping( Parameters ---------- n_samples : int - Number of samples in training set. + Number of samples in training set or total sum of `sample_weight` + for all samples when `sample_weight` is different from `None`. category_counts: list of ndarray `category_counts[i]` is the category counts corresponding to `self.categories_[i]`. @@ -563,13 +566,15 @@ class OneHotEncoder(_BaseEncoder): min_frequency : int or float, default=None Specifies the minimum frequency below which a category will be - considered infrequent. + considered infrequent. If during fit `sample_weight` is different from + default, then count will be done with sum of samples' weight. - If `int`, categories with a smaller cardinality will be considered infrequent. - If `float`, categories with a smaller cardinality than - `min_frequency * n_samples` will be considered infrequent. + `min_frequency * n_samples` will be considered infrequent. If + `sample_weight` is different from `None`, `n_samples = sum(sample_weight)`. .. versionadded:: 1.1 Read more in the :ref:`User Guide `. @@ -972,15 +977,17 @@ def fit(self, X, y=None, sample_weight=None): :class:`~sklearn.pipeline.Pipeline`. sample_weight : array-like of shape (n_samples,), default=None - Sample weights. If None, then samples are equally weighted. + Sample weights used to weight the categories when using filtering + catergories with `max_categories` and `min_frequency`. If `None`, + then samples are equally weighted. If both `max_categories` and + `min_frequency` are set to default values, then `sample_weight` + is ignored. Returns ------- self Fitted encoder. """ - # TODO ohe_sw: Add to docstring that `sample_weight` is only used when - # `max_categories` or `min_frequency` are not default values. self._validate_params() if self.sparse != "deprecated": @@ -1300,7 +1307,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): infrequent. - If `float`, categories with a smaller cardinality than - `min_frequency * n_samples` will be considered infrequent. + `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.3 Read more in the :ref:`User Guide `. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 847838d1bed04..304f8877e57cc 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2327,11 +2327,15 @@ def test_ordinal_encoder_missing_appears_infrequent(): ), ], ) -def test_one_hot_encoder_sample_weight_min_frequency(X, sample_weight, expected_shape): +@pytest.mark.parametrize( + "min_frequency", + [0.3, 0.9, 2], +) +def test_one_hot_encoder_sample_weight_min_frequency( + X, sample_weight, expected_shape, min_frequency +): ohe = OneHotEncoder(min_frequency=2) X_trans = ohe.fit_transform(X, sample_weight=sample_weight) - - print(X_trans.toarray()) assert X_trans.shape == expected_shape @@ -2360,6 +2364,33 @@ def test_one_hot_encoder_sample_weight_min_frequency(X, sample_weight, expected_ def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape): ohe = OneHotEncoder(max_categories=2) X_trans = ohe.fit_transform(X, sample_weight=sample_weight) - - print(X_trans.toarray()) assert X_trans.shape == expected_shape + + +@pytest.mark.parametrize( + "X", + [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]], +) +@pytest.mark.parametrize( + "min_frequency", + [0.1, 0.3, 0.5, 0.9], +) +def test_one_hot_encoder_sample_weight_constant(X, min_frequency): + ohe = OneHotEncoder(min_frequency=min_frequency) + X_sw_None = ohe.fit_transform(X, sample_weight=None).toarray() + X_sw_constant1 = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray() + X_sw_constant5 = ohe.fit_transform(X, sample_weight=5 * np.ones(len(X))).toarray() + + assert_array_equal(X_sw_None, X_sw_constant1) + assert_array_equal(X_sw_None, X_sw_constant5) + +@pytest.mark.parametrize( + "X", + [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]], +) +def test_one_hot_encoder_sample_weight_is_ignored(X): + ohe = OneHotEncoder() + X_sw_None = ohe.fit_transform(X).toarray() + X_sw_ones = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray() + + assert_array_equal(X_sw_None, X_sw_ones) diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 0b453659c50b3..b677348dccd57 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -48,24 +48,21 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F # numerical return _unique_np( values, - sample_weight=sample_weight, return_inverse=return_inverse, return_counts=return_counts, + sample_weight=sample_weight, ) def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False): - # TODO ohe_sw: add one line docstring - # TODO ohe_sw: create tests + """This functions behaves like np.unique but it counts the values of `arr` taking + into acount `samplt_weight`.""" sample_weight = _check_sample_weight(sample_weight, arr) sorted_indices = np.argsort(arr) sorted_arr = arr[sorted_indices] sorted_sample_weight = sample_weight[sorted_indices] - # TODO ohe_sw: Using two `np.unique` is certainly suboptimal, but for now I can't - # see how to build the `unique_inverse` of `arr` with the `unique_inverse` of - # `sorted_arr`. unique_elements, unique_indices = np.unique(sorted_arr, return_index=True) _, unique_inverse = np.unique(arr, return_inverse=True) @@ -86,25 +83,20 @@ def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts= return results[0] -def _unique_np(values, sample_weight=None, return_inverse=False, return_counts=False): +def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None): """Helper function to find unique values for numpy arrays that correctly accounts for nans. See `_unique` documentation for details.""" - # if sample_weight is None: - # uniques = np.unique( - # values, return_inverse=return_inverse, return_counts=return_counts - # ) - # else: - # TODO ohe_sw: _unique_groupby_sum is behaving like usual `np.unique` - # when `sample_weight=None`, ie, "`sample_weight=np.ones_like(X)`" because of - # `utils.validation._check_sample_weight`. - # Leaving the above lines comment for now because I want to show that the - # behaviour is the same. - uniques = _unique_groupby_sum( - values, - sample_weight=sample_weight, - return_inverse=return_inverse, - return_counts=return_counts, - ) + if sample_weight is None: + uniques = np.unique( + values, return_inverse=return_inverse, return_counts=return_counts + ) + else: + uniques = _unique_groupby_sum( + values, + sample_weight=sample_weight, + return_inverse=return_inverse, + return_counts=return_counts, + ) inverse, counts = None, None @@ -394,6 +386,9 @@ def __missing__(self, key): def _get_counts(values, uniques, sample_weight=None): """Get the count of each of the `uniques` in `values`. + If `sample_weight` is not `None` then the count is actually the sum of + `sample_weight` for that unique value. + The counts will use the order passed in by `uniques`. For non-object dtypes, `uniques` is assumed to be sorted and `np.nan` is at the end. """ @@ -402,11 +397,16 @@ def _get_counts(values, uniques, sample_weight=None): output = np.zeros(len(uniques), dtype=np.int64) for i, item in enumerate(uniques): with suppress(KeyError): - output[i] = counter[item] + if sample_weight is None: + output[i] = counter[item] + else: + # TODO ohe_sw: I need to create tests for this. Is this + # values == item working for NaN items? + output[i] = np.sum(sample_weight[values == item]) return output unique_values, counts = _unique_np( - values, sample_weight=sample_weight, return_counts=True + values, return_counts=True, sample_weight=sample_weight ) # Recorder unique_values based on input: `uniques` diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py index 083db25b7ca80..48da80e5a6a17 100644 --- a/sklearn/utils/tests/test_encode.py +++ b/sklearn/utils/tests/test_encode.py @@ -8,6 +8,7 @@ from sklearn.utils._encode import _encode from sklearn.utils._encode import _check_unknown from sklearn.utils._encode import _get_counts +from sklearn.utils._encode import _unique_groupby_sum @pytest.mark.parametrize( @@ -275,3 +276,27 @@ def test_check_unknown_with_both_missing_values(): def test_get_counts(values, uniques, expected_counts): counts = _get_counts(values, uniques) assert_array_equal(counts, expected_counts) + + +@pytest.mark.parametrize( + "arr, sample_weight, expected_unique, expected_sum", + [ + ( + np.array([1] * 3 + [2] * 2 + [3]), + np.array([0, 1, 2, 3, 4, 5]), + [1, 2, 3], + [3, 7, 5], + ), + ( + np.array([3] + [2] * 2 + [1] * 3), + np.array([5, 3, 4, 2, 1, 0]), + [1, 2, 3], + [3, 7, 5], + ), + ], +) +def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum): + # TODO ohe_sw: Do more parametrize scenarios + unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True) + assert_array_equal(groupby_sum, expected_sum) + assert_array_equal(unique, expected_unique) From 94871acf1eb1533ed88f7503c85c235a36e97377 Mon Sep 17 00:00:00 2001 From: vitaliset Date: Fri, 23 Jun 2023 00:51:04 -0300 Subject: [PATCH 07/12] updating changelog description and small linting --- doc/whats_new/v1.3.rst | 5 +++-- sklearn/preprocessing/tests/test_encoders.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 4d5d4e919c3fa..c616a30afb2df 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -490,8 +490,9 @@ Changelog :pr:`22506` by :user:`Mario Kostelac `. - |Enhancement| Adds support for `sample_weight` in - :class:`preprocessing.OneHotEncoder`. TODO ohe_sw. - :pr:`26330` by :user:`Carlo Lemos `. + :class:`preprocessing.OneHotEncoder`. When using `sample_weight`, then `min_frequency` + and `max_categories` will filter according to sum samples' weight for that category + instead of count. :pr:`26330` by :user:`Carlo Lemos `. - |Enhancement| Added support for `sample_weight` in :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 304f8877e57cc..e2ec08fa0021a 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2384,6 +2384,7 @@ def test_one_hot_encoder_sample_weight_constant(X, min_frequency): assert_array_equal(X_sw_None, X_sw_constant1) assert_array_equal(X_sw_None, X_sw_constant5) + @pytest.mark.parametrize( "X", [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]], From 8e6db11435dcbbc378ef591f6ed9a24f18533be9 Mon Sep 17 00:00:00 2001 From: vitaliset Date: Fri, 23 Jun 2023 01:02:57 -0300 Subject: [PATCH 08/12] resolve linting error introduced in conflit --- sklearn/utils/tests/test_encode.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py index 342e049de5d36..f600073b45f20 100644 --- a/sklearn/utils/tests/test_encode.py +++ b/sklearn/utils/tests/test_encode.py @@ -5,7 +5,11 @@ from numpy.testing import assert_array_equal from sklearn.utils._encode import ( - _check_unknown, _encode, _get_counts, _unique, _unique_groupby_sum + _check_unknown, + _encode, + _get_counts, + _unique, + _unique_groupby_sum ) From 51f9eae4b1feea8314631374d5b24b49b79e7285 Mon Sep 17 00:00:00 2001 From: vitaliset Date: Fri, 23 Jun 2023 01:07:48 -0300 Subject: [PATCH 09/12] black --- sklearn/utils/tests/test_encode.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py index f600073b45f20..e95e3ec5e0343 100644 --- a/sklearn/utils/tests/test_encode.py +++ b/sklearn/utils/tests/test_encode.py @@ -5,11 +5,11 @@ from numpy.testing import assert_array_equal from sklearn.utils._encode import ( - _check_unknown, - _encode, - _get_counts, - _unique, - _unique_groupby_sum + _check_unknown, + _encode, + _get_counts, + _unique, + _unique_groupby_sum, ) From 6fa547fe677753c29a04e7f332d4683ddc87c759 Mon Sep 17 00:00:00 2001 From: vitaliset Date: Sun, 9 Jun 2024 18:05:21 -0300 Subject: [PATCH 10/12] updating PR to fit the array api introduced in 27381 --- sklearn/utils/_array_api.py | 78 +++++++++++++++++++++++---- sklearn/utils/_encode.py | 50 ++--------------- sklearn/utils/tests/test_array_api.py | 25 +++++++++ sklearn/utils/tests/test_encode.py | 32 +---------- 4 files changed, 98 insertions(+), 87 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 7bf9183c80772..76af192c0c360 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -397,19 +397,37 @@ def asarray(self, x, *, dtype=None, device=None, copy=None): # noqa else: return numpy.asarray(x, dtype=dtype) - def unique_inverse(self, x): - return numpy.unique(x, return_inverse=True) + def unique_inverse(self, x, *, sample_weight=None): + if sample_weight is None: + return numpy.unique(x, return_inverse=True) + else: + return _unique_groupby_sum(x, sample_weight, return_inverse=True) - def unique_counts(self, x): - return numpy.unique(x, return_counts=True) + def unique_counts(self, x, *, sample_weight=None): + if sample_weight is None: + return numpy.unique(x, return_counts=True) + else: + return _unique_groupby_sum(x, sample_weight, return_counts=True) - def unique_values(self, x): - return numpy.unique(x) + def unique_values(self, x, *, sample_weight=None): + if sample_weight is None: + return numpy.unique(x) + else: + return _unique_groupby_sum(x, sample_weight) - def unique_all(self, x): - return numpy.unique( - x, return_index=True, return_inverse=True, return_counts=True - ) + def unique_all(self, x, *, sample_weight=None): + if sample_weight is None: + return numpy.unique( + x, return_index=True, return_inverse=True, return_counts=True + ) + else: + return _unique_groupby_sum( + x, + sample_weight, + return_index=True, + return_inverse=True, + return_counts=True, + ) def concat(self, arrays, *, axis=None): return numpy.concatenate(arrays, axis=axis) @@ -967,3 +985,43 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): return ret[: ar1.shape[0]] else: return xp.take(ret, rev_idx, axis=0) + + +def _unique_groupby_sum( + arr, + sample_weight, + return_index=False, + return_inverse=False, + return_counts=False, +): + """This functions behaves like numpy.unique but it counts the values of `arr` + taking into acount `sample_weight`.""" + if sample_weight is None: + sample_weight = numpy.ones(len(arr)) + # TODO ohe_sw: sample_weight = _check_sample_weight(sample_weight, arr) + # gave me circular import. What should I do? + + sorted_indices = numpy.argsort(arr) + sorted_arr = arr[sorted_indices] + sorted_sample_weight = sample_weight[sorted_indices] + + unique_elements, unique_indices = numpy.unique(sorted_arr, return_index=True) + _, unique_inverse = numpy.unique(arr, return_inverse=True) + + unique_indices = numpy.append(unique_indices, len(arr)) + subarrays = numpy.split(sorted_sample_weight, unique_indices[1:]) + group_sums = numpy.array( + [numpy.sum(subarray.astype(float)) for subarray in subarrays[:-1]] + ) + + results = [unique_elements] + if return_index: + results.append(unique_indices) + if return_inverse: + results.append(unique_inverse) + if return_counts: + results.append(group_sums) + + if len(results) > 1: + return tuple(results) + return results[0] diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 608de23b88713..2d5ae33051529 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -4,8 +4,6 @@ import numpy as np -from . import is_scalar_nan -from .validation import _check_sample_weight from ._array_api import ( _isin, _searchsorted, @@ -63,61 +61,21 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F ) -def _unique_groupby_sum(arr, sample_weight, return_inverse=False, return_counts=False): - """This functions behaves like np.unique but it counts the values of `arr` taking - into acount `samplt_weight`.""" - sample_weight = _check_sample_weight(sample_weight, arr) - - sorted_indices = np.argsort(arr) - sorted_arr = arr[sorted_indices] - sorted_sample_weight = sample_weight[sorted_indices] - - unique_elements, unique_indices = np.unique(sorted_arr, return_index=True) - _, unique_inverse = np.unique(arr, return_inverse=True) - - unique_indices = np.append(unique_indices, len(arr)) - subarrays = np.split(sorted_sample_weight, unique_indices[1:]) - group_sums = np.array( - [np.sum(subarray.astype(float)) for subarray in subarrays[:-1]] - ) - - results = [unique_elements] - if return_inverse: - results.append(unique_inverse) - if return_counts: - results.append(group_sums) - - if len(results) > 1: - return tuple(results) - return results[0] - - def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None): """Helper function to find unique values for numpy arrays that correctly accounts for nans. See `_unique` documentation for details.""" -# if sample_weight is None: -# uniques = np.unique( -# values, return_inverse=return_inverse, return_counts=return_counts -# ) -# else: -# uniques = _unique_groupby_sum( -# values, -# sample_weight=sample_weight, -# return_inverse=return_inverse, -# return_counts=return_counts, -# ) xp, _ = get_namespace(values) inverse, counts = None, None if return_inverse and return_counts: - uniques, _, inverse, counts = xp.unique_all(values) + uniques, _, inverse, counts = xp.unique_all(values, sample_weight=sample_weight) elif return_inverse: - uniques, inverse = xp.unique_inverse(values) + uniques, inverse = xp.unique_inverse(values, sample_weight=sample_weight) elif return_counts: - uniques, counts = xp.unique_counts(values) + uniques, counts = xp.unique_counts(values, sample_weight=sample_weight) else: - uniques = xp.unique_values(values) + uniques = xp.unique_values(values, sample_weight=sample_weight) # np.unique will have duplicate missing values at the end of `uniques` # here we clip the nans and remove it from uniques diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 25913e7f54846..50caee99be36d 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -20,6 +20,7 @@ _nanmin, _NumPyAPIWrapper, _ravel, + _unique_groupby_sum, device, get_namespace, get_namespace_and_device, @@ -566,3 +567,27 @@ def test_get_namespace_and_device(): assert namespace is xp_torch assert is_array_api assert device == some_torch_tensor.device + + +@pytest.mark.parametrize( + "arr, sample_weight, expected_unique, expected_sum", + [ + ( + numpy.array([1] * 3 + [2] * 2 + [3]), + numpy.array([0, 1, 2, 3, 4, 5]), + [1, 2, 3], + [3, 7, 5], + ), + ( + numpy.array([3] + [2] * 2 + [1] * 3), + numpy.array([5, 3, 4, 2, 1, 0]), + [1, 2, 3], + [3, 7, 5], + ), + ], +) +def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum): + # TODO ohe_sw: Do more parametrize scenarios + unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True) + assert_array_equal(groupby_sum, expected_sum) + assert_array_equal(unique, expected_unique) diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py index e95e3ec5e0343..9118eb56f0ba4 100644 --- a/sklearn/utils/tests/test_encode.py +++ b/sklearn/utils/tests/test_encode.py @@ -4,13 +4,7 @@ import pytest from numpy.testing import assert_array_equal -from sklearn.utils._encode import ( - _check_unknown, - _encode, - _get_counts, - _unique, - _unique_groupby_sum, -) +from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique @pytest.mark.parametrize( @@ -278,27 +272,3 @@ def test_check_unknown_with_both_missing_values(): def test_get_counts(values, uniques, expected_counts): counts = _get_counts(values, uniques) assert_array_equal(counts, expected_counts) - - -@pytest.mark.parametrize( - "arr, sample_weight, expected_unique, expected_sum", - [ - ( - np.array([1] * 3 + [2] * 2 + [3]), - np.array([0, 1, 2, 3, 4, 5]), - [1, 2, 3], - [3, 7, 5], - ), - ( - np.array([3] + [2] * 2 + [1] * 3), - np.array([5, 3, 4, 2, 1, 0]), - [1, 2, 3], - [3, 7, 5], - ), - ], -) -def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum): - # TODO ohe_sw: Do more parametrize scenarios - unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True) - assert_array_equal(groupby_sum, expected_sum) - assert_array_equal(unique, expected_unique) From 7e0ea84faae4b80e6de9826534dbdfdd8b995605 Mon Sep 17 00:00:00 2001 From: vitaliset Date: Mon, 10 Jun 2024 00:17:11 -0300 Subject: [PATCH 11/12] update new api logic properly, update tests on sample_weight and adjustments to sum(sample_weight)=0 scenario --- sklearn/preprocessing/_encoders.py | 11 ++- sklearn/preprocessing/tests/test_encoders.py | 10 +-- sklearn/utils/_array_api.py | 78 +++----------------- sklearn/utils/_encode.py | 65 ++++++++++++++-- sklearn/utils/tests/test_array_api.py | 25 ------- sklearn/utils/tests/test_encode.py | 37 +++++++++- 6 files changed, 118 insertions(+), 108 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 8ed398b9352f8..f1e6d6833de32 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -16,7 +16,11 @@ from ..utils._missing import is_scalar_nan from ..utils._param_validation import Interval, RealNotInt, StrOptions from ..utils._set_output import _get_output_config -from ..utils.validation import _check_feature_names_in, check_is_fitted +from ..utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + check_is_fitted, +) __all__ = ["OneHotEncoder", "OrdinalEncoder"] @@ -82,6 +86,11 @@ def _fit( ) self.n_features_in_ = n_features if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + # Filtering rows with sample_weight equals zero so we don't get extra dummy + # columns. + X_list = [Xi[sample_weight != 0] for Xi in X_list] + sample_weight = sample_weight[sample_weight != 0] n_samples = np.sum(sample_weight) if self.categories != "auto": diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 3bd466ebd1bfc..ccb4c96652b9d 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2336,7 +2336,7 @@ def test_ordinal_encoder_missing_appears_infrequent(): ["airplane", 4], ], np.array([2, 2.5, 0.5, 0.1, 0, 0]), - (6, 5), + (6, 4), # columns: car, bike, 3, infrequent (1) ), ( [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]], @@ -2352,9 +2352,9 @@ def test_ordinal_encoder_missing_appears_infrequent(): def test_one_hot_encoder_sample_weight_min_frequency( X, sample_weight, expected_shape, min_frequency ): - ohe = OneHotEncoder(min_frequency=2) + ohe = OneHotEncoder(min_frequency=2, handle_unknown="infrequent_if_exist") X_trans = ohe.fit_transform(X, sample_weight=sample_weight) - assert X_trans.shape == expected_shape + assert_allclose(X_trans.shape, expected_shape) @pytest.mark.parametrize( @@ -2380,9 +2380,9 @@ def test_one_hot_encoder_sample_weight_min_frequency( ], ) def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape): - ohe = OneHotEncoder(max_categories=2) + ohe = OneHotEncoder(max_categories=2, handle_unknown="ignore") X_trans = ohe.fit_transform(X, sample_weight=sample_weight) - assert X_trans.shape == expected_shape + assert_allclose(X_trans.shape, expected_shape) @pytest.mark.parametrize( diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 76af192c0c360..7bf9183c80772 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -397,37 +397,19 @@ def asarray(self, x, *, dtype=None, device=None, copy=None): # noqa else: return numpy.asarray(x, dtype=dtype) - def unique_inverse(self, x, *, sample_weight=None): - if sample_weight is None: - return numpy.unique(x, return_inverse=True) - else: - return _unique_groupby_sum(x, sample_weight, return_inverse=True) + def unique_inverse(self, x): + return numpy.unique(x, return_inverse=True) - def unique_counts(self, x, *, sample_weight=None): - if sample_weight is None: - return numpy.unique(x, return_counts=True) - else: - return _unique_groupby_sum(x, sample_weight, return_counts=True) + def unique_counts(self, x): + return numpy.unique(x, return_counts=True) - def unique_values(self, x, *, sample_weight=None): - if sample_weight is None: - return numpy.unique(x) - else: - return _unique_groupby_sum(x, sample_weight) + def unique_values(self, x): + return numpy.unique(x) - def unique_all(self, x, *, sample_weight=None): - if sample_weight is None: - return numpy.unique( - x, return_index=True, return_inverse=True, return_counts=True - ) - else: - return _unique_groupby_sum( - x, - sample_weight, - return_index=True, - return_inverse=True, - return_counts=True, - ) + def unique_all(self, x): + return numpy.unique( + x, return_index=True, return_inverse=True, return_counts=True + ) def concat(self, arrays, *, axis=None): return numpy.concatenate(arrays, axis=axis) @@ -985,43 +967,3 @@ def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): return ret[: ar1.shape[0]] else: return xp.take(ret, rev_idx, axis=0) - - -def _unique_groupby_sum( - arr, - sample_weight, - return_index=False, - return_inverse=False, - return_counts=False, -): - """This functions behaves like numpy.unique but it counts the values of `arr` - taking into acount `sample_weight`.""" - if sample_weight is None: - sample_weight = numpy.ones(len(arr)) - # TODO ohe_sw: sample_weight = _check_sample_weight(sample_weight, arr) - # gave me circular import. What should I do? - - sorted_indices = numpy.argsort(arr) - sorted_arr = arr[sorted_indices] - sorted_sample_weight = sample_weight[sorted_indices] - - unique_elements, unique_indices = numpy.unique(sorted_arr, return_index=True) - _, unique_inverse = numpy.unique(arr, return_inverse=True) - - unique_indices = numpy.append(unique_indices, len(arr)) - subarrays = numpy.split(sorted_sample_weight, unique_indices[1:]) - group_sums = numpy.array( - [numpy.sum(subarray.astype(float)) for subarray in subarrays[:-1]] - ) - - results = [unique_elements] - if return_index: - results.append(unique_indices) - if return_inverse: - results.append(unique_inverse) - if return_counts: - results.append(group_sums) - - if len(results) > 1: - return tuple(results) - return results[0] diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 2d5ae33051529..fc5e97aad7d69 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -12,6 +12,7 @@ get_namespace, ) from ._missing import is_scalar_nan +from .validation import _check_sample_weight def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=False): @@ -61,21 +62,69 @@ def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=F ) +def _xp_unique_groupby_sum( + xp, + arr, + sample_weight, + return_index=False, + return_inverse=False, + return_counts=False, +): + """This functions behaves like xp.unique_all but it counts the values of `arr` + taking into acount `sample_weight`.""" + sample_weight = _check_sample_weight(sample_weight, arr) + + sorted_indices = xp.argsort(arr) + sorted_arr = arr[sorted_indices] + sorted_sample_weight = sample_weight[sorted_indices] + + unique_elements, unique_indices, _, _ = xp.unique_all(sorted_arr) + _, unique_inverse = xp.unique_inverse(arr) + + # TODO ohe_sw: Update to xp-logic. This functions are not in the API. + unique_indices = np.append(unique_indices, len(arr)) + subarrays = np.split(sorted_sample_weight, unique_indices[1:]) + group_sums = np.array( + [np.sum(subarray.astype(float)) for subarray in subarrays[:-1]] + ) + + results = [unique_elements] + if return_index: + results.append(unique_indices) + if return_inverse: + results.append(unique_inverse) + if return_counts: + results.append(group_sums) + + if len(results) > 1: + return tuple(results) + return results[0] + + def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None): """Helper function to find unique values for numpy arrays that correctly accounts for nans. See `_unique` documentation for details.""" xp, _ = get_namespace(values) inverse, counts = None, None - - if return_inverse and return_counts: - uniques, _, inverse, counts = xp.unique_all(values, sample_weight=sample_weight) - elif return_inverse: - uniques, inverse = xp.unique_inverse(values, sample_weight=sample_weight) - elif return_counts: - uniques, counts = xp.unique_counts(values, sample_weight=sample_weight) + if sample_weight is None: + if return_inverse and return_counts: + uniques, _, inverse, counts = xp.unique_all(values) + elif return_inverse: + uniques, inverse = xp.unique_inverse(values) + elif return_counts: + uniques, counts = xp.unique_counts(values) + else: + uniques = xp.unique_values(values) else: - uniques = xp.unique_values(values, sample_weight=sample_weight) + uniques, _, inverse, counts = _xp_unique_groupby_sum( + xp, + values, + sample_weight, + return_index=True, + return_inverse=True, + return_counts=True, + ) # np.unique will have duplicate missing values at the end of `uniques` # here we clip the nans and remove it from uniques diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 50caee99be36d..25913e7f54846 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -20,7 +20,6 @@ _nanmin, _NumPyAPIWrapper, _ravel, - _unique_groupby_sum, device, get_namespace, get_namespace_and_device, @@ -567,27 +566,3 @@ def test_get_namespace_and_device(): assert namespace is xp_torch assert is_array_api assert device == some_torch_tensor.device - - -@pytest.mark.parametrize( - "arr, sample_weight, expected_unique, expected_sum", - [ - ( - numpy.array([1] * 3 + [2] * 2 + [3]), - numpy.array([0, 1, 2, 3, 4, 5]), - [1, 2, 3], - [3, 7, 5], - ), - ( - numpy.array([3] + [2] * 2 + [1] * 3), - numpy.array([5, 3, 4, 2, 1, 0]), - [1, 2, 3], - [3, 7, 5], - ), - ], -) -def test_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum): - # TODO ohe_sw: Do more parametrize scenarios - unique, groupby_sum = _unique_groupby_sum(arr, sample_weight, return_counts=True) - assert_array_equal(groupby_sum, expected_sum) - assert_array_equal(unique, expected_unique) diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py index 9118eb56f0ba4..c1c430ca82df9 100644 --- a/sklearn/utils/tests/test_encode.py +++ b/sklearn/utils/tests/test_encode.py @@ -4,7 +4,14 @@ import pytest from numpy.testing import assert_array_equal -from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique +from sklearn.utils._array_api import get_namespace +from sklearn.utils._encode import ( + _check_unknown, + _encode, + _get_counts, + _unique, + _xp_unique_groupby_sum, +) @pytest.mark.parametrize( @@ -272,3 +279,31 @@ def test_check_unknown_with_both_missing_values(): def test_get_counts(values, uniques, expected_counts): counts = _get_counts(values, uniques) assert_array_equal(counts, expected_counts) + + +@pytest.mark.parametrize( + "arr, sample_weight, expected_unique, expected_sum", + [ + ( + np.array([1] * 3 + [2] * 2 + [3]), + np.array([0, 1, 2, 3, 4, 5]), + [1, 2, 3], + [3, 7, 5], + ), + ( + np.array([3] + [2] * 2 + [1] * 3), + np.array([5, 3, 4, 2, 1, 0]), + [1, 2, 3], + [3, 7, 5], + ), + ], +) +def test_xp_unique_groupby_sum(arr, sample_weight, expected_unique, expected_sum): + # TODO ohe_sw: Do more parametrize scenarios + # TODO ohe_sw: Test other array types + xp, _ = get_namespace(arr) + unique, groupby_sum = _xp_unique_groupby_sum( + xp, arr, sample_weight, return_counts=True + ) + assert_array_equal(groupby_sum, expected_sum) + assert_array_equal(unique, expected_unique) From 55159386b8ca0ea6eb5793ea61973ddca6ee7f8e Mon Sep 17 00:00:00 2001 From: Carlo Lemos <55899543+vitaliset@users.noreply.github.com> Date: Tue, 13 May 2025 01:57:35 +0000 Subject: [PATCH 12/12] update whats_new --- .../sklearn.preprocessing/26330.enhancement.rst | 4 ++++ doc/whats_new/v1.3.rst | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst new file mode 100644 index 0000000000000..63fe86bebed33 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.preprocessing/26330.enhancement.rst @@ -0,0 +1,4 @@ +- Adds support for `sample_weight` in :class:`preprocessing.OneHotEncoder`. When + using `sample_weight`, then `min_frequency` and `max_categories` will filter + according to sum samples' weight for that category instead of count. By + :user:`Carlo Lemos `. \ No newline at end of file diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 440b0a97f04cf..f523c02e14447 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -871,11 +871,6 @@ Changelog combines input arguments `(input_feature, category)` to a string. :pr:`22506` by :user:`Mario Kostelac `. -- |Enhancement| Adds support for `sample_weight` in - :class:`preprocessing.OneHotEncoder`. When using `sample_weight`, then `min_frequency` - and `max_categories` will filter according to sum samples' weight for that category - instead of count. :pr:`26330` by :user:`Carlo Lemos `. - - |Enhancement| Added support for `sample_weight` in :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter `sample_weight` for each sample to be used while fitting. The option is only