Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

ENH Add sample_weight parameter to OneHotEncoder's .fit #26330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: main
Choose a base branch
Loading
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- Adds support for `sample_weight` in :class:`preprocessing.OneHotEncoder`. When
using `sample_weight`, then `min_frequency` and `max_categories` will filter
according to sum samples' weight for that category instead of count. By
:user:`Carlo Lemos <vitaliset>`.
44 changes: 34 additions & 10 deletions 44 sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
_check_feature_names,
_check_feature_names_in,
_check_n_features,
_check_sample_weight,
check_is_fitted,
)

Expand Down Expand Up @@ -72,6 +73,7 @@ def _check_X(self, X, ensure_all_finite=True):
def _fit(
self,
X,
sample_weight=None,
handle_unknown="error",
ensure_all_finite=True,
return_counts=False,
Expand All @@ -84,6 +86,13 @@ def _fit(
X, ensure_all_finite=ensure_all_finite
)
self.n_features_in_ = n_features
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
# Filtering rows with sample_weight equals zero so we don't get extra dummy
# columns.
X_list = [Xi[sample_weight != 0] for Xi in X_list]
sample_weight = sample_weight[sample_weight != 0]
n_samples = np.sum(sample_weight)

if self.categories != "auto":
if len(self.categories) != n_features:
Expand All @@ -100,7 +109,9 @@ def _fit(
Xi = X_list[i]

if self.categories == "auto":
result = _unique(Xi, return_counts=compute_counts)
result = _unique(
Xi, sample_weight=sample_weight, return_counts=compute_counts
)
if compute_counts:
cats, counts = result
category_counts.append(counts)
Expand Down Expand Up @@ -163,7 +174,7 @@ def _fit(
)
raise ValueError(msg)
if compute_counts:
category_counts.append(_get_counts(Xi, cats))
category_counts.append(_get_counts(Xi, cats, sample_weight))

self.categories_.append(cats)

Expand Down Expand Up @@ -281,11 +292,13 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):

Parameters
----------
category_count : ndarray of shape (n_cardinality,)
Category counts.
category_count : array-like of shape (n_cardinality,)
Category counts or sum of `sample_weight` for the samples from the
category when `sample_weight` is different from `None`.

n_samples : int
Number of samples.
Number of samples in training set or total sum of `sample_weight`
for all samples when `sample_weight` is different from `None`.

col_idx : int
Index of the current category. Only used for the error message.
Expand Down Expand Up @@ -348,7 +361,8 @@ def _fit_infrequent_category_mapping(
Parameters
----------
n_samples : int
Number of samples in training set.
Number of samples in training set or total sum of `sample_weight`
for all samples when `sample_weight` is different from `None`.
category_counts: list of ndarray
`category_counts[i]` is the category counts corresponding to
`self.categories_[i]`.
Expand Down Expand Up @@ -578,13 +592,15 @@ class OneHotEncoder(_BaseEncoder):

min_frequency : int or float, default=None
Specifies the minimum frequency below which a category will be
considered infrequent.
considered infrequent. If during fit `sample_weight` is different from
default, then count will be done with sum of samples' weight.

- If `int`, categories with a smaller cardinality will be considered
infrequent.

- If `float`, categories with a smaller cardinality than
`min_frequency * n_samples` will be considered infrequent.
`min_frequency * n_samples` will be considered infrequent. If
`sample_weight` is different from `None`, `n_samples = sum(sample_weight)`.

.. versionadded:: 1.1
Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
Expand Down Expand Up @@ -970,7 +986,7 @@ def _compute_n_features_outs(self):
return output

@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
def fit(self, X, y=None, sample_weight=None):
"""
Fit OneHotEncoder to X.

Expand All @@ -983,13 +999,21 @@ def fit(self, X, y=None):
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.

sample_weight : array-like of shape (n_samples,), default=None
Sample weights used to weight the categories when using filtering
catergories with `max_categories` and `min_frequency`. If `None`,
then samples are equally weighted. If both `max_categories` and
`min_frequency` are set to default values, then `sample_weight`
is ignored.

Returns
-------
self
Fitted encoder.
"""
self._fit(
X,
sample_weight=sample_weight,
handle_unknown=self.handle_unknown,
ensure_all_finite="allow-nan",
)
Expand Down Expand Up @@ -1313,7 +1337,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
infrequent.

- If `float`, categories with a smaller cardinality than
`min_frequency * n_samples` will be considered infrequent.
`min_frequency * n_samples` will be considered infrequent.

.. versionadded:: 1.3
Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
Expand Down
92 changes: 92 additions & 0 deletions 92 sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2352,6 +2352,98 @@ def test_ordinal_encoder_missing_appears_infrequent():
assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])


@pytest.mark.parametrize(
"X, sample_weight, expected_shape",
[
(
[
["car", 3],
["bike", 3],
["car", 1],
["bike", 3],
["boat", 2],
["airplane", 4],
],
np.array([2, 2.5, 0.5, 0.1, 0, 0]),
(6, 4), # columns: car, bike, 3, infrequent (1)
),
(
[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
np.array([5, 5, 0.1, 0.3, 4, 0.9]),
(6, 3),
),
],
)
@pytest.mark.parametrize(
"min_frequency",
[0.3, 0.9, 2],
)
def test_one_hot_encoder_sample_weight_min_frequency(
X, sample_weight, expected_shape, min_frequency
):
ohe = OneHotEncoder(min_frequency=2, handle_unknown="infrequent_if_exist")
X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
assert_allclose(X_trans.shape, expected_shape)


@pytest.mark.parametrize(
"X, sample_weight, expected_shape",
[
(
[
["car", 3],
["bike", 3],
["car", 1],
["bike", 3],
["boat", 2],
["airplane", 4],
],
np.array([2, 2.5, 0.5, 0.1, 0, 0]),
(6, 4),
),
(
[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
np.array([5, 5, 0.1, 0.3, 4, 0.9]),
(6, 2),
),
],
)
def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape):
ohe = OneHotEncoder(max_categories=2, handle_unknown="ignore")
X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
assert_allclose(X_trans.shape, expected_shape)


@pytest.mark.parametrize(
"X",
[[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
)
@pytest.mark.parametrize(
"min_frequency",
[0.1, 0.3, 0.5, 0.9],
)
def test_one_hot_encoder_sample_weight_constant(X, min_frequency):
ohe = OneHotEncoder(min_frequency=min_frequency)
X_sw_None = ohe.fit_transform(X, sample_weight=None).toarray()
X_sw_constant1 = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
X_sw_constant5 = ohe.fit_transform(X, sample_weight=5 * np.ones(len(X))).toarray()

assert_array_equal(X_sw_None, X_sw_constant1)
assert_array_equal(X_sw_None, X_sw_constant5)


@pytest.mark.parametrize(
"X",
[[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
)
def test_one_hot_encoder_sample_weight_is_ignored(X):
ohe = OneHotEncoder()
X_sw_None = ohe.fit_transform(X).toarray()
X_sw_ones = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()

assert_array_equal(X_sw_None, X_sw_ones)


@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
def test_encoder_not_fitted(Encoder):
"""Check that we raise a `NotFittedError` by calling transform before fit with
Expand Down
99 changes: 82 additions & 17 deletions 99 sklearn/utils/_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
xpx,
)
from ._missing import is_scalar_nan
from .validation import _check_sample_weight


def _unique(values, *, return_inverse=False, return_counts=False):
def _unique(values, *, sample_weight=None, return_inverse=False, return_counts=False):
"""Helper function to find unique values with support for python objects.

Uses pure python method for object dtype, and numpy method for
Expand Down Expand Up @@ -50,29 +51,83 @@
"""
if values.dtype == object:
return _unique_python(
values, return_inverse=return_inverse, return_counts=return_counts
values,
return_inverse=return_inverse,
return_counts=return_counts,
sample_weight=sample_weight,
)
# numerical
return _unique_np(
values, return_inverse=return_inverse, return_counts=return_counts
values,
return_inverse=return_inverse,
return_counts=return_counts,
sample_weight=sample_weight,
)


def _unique_np(values, return_inverse=False, return_counts=False):
def _xp_unique_groupby_sum(
xp,
arr,
sample_weight,
return_index=False,
return_inverse=False,
return_counts=False,
):
"""This functions behaves like xp.unique_all but it counts the values of `arr`
taking into acount `sample_weight`."""
sample_weight = _check_sample_weight(sample_weight, arr)

sorted_indices = xp.argsort(arr)
sorted_arr = arr[sorted_indices]
sorted_sample_weight = sample_weight[sorted_indices]

unique_elements, unique_indices, _, _ = xp.unique_all(sorted_arr)
_, unique_inverse = xp.unique_inverse(arr)

# TODO ohe_sw: Update to xp-logic. This functions are not in the API.
unique_indices = np.append(unique_indices, len(arr))
subarrays = np.split(sorted_sample_weight, unique_indices[1:])
group_sums = np.array(
[np.sum(subarray.astype(float)) for subarray in subarrays[:-1]]
)

results = [unique_elements]
if return_index:
results.append(unique_indices)
if return_inverse:
results.append(unique_inverse)
if return_counts:
results.append(group_sums)

if len(results) > 1:
return tuple(results)
return results[0]

Check warning on line 104 in sklearn/utils/_encode.py

View check run for this annotation

Codecov / codecov/patch

sklearn/utils/_encode.py#L104

Added line #L104 was not covered by tests


def _unique_np(values, return_inverse=False, return_counts=False, sample_weight=None):
"""Helper function to find unique values for numpy arrays that correctly
accounts for nans. See `_unique` documentation for details."""
xp, _ = get_namespace(values)

inverse, counts = None, None

if return_inverse and return_counts:
uniques, _, inverse, counts = xp.unique_all(values)
elif return_inverse:
uniques, inverse = xp.unique_inverse(values)
elif return_counts:
uniques, counts = xp.unique_counts(values)
if sample_weight is None:
if return_inverse and return_counts:
uniques, _, inverse, counts = xp.unique_all(values)
elif return_inverse:
uniques, inverse = xp.unique_inverse(values)
elif return_counts:
uniques, counts = xp.unique_counts(values)
else:
uniques = xp.unique_values(values)
else:
uniques = xp.unique_values(values)
uniques, _, inverse, counts = _xp_unique_groupby_sum(
xp,
values,
sample_weight,
return_index=True,
return_inverse=True,
return_counts=True,
)

# np.unique will have duplicate missing values at the end of `uniques`
# here we clip the nans and remove it from uniques
Expand Down Expand Up @@ -174,7 +229,7 @@
return xp.asarray([table[v] for v in values], device=device(values))


def _unique_python(values, *, return_inverse, return_counts):
def _unique_python(values, *, return_inverse, return_counts, sample_weight=None):
# Only used in `_uniques`, see docstring there for details
try:
uniques_set = set(values)
Expand All @@ -195,7 +250,7 @@
ret += (_map_to_integer(values, uniques),)

if return_counts:
ret += (_get_counts(values, uniques),)
ret += (_get_counts(values, uniques, sample_weight),)

return ret[0] if len(ret) == 1 else ret

Expand Down Expand Up @@ -349,9 +404,12 @@
raise KeyError(key)


def _get_counts(values, uniques):
def _get_counts(values, uniques, sample_weight=None):
"""Get the count of each of the `uniques` in `values`.

If `sample_weight` is not `None` then the count is actually the sum of
`sample_weight` for that unique value.

The counts will use the order passed in by `uniques`. For non-object dtypes,
`uniques` is assumed to be sorted and `np.nan` is at the end.
"""
Expand All @@ -360,10 +418,17 @@
output = np.zeros(len(uniques), dtype=np.int64)
for i, item in enumerate(uniques):
with suppress(KeyError):
output[i] = counter[item]
if sample_weight is None:
output[i] = counter[item]
else:
# TODO ohe_sw: I need to create tests for this. Is this
# values == item working for NaN items?
output[i] = np.sum(sample_weight[values == item])
return output

unique_values, counts = _unique_np(values, return_counts=True)
unique_values, counts = _unique_np(
values, return_counts=True, sample_weight=sample_weight
)

# Recorder unique_values based on input: `uniques`
uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.