Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 7e7e115

Browse filesBrowse files
authored
ENH Add 'if_binary' option to drop argument of OneHotEncoder (#16245)
1 parent 0c4252c commit 7e7e115
Copy full SHA for 7e7e115

File tree

Expand file treeCollapse file tree

3 files changed

+90
-16
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+90
-16
lines changed

‎doc/whats_new/v0.23.rst

Copy file name to clipboardExpand all lines: doc/whats_new/v0.23.rst
+5Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,11 @@ Changelog
220220
- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
221221
transforming. :pr:`15762` by `Thomas Fan`_.
222222

223+
- |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder`
224+
will now accept value 'if_binary' and will drop the first category of
225+
each feature with two categories. :pr:`#16245`
226+
by :user:`Rushabh Vasani <rushabh-v>`.
227+
223228
:mod:`sklearn.svm`
224229
..................
225230

‎sklearn/preprocessing/_encoders.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/_encoders.py
+46-16Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ class OneHotEncoder(_BaseEncoder):
195195
- None : retain all features (the default).
196196
- 'first' : drop the first category in each feature. If only one
197197
category is present, the feature will be dropped entirely.
198+
- 'if_binary' : drop the first category in each feature with two
199+
categories. Features with 1 or more than 2 categories are
200+
left intact.
198201
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
199202
should be dropped.
200203
@@ -222,8 +225,12 @@ class OneHotEncoder(_BaseEncoder):
222225
223226
drop_idx_ : array of shape (n_features,)
224227
``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
225-
be dropped for each feature. None if all the transformed features will
226-
be retained.
228+
be dropped for each feature.
229+
``drop_idx_[i] = -1`` if no category is to be dropped from the feature
230+
with index ``i``, e.g. when `drop='if_binary'` and the feature isn't
231+
binary
232+
233+
``drop_idx_ = None`` if all the transformed features will be retained.
227234
228235
See Also
229236
--------
@@ -293,15 +300,28 @@ def _validate_keywords(self):
293300
def _compute_drop_idx(self):
294301
if self.drop is None:
295302
return None
296-
elif (isinstance(self.drop, str) and self.drop == 'first'):
297-
return np.zeros(len(self.categories_), dtype=np.int_)
298-
elif not isinstance(self.drop, str):
303+
elif isinstance(self.drop, str):
304+
if self.drop == 'first':
305+
return np.zeros(len(self.categories_), dtype=np.int_)
306+
elif self.drop == 'if_binary':
307+
return np.array([0 if len(cats) == 2 else -1
308+
for cats in self.categories_], dtype=np.int_)
309+
else:
310+
msg = (
311+
"Wrong input for parameter `drop`. Expected "
312+
"'first', 'if_binary', None or array of objects, got {}"
313+
)
314+
raise ValueError(msg.format(type(self.drop)))
315+
316+
else:
299317
try:
300318
self.drop = np.asarray(self.drop, dtype=object)
301319
droplen = len(self.drop)
302320
except (ValueError, TypeError):
303-
msg = ("Wrong input for parameter `drop`. Expected "
304-
"'first', None or array of objects, got {}")
321+
msg = (
322+
"Wrong input for parameter `drop`. Expected "
323+
"'first', 'if_binary', None or array of objects, got {}"
324+
)
305325
raise ValueError(msg.format(type(self.drop)))
306326
if droplen != len(self.categories_):
307327
msg = ("`drop` should have length equal to the number "
@@ -321,10 +341,6 @@ def _compute_drop_idx(self):
321341
return np.array([np.where(cat_list == val)[0][0]
322342
for (val, cat_list) in
323343
zip(self.drop, self.categories_)], dtype=np.int_)
324-
else:
325-
msg = ("Wrong input for parameter `drop`. Expected "
326-
"'first', None or array of objects, got {}")
327-
raise ValueError(msg.format(type(self.drop)))
328344

329345
def fit(self, X, y=None):
330346
"""
@@ -392,15 +408,25 @@ def transform(self, X):
392408
n_samples, n_features = X_int.shape
393409

394410
if self.drop is not None:
395-
to_drop = self.drop_idx_.reshape(1, -1)
396-
411+
to_drop = self.drop_idx_.copy()
397412
# We remove all the dropped categories from mask, and decrement all
398413
# categories that occur after them to avoid an empty column.
399-
400414
keep_cells = X_int != to_drop
401-
X_mask &= keep_cells
415+
n_values = []
416+
for i, cats in enumerate(self.categories_):
417+
n_cats = len(cats)
418+
419+
# drop='if_binary' but feature isn't binary
420+
if to_drop[i] == -1:
421+
# set to cardinality to not drop from X_int
422+
to_drop[i] = n_cats
423+
n_values.append(n_cats)
424+
else: # dropped
425+
n_values.append(n_cats - 1)
426+
427+
to_drop = to_drop.reshape(1, -1)
402428
X_int[X_int > to_drop] -= 1
403-
n_values = [len(cats) - 1 for cats in self.categories_]
429+
X_mask &= keep_cells
404430
else:
405431
n_values = [len(cats) for cats in self.categories_]
406432

@@ -447,6 +473,10 @@ def inverse_transform(self, X):
447473
if self.drop is None:
448474
n_transformed_features = sum(len(cats)
449475
for cats in self.categories_)
476+
elif isinstance(self.drop, str) and self.drop == 'if_binary':
477+
n_transformed_features = sum(1 if len(cats) == 2
478+
else len(cats)
479+
for cats in self.categories_)
450480
else:
451481
n_transformed_features = sum(len(cats) - 1
452482
for cats in self.categories_)

‎sklearn/preprocessing/tests/test_encoders.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/tests/test_encoders.py
+39Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,15 @@ def test_one_hot_encoder_inverse(sparse_, drop):
265265
enc.inverse_transform(X_tr)
266266

267267

268+
def test_one_hot_encoder_inverse_if_binary():
269+
X = np.array([['Male', 1],
270+
['Female', 3],
271+
['Female', 2]], dtype=object)
272+
ohe = OneHotEncoder(drop='if_binary', sparse=False)
273+
X_tr = ohe.fit_transform(X)
274+
assert_array_equal(ohe.inverse_transform(X_tr), X)
275+
276+
268277
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
269278
@pytest.mark.parametrize("X", [
270279
[1, 2],
@@ -398,6 +407,36 @@ def test_one_hot_encoder_feature_names_drop(drop, expected_names):
398407
assert_array_equal(expected_names, feature_names)
399408

400409

410+
def test_one_hot_encoder_drop_equals_if_binary():
411+
# Canonical case
412+
X = [[10, 'yes'],
413+
[20, 'no'],
414+
[30, 'yes']]
415+
expected = np.array([[1., 0., 0., 1.],
416+
[0., 1., 0., 0.],
417+
[0., 0., 1., 1.]])
418+
expected_drop_idx = np.array([-1, 0])
419+
420+
ohe = OneHotEncoder(drop='if_binary', sparse=False)
421+
result = ohe.fit_transform(X)
422+
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
423+
assert_allclose(result, expected)
424+
425+
# with only one cat, the behaviour is equivalent to drop=None
426+
X = [['true', 'a'],
427+
['false', 'a'],
428+
['false', 'a']]
429+
expected = np.array([[1., 1.],
430+
[0., 1.],
431+
[0., 1.]])
432+
expected_drop_idx = np.array([0, -1])
433+
434+
ohe = OneHotEncoder(drop='if_binary', sparse=False)
435+
result = ohe.fit_transform(X)
436+
assert_array_equal(ohe.drop_idx_, expected_drop_idx)
437+
assert_allclose(result, expected)
438+
439+
401440
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
402441
np.array([['a', np.nan]], dtype=object).T],
403442
ids=['numeric', 'object'])

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.