Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 41a8d1f

Browse filesBrowse files
FIX Encoder should accept categories having dtype='S' (#19727)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent 3f3dec0 commit 41a8d1f
Copy full SHA for 41a8d1f

File tree

Expand file treeCollapse file tree

6 files changed

+108
-38
lines changed
Filter options
Expand file treeCollapse file tree

6 files changed

+108
-38
lines changed

‎doc/whats_new/v0.24.rst

Copy file name to clipboardExpand all lines: doc/whats_new/v0.24.rst
+13-8Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@ Changelog
6262
:class:`model_selection.HalvingGridSearchCV` were not properly converted to
6363
numpy arrays. :pr:`19211` by `Nicolas Hug`_.
6464

65+
:mod:`sklearn.multioutput`
66+
..........................
67+
68+
- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
69+
that dynamically define `predict` during fitting, such as
70+
:class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
71+
6572
:mod:`sklearn.preprocessing`
6673
............................
6774

@@ -70,19 +77,17 @@ Changelog
7077
`'use_encoded_value'` strategies.
7178
:pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
7279

80+
- |Fix| Fix encoder categories having dtype='S'
81+
:class:`preprocessing.OneHotEncoder` and
82+
:class:`preprocessing.OrdinalEncoder`.
83+
:pr:`19727` by :user:`Andrew Delong <andrewdelong>`.
84+
7385
- |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles
7486
unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_.
7587

7688
- |Fix| :meth:`preprocessing.OneHotEncoder.fit` no longer alters the `drop`
7789
parameter. :pr:`19924` by `Thomas Fan`_.
7890

79-
:mod:`sklearn.multioutput`
80-
..........................
81-
82-
- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
83-
that dynamically define `predict` during fitting, such as
84-
:class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
85-
8691
:mod:`sklearn.semi_supervised`
8792
..............................
8893

@@ -91,7 +96,7 @@ Changelog
9196
:pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
9297

9398
:mod:`sklearn.tree`
94-
.......................
99+
...................
95100

96101
- |Fix| Fix a bug in `fit` of :class:`tree.BaseDecisionTree` that caused
97102
segmentation faults under certain conditions. `fit` now deep copies the

‎sklearn/preprocessing/_encoders.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/_encoders.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
9090
cats = _unique(Xi)
9191
else:
9292
cats = np.array(self.categories[i], dtype=Xi.dtype)
93-
if Xi.dtype.kind not in 'OU':
93+
if Xi.dtype.kind not in 'OUS':
9494
sorted_cats = np.sort(cats)
9595
error_msg = ("Unsorted categories are not "
9696
"supported for numerical categories")

‎sklearn/preprocessing/tests/test_encoders.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/tests/test_encoders.py
+11-6Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,8 @@ def test_encoder_dtypes():
710710

711711
for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
712712
np.array([[1, 2], [3, 4]], dtype='float64'),
713-
np.array([['a', 'b'], ['c', 'd']]), # string dtype
713+
np.array([['a', 'b'], ['c', 'd']]), # unicode dtype
714+
np.array([[b'a', b'b'], [b'c', b'd']]), # string dtype
714715
np.array([[1, 'a'], [3, 'b']], dtype='object')]:
715716
enc.fit(X)
716717
assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
@@ -847,21 +848,25 @@ def test_encoders_has_categorical_tags(Encoder):
847848
assert 'categorical' in Encoder()._get_tags()['X_types']
848849

849850

850-
@pytest.mark.parametrize('input_dtype', ['O', 'U'])
851-
@pytest.mark.parametrize('category_dtype', ['O', 'U'])
851+
# deliberately omit 'OS' as an invalid combo
852+
@pytest.mark.parametrize('input_dtype, category_dtype', ['OO', 'OU',
853+
'UO', 'UU', 'US',
854+
'SO', 'SU', 'SS'])
852855
@pytest.mark.parametrize('array_type', ['list', 'array', 'dataframe'])
853-
def test_encoders_unicode_categories(input_dtype, category_dtype, array_type):
854-
"""Check that encoding work with string and object dtypes.
856+
def test_encoders_string_categories(input_dtype, category_dtype, array_type):
857+
"""Check that encoding work with object, unicode, and byte string dtypes.
855858
Non-regression test for:
856859
https://github.com/scikit-learn/scikit-learn/issues/15616
857860
https://github.com/scikit-learn/scikit-learn/issues/15726
861+
https://github.com/scikit-learn/scikit-learn/issues/19677
858862
"""
859863

860864
X = np.array([['b'], ['a']], dtype=input_dtype)
861865
categories = [np.array(['b', 'a'], dtype=category_dtype)]
862866
ohe = OneHotEncoder(categories=categories, sparse=False).fit(X)
863867

864-
X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type)
868+
X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type,
869+
dtype=input_dtype)
865870
X_trans = ohe.transform(X_test)
866871

867872
expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])

‎sklearn/utils/_encode.py

Copy file name to clipboardExpand all lines: sklearn/utils/_encode.py
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def _encode(values, *, uniques, check_unknown=True):
173173
encoded : ndarray
174174
Encoded values
175175
"""
176-
if values.dtype.kind in 'OU':
176+
if values.dtype.kind in 'OUS':
177177
try:
178178
return _map_to_integer(values, uniques)
179179
except KeyError as e:
@@ -214,7 +214,7 @@ def _check_unknown(values, known_values, return_mask=False):
214214
"""
215215
valid_mask = None
216216

217-
if values.dtype.kind in 'UO':
217+
if values.dtype.kind in 'OUS':
218218
values_set = set(values)
219219
values_set, missing_in_values = _extract_missing(values_set)
220220

‎sklearn/utils/_testing.py

Copy file name to clipboardExpand all lines: sklearn/utils/_testing.py
+38-10Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -758,30 +758,58 @@ def assert_run_python_script(source_code, timeout=60):
758758
os.unlink(source_file)
759759

760760

761-
def _convert_container(container, constructor_name, columns_name=None):
761+
def _convert_container(
762+
container, constructor_name, columns_name=None, dtype=None
763+
):
764+
"""Convert a given container to a specific array-like with a dtype.
765+
766+
Parameters
767+
----------
768+
container : array-like
769+
The container to convert.
770+
constructor_name : {"list", "tuple", "array", "sparse", "dataframe", \
771+
"series", "index", "slice", "sparse_csr", "sparse_csc"}
772+
The type of the returned container.
773+
columns_name : index or array-like, default=None
774+
For pandas container supporting `columns_names`, it will affect
775+
specific names.
776+
dtype : dtype, default=None
777+
Force the dtype of the container. Does not apply to `"slice"`
778+
container.
779+
780+
Returns
781+
-------
782+
converted_container
783+
"""
762784
if constructor_name == 'list':
763-
return list(container)
785+
if dtype is None:
786+
return list(container)
787+
else:
788+
return np.asarray(container, dtype=dtype).tolist()
764789
elif constructor_name == 'tuple':
765-
return tuple(container)
790+
if dtype is None:
791+
return tuple(container)
792+
else:
793+
return tuple(np.asarray(container, dtype=dtype).tolist())
766794
elif constructor_name == 'array':
767-
return np.asarray(container)
795+
return np.asarray(container, dtype=dtype)
768796
elif constructor_name == 'sparse':
769-
return sp.sparse.csr_matrix(container)
797+
return sp.sparse.csr_matrix(container, dtype=dtype)
770798
elif constructor_name == 'dataframe':
771799
pd = pytest.importorskip('pandas')
772-
return pd.DataFrame(container, columns=columns_name)
800+
return pd.DataFrame(container, columns=columns_name, dtype=dtype)
773801
elif constructor_name == 'series':
774802
pd = pytest.importorskip('pandas')
775-
return pd.Series(container)
803+
return pd.Series(container, dtype=dtype)
776804
elif constructor_name == 'index':
777805
pd = pytest.importorskip('pandas')
778-
return pd.Index(container)
806+
return pd.Index(container, dtype=dtype)
779807
elif constructor_name == 'slice':
780808
return slice(container[0], container[1])
781809
elif constructor_name == 'sparse_csr':
782-
return sp.sparse.csr_matrix(container)
810+
return sp.sparse.csr_matrix(container, dtype=dtype)
783811
elif constructor_name == 'sparse_csc':
784-
return sp.sparse.csc_matrix(container)
812+
return sp.sparse.csc_matrix(container, dtype=dtype)
785813

786814

787815
def raises(expected_exc_type, match=None, may_pass=False, err_msg=None):

‎sklearn/utils/tests/test_testing.py

Copy file name to clipboardExpand all lines: sklearn/utils/tests/test_testing.py
+43-11Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -624,19 +624,51 @@ def test_create_memmap_backed_data(monkeypatch):
624624

625625
@pytest.mark.parametrize(
626626
"constructor_name, container_type",
627-
[('list', list),
628-
('tuple', tuple),
629-
('array', np.ndarray),
630-
('sparse', sparse.csr_matrix),
631-
('dataframe', pytest.importorskip('pandas').DataFrame),
632-
('series', pytest.importorskip('pandas').Series),
633-
('index', pytest.importorskip('pandas').Index),
634-
('slice', slice)]
627+
[
628+
('list', list),
629+
('tuple', tuple),
630+
('array', np.ndarray),
631+
('sparse', sparse.csr_matrix),
632+
('sparse_csr', sparse.csr_matrix),
633+
('sparse_csc', sparse.csc_matrix),
634+
('dataframe', lambda: pytest.importorskip('pandas').DataFrame),
635+
('series', lambda: pytest.importorskip('pandas').Series),
636+
('index', lambda: pytest.importorskip('pandas').Index),
637+
('slice', slice),
638+
]
635639
)
636-
def test_convert_container(constructor_name, container_type):
640+
@pytest.mark.parametrize(
641+
"dtype, superdtype",
642+
[
643+
(np.int32, np.integer),
644+
(np.int64, np.integer),
645+
(np.float32, np.floating),
646+
(np.float64, np.floating),
647+
]
648+
)
649+
def test_convert_container(
650+
constructor_name, container_type, dtype, superdtype,
651+
):
652+
"""Check that we convert the container to the right type of array with the
653+
right data type."""
654+
if constructor_name in ("dataframe", "series", "index"):
655+
# delay the import of pandas within the function to only skip this test
656+
# instead of the whole file
657+
container_type = container_type()
637658
container = [0, 1]
638-
assert isinstance(_convert_container(container, constructor_name),
639-
container_type)
659+
container_converted = _convert_container(
660+
container, constructor_name, dtype=dtype,
661+
)
662+
assert isinstance(container_converted, container_type)
663+
664+
if constructor_name in ("list", "tuple", "index"):
665+
# list and tuple will use Python class dtype: int, float
666+
# pandas index will always use high precision: np.int64 and np.float64
667+
assert np.issubdtype(type(container_converted[0]), superdtype)
668+
elif hasattr(container_converted, "dtype"):
669+
assert container_converted.dtype == dtype
670+
elif hasattr(container_converted, "dtypes"):
671+
assert container_converted.dtypes[0] == dtype
640672

641673

642674
def test_raises():

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.