scikit-learn
diff --git a/‎doc/whats_new/v0.24.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v0.24.rst
+13-8Lines changed: 13 additions & 8 deletions b/‎doc/whats_new/v0.24.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v0.24.rst
+13-8Lines changed: 13 additions & 8 deletions
diff --git a/‎sklearn/preprocessing/_encoders.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/_encoders.py
+1-1Lines changed: 1 addition & 1 deletion b/‎sklearn/preprocessing/_encoders.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/_encoders.py
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/preprocessing/tests/test_encoders.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/tests/test_encoders.py
+11-6Lines changed: 11 additions & 6 deletions b/‎sklearn/preprocessing/tests/test_encoders.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/tests/test_encoders.py
+11-6Lines changed: 11 additions & 6 deletions
diff --git a/‎sklearn/utils/_encode.py
Copy file name to clipboardExpand all lines: sklearn/utils/_encode.py
+2-2Lines changed: 2 additions & 2 deletions b/‎sklearn/utils/_encode.py
Copy file name to clipboardExpand all lines: sklearn/utils/_encode.py
+2-2Lines changed: 2 additions & 2 deletions
diff --git a/‎sklearn/utils/_testing.py
Copy file name to clipboardExpand all lines: sklearn/utils/_testing.py
+38-10Lines changed: 38 additions & 10 deletions b/‎sklearn/utils/_testing.py
Copy file name to clipboardExpand all lines: sklearn/utils/_testing.py
+38-10Lines changed: 38 additions & 10 deletions
diff --git a/‎sklearn/utils/tests/test_testing.py
Copy file name to clipboardExpand all lines: sklearn/utils/tests/test_testing.py
+43-11Lines changed: 43 additions & 11 deletions b/‎sklearn/utils/tests/test_testing.py
Copy file name to clipboardExpand all lines: sklearn/utils/tests/test_testing.py
+43-11Lines changed: 43 additions & 11 deletions
@@ -62,6 +62,13 @@ Changelog
   :class:`model_selection.HalvingGridSearchCV` were not properly converted to
   numpy arrays. :pr:`19211` by `Nicolas Hug`_.
 
+:mod:`sklearn.multioutput`
+..........................
+
+- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
+  that dynamically define `predict` during fitting, such as
+  :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
+
 :mod:`sklearn.preprocessing`
 ............................
 
@@ -70,19 +77,17 @@ Changelog
   `'use_encoded_value'` strategies.
   :pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| Fix encoder categories having dtype='S'
+  :class:`preprocessing.OneHotEncoder` and
+  :class:`preprocessing.OrdinalEncoder`.
+  :pr:`19727` by :user:`Andrew Delong <andrewdelong>`.
+
 - |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles
   unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_.
 
 - |Fix| :meth:`preprocessing.OneHotEncoder.fit` no longer alters the `drop`
   parameter. :pr:`19924` by `Thomas Fan`_.
 
-:mod:`sklearn.multioutput`
-..........................
-
-- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
-  that dynamically define `predict` during fitting, such as
-  :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
-
 :mod:`sklearn.semi_supervised`
 ..............................
 
@@ -91,7 +96,7 @@ Changelog
   :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
 
 :mod:`sklearn.tree`
-.......................
+...................
 
 - |Fix| Fix a bug in `fit` of :class:`tree.BaseDecisionTree` that caused
   segmentation faults under certain conditions. `fit` now deep copies the
 
@@ -90,7 +90,7 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                 cats = _unique(Xi)
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
-                if Xi.dtype.kind not in 'OU':
+                if Xi.dtype.kind not in 'OUS':
                     sorted_cats = np.sort(cats)
                     error_msg = ("Unsorted categories are not "
                                  "supported for numerical categories")
 
@@ -710,7 +710,8 @@ def test_encoder_dtypes():
 
     for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
               np.array([[1, 2], [3, 4]], dtype='float64'),
-              np.array([['a', 'b'], ['c', 'd']]),  # string dtype
+              np.array([['a', 'b'], ['c', 'd']]),      # unicode dtype
+              np.array([[b'a', b'b'], [b'c', b'd']]),  # string dtype
               np.array([[1, 'a'], [3, 'b']], dtype='object')]:
         enc.fit(X)
         assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
@@ -847,21 +848,25 @@ def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
 
 
-@pytest.mark.parametrize('input_dtype', ['O', 'U'])
-@pytest.mark.parametrize('category_dtype', ['O', 'U'])
+# deliberately omit 'OS' as an invalid combo
+@pytest.mark.parametrize('input_dtype, category_dtype', ['OO', 'OU',
+                                                         'UO', 'UU', 'US',
+                                                         'SO', 'SU', 'SS'])
 @pytest.mark.parametrize('array_type', ['list', 'array', 'dataframe'])
-def test_encoders_unicode_categories(input_dtype, category_dtype, array_type):
-    """Check that encoding work with string and object dtypes.
+def test_encoders_string_categories(input_dtype, category_dtype, array_type):
+    """Check that encoding work with object, unicode, and byte string dtypes.
     Non-regression test for:
     https://github.com/scikit-learn/scikit-learn/issues/15616
     https://github.com/scikit-learn/scikit-learn/issues/15726
+    https://github.com/scikit-learn/scikit-learn/issues/19677
     """
 
     X = np.array([['b'], ['a']], dtype=input_dtype)
     categories = [np.array(['b', 'a'], dtype=category_dtype)]
     ohe = OneHotEncoder(categories=categories, sparse=False).fit(X)
 
-    X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type)
+    X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type,
+                                dtype=input_dtype)
     X_trans = ohe.transform(X_test)
 
     expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
 
@@ -173,7 +173,7 @@ def _encode(values, *, uniques, check_unknown=True):
     encoded : ndarray
         Encoded values
     """
-    if values.dtype.kind in 'OU':
+    if values.dtype.kind in 'OUS':
         try:
             return _map_to_integer(values, uniques)
         except KeyError as e:
@@ -214,7 +214,7 @@ def _check_unknown(values, known_values, return_mask=False):
     """
     valid_mask = None
 
-    if values.dtype.kind in 'UO':
+    if values.dtype.kind in 'OUS':
         values_set = set(values)
         values_set, missing_in_values = _extract_missing(values_set)
 
 
@@ -758,30 +758,58 @@ def assert_run_python_script(source_code, timeout=60):
         os.unlink(source_file)
 
 
-def _convert_container(container, constructor_name, columns_name=None):
+def _convert_container(
+    container, constructor_name, columns_name=None, dtype=None
+):
+    """Convert a given container to a specific array-like with a dtype.
+
+    Parameters
+    ----------
+    container : array-like
+        The container to convert.
+    constructor_name : {"list", "tuple", "array", "sparse", "dataframe", \
+            "series", "index", "slice", "sparse_csr", "sparse_csc"}
+        The type of the returned container.
+    columns_name : index or array-like, default=None
+        For pandas container supporting `columns_names`, it will affect
+        specific names.
+    dtype : dtype, default=None
+        Force the dtype of the container. Does not apply to `"slice"`
+        container.
+
+    Returns
+    -------
+    converted_container
+    """
     if constructor_name == 'list':
-        return list(container)
+        if dtype is None:
+            return list(container)
+        else:
+            return np.asarray(container, dtype=dtype).tolist()
     elif constructor_name == 'tuple':
-        return tuple(container)
+        if dtype is None:
+            return tuple(container)
+        else:
+            return tuple(np.asarray(container, dtype=dtype).tolist())
     elif constructor_name == 'array':
-        return np.asarray(container)
+        return np.asarray(container, dtype=dtype)
     elif constructor_name == 'sparse':
-        return sp.sparse.csr_matrix(container)
+        return sp.sparse.csr_matrix(container, dtype=dtype)
     elif constructor_name == 'dataframe':
         pd = pytest.importorskip('pandas')
-        return pd.DataFrame(container, columns=columns_name)
+        return pd.DataFrame(container, columns=columns_name, dtype=dtype)
     elif constructor_name == 'series':
         pd = pytest.importorskip('pandas')
-        return pd.Series(container)
+        return pd.Series(container, dtype=dtype)
     elif constructor_name == 'index':
         pd = pytest.importorskip('pandas')
-        return pd.Index(container)
+        return pd.Index(container, dtype=dtype)
     elif constructor_name == 'slice':
         return slice(container[0], container[1])
     elif constructor_name == 'sparse_csr':
-        return sp.sparse.csr_matrix(container)
+        return sp.sparse.csr_matrix(container, dtype=dtype)
     elif constructor_name == 'sparse_csc':
-        return sp.sparse.csc_matrix(container)
+        return sp.sparse.csc_matrix(container, dtype=dtype)
 
 
 def raises(expected_exc_type, match=None, may_pass=False, err_msg=None):
 
@@ -624,19 +624,51 @@ def test_create_memmap_backed_data(monkeypatch):
 
 @pytest.mark.parametrize(
     "constructor_name, container_type",
-    [('list', list),
-     ('tuple', tuple),
-     ('array', np.ndarray),
-     ('sparse', sparse.csr_matrix),
-     ('dataframe', pytest.importorskip('pandas').DataFrame),
-     ('series', pytest.importorskip('pandas').Series),
-     ('index', pytest.importorskip('pandas').Index),
-     ('slice', slice)]
+    [
+        ('list', list),
+        ('tuple', tuple),
+        ('array', np.ndarray),
+        ('sparse', sparse.csr_matrix),
+        ('sparse_csr', sparse.csr_matrix),
+        ('sparse_csc', sparse.csc_matrix),
+        ('dataframe', lambda: pytest.importorskip('pandas').DataFrame),
+        ('series', lambda: pytest.importorskip('pandas').Series),
+        ('index', lambda: pytest.importorskip('pandas').Index),
+        ('slice', slice),
+    ]
 )
-def test_convert_container(constructor_name, container_type):
+@pytest.mark.parametrize(
+    "dtype, superdtype",
+    [
+        (np.int32, np.integer),
+        (np.int64, np.integer),
+        (np.float32, np.floating),
+        (np.float64, np.floating),
+    ]
+)
+def test_convert_container(
+    constructor_name, container_type, dtype, superdtype,
+):
+    """Check that we convert the container to the right type of array with the
+    right data type."""
+    if constructor_name in ("dataframe", "series", "index"):
+        # delay the import of pandas within the function to only skip this test
+        # instead of the whole file
+        container_type = container_type()
     container = [0, 1]
-    assert isinstance(_convert_container(container, constructor_name),
-                      container_type)
+    container_converted = _convert_container(
+        container, constructor_name, dtype=dtype,
+    )
+    assert isinstance(container_converted, container_type)
+
+    if constructor_name in ("list", "tuple", "index"):
+        # list and tuple will use Python class dtype: int, float
+        # pandas index will always use high precision: np.int64 and np.float64
+        assert np.issubdtype(type(container_converted[0]), superdtype)
+    elif hasattr(container_converted, "dtype"):
+        assert container_converted.dtype == dtype
+    elif hasattr(container_converted, "dtypes"):
+        assert container_converted.dtypes[0] == dtype
 
 
 def test_raises():