scikit-learn · adrinjalali · Feb 6, 2021 · Sep 8, 2020 · Sep 8, 2020 · Sep 11, 2020
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -234,10 +234,6 @@ Changelog
  files downloaded or cached to ensure data integrity.
  :pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.

- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff
-  files downloaded or cached to ensure data integrity.
-  :pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.
-
 - |Enhancement| :func:`datasets.fetch_openml` now allows argument `as_frame`
  to be 'auto', which tries to convert returned data to pandas DataFrame
  unless data is sparse.

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -172,6 +172,14 @@ Changelog
  :class:`~sklearn.semi_supervised.LabelPropagation`.
  :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.

+:mod:`sklearn.datasets`
+.......................
+
+- |Enhancement| :func:`datasets.fetch_openml` now supports categories with
+  missing values when returning a pandas dataframe. :pr:`19365` by
+  `Thomas Fan`_ and :user:`Amanda Dsouza <amy12xx>` and
+  :user:`EL-ATEIF Sara <elateifsara>`.
+
 Code and Documentation Contributors
 -----------------------------------


diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
@@ -23,6 +23,7 @@
 from . import get_data_home
 from urllib.error import HTTPError
 from ..utils import Bunch
+from ..utils import is_scalar_nan
 from ..utils import get_chunk_n_rows
 from ..utils import _chunk_generator
 from ..utils import check_pandas_support  # noqa
@@ -357,7 +358,10 @@ def _convert_arff_data_dataframe(
    for column in columns_to_keep:
        dtype = _feature_to_dtype(features_dict[column])
        if dtype == 'category':
-            dtype = pd.api.types.CategoricalDtype(attributes[column])
+            cats_without_missing = [cat for cat in attributes[column]
+                                    if cat is not None and
+                                    not is_scalar_nan(cat)]
+            dtype = pd.api.types.CategoricalDtype(cats_without_missing)
        df[column] = df[column].astype(dtype, copy=False)
    return (df, )


diff --git a/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-42585.json.gz b/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-42585.json.gz
diff --git a/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-features-42585.json.gz b/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-features-42585.json.gz
diff --git a/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-qualities-42585.json.gz b/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-qualities-42585.json.gz
diff --git a/sklearn/datasets/tests/data/openml/42585/data-v1-download-21854866.arff.gz b/sklearn/datasets/tests/data/openml/42585/data-v1-download-21854866.arff.gz
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -1311,3 +1311,18 @@ def test_convert_arff_data_type():
    msg = r"arff\['data'\] must be a generator when converting to pd.DataFrame"
    with pytest.raises(ValueError, match=msg):
        _convert_arff_data_dataframe(arff, ['a'], {})
+
+
+def test_missing_values_pandas(monkeypatch):
+    """check that missing values in categories are compatible with pandas
+    categorical"""
+    pytest.importorskip('pandas')
+
+    data_id = 42585
+    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
+    penguins = fetch_openml(data_id=data_id, cache=False, as_frame=True)
+
+    cat_dtype = penguins.data.dtypes['sex']
+    # there are nans in the categorical
+    assert penguins.data['sex'].isna().any()
+    assert_array_equal(cat_dtype.categories, ['FEMALE', 'MALE', '_'])