Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d92fba7

Browse filesBrowse files
authored
FIX add retry mechanism to handle quotechar in read_csv (#25511)
1 parent eae3f29 commit d92fba7
Copy full SHA for d92fba7

File tree

9 files changed

+116
-30
lines changed
Filter options

9 files changed

+116
-30
lines changed

‎doc/whats_new/v1.2.rst

Copy file name to clipboardExpand all lines: doc/whats_new/v1.2.rst
+5-1Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,15 @@ Changelog
146146
:mod:`sklearn.datasets`
147147
.......................
148148

149-
- |Fix| Fix an inconsistency in :func:`datasets.fetch_openml` between liac-arff
149+
- |Fix| Fixes an inconsistency in :func:`datasets.fetch_openml` between liac-arff
150150
and pandas parser when a leading space is introduced after the delimiter.
151151
The ARFF specs requires to ignore the leading space.
152152
:pr:`25312` by :user:`Guillaume Lemaitre <glemaitre>`.
153153

154+
- |Fix| Fixes a bug in :func:`datasets.fetch_openml` when using `parser="pandas"`
155+
where single quote and backslash escape characters were not properly handled.
156+
:pr:`25511` by :user:`Guillaume Lemaitre <glemaitre>`.
157+
154158
:mod:`sklearn.decomposition`
155159
............................
156160

‎sklearn/datasets/_arff_parser.py

Copy file name to clipboardExpand all lines: sklearn/datasets/_arff_parser.py
+41-11Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ def _pandas_arff_parser(
302302
openml_columns_info,
303303
feature_names_to_select,
304304
target_names_to_select,
305+
read_csv_kwargs=None,
305306
):
306307
"""ARFF parser using `pandas.read_csv`.
307308
@@ -331,6 +332,10 @@ def _pandas_arff_parser(
331332
target_names_to_select : list of str
332333
A list of the target names to be selected to build `y`.
333334
335+
read_csv_kwargs : dict, default=None
336+
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
337+
the default options.
338+
334339
Returns
335340
-------
336341
X : {ndarray, sparse matrix, dataframe}
@@ -363,18 +368,37 @@ def _pandas_arff_parser(
363368
dtypes[name] = "Int64"
364369
elif column_dtype.lower() == "nominal":
365370
dtypes[name] = "category"
371+
# since we will not pass `names` when reading the ARFF file, we need to translate
372+
# `dtypes` from column names to column indices to pass to `pandas.read_csv`
373+
dtypes_positional = {
374+
col_idx: dtypes[name]
375+
for col_idx, name in enumerate(openml_columns_info)
376+
if name in dtypes
377+
}
366378

367-
# ARFF represents missing values with "?"
368-
frame = pd.read_csv(
369-
gzip_file,
370-
header=None,
371-
na_values=["?"], # missing values are represented by `?`
372-
comment="%", # skip line starting by `%` since they are comments
373-
quotechar='"', # delimiter to use for quoted strings
374-
names=[name for name in openml_columns_info],
375-
dtype=dtypes,
376-
skipinitialspace=True, # skip spaces after delimiter to follow ARFF specs
377-
)
379+
default_read_csv_kwargs = {
380+
"header": None,
381+
"index_col": False, # always force pandas to not use the first column as index
382+
"na_values": ["?"], # missing values are represented by `?`
383+
"comment": "%", # skip line starting by `%` since they are comments
384+
"quotechar": '"', # delimiter to use for quoted strings
385+
"skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs
386+
"escapechar": "\\",
387+
"dtype": dtypes_positional,
388+
}
389+
read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
390+
frame = pd.read_csv(gzip_file, **read_csv_kwargs)
391+
try:
392+
# Setting the columns while reading the file will select the N first columns
393+
# and not raise a ParserError. Instead, we set the columns after reading the
394+
# file and raise a ParserError if the number of columns does not match the
395+
# number of columns in the metadata given by OpenML.
396+
frame.columns = [name for name in openml_columns_info]
397+
except ValueError as exc:
398+
raise pd.errors.ParserError(
399+
"The number of columns provided by OpenML does not match the number of "
400+
"columns inferred by pandas when reading the file."
401+
) from exc
378402

379403
columns_to_select = feature_names_to_select + target_names_to_select
380404
columns_to_keep = [col for col in frame.columns if col in columns_to_select]
@@ -431,6 +455,7 @@ def load_arff_from_gzip_file(
431455
feature_names_to_select,
432456
target_names_to_select,
433457
shape=None,
458+
read_csv_kwargs=None,
434459
):
435460
"""Load a compressed ARFF file using a given parser.
436461
@@ -461,6 +486,10 @@ def load_arff_from_gzip_file(
461486
target_names_to_select : list of str
462487
A list of the target names to be selected.
463488
489+
read_csv_kwargs : dict, default=None
490+
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
491+
the default options.
492+
464493
Returns
465494
-------
466495
X : {ndarray, sparse matrix, dataframe}
@@ -493,6 +522,7 @@ def load_arff_from_gzip_file(
493522
openml_columns_info,
494523
feature_names_to_select,
495524
target_names_to_select,
525+
read_csv_kwargs,
496526
)
497527
else:
498528
raise ValueError(

‎sklearn/datasets/_openml.py

Copy file name to clipboardExpand all lines: sklearn/datasets/_openml.py
+54-18Lines changed: 54 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,15 @@ def _get_local_path(openml_path: str, data_home: str) -> str:
3737
return os.path.join(data_home, "openml.org", openml_path + ".gz")
3838

3939

40-
def _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable:
40+
def _retry_with_clean_cache(
41+
openml_path: str,
42+
data_home: Optional[str],
43+
no_retry_exception: Optional[Exception] = None,
44+
) -> Callable:
4145
"""If the first call to the decorated function fails, the local cached
4246
file is removed, and the function is called again. If ``data_home`` is
43-
``None``, then the function is called once.
47+
``None``, then the function is called once. We can provide a specific
48+
exception to not retry on usign `no_retry_exception` parameter.
4449
"""
4550

4651
def decorator(f):
@@ -52,7 +57,11 @@ def wrapper(*args, **kw):
5257
return f(*args, **kw)
5358
except URLError:
5459
raise
55-
except Exception:
60+
except Exception as exc:
61+
if no_retry_exception is not None and isinstance(
62+
exc, no_retry_exception
63+
):
64+
raise
5665
warn("Invalid cache, redownloading file", RuntimeWarning)
5766
local_path = _get_local_path(openml_path, data_home)
5867
if os.path.exists(local_path):
@@ -216,7 +225,7 @@ def _get_json_content_from_openml_api(
216225
An exception otherwise.
217226
"""
218227

219-
@_retry_with_clean_cache(url, data_home)
228+
@_retry_with_clean_cache(url, data_home=data_home)
220229
def _load_json():
221230
with closing(
222231
_open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
@@ -492,20 +501,39 @@ def _load_arff_response(
492501
"and retry..."
493502
)
494503

495-
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
496-
with closing(gzip_file):
504+
def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
505+
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
506+
with closing(gzip_file):
507+
return load_arff_from_gzip_file(gzip_file, **arff_params)
497508

498-
X, y, frame, categories = load_arff_from_gzip_file(
499-
gzip_file,
500-
parser=parser,
501-
output_type=output_type,
502-
openml_columns_info=openml_columns_info,
503-
feature_names_to_select=feature_names_to_select,
504-
target_names_to_select=target_names_to_select,
505-
shape=shape,
509+
arff_params = dict(
510+
parser=parser,
511+
output_type=output_type,
512+
openml_columns_info=openml_columns_info,
513+
feature_names_to_select=feature_names_to_select,
514+
target_names_to_select=target_names_to_select,
515+
shape=shape,
516+
)
517+
try:
518+
X, y, frame, categories = _open_url_and_load_gzip_file(
519+
url, data_home, n_retries, delay, arff_params
506520
)
521+
except Exception as exc:
522+
if parser == "pandas":
523+
from pandas.errors import ParserError
524+
525+
if isinstance(exc, ParserError):
526+
# A parsing error could come from providing the wrong quotechar
527+
# to pandas. By default, we use a double quote. Thus, we retry
528+
# with a single quote before to raise the error.
529+
arff_params["read_csv_kwargs"] = {"quotechar": "'"}
530+
X, y, frame, categories = _open_url_and_load_gzip_file(
531+
url, data_home, n_retries, delay, arff_params
532+
)
533+
else:
534+
raise
507535

508-
return X, y, frame, categories
536+
return X, y, frame, categories
509537

510538

511539
def _download_data_to_bunch(
@@ -605,9 +633,17 @@ def _download_data_to_bunch(
605633
"values. Missing values are not supported for target columns."
606634
)
607635

608-
X, y, frame, categories = _retry_with_clean_cache(url, data_home)(
609-
_load_arff_response
610-
)(
636+
no_retry_exception = None
637+
if parser == "pandas":
638+
# If we get a ParserError with pandas, then we don't want to retry and we raise
639+
# early.
640+
from pandas.errors import ParserError
641+
642+
no_retry_exception = ParserError
643+
644+
X, y, frame, categories = _retry_with_clean_cache(
645+
url, data_home, no_retry_exception
646+
)(_load_arff_response)(
611647
url,
612648
data_home,
613649
parser=parser,

‎sklearn/datasets/tests/data/openml/id_42074/__init__.py

Copy file name to clipboardExpand all lines: sklearn/datasets/tests/data/openml/id_42074/__init__.py
Whitespace-only changes.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

‎sklearn/datasets/tests/test_openml.py

Copy file name to clipboardExpand all lines: sklearn/datasets/tests/test_openml.py
+16Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,22 @@ def test_fetch_openml_leading_whitespace(monkeypatch):
16171617
)
16181618

16191619

1620+
def test_fetch_openml_quotechar_escapechar(monkeypatch):
1621+
"""Check that we can handle escapechar and single/double quotechar.
1622+
1623+
Non-regression test for:
1624+
https://github.com/scikit-learn/scikit-learn/issues/25478
1625+
"""
1626+
pd = pytest.importorskip("pandas")
1627+
data_id = 42074
1628+
_monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)
1629+
1630+
common_params = {"as_frame": True, "cache": False, "data_id": data_id}
1631+
adult_pandas = fetch_openml(parser="pandas", **common_params)
1632+
adult_liac_arff = fetch_openml(parser="liac-arff", **common_params)
1633+
pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame)
1634+
1635+
16201636
###############################################################################
16211637
# Deprecation-changed parameters
16221638

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.