Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit e403721

Browse filesBrowse files
authored
fix: issue a warning if buggy pyarrow is detected (#787)
Some pyarrow versions can cause issue when loading data from dataframe. This commit detects if such pyarrow version is installed and warns the user.
1 parent d1cbc38 commit e403721
Copy full SHA for e403721

File tree

Expand file treeCollapse file tree

2 files changed

+52
-0
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+52
-0
lines changed

‎google/cloud/bigquery/client.py

Copy file name to clipboardExpand all lines: google/cloud/bigquery/client.py
+15Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,16 @@
2727
import json
2828
import math
2929
import os
30+
import packaging.version
3031
import tempfile
3132
from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union
3233
import uuid
3334
import warnings
3435

3536
try:
3637
import pyarrow
38+
39+
_PYARROW_VERSION = packaging.version.parse(pyarrow.__version__)
3740
except ImportError: # pragma: NO COVER
3841
pyarrow = None
3942

@@ -118,6 +121,9 @@
118121
# https://github.com/googleapis/python-bigquery/issues/438
119122
_MIN_GET_QUERY_RESULTS_TIMEOUT = 120
120123

124+
# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414
125+
_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])
126+
121127

122128
class Project(object):
123129
"""Wrapper for resource describing a BigQuery project.
@@ -2609,6 +2615,15 @@ def load_table_from_dataframe(
26092615
try:
26102616

26112617
if job_config.source_format == job.SourceFormat.PARQUET:
2618+
if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS:
2619+
msg = (
2620+
"Loading dataframe data in PARQUET format with pyarrow "
2621+
f"{_PYARROW_VERSION} can result in data corruption. It is "
2622+
"therefore *strongly* advised to use a different pyarrow "
2623+
"version or a different source format. "
2624+
"See: https://github.com/googleapis/python-bigquery/issues/781"
2625+
)
2626+
warnings.warn(msg, category=RuntimeWarning)
26122627

26132628
if job_config.schema:
26142629
if parquet_compression == "snappy": # adjust the default value

‎tests/unit/test_client.py

Copy file name to clipboardExpand all lines: tests/unit/test_client.py
+37Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import warnings
2828

2929
import mock
30+
import packaging
3031
import requests
3132
import pytest
3233
import pytz
@@ -7510,6 +7511,42 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self):
75107511
parquet_compression="gzip",
75117512
)
75127513

7514+
def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self):
7515+
pytest.importorskip("pandas", reason="Requires `pandas`")
7516+
pytest.importorskip("pyarrow", reason="Requires `pyarrow`")
7517+
7518+
client = self._make_client()
7519+
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
7520+
dataframe = pandas.DataFrame(records)
7521+
7522+
pyarrow_version_patch = mock.patch(
7523+
"google.cloud.bigquery.client._PYARROW_VERSION",
7524+
packaging.version.parse("2.0.0"), # A known bad version of pyarrow.
7525+
)
7526+
get_table_patch = mock.patch(
7527+
"google.cloud.bigquery.client.Client.get_table",
7528+
autospec=True,
7529+
side_effect=google.api_core.exceptions.NotFound("Table not found"),
7530+
)
7531+
load_patch = mock.patch(
7532+
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
7533+
)
7534+
7535+
with load_patch, get_table_patch, pyarrow_version_patch:
7536+
with warnings.catch_warnings(record=True) as warned:
7537+
client.load_table_from_dataframe(
7538+
dataframe, self.TABLE_REF, location=self.LOCATION,
7539+
)
7540+
7541+
expected_warnings = [
7542+
warning for warning in warned if "pyarrow" in str(warning).lower()
7543+
]
7544+
assert len(expected_warnings) == 1
7545+
assert issubclass(expected_warnings[0].category, RuntimeWarning)
7546+
msg = str(expected_warnings[0].message)
7547+
assert "pyarrow 2.0.0" in msg
7548+
assert "data corruption" in msg
7549+
75137550
@unittest.skipIf(pandas is None, "Requires `pandas`")
75147551
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
75157552
def test_load_table_from_dataframe_w_nulls(self):

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.