Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit b2d6e0a

Browse filesBrowse files
tswastgcf-owl-bot[bot]chalmerlowe
committed
deps: use pandas-gbq to determine schema in load_table_from_dataframe (#2095)
* feat: use pandas-gbq to determine schema in `load_table_from_dataframe` * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix some unit tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * bump minimum pandas-gbq to 0.26.1 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * drop pandas-gbq from python 3.7 extras * relax warning message text assertion * use consistent time zone presense/absense in time datetime system test * Update google/cloud/bigquery/_pandas_helpers.py * Update google/cloud/bigquery/_pandas_helpers.py Co-authored-by: Chalmer Lowe <chalmerlowe@google.com> * remove pandas-gbq from at least 1 unit test and system test session --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Chalmer Lowe <chalmerlowe@google.com>
1 parent cddded1 commit b2d6e0a
Copy full SHA for b2d6e0a

File tree

Expand file treeCollapse file tree

7 files changed

+138
-22
lines changed
Filter options
Expand file treeCollapse file tree

7 files changed

+138
-22
lines changed

‎google/cloud/bigquery/_pandas_helpers.py

Copy file name to clipboardExpand all lines: google/cloud/bigquery/_pandas_helpers.py
+34-1Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""Shared helper functions for connecting BigQuery and pandas."""
15+
"""Shared helper functions for connecting BigQuery and pandas.
16+
17+
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18+
instead. See: go/pandas-gbq-and-bigframes-redundancy and
19+
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pandas_to_bigquery.py
20+
"""
1621

1722
import concurrent.futures
1823
from datetime import datetime
@@ -40,6 +45,16 @@
4045
else:
4146
import numpy
4247

48+
49+
try:
50+
import pandas_gbq.schema.pandas_to_bigquery # type: ignore
51+
52+
pandas_gbq_import_exception = None
53+
except ImportError as exc:
54+
pandas_gbq = None
55+
pandas_gbq_import_exception = exc
56+
57+
4358
try:
4459
import db_dtypes # type: ignore
4560

@@ -450,6 +465,10 @@ def _first_array_valid(series):
450465
def dataframe_to_bq_schema(dataframe, bq_schema):
451466
"""Convert a pandas DataFrame schema to a BigQuery schema.
452467
468+
DEPRECATED: Use
469+
pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(),
470+
instead. See: go/pandas-gbq-and-bigframes-redundancy.
471+
453472
Args:
454473
dataframe (pandas.DataFrame):
455474
DataFrame for which the client determines the BigQuery schema.
@@ -465,6 +484,20 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
465484
The automatically determined schema. Returns None if the type of
466485
any column cannot be determined.
467486
"""
487+
if pandas_gbq is None:
488+
warnings.warn(
489+
"Loading pandas DataFrame into BigQuery will require pandas-gbq "
490+
"package version 0.26.1 or greater in the future. "
491+
f"Tried to import pandas-gbq and got: {pandas_gbq_import_exception}",
492+
category=FutureWarning,
493+
)
494+
else:
495+
return pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(
496+
dataframe,
497+
override_bigquery_fields=bq_schema,
498+
index=True,
499+
)
500+
468501
if bq_schema:
469502
bq_schema = schema._to_schema_fields(bq_schema)
470503
bq_schema_index = {field.name: field for field in bq_schema}

‎google/cloud/bigquery/_pyarrow_helpers.py

Copy file name to clipboardExpand all lines: google/cloud/bigquery/_pyarrow_helpers.py
+6-1Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""Shared helper functions for connecting BigQuery and pyarrow."""
15+
"""Shared helper functions for connecting BigQuery and pyarrow.
16+
17+
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18+
instead. See: go/pandas-gbq-and-bigframes-redundancy and
19+
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
20+
"""
1621

1722
from typing import Any
1823

‎noxfile.py

Copy file name to clipboardExpand all lines: noxfile.py
+15Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,14 @@ def default(session, install_extras=True):
110110
else:
111111
install_target = "."
112112
session.install("-e", install_target, "-c", constraints_path)
113+
114+
# Test with some broken "extras" in case the user didn't install the extra
115+
# directly. For example, pandas-gbq is recommended for pandas features, but
116+
# we want to test that we fallback to the previous behavior. For context,
117+
# see internal document go/pandas-gbq-and-bigframes-redundancy.
118+
if session.python == UNIT_TEST_PYTHON_VERSIONS[0]:
119+
session.run("python", "-m", "pip", "uninstall", "pandas-gbq", "-y")
120+
113121
session.run("python", "-m", "pip", "freeze")
114122

115123
# Run py.test against the unit tests.
@@ -228,6 +236,13 @@ def system(session):
228236
extras = "[all]"
229237
session.install("-e", f".{extras}", "-c", constraints_path)
230238

239+
# Test with some broken "extras" in case the user didn't install the extra
240+
# directly. For example, pandas-gbq is recommended for pandas features, but
241+
# we want to test that we fallback to the previous behavior. For context,
242+
# see internal document go/pandas-gbq-and-bigframes-redundancy.
243+
if session.python == SYSTEM_TEST_PYTHON_VERSIONS[0]:
244+
session.run("python", "-m", "pip", "uninstall", "pandas-gbq", "-y")
245+
231246
# print versions of all dependencies
232247
session.run("python", "-m", "pip", "freeze")
233248

‎pyproject.toml

Copy file name to clipboardExpand all lines: pyproject.toml
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ bqstorage = [
7272
]
7373
pandas = [
7474
"pandas >= 1.1.0",
75+
"pandas-gbq >= 0.26.1; python_version >= '3.8'",
76+
"grpcio >= 1.47.0, < 2.0dev",
77+
"grpcio >= 1.49.1, < 2.0dev; python_version >= '3.11'",
7578
"pyarrow >= 3.0.0",
7679
"db-dtypes >= 0.3.0, < 2.0.0dev",
7780
]

‎tests/system/test_pandas.py

Copy file name to clipboardExpand all lines: tests/system/test_pandas.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1259,7 +1259,7 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id):
12591259
df = pandas.DataFrame(
12601260
dict(
12611261
dt=[
1262-
datetime.datetime(2020, 1, 8, 8, 0, 0),
1262+
datetime.datetime(2020, 1, 8, 8, 0, 0, tzinfo=datetime.timezone.utc),
12631263
datetime.datetime(
12641264
2020,
12651265
1,

‎tests/unit/test__pandas_helpers.py

Copy file name to clipboardExpand all lines: tests/unit/test__pandas_helpers.py
+54-11Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@
3232
except ImportError:
3333
pandas = None
3434

35+
try:
36+
import pandas_gbq.schema.pandas_to_bigquery
37+
except ImportError:
38+
pandas_gbq = None
39+
3540
try:
3641
import geopandas
3742
except ImportError:
@@ -1241,7 +1246,21 @@ def test_dataframe_to_parquet_compression_method(module_under_test):
12411246

12421247

12431248
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1244-
def test_dataframe_to_bq_schema_w_named_index(module_under_test):
1249+
@pytest.mark.skipif(pandas_gbq is None, reason="Requires `pandas-gbq`")
1250+
def test_dataframe_to_bq_schema_returns_schema_with_pandas_gbq(
1251+
module_under_test, monkeypatch
1252+
):
1253+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1254+
dataframe = pandas.DataFrame({"field00": ["foo", "bar"]})
1255+
got = module_under_test.dataframe_to_bq_schema(dataframe, [])
1256+
# Don't assert beyond this, since pandas-gbq is now source of truth.
1257+
assert got is not None
1258+
1259+
1260+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1261+
def test_dataframe_to_bq_schema_w_named_index(module_under_test, monkeypatch):
1262+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1263+
12451264
df_data = collections.OrderedDict(
12461265
[
12471266
("str_column", ["hello", "world"]),
@@ -1252,7 +1271,8 @@ def test_dataframe_to_bq_schema_w_named_index(module_under_test):
12521271
index = pandas.Index(["a", "b"], name="str_index")
12531272
dataframe = pandas.DataFrame(df_data, index=index)
12541273

1255-
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
1274+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1275+
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
12561276

12571277
expected_schema = (
12581278
schema.SchemaField("str_index", "STRING", "NULLABLE"),
@@ -1264,7 +1284,9 @@ def test_dataframe_to_bq_schema_w_named_index(module_under_test):
12641284

12651285

12661286
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1267-
def test_dataframe_to_bq_schema_w_multiindex(module_under_test):
1287+
def test_dataframe_to_bq_schema_w_multiindex(module_under_test, monkeypatch):
1288+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1289+
12681290
df_data = collections.OrderedDict(
12691291
[
12701292
("str_column", ["hello", "world"]),
@@ -1281,7 +1303,8 @@ def test_dataframe_to_bq_schema_w_multiindex(module_under_test):
12811303
)
12821304
dataframe = pandas.DataFrame(df_data, index=index)
12831305

1284-
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
1306+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1307+
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
12851308

12861309
expected_schema = (
12871310
schema.SchemaField("str_index", "STRING", "NULLABLE"),
@@ -1295,7 +1318,9 @@ def test_dataframe_to_bq_schema_w_multiindex(module_under_test):
12951318

12961319

12971320
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1298-
def test_dataframe_to_bq_schema_w_bq_schema(module_under_test):
1321+
def test_dataframe_to_bq_schema_w_bq_schema(module_under_test, monkeypatch):
1322+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1323+
12991324
df_data = collections.OrderedDict(
13001325
[
13011326
("str_column", ["hello", "world"]),
@@ -1310,7 +1335,10 @@ def test_dataframe_to_bq_schema_w_bq_schema(module_under_test):
13101335
{"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"},
13111336
]
13121337

1313-
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, dict_schema)
1338+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1339+
returned_schema = module_under_test.dataframe_to_bq_schema(
1340+
dataframe, dict_schema
1341+
)
13141342

13151343
expected_schema = (
13161344
schema.SchemaField("str_column", "STRING", "NULLABLE"),
@@ -1321,7 +1349,11 @@ def test_dataframe_to_bq_schema_w_bq_schema(module_under_test):
13211349

13221350

13231351
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1324-
def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test):
1352+
def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(
1353+
module_under_test, monkeypatch
1354+
):
1355+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1356+
13251357
dataframe = pandas.DataFrame(
13261358
data=[
13271359
{"id": 10, "status": "FOO", "execution_date": datetime.date(2019, 5, 10)},
@@ -1349,7 +1381,11 @@ def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test):
13491381

13501382
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
13511383
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
1352-
def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test):
1384+
def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(
1385+
module_under_test, monkeypatch
1386+
):
1387+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1388+
13531389
dataframe = pandas.DataFrame(
13541390
data=[
13551391
{"id": 10, "status": "FOO", "created_at": datetime.date(2019, 5, 10)},
@@ -1379,7 +1415,9 @@ def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test):
13791415

13801416
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
13811417
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
1382-
def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test):
1418+
def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test, monkeypatch):
1419+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1420+
13831421
dataframe = pandas.DataFrame(
13841422
data=[
13851423
{"struct_field": {"one": 2}, "status": "FOO"},
@@ -1403,9 +1441,11 @@ def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test):
14031441

14041442

14051443
@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`")
1406-
def test_dataframe_to_bq_schema_geography(module_under_test):
1444+
def test_dataframe_to_bq_schema_geography(module_under_test, monkeypatch):
14071445
from shapely import wkt
14081446

1447+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1448+
14091449
df = geopandas.GeoDataFrame(
14101450
pandas.DataFrame(
14111451
dict(
@@ -1416,7 +1456,10 @@ def test_dataframe_to_bq_schema_geography(module_under_test):
14161456
),
14171457
geometry="geo1",
14181458
)
1419-
bq_schema = module_under_test.dataframe_to_bq_schema(df, [])
1459+
1460+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1461+
bq_schema = module_under_test.dataframe_to_bq_schema(df, [])
1462+
14201463
assert bq_schema == (
14211464
schema.SchemaField("name", "STRING"),
14221465
schema.SchemaField("geo1", "GEOGRAPHY"),

‎tests/unit/test_client.py

Copy file name to clipboardExpand all lines: tests/unit/test_client.py
+25-8Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8391,8 +8391,12 @@ def test_load_table_from_dataframe_w_automatic_schema_detection_fails(self):
83918391
autospec=True,
83928392
side_effect=google.api_core.exceptions.NotFound("Table not found"),
83938393
)
8394+
pandas_gbq_patch = mock.patch(
8395+
"google.cloud.bigquery._pandas_helpers.pandas_gbq",
8396+
new=None,
8397+
)
83948398

8395-
with load_patch as load_table_from_file, get_table_patch:
8399+
with load_patch as load_table_from_file, get_table_patch, pandas_gbq_patch:
83968400
with warnings.catch_warnings(record=True) as warned:
83978401
client.load_table_from_dataframe(
83988402
dataframe, self.TABLE_REF, location=self.LOCATION
@@ -8448,7 +8452,6 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self):
84488452
load_patch = mock.patch(
84498453
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
84508454
)
8451-
84528455
get_table_patch = mock.patch(
84538456
"google.cloud.bigquery.client.Client.get_table",
84548457
autospec=True,
@@ -8460,6 +8463,7 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self):
84608463
]
84618464
),
84628465
)
8466+
84638467
with load_patch as load_table_from_file, get_table_patch:
84648468
client.load_table_from_dataframe(
84658469
dataframe, self.TABLE_REF, location=self.LOCATION
@@ -8580,10 +8584,10 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se
85808584

85818585
client = self._make_client()
85828586
dataframe = pandas.DataFrame({"x": [1, 2, None, 4]}, dtype="Int64")
8587+
85838588
load_patch = mock.patch(
85848589
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
85858590
)
8586-
85878591
get_table_patch = mock.patch(
85888592
"google.cloud.bigquery.client.Client.get_table",
85898593
autospec=True,
@@ -8612,8 +8616,11 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se
86128616

86138617
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
86148618
assert sent_config.source_format == job.SourceFormat.PARQUET
8615-
assert tuple(sent_config.schema) == (
8616-
SchemaField("x", "INT64", "NULLABLE", None),
8619+
assert (
8620+
# Accept either the GoogleSQL or legacy SQL type name from pandas-gbq.
8621+
tuple(sent_config.schema) == (SchemaField("x", "INT64", "NULLABLE", None),)
8622+
or tuple(sent_config.schema)
8623+
== (SchemaField("x", "INTEGER", "NULLABLE", None),)
86178624
)
86188625

86198626
def test_load_table_from_dataframe_struct_fields(self):
@@ -8759,14 +8766,22 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self):
87598766
data=records, columns=["float_column", "array_column"]
87608767
)
87618768

8762-
expected_schema = [
8769+
expected_schema_googlesql = [
87638770
SchemaField("float_column", "FLOAT"),
87648771
SchemaField(
87658772
"array_column",
87668773
"INT64",
87678774
mode="REPEATED",
87688775
),
87698776
]
8777+
expected_schema_legacy_sql = [
8778+
SchemaField("float_column", "FLOAT"),
8779+
SchemaField(
8780+
"array_column",
8781+
"INTEGER",
8782+
mode="REPEATED",
8783+
),
8784+
]
87708785

87718786
load_patch = mock.patch(
87728787
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
@@ -8802,7 +8817,10 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self):
88028817

88038818
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
88048819
assert sent_config.source_format == job.SourceFormat.PARQUET
8805-
assert sent_config.schema == expected_schema
8820+
assert (
8821+
sent_config.schema == expected_schema_googlesql
8822+
or sent_config.schema == expected_schema_legacy_sql
8823+
)
88068824

88078825
def test_load_table_from_dataframe_w_partial_schema(self):
88088826
pandas = pytest.importorskip("pandas")
@@ -8922,7 +8940,6 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self):
89228940

89238941
load_table_from_file.assert_not_called()
89248942
message = str(exc_context.value)
8925-
assert "bq_schema contains fields not present in dataframe" in message
89268943
assert "unknown_col" in message
89278944

89288945
def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self):

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.