Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit f05dc69

Browse filesBrowse files
authored
fix: load_table_from_dataframe now assumes there may be local null values (#1735)
Even if the remote schema is REQUIRED Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #1692 🦕
1 parent 5573579 commit f05dc69
Copy full SHA for f05dc69

File tree

Expand file treeCollapse file tree

3 files changed

+81
-23
lines changed
Open diff view settings
Filter options
Expand file treeCollapse file tree

3 files changed

+81
-23
lines changed
Open diff view settings
Collapse file

‎google/cloud/bigquery/_pandas_helpers.py‎

Copy file name to clipboardExpand all lines: google/cloud/bigquery/_pandas_helpers.py
+8-2Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,18 @@ def bq_to_arrow_field(bq_field, array_type=None):
178178
if arrow_type is not None:
179179
if array_type is not None:
180180
arrow_type = array_type # For GEOGRAPHY, at least initially
181-
is_nullable = bq_field.mode.upper() == "NULLABLE"
182181
metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get(
183182
bq_field.field_type.upper() if bq_field.field_type else ""
184183
)
185184
return pyarrow.field(
186-
bq_field.name, arrow_type, nullable=is_nullable, metadata=metadata
185+
bq_field.name,
186+
arrow_type,
187+
# Even if the remote schema is REQUIRED, there's a chance there's
188+
# local NULL values. Arrow will gladly interpret these NULL values
189+
# as non-NULL and give you an arbitrary value. See:
190+
# https://github.com/googleapis/python-bigquery/issues/1692
191+
nullable=True,
192+
metadata=metadata,
187193
)
188194

189195
warnings.warn("Unable to determine type for field '{}'.".format(bq_field.name))
Collapse file

‎tests/system/test_pandas.py‎

Copy file name to clipboardExpand all lines: tests/system/test_pandas.py
+40-7Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -428,8 +428,7 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id):
428428

429429

430430
def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
431-
"""Test that a DataFrame with required columns can be uploaded if a
432-
BigQuery schema is specified.
431+
"""Test that a DataFrame can be uploaded to a table with required columns.
433432
434433
See: https://github.com/googleapis/google-cloud-python/issues/8093
435434
"""
@@ -440,7 +439,6 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
440439

441440
records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}]
442441
dataframe = pandas.DataFrame(records, columns=["name", "age"])
443-
job_config = bigquery.LoadJobConfig(schema=table_schema)
444442
table_id = "{}.{}.load_table_from_dataframe_w_required".format(
445443
bigquery_client.project, dataset_id
446444
)
@@ -451,15 +449,50 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
451449
bigquery.Table(table_id, schema=table_schema)
452450
)
453451

454-
job_config = bigquery.LoadJobConfig(schema=table_schema)
455-
load_job = bigquery_client.load_table_from_dataframe(
456-
dataframe, table_id, job_config=job_config
457-
)
452+
load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id)
458453
load_job.result()
459454

460455
table = bigquery_client.get_table(table)
461456
assert tuple(table.schema) == table_schema
462457
assert table.num_rows == 2
458+
for field in table.schema:
459+
assert field.mode == "REQUIRED"
460+
461+
462+
def test_load_table_from_dataframe_w_required_but_local_nulls_fails(
463+
bigquery_client, dataset_id
464+
):
465+
"""Test that a DataFrame with nulls can't be uploaded to a table with
466+
required columns.
467+
468+
See: https://github.com/googleapis/python-bigquery/issues/1692
469+
"""
470+
table_schema = (
471+
bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
472+
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
473+
)
474+
475+
records = [
476+
{"name": "Chip", "age": 2},
477+
{"name": "Dale", "age": 3},
478+
{"name": None, "age": None},
479+
{"name": "Alvin", "age": 4},
480+
]
481+
dataframe = pandas.DataFrame(records, columns=["name", "age"])
482+
table_id = (
483+
"{}.{}.load_table_from_dataframe_w_required_but_local_nulls_fails".format(
484+
bigquery_client.project, dataset_id
485+
)
486+
)
487+
488+
# Create the table before loading so that schema mismatch errors are
489+
# identified.
490+
helpers.retry_403(bigquery_client.create_table)(
491+
bigquery.Table(table_id, schema=table_schema)
492+
)
493+
494+
with pytest.raises(google.api_core.exceptions.BadRequest, match="null"):
495+
bigquery_client.load_table_from_dataframe(dataframe, table_id).result()
463496

464497

465498
def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id):
Collapse file

‎tests/unit/test__pandas_helpers.py‎

Copy file name to clipboardExpand all lines: tests/unit/test__pandas_helpers.py
+33-14Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,30 +1017,41 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
10171017
)
10181018

10191019
data = {
1020-
"field01": ["hello", "world"],
1021-
"field02": [b"abd", b"efg"],
1022-
"field03": [1, 2],
1023-
"field04": [3, 4],
1024-
"field05": [1.25, 9.75],
1025-
"field06": [-1.75, -3.5],
1026-
"field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")],
1020+
"field01": ["hello", None, "world"],
1021+
"field02": [b"abd", b"efg", b"hij"],
1022+
"field03": [1, 2, 3],
1023+
"field04": [4, None, 5],
1024+
"field05": [1.25, 0.0, 9.75],
1025+
"field06": [-1.75, None, -3.5],
1026+
"field07": [
1027+
decimal.Decimal("1.2345"),
1028+
decimal.Decimal("6.7891"),
1029+
-decimal.Decimal("10.111213"),
1030+
],
10271031
"field08": [
10281032
decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)),
1033+
None,
10291034
decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)),
10301035
],
1031-
"field09": [True, False],
1032-
"field10": [False, True],
1036+
"field09": [True, False, True],
1037+
"field10": [False, True, None],
10331038
"field11": [
10341039
datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
10351040
datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=datetime.timezone.utc),
1041+
datetime.datetime(2022, 7, 14, 23, 59, 59, tzinfo=datetime.timezone.utc),
10361042
],
1037-
"field12": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)],
1038-
"field13": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)],
1043+
"field12": [datetime.date(9999, 12, 31), None, datetime.date(1970, 1, 1)],
1044+
"field13": [datetime.time(23, 59, 59, 999999), None, datetime.time(12, 0, 0)],
10391045
"field14": [
10401046
datetime.datetime(1970, 1, 1, 0, 0, 0),
1047+
None,
10411048
datetime.datetime(2012, 12, 21, 9, 7, 42),
10421049
],
1043-
"field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"],
1050+
"field15": [
1051+
None,
1052+
"POINT(30 10)",
1053+
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
1054+
],
10441055
}
10451056
dataframe = pandas.DataFrame(data)
10461057

@@ -1049,7 +1060,11 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
10491060

10501061
assert len(arrow_schema) == len(bq_schema)
10511062
for arrow_field in arrow_schema:
1052-
assert not arrow_field.nullable
1063+
# Even if the remote schema is REQUIRED, there's a chance there's
1064+
# local NULL values. Arrow will gladly interpret these NULL values
1065+
# as non-NULL and give you an arbitrary value. See:
1066+
# https://github.com/googleapis/python-bigquery/issues/1692
1067+
assert arrow_field.nullable
10531068

10541069

10551070
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@@ -1101,7 +1116,11 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
11011116
arrow_schema = arrow_table.schema
11021117

11031118
expected_fields = [
1104-
pyarrow.field("field01", "string", nullable=False),
1119+
# Even if the remote schema is REQUIRED, there's a chance there's
1120+
# local NULL values. Arrow will gladly interpret these NULL values
1121+
# as non-NULL and give you an arbitrary value. See:
1122+
# https://github.com/googleapis/python-bigquery/issues/1692
1123+
pyarrow.field("field01", "string", nullable=True),
11051124
pyarrow.field("field02", "bool", nullable=True),
11061125
]
11071126
assert list(arrow_schema) == expected_fields

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.