Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit 3d8b17f

Browse filesBrowse files
authored
fix: support results with STRUCT and ARRAY columns containing JSON subfields in to_pandas_batches() (#2216)
* Correctly display DataFrames with JSON columns in anywidget * Improve JSON type handling for to_gbq and to_pandas_batches * Revert "Correctly display DataFrames with JSON columns in anywidget" This reverts commit 8c34512. * Remove unnecessary comment * code refactor * testcase update * Fix testcase * function call updated in bigframes/core/blocks.py, unused function removed from bigframes/dtypes.py * revert the code refactor in loader.py, I will use a seperate pr for this refactor * replace the manual construction of the empty DataFrame with the more robust try...except block that leverages to_pyarrow and empty_table * fix testcase * existing arrow_to_pandas() helper that properly handles dtype conversion * testcase update * refactor testcase * Add pyarrow id to comments
1 parent 94c8b3c commit 3d8b17f
Copy full SHA for 3d8b17f

2 files changed

+96-6Lines changed: 96 additions & 6 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎bigframes/core/blocks.py‎

Copy file name to clipboardExpand all lines: bigframes/core/blocks.py
+10-6Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
import bigframes.operations.aggregations as agg_ops
6969
from bigframes.session import dry_runs, execution_spec
7070
from bigframes.session import executor as executors
71+
from bigframes.session._io import pandas as io_pandas
7172

7273
# Type constraint for wherever column labels are used
7374
Label = typing.Hashable
@@ -711,12 +712,15 @@ def to_pandas_batches(
711712
# To reduce the number of edge cases to consider when working with the
712713
# results of this, always return at least one DataFrame. See:
713714
# b/428918844.
714-
empty_val = pd.DataFrame(
715-
{
716-
col: pd.Series([], dtype=self.expr.get_column_type(col))
717-
for col in itertools.chain(self.value_columns, self.index_columns)
718-
}
719-
)
715+
try:
716+
empty_arrow_table = self.expr.schema.to_pyarrow().empty_table()
717+
except pa.ArrowNotImplementedError:
718+
# Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262),
719+
# empty_table only supports base storage types, not extension types.
720+
empty_arrow_table = self.expr.schema.to_pyarrow(
721+
use_storage_types=True
722+
).empty_table()
723+
empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema)
720724
dfs = map(
721725
lambda a: a[0],
722726
itertools.zip_longest(
Collapse file

‎tests/system/small/test_dataframe_io.py‎

Copy file name to clipboardExpand all lines: tests/system/small/test_dataframe_io.py
+86Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,92 @@ def test_to_pandas_batches_w_empty_dataframe(session):
376376
pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)
377377

378378

379+
@pytest.mark.skipif(
380+
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
381+
reason="Test for pandas 1.x behavior only",
382+
)
383+
def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session):
384+
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x."""
385+
sql = """
386+
SELECT
387+
0 AS id,
388+
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
389+
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
390+
"""
391+
df = session.read_gbq(sql, index_col="id")
392+
batches = list(df.to_pandas_batches())
393+
394+
assert batches[0].dtypes["json_array"] == "object"
395+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
396+
397+
398+
@pytest.mark.skipif(
399+
not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
400+
reason="Test for pandas 2.x behavior only",
401+
)
402+
def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session):
403+
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x."""
404+
sql = """
405+
SELECT
406+
0 AS id,
407+
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
408+
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
409+
"""
410+
df = session.read_gbq(sql, index_col="id")
411+
batches = list(df.to_pandas_batches())
412+
413+
assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
414+
assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType)
415+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
416+
417+
418+
@pytest.mark.skipif(
419+
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
420+
reason="Test for pandas 1.x behavior only",
421+
)
422+
def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session):
423+
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x."""
424+
425+
sql = """
426+
SELECT
427+
1 AS id,
428+
[] AS json_array,
429+
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
430+
"""
431+
df = session.read_gbq(sql, index_col="id")
432+
433+
# The main point: this should not raise an error
434+
batches = list(df.to_pandas_batches())
435+
assert sum(len(b) for b in batches) == 1
436+
437+
assert batches[0].dtypes["json_array"] == "object"
438+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
439+
440+
441+
@pytest.mark.skipif(
442+
not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
443+
reason="Test for pandas 2.x behavior only",
444+
)
445+
def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session):
446+
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x."""
447+
448+
sql = """
449+
SELECT
450+
1 AS id,
451+
[] AS json_array,
452+
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
453+
"""
454+
df = session.read_gbq(sql, index_col="id")
455+
456+
# The main point: this should not raise an error
457+
batches = list(df.to_pandas_batches())
458+
assert sum(len(b) for b in batches) == 1
459+
460+
assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
461+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
462+
assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType)
463+
464+
379465
@pytest.mark.parametrize("allow_large_results", (True, False))
380466
def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
381467
"""Verify to_pandas_batches() APIs returns the expected page size.

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.