From 3d955f8b5e89c7ab5959049f5fb5f9072d05c4f6 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 25 Sep 2024 17:54:44 +0000 Subject: [PATCH 1/5] feat: allow access of struct fields with dot operators for Series --- bigframes/series.py | 14 +++++++++ tests/data/nested_structs.jsonl | 2 ++ tests/data/nested_structs_schema.json | 39 ++++++++++++++++++++++++ tests/system/conftest.py | 44 +++++++++++++++++++++++++++ tests/system/small/test_series.py | 35 +++++++++++++++++++++ 5 files changed, 134 insertions(+) create mode 100644 tests/data/nested_structs.jsonl create mode 100644 tests/data/nested_structs_schema.json diff --git a/bigframes/series.py b/bigframes/series.py index 193eea7ee3..99690370e8 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -30,6 +30,7 @@ import numpy import pandas import pandas.core.dtypes.common +import pyarrow as pa import typing_extensions import bigframes.core @@ -181,6 +182,14 @@ def _info_axis(self) -> indexes.Index: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def _struct_fields(self) -> list[str]: + if not bigframes.dtypes.is_struct_like(self.dtype): + return [] + + struct_type = typing.cast(pa.StructType, self._dtype.pyarrow_dtype) + return [struct_type.field(i).name for i in range(struct_type.num_fields)] + @validations.requires_ordering() def transpose(self) -> Series: return self @@ -1096,6 +1105,9 @@ def __pos__(self) -> Series: def __neg__(self) -> Series: return self._apply_unary_op(ops.neg_op) + def __dir__(self) -> list[str]: + return dir(type(self)) + self._struct_fields + def eq(self, other: object) -> Series: # TODO: enforce stricter alignment return self._apply_binary_op(other, ops.eq_op) @@ -1249,6 +1261,8 @@ def __getattr__(self, key: str): """ ) ) + elif key in self._struct_fields: + return self.struct.field(key) else: raise AttributeError(key) diff --git a/tests/data/nested_structs.jsonl b/tests/data/nested_structs.jsonl new file mode 100644 index 0000000000..f57214b0b3 --- /dev/null +++ b/tests/data/nested_structs.jsonl @@ -0,0 +1,2 @@ +{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}} +{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}} \ No newline at end of file diff --git a/tests/data/nested_structs_schema.json b/tests/data/nested_structs_schema.json new file mode 100644 index 0000000000..6692615cef --- /dev/null +++ b/tests/data/nested_structs_schema.json @@ -0,0 +1,39 @@ +[ + { + "name": "id", + "type": "INTEGER", + "mode": "REQUIRED" + }, + { + "name": "person", + "type": "RECORD", + "fields": [ + { + "name": "name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "age", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "RECORD", + "fields": [ + { + "name": "city", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "country", + "type": "STRING", + "mode": "NULLABLE" + } + ] + } + ] + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index d9246eecfb..217cf71e0c 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -32,6 +32,7 @@ import ibis.backends import numpy as np import pandas as pd +import pyarrow as pa import pytest import pytz import test_utils.prefixer @@ -290,6 +291,7 @@ def load_test_data_tables( ("scalars", "scalars_schema.json", "scalars.jsonl"), ("scalars_too", "scalars_schema.json", "scalars.jsonl"), ("nested", "nested_schema.json", "nested.jsonl"), + ("nested_structs", "nested_structs_schema.json", "nested_structs.jsonl"), ("repeated", "repeated_schema.json", "repeated.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), @@ -367,6 +369,11 @@ def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] +@pytest.fixture(scope="session") +def nested_structs_table_id(test_data_tables) -> str: + return test_data_tables["nested_structs"] + + @pytest.fixture(scope="session") def repeated_table_id(test_data_tables) -> str: return test_data_tables["repeated"] @@ -412,6 +419,43 @@ def nested_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="session") +def nested_structs_df( + nested_structs_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq(nested_structs_table_id, index_col="id") + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + return df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_type() -> pd.ArrowDtype: + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + + return pd.ArrowDtype(person_struct_schema) + + @pytest.fixture(scope="session") def repeated_df( repeated_table_id: str, session: bigframes.Session diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 793a4062c5..e4a4fcbc22 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3912,3 +3912,38 @@ def test_series_explode_null(data): s.to_pandas().explode(), check_dtype=False, ) + + +def test_series_struct_get_field_by_attribute( + nested_structs_df, nested_structs_pandas_df, nested_structs_pandas_type +): + bf_series = nested_structs_df["person"] + df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type) + + pd.testing.assert_series_equal( + bf_series.address.city.to_pandas(), + df_series.struct.field("address").struct.field("city"), + check_dtype=False, + check_index=False, + ) + pd.testing.assert_series_equal( + bf_series.address.country.to_pandas(), + df_series.struct.field("address").struct.field("country"), + check_dtype=False, + check_index=False, + ) + + +def test_series_struct_fields_in_dir(nested_structs_df): + series = nested_structs_df["person"] + + assert "age" in dir(series) + assert "address" in dir(series) + assert "city" in dir(series.address) + assert "country" in dir(series.address) + + +def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): + series = nested_structs_df["person"] + + assert series.name == "person" From b83d61aa7bc577540c1b87e57b733e4ad7f291b1 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 25 Sep 2024 18:48:55 +0000 Subject: [PATCH 2/5] fix infinite recursion of __getattr__() --- bigframes/series.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 99690370e8..6fd82e31a0 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -181,10 +181,10 @@ def _info_axis(self) -> indexes.Index: @property def _session(self) -> bigframes.Session: return self._get_block().expr.session - + @property def _struct_fields(self) -> list[str]: - if not bigframes.dtypes.is_struct_like(self.dtype): + if not bigframes.dtypes.is_struct_like(self._dtype): return [] struct_type = typing.cast(pa.StructType, self._dtype.pyarrow_dtype) @@ -1252,7 +1252,15 @@ def __getitem__(self, indexer): __getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__) def __getattr__(self, key: str): - if hasattr(pandas.Series, key): + # Protect against recursion errors with uninitialized Series objects. + # We use "_block" attribute to check whether the instance is initialized. + # See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + # and + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html + if key == "_block": + raise AttributeError(key) + elif hasattr(pandas.Series, key): raise AttributeError( textwrap.dedent( f""" @@ -1266,6 +1274,7 @@ def __getattr__(self, key: str): else: raise AttributeError(key) + def _apply_aggregation( self, op: agg_ops.UnaryAggregateOp | agg_ops.NullaryAggregateOp ) -> Any: From 3392ac06511c00bd1b15843ff784ce108d2ae13d Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 25 Sep 2024 18:53:33 +0000 Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/series.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 6fd82e31a0..364fd30062 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -181,7 +181,7 @@ def _info_axis(self) -> indexes.Index: @property def _session(self) -> bigframes.Session: return self._get_block().expr.session - + @property def _struct_fields(self) -> list[str]: if not bigframes.dtypes.is_struct_like(self._dtype): @@ -1252,7 +1252,7 @@ def __getitem__(self, indexer): __getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__) def __getattr__(self, key: str): - # Protect against recursion errors with uninitialized Series objects. + # Protect against recursion errors with uninitialized Series objects. # We use "_block" attribute to check whether the instance is initialized. # See: # https://github.com/googleapis/python-bigquery-dataframes/issues/728 @@ -1274,7 +1274,6 @@ def __getattr__(self, key: str): else: raise AttributeError(key) - def _apply_aggregation( self, op: agg_ops.UnaryAggregateOp | agg_ops.NullaryAggregateOp ) -> Any: From dabf6eac9ea3722c4fe2962b45f5d63736bc6d81 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 25 Sep 2024 20:22:31 +0000 Subject: [PATCH 4/5] fix typing and version --- bigframes/series.py | 6 +++--- tests/system/small/test_series.py | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 364fd30062..e0413b1b61 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -22,7 +22,7 @@ import numbers import textwrap import typing -from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, cast, List, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -183,7 +183,7 @@ def _session(self) -> bigframes.Session: return self._get_block().expr.session @property - def _struct_fields(self) -> list[str]: + def _struct_fields(self) -> List[str]: if not bigframes.dtypes.is_struct_like(self._dtype): return [] @@ -1105,7 +1105,7 @@ def __pos__(self) -> Series: def __neg__(self) -> Series: return self._apply_unary_op(ops.neg_op) - def __dir__(self) -> list[str]: + def __dir__(self) -> List[str]: return dir(type(self)) + self._struct_fields def eq(self, other: object) -> Series: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index e4a4fcbc22..ceaa5ff8f3 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -16,6 +16,7 @@ import math import re import tempfile +from packaging.version import Version import geopandas as gpd # type: ignore import numpy @@ -3917,6 +3918,9 @@ def test_series_explode_null(data): def test_series_struct_get_field_by_attribute( nested_structs_df, nested_structs_pandas_df, nested_structs_pandas_type ): + if Version(pd.__version__) < Version("2.2.0"): + pytest.skip("struct accessor is not supported before pandas 2.2") + bf_series = nested_structs_df["person"] df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type) From 37423b3356d42fe58c4ed2809f5bd1806af680e8 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 25 Sep 2024 20:24:52 +0000 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/small/test_series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index ceaa5ff8f3..aa70b7c655 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -16,10 +16,10 @@ import math import re import tempfile -from packaging.version import Version import geopandas as gpd # type: ignore import numpy +from packaging.version import Version import pandas as pd import pyarrow as pa # type: ignore import pytest @@ -3920,7 +3920,7 @@ def test_series_struct_get_field_by_attribute( ): if Version(pd.__version__) < Version("2.2.0"): pytest.skip("struct accessor is not supported before pandas 2.2") - + bf_series = nested_structs_df["person"] df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type)