From e0f065fec9ccf4656838924619f0b954a9a9f667 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 9 Jun 2025 15:52:36 -0700 Subject: [PATCH 01/23] test: Engine tests for selection ops (#1800) --- tests/system/small/engines/conftest.py | 9 ++- tests/system/small/engines/engine_utils.py | 31 ++++++++++ tests/system/small/engines/test_read_local.py | 30 +++------- tests/system/small/engines/test_selection.py | 60 +++++++++++++++++++ 4 files changed, 107 insertions(+), 23 deletions(-) create mode 100644 tests/system/small/engines/engine_utils.py create mode 100644 tests/system/small/engines/test_selection.py diff --git a/tests/system/small/engines/conftest.py b/tests/system/small/engines/conftest.py index 2a72cb2196..249bd59260 100644 --- a/tests/system/small/engines/conftest.py +++ b/tests/system/small/engines/conftest.py @@ -19,7 +19,7 @@ import pytest import bigframes -from bigframes.core import local_data +from bigframes.core import ArrayValue, local_data from bigframes.session import ( direct_gbq_execution, local_scan_executor, @@ -62,6 +62,13 @@ def managed_data_source( return local_data.ManagedArrowTable.from_pandas(scalars_pandas_df_index) +@pytest.fixture(scope="module") +def scalars_array_value( + managed_data_source: local_data.ManagedArrowTable, fake_session: bigframes.Session +): + return ArrayValue.from_managed(managed_data_source, fake_session) + + @pytest.fixture(scope="module") def zero_row_source() -> local_data.ManagedArrowTable: return local_data.ManagedArrowTable.from_pandas(pd.DataFrame({"a": [], "b": []})) diff --git a/tests/system/small/engines/engine_utils.py b/tests/system/small/engines/engine_utils.py new file mode 100644 index 0000000000..f58e5951a1 --- /dev/null +++ b/tests/system/small/engines/engine_utils.py @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core import nodes +from bigframes.session import semi_executor + + +def assert_equivalence_execution( + node: nodes.BigFrameNode, + engine1: semi_executor.SemiExecutor, + engine2: semi_executor.SemiExecutor, +): + e1_result = engine1.execute(node, ordered=True) + e2_result = engine2.execute(node, ordered=True) + assert e1_result is not None + assert e2_result is not None + # Schemas might have extra nullity markers, normalize to node expected schema, which should be looser + e1_table = e1_result.to_arrow_table().cast(node.schema.to_pyarrow()) + e2_table = e2_result.to_arrow_table().cast(node.schema.to_pyarrow()) + assert e1_table.equals(e2_table), f"{e1_table} is not equal to {e2_table}" diff --git a/tests/system/small/engines/test_read_local.py b/tests/system/small/engines/test_read_local.py index 7bf1316a44..0517f45f1a 100644 --- a/tests/system/small/engines/test_read_local.py +++ b/tests/system/small/engines/test_read_local.py @@ -16,7 +16,8 @@ import bigframes from bigframes.core import identifiers, local_data, nodes -from bigframes.session import polars_executor, semi_executor +from bigframes.session import polars_executor +from tests.system.small.engines.engine_utils import assert_equivalence_execution pytest.importorskip("polars") @@ -24,21 +25,6 @@ REFERENCE_ENGINE = polars_executor.PolarsExecutor() -def ensure_equivalence( - node: nodes.BigFrameNode, - engine1: semi_executor.SemiExecutor, - engine2: semi_executor.SemiExecutor, -): - e1_result = engine1.execute(node, ordered=True) - e2_result = engine2.execute(node, ordered=True) - assert e1_result is not None - assert e2_result is not None - # Schemas might have extra nullity markers, normalize to node expected schema, which should be looser - e1_table = e1_result.to_arrow_table().cast(node.schema.to_pyarrow()) - e2_table = e2_result.to_arrow_table().cast(node.schema.to_pyarrow()) - assert e1_table.equals(e2_table), f"{e1_table} is not equal to {e2_table}" - - def test_engines_read_local( fake_session: bigframes.Session, managed_data_source: local_data.ManagedArrowTable, @@ -51,7 +37,7 @@ def test_engines_read_local( local_node = nodes.ReadLocalNode( managed_data_source, scan_list, fake_session, offsets_col=None ) - ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine) def test_engines_read_local_w_offsets( @@ -69,7 +55,7 @@ def test_engines_read_local_w_offsets( fake_session, offsets_col=identifiers.ColumnId("offsets"), ) - ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine) def test_engines_read_local_w_col_subset( @@ -84,7 +70,7 @@ def test_engines_read_local_w_col_subset( local_node = nodes.ReadLocalNode( managed_data_source, scan_list, fake_session, offsets_col=None ) - ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine) def test_engines_read_local_w_zero_row_source( @@ -99,7 +85,7 @@ def test_engines_read_local_w_zero_row_source( local_node = nodes.ReadLocalNode( zero_row_source, scan_list, fake_session, offsets_col=None ) - ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine) def test_engines_read_local_w_nested_source( @@ -114,7 +100,7 @@ def test_engines_read_local_w_nested_source( local_node = nodes.ReadLocalNode( nested_data_source, scan_list, fake_session, offsets_col=None ) - ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine) def test_engines_read_local_w_repeated_source( @@ -129,4 +115,4 @@ def test_engines_read_local_w_repeated_source( local_node = nodes.ReadLocalNode( repeated_data_source, scan_list, fake_session, offsets_col=None ) - ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/engines/test_selection.py b/tests/system/small/engines/test_selection.py new file mode 100644 index 0000000000..6350e79403 --- /dev/null +++ b/tests/system/small/engines/test_selection.py @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value, expression, identifiers, nodes +from bigframes.session import polars_executor +from tests.system.small.engines.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +def test_engines_select_identity( + scalars_array_value: array_value.ArrayValue, + engine, +): + selection = tuple( + nodes.AliasedRef(expression.deref(col), identifiers.ColumnId(col)) + for col in scalars_array_value.column_ids + ) + node = nodes.SelectionNode(scalars_array_value.node, selection) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +def test_engines_select_rename( + scalars_array_value: array_value.ArrayValue, + engine, +): + selection = tuple( + nodes.AliasedRef(expression.deref(col), identifiers.ColumnId(f"renamed_{col}")) + for col in scalars_array_value.column_ids + ) + node = nodes.SelectionNode(scalars_array_value.node, selection) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +def test_engines_select_reorder_rename_drop( + scalars_array_value: array_value.ArrayValue, + engine, +): + selection = tuple( + nodes.AliasedRef(expression.deref(col), identifiers.ColumnId(f"renamed_{col}")) + for col in scalars_array_value.column_ids[::-2] + ) + node = nodes.SelectionNode(scalars_array_value.node, selection) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) From 080eb7be3cde591e08cad0d5c52c68cc0b25ade8 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 10 Jun 2025 11:03:48 -0700 Subject: [PATCH 02/23] fix: Fix single row broadcast with null index (#1803) --- bigframes/core/blocks.py | 2 +- tests/system/small/test_dataframe.py | 31 ++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 35cb7d41ae..acfa399d75 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2895,7 +2895,7 @@ def join_with_single_row( combined_expr, index_columns=index_cols_post_join, column_labels=left.column_labels.append(single_row_block.column_labels), - index_labels=[left.index.name], + index_labels=left.index.names, ) return ( block, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index c80ced45a5..18d8fed7dc 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2657,16 +2657,16 @@ def test_listlike_binop_axis_1_bf_index(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) -def test_binop_with_self_aggregate(session, scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs +def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered df_columns = ["int64_col", "float64_col", "int64_too"] # Ensure that this takes the optimized single-query path by counting executions - execution_count_before = session._metrics.execution_count + execution_count_before = scalars_df._session._metrics.execution_count bf_df = scalars_df[df_columns] bf_result = (bf_df - bf_df.mean()).to_pandas() - execution_count_after = session._metrics.execution_count + execution_count_after = scalars_df._session._metrics.execution_count pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() @@ -2677,6 +2677,29 @@ def test_binop_with_self_aggregate(session, scalars_dfs): assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) +def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + df_columns = ["int64_col", "float64_col", "int64_too"] + + # Ensure that this takes the optimized single-query path by counting executions + execution_count_before = scalars_df._session._metrics.execution_count + bf_df = scalars_df[df_columns].reset_index(drop=True) + bf_result = (bf_df - bf_df.mean()).to_pandas() + execution_count_after = scalars_df._session._metrics.execution_count + + pd_df = scalars_pandas_df[df_columns].reset_index(drop=True) + pd_result = pd_df - pd_df.mean() + + executions = execution_count_after - execution_count_before + + assert executions == 1 + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("left_labels", "right_labels"), [ From 3edc313307d753396a333570d7952984128c694b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 10 Jun 2025 17:14:16 -0500 Subject: [PATCH 03/23] chore: prevent location warning when `_read_gbq_colab` can determine the location (#1802) * chore: prevent location warning when _read_gbq_colab can determine the location I've made some updates to prevent a location warning when the system can determine the location for `bigframes.pandas.io.api._read_gbq_colab`. I've updated `bigframes.pandas.io.api._read_gbq_colab` so it correctly calls `bigframes.session.Session._read_gbq_colab` and adjusted its arguments. The `_read_gbq_colab` function in the pandas API layer now has a simpler signature, accepting `query_or_table`, `pyformat_args`, and `dry_run`. It will continue to call `_set_default_session_location_if_possible` to prevent location warnings. I've also updated the unit tests to reflect these changes, making sure that the correct session-level function is called and that arguments are passed through as expected. I've also moved the tests to `tests/unit/pandas/io/test_api.py` and converted them to pytest style to follow our repository conventions. * Update tests/unit/pandas/io/test_api.py * remove unused import * format the query first --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/pandas/io/api.py | 61 ++++++++++++++++++++++++++++++++ tests/unit/pandas/io/test_api.py | 48 +++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 tests/unit/pandas/io/test_api.py diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index b2ce5f211e..e1fd7218bd 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -216,6 +216,67 @@ def read_gbq( read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq) +@overload +def _read_gbq_colab( # type: ignore[overload-overlap] + query_or_table: str, + *, + pyformat_args: Optional[Dict[str, Any]] = ..., + dry_run: Literal[False] = ..., +) -> bigframes.dataframe.DataFrame: + ... + + +@overload +def _read_gbq_colab( + query_or_table: str, + *, + pyformat_args: Optional[Dict[str, Any]] = ..., + dry_run: Literal[True] = ..., +) -> pandas.Series: + ... + + +def _read_gbq_colab( + query_or_table: str, + *, + pyformat_args: Optional[Dict[str, Any]] = None, + dry_run: bool = False, +) -> bigframes.dataframe.DataFrame | pandas.Series: + """A Colab-specific version of read_gbq. + + Calls `_set_default_session_location_if_possible` and then delegates + to `bigframes.session.Session._read_gbq_colab`. + + Args: + query_or_table (str): + SQL query or table ID (table ID not yet supported). + pyformat_args (Optional[Dict[str, Any]]): + Parameters to format into the query string. + dry_run (bool): + If True, estimates the query results size without returning data. + The return will be a pandas Series with query metadata. + + Returns: + Union[bigframes.dataframe.DataFrame, pandas.Series]: + A BigQuery DataFrame if `dry_run` is False, otherwise a pandas Series. + """ + if pyformat_args is None: + pyformat_args = {} + + query = bigframes.core.pyformat.pyformat( + query_or_table, + pyformat_args=pyformat_args, + ) + _set_default_session_location_if_possible(query) + + return global_session.with_default_session( + bigframes.session.Session._read_gbq_colab, + query_or_table, + pyformat_args=pyformat_args, + dry_run=dry_run, + ) + + def read_gbq_model(model_name: str): return global_session.with_default_session( bigframes.session.Session.read_gbq_model, diff --git a/tests/unit/pandas/io/test_api.py b/tests/unit/pandas/io/test_api.py new file mode 100644 index 0000000000..fbc9027552 --- /dev/null +++ b/tests/unit/pandas/io/test_api.py @@ -0,0 +1,48 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import bigframes.dataframe +import bigframes.pandas.io.api as bf_io_api +import bigframes.session + + +@mock.patch("bigframes.pandas.io.api._set_default_session_location_if_possible") +@mock.patch("bigframes.core.global_session.with_default_session") +def test_read_gbq_colab_calls_set_location( + mock_with_default_session, mock_set_location +): + # Configure the mock for with_default_session to return a DataFrame mock + mock_df = mock.create_autospec(bigframes.dataframe.DataFrame) + mock_with_default_session.return_value = mock_df + + query_or_table = "SELECT {param1} AS param1" + sample_pyformat_args = {"param1": "value1"} + result = bf_io_api._read_gbq_colab( + query_or_table, pyformat_args=sample_pyformat_args, dry_run=False + ) + + # Make sure that we format the SQL first to prevent syntax errors. + formatted_query = "SELECT 'value1' AS param1" + mock_set_location.assert_called_once_with(formatted_query) + mock_with_default_session.assert_called_once() + + # Check the actual arguments passed to with_default_session + args, kwargs = mock_with_default_session.call_args + assert args[0] == bigframes.session.Session._read_gbq_colab + assert args[1] == query_or_table + assert kwargs["pyformat_args"] == sample_pyformat_args + assert not kwargs["dry_run"] + assert isinstance(result, bigframes.dataframe.DataFrame) From 18f43e8b58e03a27b021bce07566a3d006ac3679 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 10 Jun 2025 17:29:13 -0700 Subject: [PATCH 04/23] feat: add groupby cumcount (#1798) --- bigframes/core/array_value.py | 22 +++++++-- bigframes/core/blocks.py | 32 ++++++++++--- bigframes/core/groupby/dataframe_group_by.py | 29 ++++++++++-- tests/system/small/test_groupby.py | 46 +++++++++++++++---- .../pandas/core/groupby/__init__.py | 1 - 5 files changed, 106 insertions(+), 24 deletions(-) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index a6c700a485..4b05781cb7 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -403,8 +403,23 @@ def project_window_op( never_skip_nulls: will disable null skipping for operators that would otherwise do so skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ + + return self.project_window_expr( + ex.UnaryAggregation(op, ex.deref(column_name)), + window_spec, + never_skip_nulls, + skip_reproject_unsafe, + ) + + def project_window_expr( + self, + expression: ex.Aggregation, + window: WindowSpec, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ): # TODO: Support non-deterministic windowing - if window_spec.is_row_bounded or not op.order_independent: + if window.is_row_bounded or not expression.op.order_independent: if self.node.order_ambiguous and not self.session._strictly_ordered: if not self.session._allows_ambiguity: raise ValueError( @@ -415,14 +430,13 @@ def project_window_op( "Window ordering may be ambiguous, this can cause unstable results." ) warnings.warn(msg, category=bfe.AmbiguousWindowWarning) - output_name = self._gen_namespaced_uid() return ( ArrayValue( nodes.WindowOpNode( child=self.node, - expression=ex.UnaryAggregation(op, ex.deref(column_name)), - window_spec=window_spec, + expression=expression, + window_spec=window, output_name=ids.ColumnId(output_name), never_skip_nulls=never_skip_nulls, skip_reproject_unsafe=skip_reproject_unsafe, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index acfa399d75..4607928b78 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1012,16 +1012,34 @@ def apply_window_op( skip_null_groups: bool = False, skip_reproject_unsafe: bool = False, never_skip_nulls: bool = False, + ) -> typing.Tuple[Block, str]: + agg_expr = ex.UnaryAggregation(op, ex.deref(column)) + return self.apply_analytic( + agg_expr, + window_spec, + result_label, + skip_reproject_unsafe=skip_reproject_unsafe, + never_skip_nulls=never_skip_nulls, + skip_null_groups=skip_null_groups, + ) + + def apply_analytic( + self, + agg_expr: ex.Aggregation, + window: windows.WindowSpec, + result_label: Label, + *, + skip_reproject_unsafe: bool = False, + never_skip_nulls: bool = False, + skip_null_groups: bool = False, ) -> typing.Tuple[Block, str]: block = self if skip_null_groups: - for key in window_spec.grouping_keys: - block, not_null_id = block.apply_unary_op(key.id.name, ops.notnull_op) - block = block.filter_by_id(not_null_id).drop_columns([not_null_id]) - expr, result_id = block._expr.project_window_op( - column, - op, - window_spec, + for key in window.grouping_keys: + block = block.filter(ops.notnull_op.as_expr(key.id.name)) + expr, result_id = block._expr.project_window_expr( + agg_expr, + window, skip_reproject_unsafe=skip_reproject_unsafe, never_skip_nulls=never_skip_nulls, ) diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index f234bad126..a2c4cf2867 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -275,6 +275,27 @@ def count(self) -> df.DataFrame: def nunique(self) -> df.DataFrame: return self._aggregate_all(agg_ops.nunique_op) + @validations.requires_ordering() + def cumcount(self, ascending: bool = True) -> series.Series: + window_spec = ( + window_specs.cumulative_rows(grouping_keys=tuple(self._by_col_ids)) + if ascending + else window_specs.inverse_cumulative_rows( + grouping_keys=tuple(self._by_col_ids) + ) + ) + block, result_id = self._block.apply_analytic( + ex.NullaryAggregation(agg_ops.size_op), + window=window_spec, + result_label=None, + ) + result = series.Series(block.select_column(result_id)) - 1 + if self._dropna and (len(self._by_col_ids) == 1): + result = result.mask( + series.Series(block.select_column(self._by_col_ids[0])).isna() + ) + return result + @validations.requires_ordering() def cumsum(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: if not numeric_only: @@ -546,10 +567,12 @@ def _apply_window_op( ) columns, _ = self._aggregated_columns(numeric_only=numeric_only) block, result_ids = self._block.multi_apply_window_op( - columns, op, window_spec=window_spec + columns, + op, + window_spec=window_spec, ) - block = block.select_columns(result_ids) - return df.DataFrame(block) + result = df.DataFrame(block.select_columns(result_ids)) + return result def _resolve_label(self, label: blocks.Label) -> str: """Resolve label to column id.""" diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index f1d2bacf08..bc2e9cc385 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -383,14 +383,14 @@ def test_dataframe_groupby_multi_sum( @pytest.mark.parametrize( - ("operator"), + ("operator", "dropna"), [ - (lambda x: x.cumsum(numeric_only=True)), - (lambda x: x.cummax(numeric_only=True)), - (lambda x: x.cummin(numeric_only=True)), + (lambda x: x.cumsum(numeric_only=True), True), + (lambda x: x.cummax(numeric_only=True), True), + (lambda x: x.cummin(numeric_only=True), False), # Pre-pandas 2.2 doesn't always proeduce float. - (lambda x: x.cumprod().astype("Float64")), - (lambda x: x.shift(periods=2)), + (lambda x: x.cumprod().astype("Float64"), False), + (lambda x: x.shift(periods=2), True), ], ids=[ "cumsum", @@ -401,16 +401,44 @@ def test_dataframe_groupby_multi_sum( ], ) def test_dataframe_groupby_analytic( - scalars_df_index, scalars_pandas_df_index, operator + scalars_df_index, + scalars_pandas_df_index, + operator, + dropna, ): col_names = ["float64_col", "int64_col", "bool_col", "string_col"] - bf_result = operator(scalars_df_index[col_names].groupby("string_col")) - pd_result = operator(scalars_pandas_df_index[col_names].groupby("string_col")) + bf_result = operator( + scalars_df_index[col_names].groupby("string_col", dropna=dropna) + ) + pd_result = operator( + scalars_pandas_df_index[col_names].groupby("string_col", dropna=dropna) + ) bf_result_computed = bf_result.to_pandas() pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +@pytest.mark.parametrize( + ("ascending", "dropna"), + [ + (True, True), + (False, False), + ], +) +def test_dataframe_groupby_cumcount( + scalars_df_index, scalars_pandas_df_index, ascending, dropna +): + bf_result = scalars_df_index.groupby("string_col", dropna=dropna).cumcount( + ascending + ) + pd_result = scalars_pandas_df_index.groupby("string_col", dropna=dropna).cumcount( + ascending + ) + bf_result_computed = bf_result.to_pandas() + + pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) + + def test_dataframe_groupby_size_as_index_false( scalars_df_index, scalars_pandas_df_index ): diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 4fb8498932..ebfbfa8830 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -718,7 +718,6 @@ def max( def cumcount(self, ascending: bool = True): """ Number each item in each group from 0 to the length of that group - 1. - (DataFrameGroupBy functionality is not yet available.) **Examples:** From b3db5197444262b487532b4c7d5fcc4f50ee1404 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 10 Jun 2025 20:34:00 -0700 Subject: [PATCH 05/23] chore: Move remaining test utils to bigframes.testing module (#1810) Co-authored-by: Shenyang Cai --- .../testing}/compiler_session.py | 0 .../engines => bigframes/testing}/engine_utils.py | 0 {tests/system => bigframes/testing}/utils.py | 0 pytest.ini | 1 + scripts/__init__.py | 13 ------------- scripts/conftest.py | 8 ++++++++ scripts/test_publish_api_coverage.py | 5 ++--- tests/system/conftest.py | 10 +++++----- .../system/large/functions/test_managed_function.py | 2 +- .../system/large/functions/test_remote_function.py | 2 +- tests/system/large/ml/test_cluster.py | 2 +- tests/system/large/ml/test_compose.py | 2 +- tests/system/large/ml/test_core.py | 2 +- tests/system/large/ml/test_decomposition.py | 2 +- tests/system/large/ml/test_ensemble.py | 2 +- tests/system/large/ml/test_forecasting.py | 2 +- tests/system/large/ml/test_linear_model.py | 2 +- tests/system/large/ml/test_model_selection.py | 2 +- tests/system/large/ml/test_pipeline.py | 2 +- tests/system/load/test_llm.py | 2 +- tests/system/small/bigquery/test_vector_search.py | 2 +- tests/system/small/engines/test_read_local.py | 2 +- tests/system/small/engines/test_selection.py | 2 +- .../system/small/functions/test_remote_function.py | 2 +- tests/system/small/geopandas/test_geoseries.py | 2 +- tests/system/small/ml/test_cluster.py | 2 +- tests/system/small/ml/test_core.py | 2 +- tests/system/small/ml/test_decomposition.py | 10 +++++----- tests/system/small/ml/test_llm.py | 2 +- tests/system/small/ml/test_multimodal_llm.py | 2 +- tests/system/small/ml/test_preprocessing.py | 2 +- tests/system/small/operations/test_datetimes.py | 2 +- tests/system/small/operations/test_lists.py | 2 +- tests/system/small/operations/test_strings.py | 3 +-- .../regression/test_issue355_merge_after_filter.py | 2 +- tests/system/small/test_dataframe.py | 2 +- tests/system/small/test_dataframe_io.py | 2 +- tests/system/small/test_encryption.py | 2 +- tests/system/small/test_groupby.py | 2 +- tests/system/small/test_index.py | 2 +- tests/system/small/test_large_local_data.py | 2 +- tests/system/small/test_multiindex.py | 2 +- tests/system/small/test_pandas.py | 2 +- tests/system/small/test_series.py | 2 +- tests/system/small/test_session.py | 2 +- tests/system/small/test_unordered.py | 2 +- tests/unit/core/compile/sqlglot/conftest.py | 6 +++--- tests/unit/test_dataframe_polars.py | 2 +- 48 files changed, 62 insertions(+), 68 deletions(-) rename {tests/unit/core/compile/sqlglot => bigframes/testing}/compiler_session.py (100%) rename {tests/system/small/engines => bigframes/testing}/engine_utils.py (100%) rename {tests/system => bigframes/testing}/utils.py (100%) delete mode 100644 scripts/__init__.py create mode 100644 scripts/conftest.py diff --git a/tests/unit/core/compile/sqlglot/compiler_session.py b/bigframes/testing/compiler_session.py similarity index 100% rename from tests/unit/core/compile/sqlglot/compiler_session.py rename to bigframes/testing/compiler_session.py diff --git a/tests/system/small/engines/engine_utils.py b/bigframes/testing/engine_utils.py similarity index 100% rename from tests/system/small/engines/engine_utils.py rename to bigframes/testing/engine_utils.py diff --git a/tests/system/utils.py b/bigframes/testing/utils.py similarity index 100% rename from tests/system/utils.py rename to bigframes/testing/utils.py diff --git a/pytest.ini b/pytest.ini index 204c743bbf..75b69ce435 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,3 +2,4 @@ doctest_optionflags = NORMALIZE_WHITESPACE filterwarnings = ignore::pandas.errors.SettingWithCopyWarning +addopts = "--import-mode=importlib" diff --git a/scripts/__init__.py b/scripts/__init__.py deleted file mode 100644 index 6d5e14bcf4..0000000000 --- a/scripts/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/scripts/conftest.py b/scripts/conftest.py new file mode 100644 index 0000000000..83fd2b19af --- /dev/null +++ b/scripts/conftest.py @@ -0,0 +1,8 @@ +from pathlib import Path +import sys + +# inserts scripts into path so that tests can import +project_root = Path(__file__).parent.parent +scripts_dir = project_root / "scripts" + +sys.path.insert(0, str(scripts_dir)) diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 6dea10b608..6e366b6854 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -15,16 +15,15 @@ import sys import pandas +from publish_api_coverage import build_api_coverage_table import pytest -from . import publish_api_coverage - pytest.importorskip("sklearn") @pytest.fixture def api_coverage_df(): - return publish_api_coverage.build_api_coverage_table("my_bf_ver", "my_release_ver") + return build_api_coverage_table("my_bf_ver", "my_release_ver") @pytest.mark.skipif( diff --git a/tests/system/conftest.py b/tests/system/conftest.py index a4bab1bcfe..4605d9ddbc 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -41,7 +41,7 @@ import bigframes.dataframe import bigframes.pandas as bpd import bigframes.series -import tests.system.utils +import bigframes.testing.utils # Use this to control the number of cloud functions being deleted in a single # test session. This should help soften the spike of the number of mutations per @@ -615,7 +615,7 @@ def scalars_pandas_df_default_index() -> pd.DataFrame: DATA_DIR / "scalars.jsonl", lines=True, ) - tests.system.utils.convert_pandas_dtypes(df, bytes_col=True) + bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True) df = df.set_index("rowindex", drop=False) df.index.name = None @@ -1422,12 +1422,12 @@ def use_fast_query_path(): @pytest.fixture(scope="session", autouse=True) def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent): """Clean up stale cloud functions.""" - permanent_endpoints = tests.system.utils.get_remote_function_endpoints( + permanent_endpoints = bigframes.testing.utils.get_remote_function_endpoints( session.bqclient, dataset_id_permanent ) delete_count = 0 try: - for cloud_function in tests.system.utils.get_cloud_functions( + for cloud_function in bigframes.testing.utils.get_cloud_functions( cloudfunctions_client, session.bqclient.project, session.bqclient.location, @@ -1447,7 +1447,7 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent # Go ahead and delete try: - tests.system.utils.delete_cloud_function( + bigframes.testing.utils.delete_cloud_function( cloudfunctions_client, cloud_function.name ) delete_count += 1 diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 9eba1907e6..5cb54a00c1 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -21,7 +21,7 @@ import bigframes import bigframes.exceptions as bfe import bigframes.pandas as bpd -from tests.system.utils import cleanup_function_assets +from bigframes.testing.utils import cleanup_function_assets prefixer = test_utils.prefixer.Prefixer("bigframes", "") diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 426813b0ff..9e0dcfe4d7 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -36,7 +36,7 @@ import bigframes.functions._utils as bff_utils import bigframes.pandas as bpd import bigframes.series -from tests.system.utils import ( +from bigframes.testing.utils import ( assert_pandas_df_equal, cleanup_function_assets, delete_cloud_function, diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index 39368f490b..9736199b17 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -15,7 +15,7 @@ import pandas as pd from bigframes.ml import cluster -from tests.system import utils +from bigframes.testing import utils def test_cluster_configure_fit_score_predict( diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index cbc702018a..9279324b3c 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -13,7 +13,7 @@ # limitations under the License. from bigframes.ml import compose, preprocessing -from tests.system import utils +from bigframes.testing import utils def test_columntransformer_standalone_fit_and_transform( diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index c1e1cc19d9..6f0551b1ef 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -13,7 +13,7 @@ # limitations under the License. from bigframes.ml import globals -from tests.system import utils +from bigframes.testing import utils def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df): diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index e0e4b79c6f..c36e873816 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -16,7 +16,7 @@ import pandas.testing from bigframes.ml import decomposition -from tests.system import utils +from bigframes.testing import utils def test_decomposition_configure_fit_score_predict( diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index 706cbfdfaf..c2e9036eed 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -15,7 +15,7 @@ import pytest import bigframes.ml.ensemble -from tests.system import utils +from bigframes.testing import utils @pytest.mark.flaky(retries=2) diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index 56b93e5338..72a0ee469b 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -15,7 +15,7 @@ import pytest from bigframes.ml import forecasting -from tests.system import utils +from bigframes.testing import utils ARIMA_EVALUATE_OUTPUT_COL = [ "non_seasonal_p", diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index be98902007..f0e2892ba8 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -16,7 +16,7 @@ from bigframes.ml import model_selection import bigframes.ml.linear_model -from tests.system import utils +from bigframes.testing import utils def test_linear_regression_configure_fit_score(penguins_df_default_index, dataset_id): diff --git a/tests/system/large/ml/test_model_selection.py b/tests/system/large/ml/test_model_selection.py index c1856a1537..26174b7ee9 100644 --- a/tests/system/large/ml/test_model_selection.py +++ b/tests/system/large/ml/test_model_selection.py @@ -15,7 +15,7 @@ import pytest from bigframes.ml import linear_model, model_selection -from tests.system import utils +from bigframes.testing import utils @pytest.mark.parametrize( diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 84a6b11ff2..6c51a11a11 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -25,7 +25,7 @@ pipeline, preprocessing, ) -from tests.system import utils +from bigframes.testing import utils def test_pipeline_linear_regression_fit_score_predict( diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 5cf9621ef9..fc04956749 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -16,7 +16,7 @@ import pytest from bigframes.ml import llm -from tests.system import utils +from bigframes.testing import utils @pytest.fixture(scope="session") diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 6297d729ea..a282135fa6 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -23,7 +23,7 @@ import bigframes.bigquery as bbq import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_pandas_df_equal # Need at least 5,000 rows to create a vector index. VECTOR_DF = pd.DataFrame( diff --git a/tests/system/small/engines/test_read_local.py b/tests/system/small/engines/test_read_local.py index 0517f45f1a..82af7c984d 100644 --- a/tests/system/small/engines/test_read_local.py +++ b/tests/system/small/engines/test_read_local.py @@ -17,7 +17,7 @@ import bigframes from bigframes.core import identifiers, local_data, nodes from bigframes.session import polars_executor -from tests.system.small.engines.engine_utils import assert_equivalence_execution +from bigframes.testing.engine_utils import assert_equivalence_execution pytest.importorskip("polars") diff --git a/tests/system/small/engines/test_selection.py b/tests/system/small/engines/test_selection.py index 6350e79403..94c8a6463c 100644 --- a/tests/system/small/engines/test_selection.py +++ b/tests/system/small/engines/test_selection.py @@ -16,7 +16,7 @@ from bigframes.core import array_value, expression, identifiers, nodes from bigframes.session import polars_executor -from tests.system.small.engines.engine_utils import assert_equivalence_execution +from bigframes.testing.engine_utils import assert_equivalence_execution pytest.importorskip("polars") diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 7fc7caf2fc..47ab6e2174 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -31,7 +31,7 @@ from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff import bigframes.session._io.bigquery -from tests.system.utils import assert_pandas_df_equal, get_function_name +from bigframes.testing.utils import assert_pandas_df_equal, get_function_name _prefixer = test_utils.prefixer.Prefixer("bigframes", "") diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index 36dd070ef5..51344edcbd 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -31,7 +31,7 @@ import bigframes.geopandas import bigframes.pandas import bigframes.series -from tests.system.utils import assert_series_equal +from bigframes.testing.utils import assert_series_equal @pytest.fixture(scope="session") diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index 96066e5fbe..4840329cda 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -16,7 +16,7 @@ from bigframes.ml import cluster import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_pandas_df_equal _PD_NEW_PENGUINS = pd.DataFrame.from_dict( { diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 3c5ba9bb18..ef62e5ddd3 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -23,7 +23,7 @@ import bigframes import bigframes.features from bigframes.ml import core -from tests.system import utils +from bigframes.testing import utils def test_model_eval( diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 9eb9b25ea1..10255003a1 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -16,7 +16,7 @@ from bigframes.ml import decomposition import bigframes.pandas as bpd -import tests.system.utils +import bigframes.testing.utils def test_pca_predict( @@ -33,7 +33,7 @@ def test_pca_predict( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - tests.system.utils.assert_pandas_df_equal_pca( + bigframes.testing.utils.assert_pandas_df_equal_pca( predictions, expected, check_exact=False, rtol=0.1 ) @@ -161,7 +161,7 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA): .reset_index(drop=True) ) - tests.system.utils.assert_pandas_df_equal_pca_components( + bigframes.testing.utils.assert_pandas_df_equal_pca_components( result, expected, check_exact=False, @@ -180,7 +180,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): "explained_variance": [3.278657, 1.270829, 1.125354], }, ) - tests.system.utils.assert_pandas_df_equal( + bigframes.testing.utils.assert_pandas_df_equal( result, expected, check_exact=False, @@ -200,7 +200,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): "explained_variance_ratio": [0.469357, 0.181926, 0.1611], }, ) - tests.system.utils.assert_pandas_df_equal( + bigframes.testing.utils.assert_pandas_df_equal( result, expected, check_exact=False, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 3d5453099d..11425400bf 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -22,7 +22,7 @@ from bigframes import exceptions from bigframes.ml import core, llm import bigframes.pandas as bpd -from tests.system import utils +from bigframes.testing import utils @pytest.mark.parametrize( diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py index beee95636f..48a69f522c 100644 --- a/tests/system/small/ml/test_multimodal_llm.py +++ b/tests/system/small/ml/test_multimodal_llm.py @@ -18,7 +18,7 @@ from bigframes.ml import llm import bigframes.pandas as bpd -from tests.system import utils +from bigframes.testing import utils @pytest.mark.flaky(retries=2) diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 16b153ab45..34be48be1e 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -19,7 +19,7 @@ import bigframes.features from bigframes.ml import preprocessing -from tests.system import utils +from bigframes.testing import utils ONE_HOT_ENCODED_DTYPE = ( pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index bbecf40e0b..4e2beb9c19 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -20,7 +20,7 @@ import pytest import bigframes.series -from tests.system.utils import assert_series_equal +from bigframes.testing.utils import assert_series_equal DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)] DATE_COLUMNS = [ diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py index 7b39bdebd5..fda01a5dae 100644 --- a/tests/system/small/operations/test_lists.py +++ b/tests/system/small/operations/test_lists.py @@ -18,7 +18,7 @@ import pyarrow as pa import pytest -from ...utils import assert_series_equal +from bigframes.testing.utils import assert_series_equal @pytest.mark.parametrize( diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 8801faf657..209bc87f9b 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -20,8 +20,7 @@ import bigframes.dtypes as dtypes import bigframes.pandas as bpd - -from ...utils import assert_series_equal +from bigframes.testing.utils import assert_series_equal def test_find(scalars_dfs): diff --git a/tests/system/small/regression/test_issue355_merge_after_filter.py b/tests/system/small/regression/test_issue355_merge_after_filter.py index 24ee01cb7f..1c3b6e4fe3 100644 --- a/tests/system/small/regression/test_issue355_merge_after_filter.py +++ b/tests/system/small/regression/test_issue355_merge_after_filter.py @@ -15,7 +15,7 @@ import pandas as pd import pytest -from tests.system.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_pandas_df_equal @pytest.mark.parametrize( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 18d8fed7dc..946df79cbf 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -33,7 +33,7 @@ import bigframes.dtypes as dtypes import bigframes.pandas as bpd import bigframes.series as series -from tests.system.utils import ( +from bigframes.testing.utils import ( assert_dfs_equivalent, assert_pandas_df_equal, assert_series_equal, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 5df7283e3c..afe3b53d6d 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -23,7 +23,7 @@ import pytest import bigframes.dtypes as dtypes -from tests.system import utils +from bigframes.testing import utils try: import pandas_gbq # type: ignore diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 1ba8ed7e09..1f30df451d 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -21,7 +21,7 @@ import bigframes import bigframes.ml.linear_model -from tests.system import utils +from bigframes.testing import utils @pytest.fixture(scope="module") diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index bc2e9cc385..0af173adc8 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_pandas_df_equal # ================= # DataFrame.groupby diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 7643f5701b..3b9854be26 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -19,7 +19,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_index_equal_ignore_index_type +from bigframes.testing.utils import assert_pandas_index_equal_ignore_index_type def test_index_construct_from_list(): diff --git a/tests/system/small/test_large_local_data.py b/tests/system/small/test_large_local_data.py index eddec37132..0c03a8b6a3 100644 --- a/tests/system/small/test_large_local_data.py +++ b/tests/system/small/test_large_local_data.py @@ -17,7 +17,7 @@ import pytest import bigframes -from tests.system.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_pandas_df_equal large_dataframe = pd.DataFrame(np.random.rand(10000, 10), dtype="Float64") large_dataframe.index = large_dataframe.index.astype("Int64") diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a01b7aab92..b63468d311 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -17,7 +17,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_pandas_df_equal def test_multi_index_from_arrays(): diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 491b56d5fc..4e8d3d20f7 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -21,7 +21,7 @@ import pytz import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_pandas_df_equal @pytest.mark.parametrize( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 10671720af..6760d63a20 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -32,7 +32,7 @@ import bigframes.features import bigframes.pandas import bigframes.series as series -from tests.system.utils import ( +from bigframes.testing.utils import ( assert_pandas_df_equal, assert_series_equal, get_first_file_from_wildcard, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 9febb0da42..cbb441e5aa 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -36,7 +36,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model -from tests.system import utils +from bigframes.testing import utils all_write_engines = pytest.mark.parametrize( "write_engine", diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index f6a56af7ff..0825b78037 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -19,7 +19,7 @@ import bigframes.exceptions import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal, assert_series_equal +from bigframes.testing.utils import assert_pandas_df_equal, assert_series_equal def test_unordered_mode_sql_no_hash(unordered_session): diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 4d871fd707..83d6a2b881 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -19,7 +19,7 @@ import pytest from bigframes import dtypes -import tests.system.utils +import bigframes.testing.utils CURRENT_DIR = pathlib.Path(__file__).parent DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" @@ -27,7 +27,7 @@ @pytest.fixture(scope="session") def compiler_session(): - from . import compiler_session + from bigframes.testing import compiler_session return compiler_session.SQLCompilerSession() @@ -41,7 +41,7 @@ def scalars_types_pandas_df() -> pd.DataFrame: DATA_DIR / "scalars.jsonl", lines=True, ) - tests.system.utils.convert_pandas_dtypes(df, bytes_col=True) + bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True) df = df.set_index("rowindex", drop=False) return df diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index 2bda563418..b434e473e9 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -30,7 +30,7 @@ import bigframes.dataframe as dataframe import bigframes.pandas as bpd import bigframes.series as series -from tests.system.utils import ( +from bigframes.testing.utils import ( assert_dfs_equivalent, assert_pandas_df_equal, assert_series_equal, From 582bbaf0bf27c8387eae35c663789713184cdf89 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 11 Jun 2025 13:08:07 -0700 Subject: [PATCH 06/23] chore: implement compile_readtable (#1809) * use mocks.create_bigquery_session * chore: implement compile_readtable --- bigframes/core/compile/sqlglot/compiler.py | 11 ++++++ .../core/compile/sqlglot/scalar_compiler.py | 4 ++- bigframes/core/compile/sqlglot/sqlglot_ir.py | 29 +++++++++++++-- bigframes/testing/compiler_session.py | 35 ------------------- bigframes/testing/mocks.py | 8 ++--- tests/unit/core/compile/sqlglot/conftest.py | 21 +++++++++-- .../test_compile_projection/out.sql | 27 ++++++++++---- .../test_compile_readtable/out.sql | 16 +++++++++ .../sqlglot/test_compile_projection.py | 10 ++---- .../compile/sqlglot/test_compile_readtable.py | 24 +++++++++++++ 10 files changed, 125 insertions(+), 60 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql create mode 100644 tests/unit/core/compile/sqlglot/test_compile_readtable.py diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 50169d1a8b..7e55c0285f 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -158,6 +158,17 @@ def compile_readlocal(self, node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema, uid_gen=self.uid_gen) + @_compile_node.register + def compile_readtable(self, node: nodes.ReadTableNode, *args): + table = node.source.table + return ir.SQLGlotIR.from_table( + table.project_id, + table.dataset_id, + table.table_id, + col_names=[col.source_id for col in node.scan_list.items], + alias_names=[col.id.sql for col in node.scan_list.items], + ) + @_compile_node.register def compile_selection( self, node: nodes.SelectionNode, child: ir.SQLGlotIR diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 0f059d482c..18d709732a 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -79,6 +79,8 @@ def compile_op_expression(expr: expression.OpExpression): # TODO: add parenthesize for operators -def compile_addop(op: ops.AddOp, left: sge.Expression, right: sge.Expression): +def compile_addop( + op: ops.AddOp, left: sge.Expression, right: sge.Expression +) -> sge.Expression: # TODO: support addop for string dtype. return sge.Add(this=left, expression=right) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 23b441591b..fc1a687c71 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -106,6 +106,30 @@ def from_pyarrow( ) return cls(expr=sg.select(sge.Star()).from_(expr), uid_gen=uid_gen) + @classmethod + def from_table( + cls, + project_id: str, + dataset_id: str, + table_id: str, + col_names: typing.Sequence[str], + alias_names: typing.Sequence[str], + ) -> SQLGlotIR: + selections = [ + sge.Alias( + this=sge.to_identifier(col_name, quoted=cls.quoted), + alias=sge.to_identifier(alias_name, quoted=cls.quoted), + ) + for col_name, alias_name in zip(col_names, alias_names) + ] + table_expr = sge.Table( + this=sg.to_identifier(table_id, quoted=cls.quoted), + db=sg.to_identifier(dataset_id, quoted=cls.quoted), + catalog=sg.to_identifier(project_id, quoted=cls.quoted), + ) + select_expr = sge.Select().select(*selections).from_(table_expr) + return cls(expr=select_expr) + @classmethod def from_query_string( cls, @@ -156,9 +180,8 @@ def project( ) for id, expr in projected_cols ] - # TODO: some columns are not able to be projected into the same select. - select_expr = self.expr.select(*projected_cols_expr, append=True) - return SQLGlotIR(expr=select_expr) + new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=False) + return SQLGlotIR(expr=new_expr) def insert( self, diff --git a/bigframes/testing/compiler_session.py b/bigframes/testing/compiler_session.py index 7309349681..35114d95d0 100644 --- a/bigframes/testing/compiler_session.py +++ b/bigframes/testing/compiler_session.py @@ -14,13 +14,10 @@ import dataclasses import typing -import weakref import bigframes.core import bigframes.core.compile.sqlglot as sqlglot -import bigframes.dataframe import bigframes.session.executor -import bigframes.session.metrics @dataclasses.dataclass @@ -44,35 +41,3 @@ def to_sql( return self.compiler.SQLGlotCompiler().compile( array_value.node, ordered=ordered ) - - -class SQLCompilerSession(bigframes.session.Session): - """Session for SQL compilation using sqlglot.""" - - def __init__(self): - # TODO: remove unused attributes. - self._location = None # type: ignore - self._bq_kms_key_name = None # type: ignore - self._clients_provider = None # type: ignore - self.ibis_client = None # type: ignore - self._bq_connection = None # type: ignore - self._skip_bq_connection_check = True - self._objects: list[ - weakref.ReferenceType[ - typing.Union[ - bigframes.core.indexes.Index, - bigframes.series.Series, - bigframes.dataframe.DataFrame, - ] - ] - ] = [] - self._strictly_ordered: bool = True - self._allow_ambiguity = False # type: ignore - self._default_index_type = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 - self._metrics = bigframes.session.metrics.ExecutionMetrics() - self._remote_function_session = None # type: ignore - self._temp_storage_manager = None # type: ignore - self._loader = None # type: ignore - - self._session_id: str = "sqlglot_unit_tests_session" - self._executor = SQLCompilerExecutor() diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index 7ddc2e2e6e..25f1f90fe7 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -64,7 +64,7 @@ def create_bigquery_session( if bqclient is None: bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) - bqclient.project = "test-project" + bqclient.project = anonymous_dataset.project bqclient.location = location # Mock the location. @@ -74,9 +74,9 @@ def create_bigquery_session( type(table).created = mock.PropertyMock(return_value=table_time) type(table).location = mock.PropertyMock(return_value=location) type(table).schema = mock.PropertyMock(return_value=table_schema) - type(table).reference = mock.PropertyMock( - return_value=anonymous_dataset.table("test_table") - ) + type(table).project = anonymous_dataset.project + type(table).dataset_id = anonymous_dataset.dataset_id + type(table).table_id = "test_table" type(table).num_rows = mock.PropertyMock(return_value=1000000000) bqclient.get_table.return_value = table diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 83d6a2b881..d9a13ae53f 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -13,12 +13,15 @@ # limitations under the License. import pathlib +import typing +from google.cloud import bigquery import pandas as pd import pyarrow as pa import pytest from bigframes import dtypes +import bigframes.testing.mocks as mocks import bigframes.testing.utils CURRENT_DIR = pathlib.Path(__file__).parent @@ -26,10 +29,24 @@ @pytest.fixture(scope="session") -def compiler_session(): +def compiler_session(basic_types_table_schema): from bigframes.testing import compiler_session - return compiler_session.SQLCompilerSession() + # TODO: Check if ordering mode is needed for the tests. + session = mocks.create_bigquery_session(table_schema=basic_types_table_schema) + session._executor = compiler_session.SQLCompilerExecutor() + return session + + +@pytest.fixture(scope="session") +def basic_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("int64_col", "INTEGER"), + bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + ] @pytest.fixture(scope="session") diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql index f5182a380b..8a24b01a25 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql @@ -1,11 +1,24 @@ WITH `bfcte_0` AS ( SELECT - *, - `bfcol_0` AS `bfcol_3`, - `bfcol_1` + 1 AS `bfcol_4` - FROM UNNEST(ARRAY>[STRUCT(0, 123456789, 0), STRUCT(1, -987654321, 1), STRUCT(2, 314159, 2), STRUCT(3, CAST(NULL AS INT64), 3), STRUCT(4, -234892, 4), STRUCT(5, 55555, 5), STRUCT(6, 101202303, 6), STRUCT(7, -214748367, 7), STRUCT(8, 2, 8)]) + `rowindex` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `string_col` AS `bfcol_2`, + `float64_col` AS `bfcol_3`, + `bool_col` AS `bfcol_4` + FROM `test-project`.`test_dataset`.`test_table` +), `bfcte_1` AS ( + SELECT + `bfcol_0` AS `bfcol_5`, + `bfcol_2` AS `bfcol_6`, + `bfcol_3` AS `bfcol_7`, + `bfcol_4` AS `bfcol_8`, + `bfcol_1` + 1 AS `bfcol_9` + FROM `bfcte_0` ) SELECT - `bfcol_3` AS `rowindex`, - `bfcol_4` AS `int64_col` -FROM `bfcte_0` \ No newline at end of file + `bfcol_5` AS `rowindex`, + `bfcol_9` AS `int64_col`, + `bfcol_6` AS `string_col`, + `bfcol_7` AS `float64_col`, + `bfcol_8` AS `bool_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql new file mode 100644 index 0000000000..f010f77bf1 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_2` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `string_col` AS `bfcol_2`, + `float64_col` AS `bfcol_3`, + `bool_col` AS `bfcol_4` + FROM `test-project`.`test_dataset`.`test_table` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int64_col`, + `bfcol_2` AS `string_col`, + `bfcol_3` AS `float64_col`, + `bfcol_4` AS `bool_col` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_projection.py b/tests/unit/core/compile/sqlglot/test_compile_projection.py index be74255649..82e6c60668 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_projection.py +++ b/tests/unit/core/compile/sqlglot/test_compile_projection.py @@ -12,20 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd import pytest import bigframes -import bigframes.pandas as bpd pytest.importorskip("pytest_snapshot") -def test_compile_projection( - scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot -): - bf_df = bpd.DataFrame( - scalars_types_pandas_df[["int64_col"]], session=compiler_session - ) +def test_compile_projection(compiler_session: bigframes.Session, snapshot): + bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") bf_df["int64_col"] = bf_df["int64_col"] + 1 snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py new file mode 100644 index 0000000000..848ace58f3 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes + +pytest.importorskip("pytest_snapshot") + + +def test_compile_readtable(compiler_session: bigframes.Session, snapshot): + bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") + snapshot.assert_match(bf_df.sql, "out.sql") From b5867464a5bf30300dcfc069eda546b11f03146c Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 11 Jun 2025 13:25:32 -0700 Subject: [PATCH 07/23] docs: document how to use ai.map() for information extraction (#1808) * doc: document how to use ai.map() for information extraction * fix lint --- bigframes/operations/ai.py | 19 +++++- notebooks/experimental/ai_operators.ipynb | 78 +++++++++++++---------- 2 files changed, 64 insertions(+), 33 deletions(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 87245d104e..f7a9e6358e 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -117,7 +117,8 @@ def map( attach_logprobs=False, ): """ - Maps the DataFrame with the semantics of the user instruction. + Maps the DataFrame with the semantics of the user instruction. The name of the keys in the output_schema parameter carry + semantic meaning, and can be used for information extraction. **Examples:** @@ -139,6 +140,22 @@ def map( [2 rows x 3 columns] + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + + >>> df = bpd.DataFrame({"text": ["Elmo lives at 123 Sesame Street."]}) + >>> df.ai.map("{text}", model=model, output_schema={"person": "string", "address": "string"}) + text person address + 0 Elmo lives at 123 Sesame Street. Elmo 123 Sesame Street + + [1 rows x 3 columns] + Args: instruction (str): An instruction on how to map the data. This value must contain diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index 49a9d798e2..f830787801 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -264,7 +264,7 @@ "id": "hQft3o3OiouS" }, "source": [ - "# API Samples" + "# API Examples" ] }, { @@ -403,7 +403,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\n", " warnings.warn(msg, bfe.PreviewWarning)\n" ] @@ -594,7 +594,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\n", " warnings.warn(msg, bfe.PreviewWarning)\n" ] @@ -676,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -685,12 +685,30 @@ "id": "PpL24AQFiouS", "outputId": "e7aff038-bf4b-4833-def8-fe2648e8885b" }, + "outputs": [], + "source": [ + "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AI Extraction\n", + "\n", + "AI mapping is also able to extract multiple pieces of information based on your prompt, because the output schema keys can carry semantic meanings:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\n", " warnings.warn(msg, bfe.PreviewWarning)\n" ] @@ -716,54 +734,50 @@ " \n", " \n", " \n", - " ingredient_1\n", - " ingredient_2\n", - " food\n", + " text\n", + " person\n", + " address\n", " \n", " \n", " \n", " \n", " 0\n", - " Bun\n", - " Beef Patty\n", - " Burger\n", + " Elmo lives at 123 Sesame Street.\n", + " Elmo\n", + " 123 Sesame Street\n", " \n", " \n", " 1\n", - " Soy Bean\n", - " Bittern\n", - " Tofu\n", - " \n", - " \n", - " 2\n", - " Sausage\n", - " Long Bread\n", - " Hotdog\n", + " 124 Conch Street is SpongeBob's home\n", + " SpongeBob\n", + " 124 Conch Street\n", " \n", " \n", "\n", - "

3 rows × 3 columns

\n", - "[3 rows x 3 columns in total]" + "

2 rows × 3 columns

\n", + "[2 rows x 3 columns in total]" ], "text/plain": [ - " ingredient_1 ingredient_2 food\n", - "0 Bun Beef Patty Burger\n", - "\n", - "1 Soy Bean Bittern Tofu\n", - "\n", - "2 Sausage Long Bread Hotdog\n", - "\n", + " text person address\n", + "0 Elmo lives at 123 Sesame Street. Elmo 123 Sesame Street\n", + "1 124 Conch Street is SpongeBob's home SpongeBob 124 Conch Street\n", "\n", - "[3 rows x 3 columns]" + "[2 rows x 3 columns]" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + "df = bpd.DataFrame({\n", + " \"text\": [\n", + " \"Elmo lives at 123 Sesame Street.\", \n", + " \"124 Conch Street is SpongeBob's home\",\n", + " ]\n", + "})\n", + "df.ai.map(\"{text}\", model=gemini_model, output_schema={\"person\": \"string\", \"address\": \"string\"})" ] }, { From 63205f2565bdfe3833d6b20b912a88ef0599d955 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 11 Jun 2025 14:19:12 -0700 Subject: [PATCH 08/23] refactor: Refactor polars scalar op compiler (#1807) --- bigframes/core/compile/polars/compiler.py | 208 ++++++++++++++-------- bigframes/operations/base_ops.py | 10 +- bigframes/operations/blob_ops.py | 3 +- bigframes/operations/bool_ops.py | 9 +- bigframes/operations/comparison_ops.py | 21 ++- bigframes/operations/date_ops.py | 27 ++- bigframes/operations/datetime_ops.py | 6 +- bigframes/operations/distance_ops.py | 9 +- bigframes/operations/generic_ops.py | 26 +-- bigframes/operations/geo_ops.py | 30 ++-- bigframes/operations/numeric_ops.py | 83 ++++++--- bigframes/operations/string_ops.py | 39 ++-- bigframes/operations/time_ops.py | 12 +- 13 files changed, 317 insertions(+), 166 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index a0e85d8c69..62654c1518 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -29,6 +29,10 @@ import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.operations.bool_ops as bool_ops +import bigframes.operations.comparison_ops as comp_ops +import bigframes.operations.generic_ops as gen_ops +import bigframes.operations.numeric_ops as num_ops polars_installed = True if TYPE_CHECKING: @@ -123,84 +127,146 @@ def _( self, expression: ex.OpExpression, ) -> pl.Expr: - # TODO: Complete the implementation, convert to hash dispatch + # TODO: Complete the implementation op = expression.op args = tuple(map(self.compile_expression, expression.inputs)) - if isinstance(op, ops.invert_op.__class__): - return ~args[0] - if isinstance(op, ops.and_op.__class__): - return args[0] & args[1] - if isinstance(op, ops.or_op.__class__): - return args[0] | args[1] - if isinstance(op, ops.add_op.__class__): - return args[0] + args[1] - if isinstance(op, ops.sub_op.__class__): - return args[0] - args[1] - if isinstance(op, ops.mul_op.__class__): - return args[0] * args[1] - if isinstance(op, ops.div_op.__class__): - return args[0] / args[1] - if isinstance(op, ops.floordiv_op.__class__): - # TODO: Handle int // 0 - return args[0] // args[1] - if isinstance(op, (ops.pow_op.__class__, ops.unsafe_pow_op.__class__)): - return args[0] ** args[1] - if isinstance(op, ops.abs_op.__class__): - return args[0].abs() - if isinstance(op, ops.neg_op.__class__): - return args[0].neg() - if isinstance(op, ops.pos_op.__class__): - return args[0] - if isinstance(op, ops.ge_op.__class__): - return args[0] >= args[1] - if isinstance(op, ops.gt_op.__class__): - return args[0] > args[1] - if isinstance(op, ops.le_op.__class__): - return args[0] <= args[1] - if isinstance(op, ops.lt_op.__class__): - return args[0] < args[1] - if isinstance(op, ops.eq_op.__class__): - return args[0].eq(args[1]) - if isinstance(op, ops.eq_null_match_op.__class__): - return args[0].eq_missing(args[1]) - if isinstance(op, ops.ne_op.__class__): - return args[0].ne(args[1]) - if isinstance(op, ops.IsInOp): - # TODO: Filter out types that can't be coerced to right type - if op.match_nulls or not any(map(pd.isna, op.values)): - # newer polars version have nulls_equal arg - return args[0].is_in(op.values) - else: - return args[0].is_in(op.values) or args[0].is_null() - if isinstance(op, ops.mod_op.__class__): - return args[0] % args[1] - if isinstance(op, ops.coalesce_op.__class__): - return pl.coalesce(*args) - if isinstance(op, ops.fillna_op.__class__): - return pl.coalesce(*args) - if isinstance(op, ops.isnull_op.__class__): - return args[0].is_null() - if isinstance(op, ops.notnull_op.__class__): - return args[0].is_not_null() - if isinstance(op, ops.CaseWhenOp): - expr = pl.when(args[0]).then(args[1]) - for pred, result in zip(args[2::2], args[3::2]): - expr = expr.when(pred).then(result) # type: ignore - return expr - if isinstance(op, ops.where_op.__class__): - original, condition, otherwise = args - return pl.when(condition).then(original).otherwise(otherwise) - if isinstance(op, ops.AsTypeOp): - return self.astype(args[0], op.to_type, safe=op.safe) + return self.compile_op(op, *args) + @functools.singledispatchmethod + def compile_op(self, op: ops.ScalarOp, *args: pl.Expr) -> pl.Expr: raise NotImplementedError(f"Polars compiler hasn't implemented {op}") - def astype( - self, col: pl.Expr, dtype: bigframes.dtypes.Dtype, safe: bool + @compile_op.register(gen_ops.InvertOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return ~input + + @compile_op.register(num_ops.AbsOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.abs() + + @compile_op.register(num_ops.PosOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.__pos__() + + @compile_op.register(num_ops.NegOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.__neg__() + + @compile_op.register(bool_ops.AndOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input & r_input + + @compile_op.register(bool_ops.OrOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input | r_input + + @compile_op.register(num_ops.AddOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input + r_input + + @compile_op.register(num_ops.SubOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input - r_input + + @compile_op.register(num_ops.MulOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input * r_input + + @compile_op.register(num_ops.DivOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input / r_input + + @compile_op.register(num_ops.FloorDivOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input // r_input + + @compile_op.register(num_ops.FloorDivOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input // r_input + + @compile_op.register(num_ops.ModOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input % r_input + + @compile_op.register(num_ops.PowOp) + @compile_op.register(num_ops.UnsafePowOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input**r_input + + @compile_op.register(comp_ops.EqOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input.eq(r_input) + + @compile_op.register(comp_ops.EqNullsMatchOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input.eq_missing(r_input) + + @compile_op.register(comp_ops.NeOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input.ne(r_input) + + @compile_op.register(comp_ops.GtOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input > r_input + + @compile_op.register(comp_ops.GeOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input >= r_input + + @compile_op.register(comp_ops.LtOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input < r_input + + @compile_op.register(comp_ops.LeOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return l_input <= r_input + + @compile_op.register(gen_ops.IsInOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + # TODO: Filter out types that can't be coerced to right type + assert isinstance(op, gen_ops.IsInOp) + if op.match_nulls or not any(map(pd.isna, op.values)): + # newer polars version have nulls_equal arg + return input.is_in(op.values) + else: + return input.is_in(op.values) or input.is_null() + + @compile_op.register(gen_ops.IsNullOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.is_null() + + @compile_op.register(gen_ops.NotNullOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.is_not_null() + + @compile_op.register(gen_ops.FillNaOp) + @compile_op.register(gen_ops.CoalesceOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + return pl.coalesce(l_input, r_input) + + @compile_op.register(gen_ops.CaseWhenOp) + def _(self, op: ops.ScalarOp, *inputs: pl.Expr) -> pl.Expr: + expr = pl.when(inputs[0]).then(inputs[1]) + for pred, result in zip(inputs[2::2], inputs[3::2]): + expr = expr.when(pred).then(result) # type: ignore + return expr + + @compile_op.register(gen_ops.WhereOp) + def _( + self, + op: ops.ScalarOp, + original: pl.Expr, + condition: pl.Expr, + otherwise: pl.Expr, ) -> pl.Expr: + return pl.when(condition).then(original).otherwise(otherwise) + + @compile_op.register(gen_ops.AsTypeOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + assert isinstance(op, gen_ops.AsTypeOp) # TODO: Polars casting works differently, need to lower instead to specific conversion ops. - # eg. We want "True" instead of "true" for bool to string. - return col.cast(_DTYPE_MAPPING[dtype], strict=not safe) + # eg. We want "True" instead of "true" for bool to strin + return input.cast(_DTYPE_MAPPING[op.to_type], strict=not op.safe) @dataclasses.dataclass(frozen=True) class PolarsAggregateCompiler: diff --git a/bigframes/operations/base_ops.py b/bigframes/operations/base_ops.py index fc92ffe760..c0145a6711 100644 --- a/bigframes/operations/base_ops.py +++ b/bigframes/operations/base_ops.py @@ -180,7 +180,9 @@ def _convert_expr_input( # Operation Factories -def create_unary_op(name: str, type_signature: op_typing.UnaryTypeSignature) -> UnaryOp: +def create_unary_op( + name: str, type_signature: op_typing.UnaryTypeSignature +) -> type[UnaryOp]: return dataclasses.make_dataclass( name, [ @@ -189,12 +191,12 @@ def create_unary_op(name: str, type_signature: op_typing.UnaryTypeSignature) -> ], bases=(UnaryOp,), frozen=True, - )() + ) def create_binary_op( name: str, type_signature: op_typing.BinaryTypeSignature -) -> BinaryOp: +) -> type[BinaryOp]: return dataclasses.make_dataclass( name, [ @@ -203,4 +205,4 @@ def create_binary_op( ], bases=(BinaryOp,), frozen=True, - )() + ) diff --git a/bigframes/operations/blob_ops.py b/bigframes/operations/blob_ops.py index b17d1b1215..2936e0f14f 100644 --- a/bigframes/operations/blob_ops.py +++ b/bigframes/operations/blob_ops.py @@ -19,9 +19,10 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -obj_fetch_metadata_op = base_ops.create_unary_op( +ObjFetchMetadataOp = base_ops.create_unary_op( name="obj_fetch_metadata", type_signature=op_typing.BLOB_TRANSFORM ) +obj_fetch_metadata_op = ObjFetchMetadataOp() @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/bool_ops.py b/bigframes/operations/bool_ops.py index c8cd08efe5..003318f822 100644 --- a/bigframes/operations/bool_ops.py +++ b/bigframes/operations/bool_ops.py @@ -16,8 +16,11 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -and_op = base_ops.create_binary_op(name="and", type_signature=op_typing.LOGICAL) +AndOp = base_ops.create_binary_op(name="and", type_signature=op_typing.LOGICAL) +and_op = AndOp() -or_op = base_ops.create_binary_op(name="or", type_signature=op_typing.LOGICAL) +OrOp = base_ops.create_binary_op(name="or", type_signature=op_typing.LOGICAL) +or_op = OrOp() -xor_op = base_ops.create_binary_op(name="xor", type_signature=op_typing.LOGICAL) +XorOp = base_ops.create_binary_op(name="xor", type_signature=op_typing.LOGICAL) +xor_op = XorOp() diff --git a/bigframes/operations/comparison_ops.py b/bigframes/operations/comparison_ops.py index b109a85d18..4c2911808d 100644 --- a/bigframes/operations/comparison_ops.py +++ b/bigframes/operations/comparison_ops.py @@ -16,18 +16,25 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -eq_op = base_ops.create_binary_op(name="eq", type_signature=op_typing.COMPARISON) +EqOp = base_ops.create_binary_op(name="eq", type_signature=op_typing.COMPARISON) +eq_op = EqOp() -eq_null_match_op = base_ops.create_binary_op( +EqNullsMatchOp = base_ops.create_binary_op( name="eq_nulls_match", type_signature=op_typing.COMPARISON ) +eq_null_match_op = EqNullsMatchOp() -ne_op = base_ops.create_binary_op(name="ne", type_signature=op_typing.COMPARISON) +NeOp = base_ops.create_binary_op(name="ne", type_signature=op_typing.COMPARISON) +ne_op = NeOp() -lt_op = base_ops.create_binary_op(name="lt", type_signature=op_typing.COMPARISON) +LtOp = base_ops.create_binary_op(name="lt", type_signature=op_typing.COMPARISON) +lt_op = LtOp() -gt_op = base_ops.create_binary_op(name="gt", type_signature=op_typing.COMPARISON) +GtOp = base_ops.create_binary_op(name="gt", type_signature=op_typing.COMPARISON) +gt_op = GtOp() -le_op = base_ops.create_binary_op(name="le", type_signature=op_typing.COMPARISON) +LeOp = base_ops.create_binary_op(name="le", type_signature=op_typing.COMPARISON) +le_op = LeOp() -ge_op = base_ops.create_binary_op(name="ge", type_signature=op_typing.COMPARISON) +GeOp = base_ops.create_binary_op(name="ge", type_signature=op_typing.COMPARISON) +ge_op = GeOp() diff --git a/bigframes/operations/date_ops.py b/bigframes/operations/date_ops.py index 0b91c86b11..352bc9f93e 100644 --- a/bigframes/operations/date_ops.py +++ b/bigframes/operations/date_ops.py @@ -19,49 +19,58 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -day_op = base_ops.create_unary_op( +DayOp = base_ops.create_unary_op( name="day", type_signature=op_typing.DATELIKE_ACCESSOR, ) +day_op = DayOp() -month_op = base_ops.create_unary_op( +MonthOp = base_ops.create_unary_op( name="month", type_signature=op_typing.DATELIKE_ACCESSOR, ) +month_op = MonthOp() -year_op = base_ops.create_unary_op( +YearOp = base_ops.create_unary_op( name="year", type_signature=op_typing.DATELIKE_ACCESSOR, ) +year_op = YearOp() -iso_day_op = base_ops.create_unary_op( +IsoDayOp = base_ops.create_unary_op( name="iso_day", type_signature=op_typing.DATELIKE_ACCESSOR ) +iso_day_op = IsoDayOp() -iso_week_op = base_ops.create_unary_op( +IsoWeekOp = base_ops.create_unary_op( name="iso_weeek", type_signature=op_typing.DATELIKE_ACCESSOR, ) +iso_week_op = IsoWeekOp() -iso_year_op = base_ops.create_unary_op( +IsoYearOp = base_ops.create_unary_op( name="iso_year", type_signature=op_typing.DATELIKE_ACCESSOR, ) +iso_year_op = IsoYearOp() -dayofweek_op = base_ops.create_unary_op( +DayOfWeekOp = base_ops.create_unary_op( name="dayofweek", type_signature=op_typing.DATELIKE_ACCESSOR, ) +dayofweek_op = DayOfWeekOp() -dayofyear_op = base_ops.create_unary_op( +DayOfYearOp = base_ops.create_unary_op( name="dayofyear", type_signature=op_typing.DATELIKE_ACCESSOR, ) +dayofyear_op = DayOfYearOp() -quarter_op = base_ops.create_unary_op( +QuarterOp = base_ops.create_unary_op( name="quarter", type_signature=op_typing.DATELIKE_ACCESSOR, ) +quarter_op = QuarterOp() @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py index 6e7fb32941..7c760b689b 100644 --- a/bigframes/operations/datetime_ops.py +++ b/bigframes/operations/datetime_ops.py @@ -22,19 +22,21 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -date_op = base_ops.create_unary_op( +DateOp = base_ops.create_unary_op( name="date", type_signature=op_typing.FixedOutputType( dtypes.is_date_like, dtypes.DATE_DTYPE, description="date-like" ), ) +date_op = DateOp() -time_op = base_ops.create_unary_op( +TimeOp = base_ops.create_unary_op( name="time", type_signature=op_typing.FixedOutputType( dtypes.is_time_like, dtypes.TIME_DTYPE, description="time-like" ), ) +time_op = TimeOp() @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/distance_ops.py b/bigframes/operations/distance_ops.py index 74595b561a..ac0863b9e6 100644 --- a/bigframes/operations/distance_ops.py +++ b/bigframes/operations/distance_ops.py @@ -16,14 +16,17 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -cosine_distance_op = base_ops.create_binary_op( +CosineDistanceOp = base_ops.create_binary_op( name="ml_cosine_distance", type_signature=op_typing.VECTOR_METRIC ) +cosine_distance_op = CosineDistanceOp() -manhattan_distance_op = base_ops.create_binary_op( +ManhattanDistanceOp = base_ops.create_binary_op( name="ml_manhattan_distance", type_signature=op_typing.VECTOR_METRIC ) +manhattan_distance_op = ManhattanDistanceOp() -euclidean_distance_op = base_ops.create_binary_op( +EuclidDistanceOp = base_ops.create_binary_op( name="ml_euclidean_distance", type_signature=op_typing.VECTOR_METRIC ) +euclidean_distance_op = EuclidDistanceOp() diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py index b90a43b091..3c3f9653b4 100644 --- a/bigframes/operations/generic_ops.py +++ b/bigframes/operations/generic_ops.py @@ -20,34 +20,38 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -invert_op = base_ops.create_unary_op( +InvertOp = base_ops.create_unary_op( name="invert", type_signature=op_typing.TypePreserving( dtypes.is_binary_like, description="binary-like", ), ) +invert_op = InvertOp() -isnull_op = base_ops.create_unary_op( +IsNullOp = base_ops.create_unary_op( name="isnull", type_signature=op_typing.FixedOutputType( lambda x: True, dtypes.BOOL_DTYPE, description="nullable" ), ) +isnull_op = IsNullOp() -notnull_op = base_ops.create_unary_op( +NotNullOp = base_ops.create_unary_op( name="notnull", type_signature=op_typing.FixedOutputType( lambda x: True, dtypes.BOOL_DTYPE, description="nullable" ), ) +notnull_op = NotNullOp() -hash_op = base_ops.create_unary_op( +HashOp = base_ops.create_unary_op( name="hash", type_signature=op_typing.FixedOutputType( dtypes.is_string_like, dtypes.INT_DTYPE, description="string-like" ), ) +hash_op = HashOp() @dataclasses.dataclass(frozen=True) @@ -80,15 +84,17 @@ def output_type(self, *input_types): return input_types[0] -fillna_op = base_ops.create_binary_op(name="fillna", type_signature=op_typing.COERCE) +FillNaOp = base_ops.create_binary_op(name="fillna", type_signature=op_typing.COERCE) +fillna_op = FillNaOp() -maximum_op = base_ops.create_binary_op(name="maximum", type_signature=op_typing.COERCE) +MaximumOp = base_ops.create_binary_op(name="maximum", type_signature=op_typing.COERCE) +maximum_op = MaximumOp() -minimum_op = base_ops.create_binary_op(name="minimum", type_signature=op_typing.COERCE) +MinimumOp = base_ops.create_binary_op(name="minimum", type_signature=op_typing.COERCE) +minimum_op = MinimumOp() -coalesce_op = base_ops.create_binary_op( - name="coalesce", type_signature=op_typing.COERCE -) +CoalesceOp = base_ops.create_binary_op(name="coalesce", type_signature=op_typing.COERCE) +coalesce_op = CoalesceOp() @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 1b99e47ab1..0268c63249 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -18,66 +18,76 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -geo_area_op = base_ops.create_unary_op( +GeoAreaOp = base_ops.create_unary_op( name="geo_area", type_signature=op_typing.FixedOutputType( dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" ), ) +geo_area_op = GeoAreaOp() -geo_st_astext_op = base_ops.create_unary_op( +GeoStAstextOp = base_ops.create_unary_op( name="geo_st_astext", type_signature=op_typing.FixedOutputType( dtypes.is_geo_like, dtypes.STRING_DTYPE, description="geo-like" ), ) +geo_st_astext_op = GeoStAstextOp() -geo_st_boundary_op = base_ops.create_unary_op( +GeoStBoundaryOp = base_ops.create_unary_op( name="geo_st_boundary", type_signature=op_typing.FixedOutputType( dtypes.is_geo_like, dtypes.GEO_DTYPE, description="geo-like" ), ) +geo_st_boundary_op = GeoStBoundaryOp() -geo_st_difference_op = base_ops.create_binary_op( +GeoStDifferenceOp = base_ops.create_binary_op( name="geo_st_difference", type_signature=op_typing.BinaryGeo() ) +geo_st_difference_op = GeoStDifferenceOp() -geo_st_geogfromtext_op = base_ops.create_unary_op( +GeoStGeogfromtextOp = base_ops.create_unary_op( name="geo_st_geogfromtext", type_signature=op_typing.FixedOutputType( dtypes.is_string_like, dtypes.GEO_DTYPE, description="string-like" ), ) +geo_st_geogfromtext_op = GeoStGeogfromtextOp() -geo_st_geogpoint_op = base_ops.create_binary_op( +GeoStGeogpointOp = base_ops.create_binary_op( name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() ) +geo_st_geogpoint_op = GeoStGeogpointOp() -geo_st_isclosed_op = base_ops.create_unary_op( +GeoStIsclosedOp = base_ops.create_unary_op( name="geo_st_isclosed", type_signature=op_typing.FixedOutputType( dtypes.is_geo_like, dtypes.BOOL_DTYPE, description="geo-like" ), ) +geo_st_isclosed_op = GeoStIsclosedOp() -geo_x_op = base_ops.create_unary_op( +GeoXOp = base_ops.create_unary_op( name="geo_x", type_signature=op_typing.FixedOutputType( dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" ), ) +geo_x_op = GeoXOp() -geo_y_op = base_ops.create_unary_op( +GeoYOp = base_ops.create_unary_op( name="geo_y", type_signature=op_typing.FixedOutputType( dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" ), ) +geo_y_op = GeoYOp() -geo_st_intersection_op = base_ops.create_binary_op( +GeoStIntersectionOp = base_ops.create_binary_op( name="geo_st_intersection", type_signature=op_typing.BinaryGeo() ) +geo_st_intersection_op = GeoStIntersectionOp() @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index b9820cd0ea..64eec9d8a1 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -19,97 +19,118 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -sin_op = base_ops.create_unary_op( +SinOp = base_ops.create_unary_op( name="sin", type_signature=op_typing.UNARY_REAL_NUMERIC ) +sin_op = SinOp() -cos_op = base_ops.create_unary_op( +CosOp = base_ops.create_unary_op( name="cos", type_signature=op_typing.UNARY_REAL_NUMERIC ) +cos_op = CosOp() -tan_op = base_ops.create_unary_op( +TanOp = base_ops.create_unary_op( name="tan", type_signature=op_typing.UNARY_REAL_NUMERIC ) +tan_op = TanOp() -arcsin_op = base_ops.create_unary_op( +ArcsinOp = base_ops.create_unary_op( name="arcsin", type_signature=op_typing.UNARY_REAL_NUMERIC ) +arcsin_op = ArcsinOp() -arccos_op = base_ops.create_unary_op( +ArccosOp = base_ops.create_unary_op( name="arccos", type_signature=op_typing.UNARY_REAL_NUMERIC ) +arccos_op = ArccosOp() -arctan_op = base_ops.create_unary_op( +ArctanOp = base_ops.create_unary_op( name="arctan", type_signature=op_typing.UNARY_REAL_NUMERIC ) +arctan_op = ArctanOp() -sinh_op = base_ops.create_unary_op( +SinhOp = base_ops.create_unary_op( name="sinh", type_signature=op_typing.UNARY_REAL_NUMERIC ) +sinh_op = SinhOp() -cosh_op = base_ops.create_unary_op( +CoshOp = base_ops.create_unary_op( name="cosh", type_signature=op_typing.UNARY_REAL_NUMERIC ) +cosh_op = CoshOp() -tanh_op = base_ops.create_unary_op( +TanhOp = base_ops.create_unary_op( name="tanh", type_signature=op_typing.UNARY_REAL_NUMERIC ) +tanh_op = TanhOp() -arcsinh_op = base_ops.create_unary_op( +ArcsinhOp = base_ops.create_unary_op( name="arcsinh", type_signature=op_typing.UNARY_REAL_NUMERIC ) +arcsinh_op = ArcsinhOp() -arccosh_op = base_ops.create_unary_op( +ArccoshOp = base_ops.create_unary_op( name="arccosh", type_signature=op_typing.UNARY_REAL_NUMERIC ) +arccosh_op = ArccoshOp() -arctanh_op = base_ops.create_unary_op( +ArctanhOp = base_ops.create_unary_op( name="arctanh", type_signature=op_typing.UNARY_REAL_NUMERIC ) +arctanh_op = ArctanhOp() -floor_op = base_ops.create_unary_op( +FloorOp = base_ops.create_unary_op( name="floor", type_signature=op_typing.UNARY_REAL_NUMERIC ) +floor_op = FloorOp() -ceil_op = base_ops.create_unary_op( +CeilOp = base_ops.create_unary_op( name="ceil", type_signature=op_typing.UNARY_REAL_NUMERIC ) +ceil_op = CeilOp() -abs_op = base_ops.create_unary_op( +AbsOp = base_ops.create_unary_op( name="abs", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA ) +abs_op = AbsOp() -pos_op = base_ops.create_unary_op( +PosOp = base_ops.create_unary_op( name="pos", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA ) +pos_op = PosOp() -neg_op = base_ops.create_unary_op( +NegOp = base_ops.create_unary_op( name="neg", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA ) +neg_op = NegOp() -exp_op = base_ops.create_unary_op( +ExpOp = base_ops.create_unary_op( name="exp", type_signature=op_typing.UNARY_REAL_NUMERIC ) +exp_op = ExpOp() -expm1_op = base_ops.create_unary_op( +Expm1Op = base_ops.create_unary_op( name="expm1", type_signature=op_typing.UNARY_REAL_NUMERIC ) +expm1_op = Expm1Op() -ln_op = base_ops.create_unary_op( - name="log", type_signature=op_typing.UNARY_REAL_NUMERIC -) +LnOp = base_ops.create_unary_op(name="log", type_signature=op_typing.UNARY_REAL_NUMERIC) +ln_op = LnOp() -log10_op = base_ops.create_unary_op( +Log10Op = base_ops.create_unary_op( name="log10", type_signature=op_typing.UNARY_REAL_NUMERIC ) +log10_op = Log10Op() -log1p_op = base_ops.create_unary_op( +Log1pOp = base_ops.create_unary_op( name="log1p", type_signature=op_typing.UNARY_REAL_NUMERIC ) +log1p_op = Log1pOp() -sqrt_op = base_ops.create_unary_op( +SqrtOp = base_ops.create_unary_op( name="sqrt", type_signature=op_typing.UNARY_REAL_NUMERIC ) +sqrt_op = SqrtOp() @dataclasses.dataclass(frozen=True) @@ -282,16 +303,20 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT mod_op = ModOp() -pow_op = base_ops.create_binary_op(name="pow", type_signature=op_typing.BINARY_NUMERIC) +PowOp = base_ops.create_binary_op(name="pow", type_signature=op_typing.BINARY_NUMERIC) +pow_op = PowOp() -arctan2_op = base_ops.create_binary_op( +Arctan2Op = base_ops.create_binary_op( name="arctan2", type_signature=op_typing.BINARY_REAL_NUMERIC ) +arctan2_op = Arctan2Op() -round_op = base_ops.create_binary_op( +RoundOp = base_ops.create_binary_op( name="round", type_signature=op_typing.BINARY_NUMERIC ) +round_op = RoundOp() -unsafe_pow_op = base_ops.create_binary_op( +UnsafePowOp = base_ops.create_binary_op( name="unsafe_pow_op", type_signature=op_typing.BINARY_REAL_NUMERIC ) +unsafe_pow_op = UnsafePowOp() diff --git a/bigframes/operations/string_ops.py b/bigframes/operations/string_ops.py index a2755f6654..f937ed23b6 100644 --- a/bigframes/operations/string_ops.py +++ b/bigframes/operations/string_ops.py @@ -22,60 +22,73 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -len_op = base_ops.create_unary_op( +LenOp = base_ops.create_unary_op( name="len", type_signature=op_typing.FixedOutputType( dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable" ), ) +len_op = LenOp() -reverse_op = base_ops.create_unary_op( +ReverseOp = base_ops.create_unary_op( name="reverse", type_signature=op_typing.STRING_TRANSFORM ) +reverse_op = ReverseOp() -lower_op = base_ops.create_unary_op( +LowerOp = base_ops.create_unary_op( name="lower", type_signature=op_typing.STRING_TRANSFORM ) +lower_op = LowerOp() -upper_op = base_ops.create_unary_op( +UpperOp = base_ops.create_unary_op( name="upper", type_signature=op_typing.STRING_TRANSFORM ) +upper_op = UpperOp() -isalnum_op = base_ops.create_unary_op( +IsAlnumOp = base_ops.create_unary_op( name="isalnum", type_signature=op_typing.STRING_PREDICATE ) +isalnum_op = IsAlnumOp() -isalpha_op = base_ops.create_unary_op( +IsAlphaOp = base_ops.create_unary_op( name="isalpha", type_signature=op_typing.STRING_PREDICATE ) +isalpha_op = IsAlphaOp() -isdecimal_op = base_ops.create_unary_op( +IsDecimalOp = base_ops.create_unary_op( name="isdecimal", type_signature=op_typing.STRING_PREDICATE ) +isdecimal_op = IsDecimalOp() -isdigit_op = base_ops.create_unary_op( +IsDigitOp = base_ops.create_unary_op( name="isdigit", type_signature=op_typing.STRING_PREDICATE ) +isdigit_op = IsDigitOp() -isnumeric_op = base_ops.create_unary_op( +IsNumericOp = base_ops.create_unary_op( name="isnumeric", type_signature=op_typing.STRING_PREDICATE ) +isnumeric_op = IsNumericOp() -isspace_op = base_ops.create_unary_op( +IsSpaceOp = base_ops.create_unary_op( name="isspace", type_signature=op_typing.STRING_PREDICATE ) +isspace_op = IsSpaceOp() -islower_op = base_ops.create_unary_op( +IsLowerOp = base_ops.create_unary_op( name="islower", type_signature=op_typing.STRING_PREDICATE ) +islower_op = IsLowerOp() -isupper_op = base_ops.create_unary_op( +IsUpperOp = base_ops.create_unary_op( name="isupper", type_signature=op_typing.STRING_PREDICATE ) +isupper_op = IsUpperOp() -capitalize_op = base_ops.create_unary_op( +CapitalizeOp = base_ops.create_unary_op( name="capitalize", type_signature=op_typing.STRING_TRANSFORM ) +capitalize_op = CapitalizeOp() @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/time_ops.py b/bigframes/operations/time_ops.py index a6a65ad80e..bf6fa3e7d1 100644 --- a/bigframes/operations/time_ops.py +++ b/bigframes/operations/time_ops.py @@ -16,25 +16,29 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -hour_op = base_ops.create_unary_op( +HourOp = base_ops.create_unary_op( name="hour", type_signature=op_typing.TIMELIKE_ACCESSOR, ) +hour_op = HourOp() -minute_op = base_ops.create_unary_op( +MinuteOp = base_ops.create_unary_op( name="minute", type_signature=op_typing.TIMELIKE_ACCESSOR, ) +minute_op = MinuteOp() -second_op = base_ops.create_unary_op( +SecondOp = base_ops.create_unary_op( name="second", type_signature=op_typing.TIMELIKE_ACCESSOR, ) +second_op = SecondOp() -normalize_op = base_ops.create_unary_op( +NormalizeOp = base_ops.create_unary_op( name="normalize", type_signature=op_typing.TypePreserving( dtypes.is_time_like, description="time-like", ), ) +normalize_op = NormalizeOp() From e403528b9241e4bd0ad9a09dc0c1cd8e8f8437d8 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 11 Jun 2025 14:58:10 -0700 Subject: [PATCH 09/23] chore: add snippet tests for type system doc (#1783) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: add snippet tests for type system doc * fix format * fix more lint * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add tests for snippets * fix lint * try to fix tests with typo * restore project in set_options test * use options.reset(): * put global options setting in a try-finally block * warn about json type and remove json type output from the comment * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * polish comments * Update samples/snippets/type_system_test.py Co-authored-by: Tim Sweña (Swast) * remove json samples * remove json samples --------- Co-authored-by: Owl Bot Co-authored-by: Tim Sweña (Swast) --- samples/snippets/set_options_test.py | 44 ++--- samples/snippets/type_system_test.py | 235 +++++++++++++++++++++++++++ 2 files changed, 259 insertions(+), 20 deletions(-) create mode 100644 samples/snippets/type_system_test.py diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py index 3dea524a17..6007dcbb38 100644 --- a/samples/snippets/set_options_test.py +++ b/samples/snippets/set_options_test.py @@ -19,23 +19,27 @@ def test_bigquery_dataframes_set_options() -> None: bpd.close_session() - # [START bigquery_dataframes_set_options] - import bigframes.pandas as bpd - - PROJECT_ID = "bigframes-dec" # @param {type:"string"} - REGION = "US" # @param {type:"string"} - - # Set BigQuery DataFrames options - # Note: The project option is not required in all environments. - # On BigQuery Studio, the project ID is automatically detected. - bpd.options.bigquery.project = PROJECT_ID - - # Note: The location option is not required. - # It defaults to the location of the first table or query - # passed to read_gbq(). For APIs where a location can't be - # auto-detected, the location defaults to the "US" location. - bpd.options.bigquery.location = REGION - - # [END bigquery_dataframes_set_options] - assert bpd.options.bigquery.project == PROJECT_ID - assert bpd.options.bigquery.location == REGION + try: + # [START bigquery_dataframes_set_options] + import bigframes.pandas as bpd + + PROJECT_ID = "bigframes-dev" # @param {type:"string"} + REGION = "US" # @param {type:"string"} + + # Set BigQuery DataFrames options + # Note: The project option is not required in all environments. + # On BigQuery Studio, the project ID is automatically detected. + bpd.options.bigquery.project = PROJECT_ID + + # Note: The location option is not required. + # It defaults to the location of the first table or query + # passed to read_gbq(). For APIs where a location can't be + # auto-detected, the location defaults to the "US" location. + bpd.options.bigquery.location = REGION + + # [END bigquery_dataframes_set_options] + assert bpd.options.bigquery.project == PROJECT_ID + assert bpd.options.bigquery.location == REGION + finally: + bpd.close_session() + bpd.options.reset() diff --git a/samples/snippets/type_system_test.py b/samples/snippets/type_system_test.py new file mode 100644 index 0000000000..88b9e74742 --- /dev/null +++ b/samples/snippets/type_system_test.py @@ -0,0 +1,235 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas.testing + +from bigframes import dtypes + + +def test_type_system_examples() -> None: + # [START bigquery_dataframes_type_sytem_timestamp_local_type_conversion] + import pandas as pd + + import bigframes.pandas as bpd + + s = pd.Series([pd.Timestamp("20250101")]) + assert s.dtype == "datetime64[ns]" + assert bpd.read_pandas(s).dtype == "timestamp[us][pyarrow]" + # [END bigquery_dataframes_type_sytem_timestamp_local_type_conversion] + + # [START bigquery_dataframes_type_system_pyarrow_preference] + import datetime + + import pandas as pd + + import bigframes.pandas as bpd + + s = pd.Series([datetime.date(2025, 1, 1)]) + s + pd.Timedelta(hours=12) + # 0 2025-01-01 + # dtype: object + + bpd.read_pandas(s) + pd.Timedelta(hours=12) + # 0 2025-01-01 12:00:00 + # dtype: timestamp[us][pyarrow] + # [END bigquery_dataframes_type_system_pyarrow_preference] + pandas.testing.assert_series_equal( + s + pd.Timedelta(hours=12), pd.Series([datetime.date(2025, 1, 1)]) + ) + pandas.testing.assert_series_equal( + (bpd.read_pandas(s) + pd.Timedelta(hours=12)).to_pandas(), + pd.Series([pd.Timestamp(2025, 1, 1, 12)], dtype=dtypes.DATETIME_DTYPE), + check_index_type=False, + ) + + # [START bigquery_dataframes_type_system_load_timedelta] + import pandas as pd + + import bigframes.pandas as bpd + + s = pd.Series([pd.Timedelta("1s"), pd.Timedelta("2m")]) + bpd.read_pandas(s) + # 0 0 days 00:00:01 + # 1 0 days 00:02:00 + # dtype: duration[us][pyarrow] + # [END bigquery_dataframes_type_system_load_timedelta] + pandas.testing.assert_series_equal( + bpd.read_pandas(s).to_pandas(), + s.astype(dtypes.TIMEDELTA_DTYPE), + check_index_type=False, + ) + + # [START bigquery_dataframes_type_system_timedelta_precision] + import pandas as pd + + s = pd.Series([pd.Timedelta("999ns")]) + bpd.read_pandas(s.dt.round("us")) + # 0 0 days 00:00:00.000001 + # dtype: duration[us][pyarrow] + # [END bigquery_dataframes_type_system_timedelta_precision] + pandas.testing.assert_series_equal( + bpd.read_pandas(s.dt.round("us")).to_pandas(), + s.dt.round("us").astype(dtypes.TIMEDELTA_DTYPE), + check_index_type=False, + ) + + # [START bigquery_dataframes_type_system_cast_timedelta] + import bigframes.pandas as bpd + + bpd.to_timedelta([1, 2, 3], unit="s") + # 0 0 days 00:00:01 + # 1 0 days 00:00:02 + # 2 0 days 00:00:03 + # dtype: duration[us][pyarrow] + # [END bigquery_dataframes_type_system_cast_timedelta] + pandas.testing.assert_series_equal( + bpd.to_timedelta([1, 2, 3], unit="s").to_pandas(), + pd.Series(pd.to_timedelta([1, 2, 3], unit="s"), dtype=dtypes.TIMEDELTA_DTYPE), + check_index_type=False, + ) + + # [START bigquery_dataframes_type_system_list_accessor] + import bigframes.pandas as bpd + + s = bpd.Series([[1, 2, 3], [4, 5], [6]]) # dtype: list[pyarrow] + + # Access the first elements of each list + s.list[0] + # 0 1 + # 1 4 + # 2 6 + # dtype: Int64 + + # Get the lengths of each list + s.list.len() + # 0 3 + # 1 2 + # 2 1 + # dtype: Int64 + # [END bigquery_dataframes_type_system_list_accessor] + pandas.testing.assert_series_equal( + s.list[0].to_pandas(), + pd.Series([1, 4, 6], dtype="Int64"), + check_index_type=False, + ) + pandas.testing.assert_series_equal( + s.list.len().to_pandas(), + pd.Series([3, 2, 1], dtype="Int64"), + check_index_type=False, + ) + + # [START bigquery_dataframes_type_system_struct_accessor] + import bigframes.pandas as bpd + + structs = [ + {"id": 101, "category": "A"}, + {"id": 102, "category": "B"}, + {"id": 103, "category": "C"}, + ] + s = bpd.Series(structs) + # Get the 'id' field of each struct + s.struct.field("id") + # 0 101 + # 1 102 + # 2 103 + # Name: id, dtype: Int64 + # [END bigquery_dataframes_type_system_struct_accessor] + + # [START bigquery_dataframes_type_system_struct_accessor_shortcut] + import bigframes.pandas as bpd + + structs = [ + {"id": 101, "category": "A"}, + {"id": 102, "category": "B"}, + {"id": 103, "category": "C"}, + ] + s = bpd.Series(structs) + + # not explicitly using the "struct" property + s.id + # 0 101 + # 1 102 + # 2 103 + # Name: id, dtype: Int64 + # [END bigquery_dataframes_type_system_struct_accessor_shortcut] + pandas.testing.assert_series_equal( + s.struct.field("id").to_pandas(), + pd.Series([101, 102, 103], dtype="Int64", name="id"), + check_index_type=False, + ) + pandas.testing.assert_series_equal( + s.id.to_pandas(), + pd.Series([101, 102, 103], dtype="Int64", name="id"), + check_index_type=False, + ) + + # [START bigquery_dataframes_type_system_string_accessor] + import bigframes.pandas as bpd + + s = bpd.Series(["abc", "de", "1"]) # dtype: string[pyarrow] + + # Get the first character of each string + s.str[0] + # 0 a + # 1 d + # 2 1 + # dtype: string + + # Check whether there are only alphabetic characters in each string + s.str.isalpha() + # 0 True + # 1 True + # 2 False + # dtype: boolean + + # Cast the alphabetic characters to their upper cases for each string + s.str.upper() + # 0 ABC + # 1 DE + # 2 1 + # dtype: string + # [END bigquery_dataframes_type_system_string_accessor] + pandas.testing.assert_series_equal( + s.str[0].to_pandas(), + pd.Series(["a", "d", "1"], dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) + pandas.testing.assert_series_equal( + s.str.isalpha().to_pandas(), + pd.Series([True, True, False], dtype=dtypes.BOOL_DTYPE), + check_index_type=False, + ) + pandas.testing.assert_series_equal( + s.str.upper().to_pandas(), + pd.Series(["ABC", "DE", "1"], dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) + + # [START bigquery_dataframes_type_system_geo_accessor] + from shapely.geometry import Point + + import bigframes.pandas as bpd + + s = bpd.Series([Point(1, 0), Point(2, 1)]) # dtype: geometry + + s.geo.y + # 0 0.0 + # 1 1.0 + # dtype: Float64 + # [END bigquery_dataframes_type_system_geo_accessor] + pandas.testing.assert_series_equal( + s.geo.y.to_pandas(), + pd.Series([0.0, 1.0], dtype=dtypes.FLOAT_DTYPE), + check_index_type=False, + ) From 855031a316a6957731a5d1c5e59dedb9757d9f7a Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 12 Jun 2025 09:58:01 -0700 Subject: [PATCH 10/23] fix: correct read_csv behaviours with use_cols, names, index_col (#1804) * fix: correct read_csv behaviours with use_cols, names, index_col parameters * fix test_default_index_warning_not_raised_by_read_gbq_primary_key * refactor read_gbq_table for more readable * fix presubmit --- .../session/_io/bigquery/read_gbq_table.py | 18 +- bigframes/session/loader.py | 245 +++++++++++------- tests/system/small/test_session.py | 130 +++++++++- 3 files changed, 283 insertions(+), 110 deletions(-) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 2dff16933f..6322040428 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -243,25 +243,17 @@ def get_index_cols( | int | bigframes.enums.DefaultIndexKind, *, - names: Optional[Iterable[str]] = None, + rename_to_schema: Optional[Dict[str, str]] = None, ) -> List[str]: """ If we can get a total ordering from the table, such as via primary key column(s), then return those too so that ordering generation can be avoided. """ - # Transform index_col -> index_cols so we have a variable that is # always a list of column names (possibly empty). schema_len = len(table.schema) - # If the `names` is provided, the index_col provided by the user is the new - # name, so we need to rename it to the original name in the table schema. - renamed_schema: Optional[Dict[str, str]] = None - if names is not None: - assert len(list(names)) == schema_len - renamed_schema = {name: field.name for name, field in zip(names, table.schema)} - index_cols: List[str] = [] if isinstance(index_col, bigframes.enums.DefaultIndexKind): if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: @@ -278,8 +270,8 @@ def get_index_cols( f"Got unexpected index_col {repr(index_col)}. {constants.FEEDBACK_LINK}" ) elif isinstance(index_col, str): - if renamed_schema is not None: - index_col = renamed_schema.get(index_col, index_col) + if rename_to_schema is not None: + index_col = rename_to_schema.get(index_col, index_col) index_cols = [index_col] elif isinstance(index_col, int): if not 0 <= index_col < schema_len: @@ -291,8 +283,8 @@ def get_index_cols( elif isinstance(index_col, Iterable): for item in index_col: if isinstance(item, str): - if renamed_schema is not None: - item = renamed_schema.get(item, item) + if rename_to_schema is not None: + item = rename_to_schema.get(item, item) index_cols.append(item) elif isinstance(item, int): if not 0 <= item < schema_len: diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 814d44292e..add4efb6ab 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -96,22 +96,35 @@ def _to_index_cols( return index_cols -def _check_column_duplicates( - index_cols: Iterable[str], columns: Iterable[str], index_col_in_columns: bool -) -> Iterable[str]: - """Validates and processes index and data columns for duplicates and overlap. +def _check_duplicates(name: str, columns: Optional[Iterable[str]] = None): + """Check for duplicate column names in the provided iterable.""" + if columns is None: + return + columns_list = list(columns) + set_columns = set(columns_list) + if len(columns_list) > len(set_columns): + raise ValueError( + f"The '{name}' argument contains duplicate names. " + f"All column names specified in '{name}' must be unique." + ) - This function performs two main tasks: - 1. Ensures there are no duplicate column names within the `index_cols` list - or within the `columns` list. - 2. Based on the `index_col_in_columns` flag, it validates the relationship - between `index_cols` and `columns`. + +def _check_index_col_param( + index_cols: Iterable[str], + columns: Iterable[str], + *, + table_columns: Optional[Iterable[str]] = None, + index_col_in_columns: Optional[bool] = False, +): + """Checks for duplicates in `index_cols` and resolves overlap with `columns`. Args: index_cols (Iterable[str]): - An iterable of column names designated as the index. + Column names designated as the index columns. columns (Iterable[str]): - An iterable of column names designated as the data columns. + Used column names from table_columns. + table_columns (Iterable[str]): + A full list of column names in the table schema. index_col_in_columns (bool): A flag indicating how to handle overlap between `index_cols` and `columns`. @@ -121,40 +134,97 @@ def _check_column_duplicates( `columns`. An error is raised if an index column is not found in the `columns` list. """ - index_cols_list = list(index_cols) if index_cols is not None else [] - columns_list = list(columns) if columns is not None else [] - set_index = set(index_cols_list) - set_columns = set(columns_list) + _check_duplicates("index_col", index_cols) - if len(index_cols_list) > len(set_index): - raise ValueError( - "The 'index_col' argument contains duplicate names. " - "All column names specified in 'index_col' must be unique." - ) + if columns is not None and len(list(columns)) > 0: + set_index = set(list(index_cols) if index_cols is not None else []) + set_columns = set(list(columns) if columns is not None else []) - if len(columns_list) == 0: - return columns + if index_col_in_columns: + if not set_index.issubset(set_columns): + raise ValueError( + f"The specified index column(s) were not found: {set_index - set_columns}. " + f"Available columns are: {set_columns}" + ) + else: + if not set_index.isdisjoint(set_columns): + raise ValueError( + "Found column names that exist in both 'index_col' and 'columns' arguments. " + "These arguments must specify distinct sets of columns." + ) - if len(columns_list) > len(set_columns): - raise ValueError( - "The 'columns' argument contains duplicate names. " - "All column names specified in 'columns' must be unique." - ) + if not index_col_in_columns and table_columns is not None: + for key in index_cols: + if key not in table_columns: + possibility = min( + table_columns, + key=lambda item: bigframes._tools.strings.levenshtein_distance( + key, item + ), + ) + raise ValueError( + f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?" + ) - if index_col_in_columns: - if not set_index.issubset(set_columns): - raise ValueError( - f"The specified index column(s) were not found: {set_index - set_columns}. " - f"Available columns are: {set_columns}" + +def _check_columns_param(columns: Iterable[str], table_columns: Iterable[str]): + """Validates that the specified columns are present in the table columns. + + Args: + columns (Iterable[str]): + Used column names from table_columns. + table_columns (Iterable[str]): + A full list of column names in the table schema. + Raises: + ValueError: If any column in `columns` is not found in the table columns. + """ + for column_name in columns: + if column_name not in table_columns: + possibility = min( + table_columns, + key=lambda item: bigframes._tools.strings.levenshtein_distance( + column_name, item + ), ) - return [col for col in columns if col not in set_index] - else: - if not set_index.isdisjoint(set_columns): raise ValueError( - "Found column names that exist in both 'index_col' and 'columns' arguments. " - "These arguments must specify distinct sets of columns." + f"Column '{column_name}' is not found. Did you mean '{possibility}'?" ) - return columns + + +def _check_names_param( + names: Iterable[str], + index_col: Iterable[str] + | str + | Iterable[int] + | int + | bigframes.enums.DefaultIndexKind, + columns: Iterable[str], + table_columns: Iterable[str], +): + len_names = len(list(names)) + len_table_columns = len(list(table_columns)) + len_columns = len(list(columns)) + if len_names > len_table_columns: + raise ValueError( + f"Too many columns specified: expected {len_table_columns}" + f" and found {len_names}" + ) + elif len_names < len_table_columns: + if isinstance(index_col, bigframes.enums.DefaultIndexKind) or index_col != (): + raise KeyError( + "When providing both `index_col` and `names`, ensure the " + "number of `names` matches the number of columns in your " + "data." + ) + if len_columns != 0: + # The 'columns' must be identical to the 'names'. If not, raise an error. + if len_columns != len_names: + raise ValueError( + "Number of passed names did not match number of header " + "fields in the file" + ) + if set(list(names)) != set(list(columns)): + raise ValueError("Usecols do not match columns") @dataclasses.dataclass @@ -545,11 +615,14 @@ def read_gbq_table( f"`max_results` should be a positive number, got {max_results}." ) + _check_duplicates("columns", columns) + table_ref = google.cloud.bigquery.table.TableReference.from_string( table_id, default_project=self._bqclient.project ) columns = list(columns) + include_all_columns = columns is None or len(columns) == 0 filters = typing.cast(list, list(filters)) # --------------------------------- @@ -563,72 +636,58 @@ def read_gbq_table( cache=self._df_snapshot, use_cache=use_cache, ) - table_column_names = {field.name for field in table.schema} if table.location.casefold() != self._storage_manager.location.casefold(): raise ValueError( f"Current session is in {self._storage_manager.location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" ) - for key in columns: - if key not in table_column_names: - possibility = min( - table_column_names, - key=lambda item: bigframes._tools.strings.levenshtein_distance( - key, item - ), - ) - raise ValueError( - f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?" - ) - - # TODO(b/408499371): check `names` work with `use_cols` for read_csv method. + table_column_names = [field.name for field in table.schema] + rename_to_schema: Optional[Dict[str, str]] = None if names is not None: + _check_names_param(names, index_col, columns, table_column_names) + + # Additional unnamed columns is going to set as index columns len_names = len(list(names)) - len_columns = len(table.schema) - if len_names > len_columns: - raise ValueError( - f"Too many columns specified: expected {len_columns}" - f" and found {len_names}" - ) - elif len_names < len_columns: - if ( - isinstance(index_col, bigframes.enums.DefaultIndexKind) - or index_col != () - ): - raise KeyError( - "When providing both `index_col` and `names`, ensure the " - "number of `names` matches the number of columns in your " - "data." - ) - index_col = range(len_columns - len_names) + len_schema = len(table.schema) + if len(columns) == 0 and len_names < len_schema: + index_col = range(len_schema - len_names) names = [ - field.name for field in table.schema[: len_columns - len_names] + field.name for field in table.schema[: len_schema - len_names] ] + list(names) + assert len_schema >= len_names + assert len_names >= len(columns) + + table_column_names = table_column_names[: len(list(names))] + rename_to_schema = dict(zip(list(names), table_column_names)) + + if len(columns) != 0: + if names is None: + _check_columns_param(columns, table_column_names) + else: + _check_columns_param(columns, names) + names = columns + assert rename_to_schema is not None + columns = [rename_to_schema[renamed_name] for renamed_name in columns] + # Converting index_col into a list of column names requires # the table metadata because we might use the primary keys # when constructing the index. index_cols = bf_read_gbq_table.get_index_cols( table=table, index_col=index_col, - names=names, + rename_to_schema=rename_to_schema, ) - columns = list( - _check_column_duplicates(index_cols, columns, index_col_in_columns) + _check_index_col_param( + index_cols, + columns, + table_columns=table_column_names, + index_col_in_columns=index_col_in_columns, ) - - for key in index_cols: - if key not in table_column_names: - possibility = min( - table_column_names, - key=lambda item: bigframes._tools.strings.levenshtein_distance( - key, item - ), - ) - raise ValueError( - f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?" - ) + if index_col_in_columns and not include_all_columns: + set_index = set(list(index_cols) if index_cols is not None else []) + columns = [col for col in columns if col not in set_index] # ----------------------------- # Optionally, execute the query @@ -715,7 +774,7 @@ def read_gbq_table( metadata_only=not self._scan_index_uniqueness, ) schema = schemata.ArraySchema.from_bq_table(table) - if columns: + if not include_all_columns: schema = schema.select(index_cols + columns) array_value = core.ArrayValue.from_table( table, @@ -767,14 +826,14 @@ def read_gbq_table( value_columns = [col for col in array_value.column_ids if col not in index_cols] if names is not None: - renamed_cols: Dict[str, str] = { - col: new_name for col, new_name in zip(array_value.column_ids, names) - } + assert rename_to_schema is not None + schema_to_rename = {value: key for key, value in rename_to_schema.items()} if index_col != bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: index_names = [ - renamed_cols.get(index_col, index_col) for index_col in index_cols + schema_to_rename.get(index_col, index_col) + for index_col in index_cols ] - value_columns = [renamed_cols.get(col, col) for col in value_columns] + value_columns = [schema_to_rename.get(col, col) for col in value_columns] block = blocks.Block( array_value, @@ -898,9 +957,7 @@ def read_gbq_query( ) index_cols = _to_index_cols(index_col) - columns = _check_column_duplicates( - index_cols, columns, index_col_in_columns=False - ) + _check_index_col_param(index_cols, columns) filters_copy1, filters_copy2 = itertools.tee(filters) has_filters = len(list(filters_copy1)) != 0 diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index cbb441e5aa..809d08c6c1 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -146,9 +146,7 @@ def test_read_gbq_w_unknown_column( ): with pytest.raises( ValueError, - match=re.escape( - "Column 'int63_col' of `columns` not found in this table. Did you mean 'int64_col'?" - ), + match=re.escape("Column 'int63_col' is not found. Did you mean 'int64_col'?"), ): session.read_gbq( scalars_table_id, @@ -1365,6 +1363,132 @@ def test_read_csv_for_names_and_index_col( ) +@pytest.mark.parametrize( + "usecols", + [ + pytest.param(["a", "b", "c"], id="same"), + pytest.param(["a", "c"], id="less_than_names"), + ], +) +def test_read_csv_for_names_and_usecols( + session, usecols, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + + names = ["a", "b", "c"] + bf_df = session.read_csv(path, engine="bigquery", names=names, usecols=usecols) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + path, names=names, usecols=usecols, dtype=bf_df.dtypes.to_dict() + ) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index(names[0]).sort_index() + pd_df = pd_df.set_index(names[0]) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_for_names_and_invalid_usecols( + session, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + + names = ["a", "b", "c"] + usecols = ["a", "X"] + with pytest.raises( + ValueError, + match=re.escape("Column 'X' is not found. "), + ): + session.read_csv(path, engine="bigquery", names=names, usecols=usecols) + + +@pytest.mark.parametrize( + ("usecols", "index_col"), + [ + pytest.param(["a", "b", "c"], "a", id="same"), + pytest.param(["a", "b", "c"], ["a", "b"], id="same_two_index"), + pytest.param(["a", "c"], 0, id="less_than_names"), + ], +) +def test_read_csv_for_names_and_usecols_and_indexcol( + session, usecols, index_col, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + + names = ["a", "b", "c"] + bf_df = session.read_csv( + path, engine="bigquery", names=names, usecols=usecols, index_col=index_col + ) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + path, + names=names, + usecols=usecols, + index_col=index_col, + dtype=bf_df.reset_index().dtypes.to_dict(), + ) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_for_names_less_than_columns_and_same_usecols( + session, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + names = ["a", "c"] + usecols = ["a", "c"] + bf_df = session.read_csv(path, engine="bigquery", names=names, usecols=usecols) + + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + path, names=names, usecols=usecols, dtype=bf_df.dtypes.to_dict() + ) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index(names[0]).sort_index() + pd_df = pd_df.set_index(names[0]) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_for_names_less_than_columns_and_mismatched_usecols( + session, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + names = ["a", "b"] + usecols = ["a"] + with pytest.raises( + ValueError, + match=re.escape("Number of passed names did not match number"), + ): + session.read_csv(path, engine="bigquery", names=names, usecols=usecols) + + +def test_read_csv_for_names_less_than_columns_and_different_usecols( + session, df_and_gcs_csv_for_two_columns +): + _, path = df_and_gcs_csv_for_two_columns + names = ["a", "b"] + usecols = ["a", "c"] + with pytest.raises( + ValueError, + match=re.escape("Usecols do not match columns"), + ): + session.read_csv(path, engine="bigquery", names=names, usecols=usecols) + + def test_read_csv_for_dtype(session, df_and_gcs_csv_for_two_columns): _, path = df_and_gcs_csv_for_two_columns From e586151df81917b49f702ae496aaacbd02931636 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 12 Jun 2025 15:15:44 -0700 Subject: [PATCH 11/23] feat: support custom build service account in `remote_function` (#1796) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Regarding the commit to refactor the system test for `cloud_build_service_account`: This commit refactors the system test `test_remote_function_via_session_custom_build_sa` in `tests/system/large/functions/test_remote_function.py` to align with the structure and validation approach of `test_remote_function_via_session_custom_sa`. The test now: - Uses the project "bigframes-dev-perf". - Sets `cloud_build_service_account` to "bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com". - Sets `cloud_function_service_account` to the same value for simplicity in this test. - Uses `cloud_function_ingress_settings="all"`. - Validates that `gcf.build_config.service_account` matches the provided `cloud_build_service_account`. - Employs a dedicated session for the test and ensures proper cleanup. * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add proper test, improve documentation * nit rewording for readability --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Owl Bot --- bigframes/functions/_function_client.py | 13 ++++ bigframes/functions/_function_session.py | 12 +++ bigframes/pandas/__init__.py | 2 + bigframes/session/__init__.py | 12 +++ .../large/functions/test_remote_function.py | 76 ++++++++++++++++++- 5 files changed, 114 insertions(+), 1 deletion(-) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index d03021dd23..e818015a9b 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -77,6 +77,7 @@ def __init__( cloud_function_service_account=None, cloud_function_kms_key_name=None, cloud_function_docker_repository=None, + cloud_build_service_account=None, *, session: Session, ): @@ -94,6 +95,7 @@ def __init__( self._cloud_function_service_account = cloud_function_service_account self._cloud_function_kms_key_name = cloud_function_kms_key_name self._cloud_function_docker_repository = cloud_function_docker_repository + self._cloud_build_service_account = cloud_build_service_account def _create_bq_connection(self) -> None: if self._bq_connection_manager: @@ -452,6 +454,17 @@ def create_cloud_function( function.build_config.docker_repository = ( self._cloud_function_docker_repository ) + + if self._cloud_build_service_account: + canonical_cloud_build_service_account = ( + self._cloud_build_service_account + if "/" in self._cloud_build_service_account + else f"projects/{self._gcp_project_id}/serviceAccounts/{self._cloud_build_service_account}" + ) + function.build_config.service_account = ( + canonical_cloud_build_service_account + ) + function.service_config = functions_v2.ServiceConfig() if memory_mib is not None: function.service_config.available_memory = f"{memory_mib}Mi" diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index e18f7084db..2fb3480d6c 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -263,6 +263,7 @@ def remote_function( cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" ] = "internal-only", + cloud_build_service_account: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -453,6 +454,16 @@ def remote_function( If no setting is provided, `internal-only` will be used by default. See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. + cloud_build_service_account (str, Optional): + Service account in the fully qualified format + `projects/PROJECT_ID/serviceAccounts/SERVICE_ACCOUNT_EMAIL`, or + just the SERVICE_ACCOUNT_EMAIL. The latter would be interpreted + as belonging to the BigQuery DataFrames session project. This is + to be used by Cloud Build to build the function source code into + a deployable artifact. If not provided, the default Cloud Build + service account is used. See + https://cloud.google.com/build/docs/cloud-build-service-account + for more details. """ # Some defaults may be used from the session if not provided otherwise. session = self._resolve_session(session) @@ -599,6 +610,7 @@ def wrapper(func): else cloud_function_service_account, cloud_function_kms_key_name, cloud_function_docker_repository, + cloud_build_service_account=cloud_build_service_account, session=session, # type: ignore ) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index d08ef4e91d..e8253769be 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -89,6 +89,7 @@ def remote_function( cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" ] = "internal-only", + cloud_build_service_account: Optional[str] = None, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -108,6 +109,7 @@ def remote_function( cloud_function_vpc_connector=cloud_function_vpc_connector, cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_ingress_settings=cloud_function_ingress_settings, + cloud_build_service_account=cloud_build_service_account, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index ab09230c99..b6066daed3 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1378,6 +1378,7 @@ def remote_function( cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" ] = "internal-only", + cloud_build_service_account: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1553,6 +1554,16 @@ def remote_function( If no setting is provided, `internal-only` will be used by default. See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. + cloud_build_service_account (str, Optional): + Service account in the fully qualified format + `projects/PROJECT_ID/serviceAccounts/SERVICE_ACCOUNT_EMAIL`, or + just the SERVICE_ACCOUNT_EMAIL. The latter would be interpreted + as belonging to the BigQuery DataFrames session project. This is + to be used by Cloud Build to build the function source code into + a deployable artifact. If not provided, the default Cloud Build + service account is used. See + https://cloud.google.com/build/docs/cloud-build-service-account + for more details. Returns: collections.abc.Callable: A remote function object pointing to the cloud assets created @@ -1581,6 +1592,7 @@ def remote_function( cloud_function_vpc_connector=cloud_function_vpc_connector, cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_ingress_settings=cloud_function_ingress_settings, + cloud_build_service_account=cloud_build_service_account, ) def udf( diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 9e0dcfe4d7..172fff3010 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -1342,7 +1342,7 @@ def test_remote_function_via_session_custom_sa(scalars_dfs): # For upfront convenience, the following set up has been statically created # in the project bigfrmames-dev-perf via cloud console: # - # 1. Create a service account as per + # 1. Create a service account bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com as per # https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console # 2. Give necessary roles as per # https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration @@ -1395,6 +1395,80 @@ def square_num(x): ) +@pytest.mark.parametrize( + ("set_build_service_account"), + [ + pytest.param( + "projects/bigframes-dev-perf/serviceAccounts/bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com", + id="fully-qualified-sa", + ), + pytest.param( + "bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com", + id="just-sa-email", + ), + ], +) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_via_session_custom_build_sa( + scalars_dfs, set_build_service_account +): + # TODO(shobs): Automate the following set-up during testing in the test project. + # + # For upfront convenience, the following set up has been statically created + # in the project bigfrmames-dev-perf via cloud console: + # + # 1. Create a service account bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com as per + # https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console + # 2. Give "Cloud Build Service Account (roles/cloudbuild.builds.builder)" role as per + # https://cloud.google.com/build/docs/cloud-build-service-account#default_permissions_of_the_legacy_service_account + # + project = "bigframes-dev-perf" + expected_build_service_account = "projects/bigframes-dev-perf/serviceAccounts/bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com" + + rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) + + try: + + # TODO(shobs): Figure out why the default ingress setting + # (internal-only) does not work here + @rf_session.remote_function( + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + cloud_build_service_account=set_build_service_account, + cloud_function_ingress_settings="all", + ) + def square_num(x): + if x is None: + return x + return x * x + + # assert that the GCF is created with the intended SA + gcf = rf_session.cloudfunctionsclient.get_function( + name=square_num.bigframes_cloud_function + ) + assert gcf.build_config.service_account == expected_build_service_account + + # assert that the function works as expected on data + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_result_col = bf_int64_col.apply(square_num) + bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x * x) + pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_function_assets( + square_num, rf_session.bqclient, rf_session.cloudfunctionsclient + ) + + def test_remote_function_throws_none_cloud_function_service_account(session): with pytest.raises( ValueError, From f6265dbb8e22de81bb59c7def175cd325e85c041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 13 Jun 2025 10:07:05 -0500 Subject: [PATCH 12/23] docs: rearrange README.rst to include a short code sample (#1812) * docs: rearrange README.rst to include a short code sample Towards internal issue 424443170 * change to BigFrames --- README.rst | 72 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/README.rst b/README.rst index 7f487b9077..9288f2e6a5 100644 --- a/README.rst +++ b/README.rst @@ -1,16 +1,60 @@ -BigQuery DataFrames -=================== +BigQuery DataFrames (BigFrames) +=============================== |GA| |pypi| |versions| -BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API -powered by the BigQuery engine. +BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame +and machine learning (ML) API powered by the BigQuery engine. * ``bigframes.pandas`` provides a pandas-compatible API for analytics. * ``bigframes.ml`` provides a scikit-learn-like API for ML. -BigQuery DataFrames is an open-source package. You can run -``pip install --upgrade bigframes`` to install the latest version. +BigQuery DataFrames is an open-source package. + +**Version 2.0 introduces breaking changes for improved security and performance. See below for details.** + +Getting started with BigQuery DataFrames +---------------------------------------- + +The easiest way to get started is to try the +`BigFrames quickstart `_ +in a `notebook in BigQuery Studio `_. + +To use BigFrames in your local development environment, + +1. Run ``pip install --upgrade bigframes`` to install the latest version. + +2. Setup `Application default credentials `_ + for your local development environment enviroment. + +3. Create a `GCP project with the BigQuery API enabled `_. + +4. Use the ``bigframes`` package to query data. + +.. code-block:: python + + import bigframes.pandas as bpd + + bpd.options.bigquery.project = your_gcp_project_id + df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + print( + df.groupby("name") + .agg({"number": "sum"}) + .sort_values("number", ascending=False) + .head(10) + .to_pandas() + ) + + +Documentation +------------- + +To learn more about BigQuery DataFrames, visit these pages + +* `Introduction to BigQuery DataFrames (BigFrames) `_ +* `Sample notebooks `_ +* `API reference `_ +* `Source code (GitHub) `_ ⚠️ Warning: Breaking Changes in BigQuery DataFrames v2.0 -------------------------------------------------------- @@ -44,22 +88,6 @@ To learn about these changes and how to migrate to version 2.0, see the .. |versions| image:: https://img.shields.io/pypi/pyversions/bigframes.svg :target: https://pypi.org/project/bigframes/ -Documentation -------------- - -* `BigQuery DataFrames source code (GitHub) `_ -* `BigQuery DataFrames sample notebooks `_ -* `BigQuery DataFrames API reference `_ -* `BigQuery DataFrames supported pandas APIs `_ - - -Getting started with BigQuery DataFrames ----------------------------------------- -Read `Introduction to BigQuery DataFrames `_ -and try the `BigQuery DataFrames quickstart `_ -to get up and running in just a few minutes. - - License ------- From 0562a374418d7025793dfd21dc1a4f37cb938fec Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 13 Jun 2025 11:21:34 -0700 Subject: [PATCH 13/23] test: Add unit tests for SequentialUIDGenerator (#1813) This commit introduces unit tests for the `get_uid_stream` method in the `SequentialUIDGenerator` class. Fixes internal issue 416487613 Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/core/compile/sqlglot/compiler.py | 1 + bigframes/core/compile/sqlglot/sqlglot_ir.py | 9 ++-- .../test_compile_readtable/out.sql | 4 +- tests/unit/core/test_guid.py | 41 +++++++++++++++++++ 4 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 tests/unit/core/test_guid.py diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 7e55c0285f..ebe2a64699 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -167,6 +167,7 @@ def compile_readtable(self, node: nodes.ReadTableNode, *args): table.table_id, col_names=[col.source_id for col in node.scan_list.items], alias_names=[col.id.sql for col in node.scan_list.items], + uid_gen=self.uid_gen, ) @_compile_node.register diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index fc1a687c71..95e4f90118 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -114,6 +114,7 @@ def from_table( table_id: str, col_names: typing.Sequence[str], alias_names: typing.Sequence[str], + uid_gen: guid.SequentialUIDGenerator, ) -> SQLGlotIR: selections = [ sge.Alias( @@ -128,7 +129,7 @@ def from_table( catalog=sg.to_identifier(project_id, quoted=cls.quoted), ) select_expr = sge.Select().select(*selections).from_(table_expr) - return cls(expr=select_expr) + return cls(expr=select_expr, uid_gen=uid_gen) @classmethod def from_query_string( @@ -164,10 +165,10 @@ def select( squashed_selections = _squash_selections(self.expr.expressions, selections) if squashed_selections != []: new_expr = self.expr.select(*squashed_selections, append=False) - return SQLGlotIR(expr=new_expr) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) else: new_expr = self._encapsulate_as_cte().select(*selections, append=False) - return SQLGlotIR(expr=new_expr) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) def project( self, @@ -181,7 +182,7 @@ def project( for id, expr in projected_cols ] new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=False) - return SQLGlotIR(expr=new_expr) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) def insert( self, diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql index f010f77bf1..a5cb399b40 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql @@ -1,4 +1,4 @@ -WITH `bfcte_2` AS ( +WITH `bfcte_0` AS ( SELECT `rowindex` AS `bfcol_0`, `int64_col` AS `bfcol_1`, @@ -13,4 +13,4 @@ SELECT `bfcol_2` AS `string_col`, `bfcol_3` AS `float64_col`, `bfcol_4` AS `bool_col` -FROM `bfcte_2` \ No newline at end of file +FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/test_guid.py b/tests/unit/core/test_guid.py new file mode 100644 index 0000000000..c7334848ee --- /dev/null +++ b/tests/unit/core/test_guid.py @@ -0,0 +1,41 @@ +import types +import unittest + +from bigframes.core.guid import SequentialUIDGenerator + + +class TestSequentialUIDGenerator(unittest.TestCase): + def test_get_uid_stream_returns_generator(self): + generator = SequentialUIDGenerator() + stream = generator.get_uid_stream("prefix") + self.assertIsInstance(stream, types.GeneratorType) + + def test_generator_yields_correct_uids(self): + generator = SequentialUIDGenerator() + stream = generator.get_uid_stream("prefix") + self.assertEqual(next(stream), "prefix0") + self.assertEqual(next(stream), "prefix1") + self.assertEqual(next(stream), "prefix2") + + def test_generator_yields_different_uids_for_different_prefixes(self): + generator = SequentialUIDGenerator() + stream_a = generator.get_uid_stream("prefixA") + stream_b = generator.get_uid_stream("prefixB") + self.assertEqual(next(stream_a), "prefixA0") + self.assertEqual(next(stream_b), "prefixB0") + self.assertEqual(next(stream_a), "prefixA1") + self.assertEqual(next(stream_b), "prefixB1") + + def test_multiple_calls_continue_generation(self): + generator = SequentialUIDGenerator() + stream1 = generator.get_uid_stream("prefix") + self.assertEqual(next(stream1), "prefix0") + self.assertEqual(next(stream1), "prefix1") + + stream2 = generator.get_uid_stream("prefix") + self.assertEqual(next(stream2), "prefix2") + self.assertEqual(next(stream2), "prefix3") + + +if __name__ == "__main__": + unittest.main() From dc9eb27fa75e90c2c95a0619551bf67aea6ef63b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 13 Jun 2025 11:55:36 -0700 Subject: [PATCH 14/23] feat: add bbq.json_query_array and warn bbq.json_extract_array deprecated (#1811) * feat: add bbq.json_query_array and warn bbq.json_extract_array deprecated * complete features --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/bigquery/__init__.py | 2 + bigframes/bigquery/_operations/json.py | 59 ++++++++++++++++++ bigframes/core/compile/scalar_op_compiler.py | 13 ++++ bigframes/operations/__init__.py | 2 + bigframes/operations/json_ops.py | 17 ++++++ tests/system/small/bigquery/test_json.py | 63 +++++++++++++++++++- 6 files changed, 155 insertions(+), 1 deletion(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 22bcfb1407..cdc3718893 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -40,6 +40,7 @@ json_extract_array, json_extract_string_array, json_query, + json_query_array, json_set, json_value, parse_json, @@ -67,6 +68,7 @@ "json_extract_array", "json_extract_string_array", "json_query", + "json_query_array", "json_set", "json_value", "parse_json", diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 561fb57348..00d230d684 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -133,6 +133,10 @@ def json_extract_array( `STRING` or `JSON` values. This function uses single quotes and brackets to escape invalid JSONPath characters in JSON keys. + .. deprecated:: 2.5.0 + The ``json_extract_array`` is deprecated and will be removed in a future version. + Use ``json_query_array`` instead. + **Examples:** >>> import bigframes.pandas as bpd @@ -172,6 +176,11 @@ def json_extract_array( Returns: bigframes.series.Series: A new Series with the parsed arrays from the input. """ + msg = ( + "The `json_extract_array` is deprecated and will be removed in a future version. " + "Use `json_query_array` instead." + ) + warnings.warn(bfe.format_message(msg), category=UserWarning) return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) @@ -273,6 +282,56 @@ def json_query( return input._apply_unary_op(ops.JSONQuery(json_path=json_path)) +def json_query_array( + input: series.Series, + json_path: str = "$", +) -> series.Series: + """Extracts a JSON array and converts it to a SQL array of JSON-formatted + `STRING` or `JSON` values. This function uses double quotes to escape invalid + JSONPath characters in JSON keys. For example: `"a.b"`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_query_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}', + ... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}' + ... ]) + >>> bbq.json_query_array(s, "$.fruits") + 0 ['{"name":"apple"}' '{"name":"cherry"}'] + 1 ['{"name":"guava"}' '{"name":"grapes"}'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}', + ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}' + ... ]) + >>> bbq.json_query_array(s, "$.fruits.names") + 0 ['"apple"' '"cherry"'] + 1 ['"guava"' '"grapes"'] + dtype: list[pyarrow] + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the parsed arrays from the input. + """ + return input._apply_unary_op(ops.JSONQueryArray(json_path=json_path)) + + def json_value( input: series.Series, json_path: str, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a1fc995159..908f3082c3 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1379,6 +1379,19 @@ def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore return json_query_op(json_or_json_string=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONQueryArray, pass_op=True) +def json_query_array_op_impl(x: ibis_types.Value, op: ops.JSONQueryArray): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_query_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_query_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore + json_query_op = ibis_udf.scalar.builtin(json_query_array) + return json_query_op(json_or_json_string=x, json_path=op.json_path) + + @scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True) def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): return parse_json(json_str=x) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index faf4e18d5e..291bf17fa5 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -109,6 +109,7 @@ JSONExtractArray, JSONExtractStringArray, JSONQuery, + JSONQueryArray, JSONSet, JSONValue, ParseJSON, @@ -359,6 +360,7 @@ "JSONExtractArray", "JSONExtractStringArray", "JSONQuery", + "JSONQueryArray", "JSONSet", "JSONValue", "ParseJSON", diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index b083035d38..95a47dcadb 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -37,6 +37,23 @@ def output_type(self, *input_types): return input_type +@dataclasses.dataclass(frozen=True) +class JSONQueryArray(base_ops.UnaryOp): + name: typing.ClassVar[str] = "json_query_array" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be a valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type)) + ) + + @dataclasses.dataclass(frozen=True) class JSONExtractArray(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_extract_array" diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 3d155b5f16..4ad16d6cc8 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -128,7 +128,8 @@ def test_json_extract_array_from_json(): ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], dtype=dtypes.JSON_DTYPE, ) - actual = bbq.json_extract_array(s, "$.a") + with pytest.warns(UserWarning, match="The `json_extract_array` is deprecated"): + actual = bbq.json_extract_array(s, "$.a") # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) @@ -241,6 +242,66 @@ def test_json_query_w_invalid_series_type(): bbq.json_query(s, "$.a") +def test_json_query_array_from_json(): + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], + dtype=dtypes.JSON_DTYPE, + ) + actual = bbq.json_query_array(s, "$.a") + + # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, + # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) + sql = """ + SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data, + UNION ALL + SELECT 1, [], + UNION ALL + SELECT 2, [JSON '"4"', JSON '"5"'], + UNION ALL + SELECT 3, null, + """ + df = bpd.read_gbq(sql).set_index("id").sort_index() + expected = df["data"] + expected.index.name = None + expected.name = None + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_from_json_strings(): + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query_array(s, "$.a") + expected = bpd.Series( + [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], + dtype=pd.ArrowDtype(pa.list_(pa.string())), + ) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_from_json_array_strings(): + s = bpd.Series( + ["[1, 2, 3]", "[]", "[4,5]"], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query_array(s) + expected = bpd.Series( + [["1", "2", "3"], [], ["4", "5"]], + dtype=pd.ArrowDtype(pa.list_(pa.string())), + ) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_w_invalid_series_type(): + s = bpd.Series([1, 2]) + with pytest.raises(TypeError): + bbq.json_query_array(s) + + def test_json_value_from_json(): s = bpd.Series( ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], From 1e8a2f1b9fa8cbb40bad638db32afbf4043297e3 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 13 Jun 2025 11:58:27 -0700 Subject: [PATCH 15/23] chore!: Remove attach_logprobs parameter from AI operations (#1816) * Refactor: Remove attach_logprobs parameter from AI operations This commit removes the `attach_logprobs` parameter from the `filter`, `map`, `classify`, and `join` methods within the `AIAccessor` class in `bigframes/operations/ai.py`. The associated logic for calculating and attaching the 'logprob' column has also been removed from the `map` method. System tests in `tests/system/large/operations/test_ai.py` that specifically tested the `attach_logprobs` functionality have been updated by: - Removing the `attach_logprobs=True` argument from method calls. - Removing assertions for the 'logprob' column. - Renaming the test methods to reflect their updated scope (e.g., `test_filter_attach_logprob` to `test_filter_functionality_formerly_attach_logprob`). The small system tests and experimental notebooks were not affected as they did not utilize this parameter. * polish tests --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/operations/ai.py | 38 ------------ tests/system/large/operations/test_ai.py | 78 ------------------------ 2 files changed, 116 deletions(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index f7a9e6358e..30192695ac 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -41,7 +41,6 @@ def filter( instruction: str, model, ground_with_google_search: bool = False, - attach_logprobs: bool = False, ): """ Filters the DataFrame with the semantics of the user instruction. @@ -82,10 +81,6 @@ def filter( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. - attach_logprobs (bool, default False): - Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level - of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. - Returns: bigframes.pandas.DataFrame: DataFrame filtered by the instruction. @@ -103,7 +98,6 @@ def filter( model, output_schema, ground_with_google_search, - attach_logprobs, ) return result[result[answer_col]].drop(answer_col, axis=1) @@ -114,7 +108,6 @@ def map( model, output_schema: Dict[str, str] | None = None, ground_with_google_search: bool = False, - attach_logprobs=False, ): """ Maps the DataFrame with the semantics of the user instruction. The name of the keys in the output_schema parameter carry @@ -180,11 +173,6 @@ def map( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. - attach_logprobs (bool, default False): - Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level - of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. - - Returns: bigframes.pandas.DataFrame: DataFrame with attached mapping results. @@ -258,19 +246,6 @@ def map( attach_columns = [results[col] for col, _ in output_schema.items()] - def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series: - from bigframes import bigquery as bbq - - logprob_jsons = bbq.json_extract_array(s, "$.candidates").list[0] - logprobs = bbq.json_extract(logprob_jsons, "$.avg_logprobs").astype( - "Float64" - ) - logprobs.name = "logprob" - return logprobs - - if attach_logprobs: - attach_columns.append(extract_logprob(results["full_response"])) - from bigframes.core.reshape.api import concat return concat([self._df, *attach_columns], axis=1) @@ -282,7 +257,6 @@ def classify( labels: Sequence[str], output_column: str = "result", ground_with_google_search: bool = False, - attach_logprobs=False, ): """ Classifies the rows of dataframes based on user instruction into the provided labels. @@ -337,11 +311,6 @@ def classify( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. - attach_logprobs (bool, default False): - Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level - of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. - - Returns: bigframes.pandas.DataFrame: DataFrame with classification result. @@ -367,7 +336,6 @@ def classify( model, output_schema={output_column: "string"}, ground_with_google_search=ground_with_google_search, - attach_logprobs=attach_logprobs, ) def join( @@ -376,7 +344,6 @@ def join( instruction: str, model, ground_with_google_search: bool = False, - attach_logprobs=False, ): """ Joines two dataframes by applying the instruction over each pair of rows from @@ -428,10 +395,6 @@ def join( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. - attach_logprobs (bool, default False): - Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level - of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. - Returns: bigframes.pandas.DataFrame: The joined dataframe. @@ -510,7 +473,6 @@ def join( instruction, model, ground_with_google_search=ground_with_google_search, - attach_logprobs=attach_logprobs, ).reset_index(drop=True) def search( diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index c0716220b1..afd135591f 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -66,31 +66,6 @@ def test_filter(session, gemini_flash_model): ) -def test_filter_attach_logprob(session, gemini_flash_model): - df = dataframe.DataFrame( - data={ - "number_1": [1, 2], - "number_2": [2, 1], - "col": [0, 0], - }, - session=session, - ) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 10, - ): - actual_df = df.ai.filter( - "{number_1} is greater than {number_2}", - gemini_flash_model, - attach_logprobs=True, - ).to_pandas() - - assert "logprob" in actual_df.columns - - def test_filter_multi_model(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -259,31 +234,6 @@ def test_map(session, gemini_flash_model, output_schema, output_col): ) -def test_map_attach_logprob(session, gemini_flash_model): - df = dataframe.DataFrame( - data={ - "ingredient_1": ["Burger Bun", "Soy Bean"], - "ingredient_2": ["Beef Patty", "Bittern"], - "gluten-free": [True, True], - }, - session=session, - ) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 10, - ): - actual_df = df.ai.map( - "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - gemini_flash_model, - attach_logprobs=True, - ).to_pandas() - - assert "logprob" in actual_df.columns - - def test_map_multimodel(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -478,34 +428,6 @@ def test_join(instruction, session, gemini_flash_model): ) -def test_join_attach_logprob(session, gemini_flash_model): - cities = dataframe.DataFrame( - data={ - "city": ["Seattle", "Berlin"], - }, - session=session, - ) - countries = dataframe.DataFrame( - data={"country": ["USA", "UK", "Germany"]}, - session=session, - ) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 10, - ): - actual_df = cities.ai.join( - countries, - "{city} is in {country}", - gemini_flash_model, - attach_logprobs=True, - ).to_pandas() - - assert "logprob" in actual_df.columns - - @pytest.mark.parametrize( ("reply"), [ From f984381dee56b3dc4a96a59703696d8535cab783 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 13 Jun 2025 11:59:04 -0700 Subject: [PATCH 16/23] refactor: Refactor udf definitions (#1814) --- bigframes/core/compile/ibis_types.py | 48 --- bigframes/core/compile/scalar_op_compiler.py | 83 ++-- bigframes/dataframe.py | 40 +- bigframes/dtypes.py | 28 -- bigframes/functions/__init__.py | 9 + bigframes/functions/_function_client.py | 34 +- bigframes/functions/_function_session.py | 355 ++++++----------- bigframes/functions/_utils.py | 61 +-- bigframes/functions/function.py | 372 +++++++++++------- bigframes/functions/function_typing.py | 122 ++++++ bigframes/functions/udf_def.py | 173 ++++++++ bigframes/operations/remote_function_ops.py | 29 +- bigframes/series.py | 43 +- bigframes/session/__init__.py | 12 - bigframes/testing/polars_session.py | 1 - tests/system/conftest.py | 6 - .../large/functions/test_managed_function.py | 128 +----- .../large/functions/test_remote_function.py | 120 ------ .../small/functions/test_remote_function.py | 5 +- tests/unit/core/test_dtypes.py | 12 - tests/unit/functions/test_remote_function.py | 23 -- .../functions/test_remote_function_utils.py | 5 +- .../ibis/expr/operations/udf.py | 3 +- 23 files changed, 791 insertions(+), 921 deletions(-) create mode 100644 bigframes/functions/function_typing.py create mode 100644 bigframes/functions/udf_def.py diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index d5f9b5c5f9..0a61be716a 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -13,20 +13,14 @@ # limitations under the License. from __future__ import annotations -import typing from typing import cast, Dict, Iterable, Optional, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.ibis -import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes -from bigframes_vendored.ibis.expr.datatypes.core import ( - dtype as python_type_to_ibis_type, -) import bigframes_vendored.ibis.expr.types as ibis_types import db_dtypes # type: ignore import geopandas as gpd # type: ignore -import google.cloud.bigquery as bigquery import pandas as pd import pyarrow as pa @@ -439,45 +433,3 @@ def literal_to_ibis_scalar( ) return scalar_expr - - -class UnsupportedTypeError(ValueError): - def __init__(self, type_, supported_types): - self.type = type_ - self.supported_types = supported_types - super().__init__( - f"'{type_}' is not one of the supported types {supported_types}" - ) - - -def ibis_type_from_python_type(t: type) -> ibis_dtypes.DataType: - if t not in bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES: - raise UnsupportedTypeError(t, bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES) - return python_type_to_ibis_type(t) - - -def ibis_array_output_type_from_python_type(t: type) -> ibis_dtypes.DataType: - array_of = typing.get_args(t)[0] - if array_of not in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: - raise UnsupportedTypeError( - array_of, bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES - ) - return python_type_to_ibis_type(t) - - -def ibis_type_from_bigquery_type( - type_: bigquery.StandardSqlDataType, -) -> ibis_dtypes.DataType: - """Convert bq type to ibis. Only to be used for remote functions, does not handle all types.""" - if type_.type_kind not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: - raise UnsupportedTypeError( - type_.type_kind, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS - ) - elif type_.type_kind == "ARRAY": - return ibis_dtypes.Array( - value_type=ibis_type_from_bigquery_type( - typing.cast(bigquery.StandardSqlDataType, type_.array_element_type) - ) - ) - else: - return third_party_ibis_bqtypes.BigQueryType.to_ibis(type_.type_kind) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 908f3082c3..b819b1c4e2 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -17,7 +17,6 @@ import functools import typing -import bigframes_vendored.constants as constants import bigframes_vendored.ibis.expr.api as ibis_api import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.operations.generic as ibis_generic @@ -30,6 +29,7 @@ import bigframes.core.compile.default_ordering import bigframes.core.compile.ibis_types import bigframes.core.expression as ex +import bigframes.dtypes import bigframes.operations as ops _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) @@ -1284,17 +1284,58 @@ def timedelta_floor_op_impl(x: ibis_types.NumericValue): @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): - ibis_node = getattr(op.func, "ibis_node", None) - if ibis_node is None: - raise TypeError( - f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" - ) - x_transformed = ibis_node(x) + udf_sig = op.function_def.signature + ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + + @ibis_udf.scalar.builtin( + name=str(op.function_def.routine_ref), signature=ibis_py_sig + ) + def udf(input): + ... + + x_transformed = udf(x) if not op.apply_on_null: - x_transformed = ibis_api.case().when(x.isnull(), x).else_(x_transformed).end() + return ibis_api.case().when(x.isnull(), x).else_(x_transformed).end() return x_transformed +@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True) +def binary_remote_function_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp +): + udf_sig = op.function_def.signature + ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + + @ibis_udf.scalar.builtin( + name=str(op.function_def.routine_ref), signature=ibis_py_sig + ) + def udf(input1, input2): + ... + + x_transformed = udf(x, y) + return x_transformed + + +@scalar_op_compiler.register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True) +def nary_remote_function_op_impl( + *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp +): + udf_sig = op.function_def.signature + ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + arg_names = tuple(arg.name for arg in udf_sig.input_types) + + @ibis_udf.scalar.builtin( + name=str(op.function_def.routine_ref), + signature=ibis_py_sig, + param_name_overrides=arg_names, + ) + def udf(*inputs): + ... + + result = udf(*operands) + return result + + @scalar_op_compiler.register_unary_op(ops.MapOp, pass_op=True) def map_op_impl(x: ibis_types.Value, op: ops.MapOp): case = ibis_api.case() @@ -1931,19 +1972,6 @@ def manhattan_distance_impl( return vector_distance(vector1, vector2, "MANHATTAN") -@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True) -def binary_remote_function_op_impl( - x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp -): - ibis_node = getattr(op.func, "ibis_node", None) - if ibis_node is None: - raise TypeError( - f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" - ) - x_transformed = ibis_node(x, y) - return x_transformed - - # Blob Ops @scalar_op_compiler.register_binary_op(ops.obj_make_ref_op) def obj_make_ref_op(x: ibis_types.Value, y: ibis_types.Value): @@ -2005,19 +2033,6 @@ def case_when_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value: return case_val.end() # type: ignore -@scalar_op_compiler.register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True) -def nary_remote_function_op_impl( - *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp -): - ibis_node = getattr(op.func, "ibis_node", None) - if ibis_node is None: - raise TypeError( - f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" - ) - result = ibis_node(*operands) - return result - - @scalar_op_compiler.register_nary_op(ops.SqlScalarOp, pass_op=True) def sql_scalar_op_impl(*operands: ibis_types.Value, op: ops.SqlScalarOp): return ibis_generic.SqlScalar( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1d0d485392..7e5bb3049a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -74,6 +74,7 @@ import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.formatting_helpers as formatter +import bigframes.functions import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.ai @@ -4470,7 +4471,7 @@ def _prepare_export( return array_value, id_overrides def map(self, func, na_action: Optional[str] = None) -> DataFrame: - if not callable(func): + if not isinstance(func, bigframes.functions.BigqueryCallableRoutine): raise TypeError("the first argument must be callable") if na_action not in {None, "ignore"}: @@ -4478,7 +4479,9 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: # TODO(shobs): Support **kwargs return self._apply_unary_op( - ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None)) + ops.RemoteFunctionOp( + function_def=func.udf_def, apply_on_null=(na_action is None) + ) ) def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): @@ -4492,13 +4495,18 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) warnings.warn(msg, category=bfe.FunctionAxisOnePreviewWarning) - if not hasattr(func, "bigframes_bigquery_function"): + if not isinstance( + func, + ( + bigframes.functions.BigqueryCallableRoutine, + bigframes.functions.BigqueryCallableRowRoutine, + ), + ): raise ValueError( "For axis=1 a BigFrames BigQuery function must be used." ) - is_row_processor = getattr(func, "is_row_processor") - if is_row_processor: + if func.is_row_processor: # Early check whether the dataframe dtypes are currently supported # in the bigquery function # NOTE: Keep in sync with the value converters used in the gcf code @@ -4552,7 +4560,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # Apply the function result_series = rows_as_json_series._apply_unary_op( - ops.RemoteFunctionOp(func=func, apply_on_null=True) + ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True) ) else: # This is a special case where we are providing not-pandas-like @@ -4567,7 +4575,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # compatible with the data types of the input params # 3. The order of the columns in the dataframe must correspond # to the order of the input params in the function - udf_input_dtypes = getattr(func, "input_dtypes") + udf_input_dtypes = func.udf_def.signature.bf_input_types if len(udf_input_dtypes) != len(self.columns): raise ValueError( f"BigFrames BigQuery function takes {len(udf_input_dtypes)}" @@ -4581,25 +4589,11 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): series_list = [self[col] for col in self.columns] result_series = series_list[0]._apply_nary_op( - ops.NaryRemoteFunctionOp(func=func), series_list[1:] + ops.NaryRemoteFunctionOp(function_def=func.udf_def), series_list[1:] ) result_series.name = None - # If the result type is string but the function output is intended - # to be an array, reconstruct the array from the string assuming it - # is a json serialized form of the array. - if bigframes.dtypes.is_string_like( - result_series.dtype - ) and bigframes.dtypes.is_array_like(func.output_dtype): - import bigframes.bigquery as bbq - - result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( - func.output_dtype.pyarrow_dtype.value_type - ) - result_series = bbq.json_extract_string_array( - result_series, value_dtype=result_dtype - ) - + result_series = func._post_process_series(result_series) return result_series # At this point column-wise or element-wise bigquery function operation will diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 2c5df89665..e0c3e39ac9 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -870,32 +870,4 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: return result -### Remote functions use only -# TODO: Refactor into remote function module - -# Input and output types supported by BigQuery DataFrames remote functions. -# TODO(shobs): Extend the support to all types supported by BQ remote functions -# https://cloud.google.com/bigquery/docs/remote-functions#limitations -RF_SUPPORTED_IO_PYTHON_TYPES = {bool, bytes, float, int, str} - -# Support array output types in BigQuery DataFrames remote functions even though -# it is not currently (2024-10-06) supported in BigQuery remote functions. -# https://cloud.google.com/bigquery/docs/remote-functions#limitations -# TODO(b/284515241): remove this special handling when BigQuery remote functions -# support array. -RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES = {bool, float, int, str} - -RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS = { - "BOOLEAN", - "BOOL", - "BYTES", - "FLOAT", - "FLOAT64", - "INT64", - "INTEGER", - "STRING", - "ARRAY", -} - - TIMEDELTA_DESCRIPTION_TAG = "#microseconds" diff --git a/bigframes/functions/__init__.py b/bigframes/functions/__init__.py index 6d5e14bcf4..5f87956a61 100644 --- a/bigframes/functions/__init__.py +++ b/bigframes/functions/__init__.py @@ -11,3 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from bigframes.functions.function import ( + BigqueryCallableRoutine, + BigqueryCallableRowRoutine, +) + +__all__ = [ + "BigqueryCallableRoutine", + "BigqueryCallableRowRoutine", +] diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index e818015a9b..1833ac489c 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -25,7 +25,7 @@ import tempfile import textwrap import types -from typing import cast, Tuple, TYPE_CHECKING +from typing import Any, cast, Optional, Sequence, Tuple, TYPE_CHECKING import requests @@ -39,8 +39,6 @@ import google.api_core.retry from google.cloud import bigquery, functions_v2 -import bigframes.session._io.bigquery - from . import _utils logger = logging.getLogger(__name__) @@ -126,6 +124,8 @@ def _ensure_dataset_exists(self) -> None: def _create_bq_function(self, create_function_ddl: str) -> None: # TODO(swast): plumb through the original, user-facing api_name. + import bigframes.session._io.bigquery + _, query_job = bigframes.session._io.bigquery.start_query_with_client( cast(bigquery.Client, self._session.bqclient), create_function_ddl, @@ -149,13 +149,13 @@ def _format_function_options(self, function_options: dict) -> str: def create_bq_remote_function( self, - input_args, - input_types, - output_type, - endpoint, - bq_function_name, - max_batching_rows, - metadata, + input_args: Sequence[str], + input_types: Sequence[str], + output_type: str, + endpoint: str, + bq_function_name: str, + max_batching_rows: int, + metadata: str, ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" @@ -198,14 +198,14 @@ def create_bq_remote_function( def provision_bq_managed_function( self, func, - input_types, - output_type, - name, - packages, - is_row_processor, + input_types: Sequence[str], + output_type: str, + name: Optional[str], + packages: Optional[Sequence[str]], + is_row_processor: bool, bq_connection_id, *, - capture_references=False, + capture_references: bool = False, ): """Create a BigQuery managed function.""" @@ -230,7 +230,7 @@ def provision_bq_managed_function( for name_, type_ in zip(input_args, input_types): bq_function_args.append(f"{name_} {type_}") - managed_function_options = { + managed_function_options: dict[str, Any] = { "runtime_version": _MANAGED_FUNC_PYTHON_VERSION, "entry_point": "bigframes_handler", } diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 2fb3480d6c..9e7555431a 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -16,14 +16,15 @@ from __future__ import annotations import collections.abc +import functools import inspect import sys import threading from typing import ( Any, - Callable, cast, Dict, + get_origin, Literal, Mapping, Optional, @@ -33,10 +34,6 @@ ) import warnings -import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes -import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes -import bigframes_vendored.ibis.expr.operations.udf as ibis_udf -import cloudpickle import google.api_core.exceptions from google.cloud import ( bigquery, @@ -46,17 +43,17 @@ ) from bigframes import clients -import bigframes.core.compile.ibis_types import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting -import bigframes.series as bf_series +from bigframes.functions import function as bq_functions +from bigframes.functions import udf_def if TYPE_CHECKING: from bigframes.session import Session import pandas -from . import _function_client, _utils +from bigframes.functions import _function_client, _utils class FunctionSession: @@ -220,17 +217,6 @@ def clean_up( self._temp_artifacts.clear() - def _try_delattr(self, func: Callable, attr: str) -> None: - """Attempts to delete an attribute from a bigframes function.""" - # In the unlikely case where the user is trying to re-deploy the same - # function, cleanup the attributes we add in bigframes functions, first. - # This prevents the pickle from having dependencies that might not - # otherwise be present such as ibis or pandas. - try: - delattr(func, attr) - except AttributeError: - pass - # Inspired by @udf decorator implemented in ibis-bigquery package # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py # which has moved as @js to the ibis package @@ -543,58 +529,32 @@ def wrapper(func): else: signature_kwargs = {} # type: ignore - signature = inspect.signature( + py_sig = inspect.signature( func, **signature_kwargs, ) + if input_types is not None: + if not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + py_sig = py_sig.replace( + parameters=[ + par.replace(annotation=itype) + for par, itype in zip(py_sig.parameters.values(), input_types) + ] + ) + if output_type: + py_sig = py_sig.replace(return_annotation=output_type) # Try to get input types via type annotations. - if input_types is None: - input_types = [] - for parameter in signature.parameters.values(): - if (param_type := parameter.annotation) is inspect.Signature.empty: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "'input_types' was not set and parameter " - f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function.", - ) - input_types.append(param_type) - elif not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - - if output_type is None: - if ( - output_type := signature.return_annotation - ) is inspect.Signature.empty: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "'output_type' was not set and function is missing a " - "return type annotation. Types are required to use " - "@remote_function.", - ) # The function will actually be receiving a pandas Series, but allow both # BigQuery DataFrames and pandas object types for compatibility. + # The function will actually be receiving a pandas Series, but allow + # both BigQuery DataFrames and pandas object types for compatibility. is_row_processor = False - if len(input_types) == 1 and ( - (input_type := input_types[0]) == bf_series.Series - or input_type == pandas.Series - ): - msg = bfe.format_message("input_types=Series is in preview.") - warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) - - # we will model the row as a json serialized string containing the data - # and the metadata representing the row. - input_types = [str] + if new_sig := _convert_row_processor_sig(py_sig): + py_sig = new_sig is_row_processor = True - elif isinstance(input_types, type): - input_types = [input_types] - - # TODO(b/340898611): fix type error. - ibis_signature = _utils.ibis_signature_from_python_signature( - signature, input_types, output_type # type: ignore - ) remote_function_client = _function_client.FunctionClient( dataset_ref.project, @@ -614,37 +574,25 @@ def wrapper(func): session=session, # type: ignore ) - # To respect the user code/environment let's use a copy of the - # original udf, especially since we would be setting some properties - # on it. - func = cloudpickle.loads(cloudpickle.dumps(func)) - - self._try_delattr(func, "bigframes_cloud_function") - self._try_delattr(func, "bigframes_remote_function") - self._try_delattr(func, "bigframes_bigquery_function") - self._try_delattr(func, "bigframes_bigquery_function_output_dtype") - self._try_delattr(func, "input_dtypes") - self._try_delattr(func, "output_dtype") - self._try_delattr(func, "is_row_processor") - self._try_delattr(func, "ibis_node") - # resolve the output type that can be supported in the bigframes, # ibis, BQ remote functions and cloud functions integration. - ibis_output_type_for_bqrf = ibis_signature.output_type bqrf_metadata = None - if isinstance(ibis_signature.output_type, ibis_dtypes.Array): + post_process_routine = None + if get_origin(py_sig.return_annotation) is list: # TODO(b/284515241): remove this special handling to support # array output types once BQ remote functions support ARRAY. # Until then, use json serialized strings at the cloud function # and BQ level, and parse that to the intended output type at # the bigframes level. - ibis_output_type_for_bqrf = ibis_dtypes.String() bqrf_metadata = _utils.get_bigframes_metadata( - python_output_type=output_type + python_output_type=py_sig.return_annotation ) - bqrf_output_type = third_party_ibis_bqtypes.BigQueryType.from_ibis( - ibis_output_type_for_bqrf - ) + post_process_routine = _utils._build_unnest_post_routine( + py_sig.return_annotation + ) + py_sig = py_sig.replace(return_annotation=str) + + udf_sig = udf_def.UdfSignature.from_py_signature(py_sig) ( rf_name, @@ -652,12 +600,8 @@ def wrapper(func): created_new, ) = remote_function_client.provision_bq_remote_function( func, - input_types=tuple( - third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) - for type_ in ibis_signature.input_types - if type_ is not None - ), - output_type=bqrf_output_type, + input_types=udf_sig.sql_input_types, + output_type=udf_sig.sql_output_type, reuse=reuse, name=name, package_requirements=packages, @@ -671,56 +615,14 @@ def wrapper(func): bq_metadata=bqrf_metadata, ) - # TODO(shobs): Find a better way to support udfs with param named "name". - # This causes an issue in the ibis compilation. - func.__signature__ = inspect.signature(func).replace( # type: ignore - parameters=[ - inspect.Parameter( - f"bigframes_{param.name}", - param.kind, - ) - for param in inspect.signature(func).parameters.values() - ] - ) - - # TODO: Move ibis logic to compiler step. - node = ibis_udf.scalar.builtin( - func, - name=rf_name, - catalog=dataset_ref.project, - database=dataset_ref.dataset_id, - signature=(ibis_signature.input_types, ibis_output_type_for_bqrf), - ) # type: ignore - func.bigframes_cloud_function = ( + bigframes_cloud_function = ( remote_function_client.get_cloud_function_fully_qualified_name(cf_name) ) - func.bigframes_bigquery_function = ( + bigframes_bigquery_function = ( remote_function_client.get_remote_function_fully_qualilfied_name( rf_name ) ) - func.bigframes_remote_function = func.bigframes_bigquery_function - func.input_dtypes = tuple( - [ - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - input_type - ) - for input_type in ibis_signature.input_types - if input_type is not None - ] - ) - func.output_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_signature.output_type - ) - ) - func.bigframes_bigquery_function_output_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_output_type_for_bqrf - ) - ) - func.is_row_processor = is_row_processor - func.ibis_node = node # If a new remote function was created, update the cloud artifacts # created in the session. This would be used to clean up any @@ -731,9 +633,38 @@ def wrapper(func): # with that name and would directly manage their lifecycle. if created_new and (not name): self._update_temp_artifacts( - func.bigframes_bigquery_function, func.bigframes_cloud_function + bigframes_bigquery_function, bigframes_cloud_function + ) + + udf_definition = udf_def.BigqueryUdf( + routine_ref=bigquery.RoutineReference.from_string( + bigframes_bigquery_function + ), + signature=udf_sig, + ) + decorator = functools.wraps(func) + if is_row_processor: + return decorator( + bq_functions.BigqueryCallableRowRoutine( + udf_definition, + session, + post_routine=post_process_routine, + cloud_function_ref=bigframes_cloud_function, + local_func=func, + is_managed=False, + ) + ) + else: + return decorator( + bq_functions.BigqueryCallableRoutine( + udf_definition, + session, + post_routine=post_process_routine, + cloud_function_ref=bigframes_cloud_function, + local_func=func, + is_managed=False, + ) ) - return func return wrapper @@ -858,57 +789,30 @@ def wrapper(func): else: signature_kwargs = {} # type: ignore - signature = inspect.signature( + py_sig = inspect.signature( func, **signature_kwargs, ) + if input_types is not None: + if not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + py_sig = py_sig.replace( + parameters=[ + par.replace(annotation=itype) + for par, itype in zip(py_sig.parameters.values(), input_types) + ] + ) + if output_type: + py_sig = py_sig.replace(return_annotation=output_type) - # Try to get input types via type annotations. - if input_types is None: - input_types = [] - for parameter in signature.parameters.values(): - if (param_type := parameter.annotation) is inspect.Signature.empty: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "'input_types' was not set and parameter " - f"'{parameter.name}' is missing a type annotation. " - "Types are required to use udf.", - ) - input_types.append(param_type) - elif not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - - if output_type is None: - if ( - output_type := signature.return_annotation - ) is inspect.Signature.empty: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - "'output_type' was not set and function is missing a " - "return type annotation. Types are required to use udf", - ) + udf_sig = udf_def.UdfSignature.from_py_signature(py_sig) # The function will actually be receiving a pandas Series, but allow # both BigQuery DataFrames and pandas object types for compatibility. is_row_processor = False - if len(input_types) == 1 and ( - (input_type := input_types[0]) == bf_series.Series - or input_type == pandas.Series - ): - msg = bfe.format_message("input_types=Series is in preview.") - warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) - - # we will model the row as a json serialized string containing - # the data and the metadata representing the row. - input_types = [str] + if new_sig := _convert_row_processor_sig(py_sig): + py_sig = new_sig is_row_processor = True - elif isinstance(input_types, type): - input_types = [input_types] - - # TODO(b/340898611): fix type error. - ibis_signature = _utils.ibis_signature_from_python_signature( - signature, input_types, output_type # type: ignore - ) managed_function_client = _function_client.FunctionClient( dataset_ref.project, @@ -920,80 +824,59 @@ def wrapper(func): session=session, # type: ignore ) - func = cloudpickle.loads(cloudpickle.dumps(func)) - - self._try_delattr(func, "bigframes_bigquery_function") - self._try_delattr(func, "bigframes_bigquery_function_output_dtype") - self._try_delattr(func, "input_dtypes") - self._try_delattr(func, "output_dtype") - self._try_delattr(func, "is_row_processor") - self._try_delattr(func, "ibis_node") - bq_function_name = managed_function_client.provision_bq_managed_function( func=func, - input_types=tuple( - third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) - for type_ in ibis_signature.input_types - if type_ is not None - ), - output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( - ibis_signature.output_type - ), + input_types=udf_sig.sql_input_types, + output_type=udf_sig.sql_output_type, name=name, packages=packages, is_row_processor=is_row_processor, bq_connection_id=bq_connection_id, ) - - # TODO(shobs): Find a better way to support udfs with param named - # "name". This causes an issue in the ibis compilation. - func.__signature__ = inspect.signature(func).replace( # type: ignore - parameters=[ - inspect.Parameter( - f"bigframes_{param.name}", - param.kind, - ) - for param in inspect.signature(func).parameters.values() - ] - ) - - # TODO: Move ibis logic to compiler step. - node = ibis_udf.scalar.builtin( - func, - name=bq_function_name, - catalog=dataset_ref.project, - database=dataset_ref.dataset_id, - signature=(ibis_signature.input_types, ibis_signature.output_type), - ) # type: ignore - func.bigframes_bigquery_function = ( + full_rf_name = ( managed_function_client.get_remote_function_fully_qualilfied_name( bq_function_name ) ) - func.input_dtypes = tuple( - [ - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - input_type - ) - for input_type in ibis_signature.input_types - if input_type is not None - ] - ) - func.output_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_signature.output_type - ) + + udf_definition = udf_def.BigqueryUdf( + routine_ref=bigquery.RoutineReference.from_string(full_rf_name), + signature=udf_sig, ) - # Managed function directly supports certain output types which are - # not supported in remote function (e.g. list output). Thus no more - # processing for 'bigframes_bigquery_function_output_dtype'. - func.bigframes_bigquery_function_output_dtype = func.output_dtype - func.is_row_processor = is_row_processor - func.ibis_node = node if not name: - self._update_temp_artifacts(func.bigframes_bigquery_function, "") + self._update_temp_artifacts(full_rf_name, "") - return func + decorator = functools.wraps(func) + if is_row_processor: + return decorator( + bq_functions.BigqueryCallableRowRoutine( + udf_definition, session, local_func=func, is_managed=True + ) + ) + else: + return decorator( + bq_functions.BigqueryCallableRoutine( + udf_definition, + session, + local_func=func, + is_managed=True, + ) + ) return wrapper + + +def _convert_row_processor_sig( + signature: inspect.Signature, +) -> Optional[inspect.Signature]: + import bigframes.series as bf_series + + if len(signature.parameters) == 1: + only_param = next(iter(signature.parameters.values())) + param_type = only_param.annotation + if (param_type == bf_series.Series) or (param_type == pandas.Series): + msg = bfe.format_message("input_types=Series is in preview.") + warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) + return signature.replace(parameters=[only_param.replace(annotation=str)]) + return None diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py index 1d930a280d..69cf74ada0 100644 --- a/bigframes/functions/_utils.py +++ b/bigframes/functions/_utils.py @@ -14,13 +14,11 @@ import hashlib -import inspect import json import sys import typing -from typing import cast, List, NamedTuple, Optional, Sequence, Set +from typing import cast, Optional, Set -import bigframes_vendored.ibis.expr.datatypes.core as ibis_dtypes import cloudpickle import google.api_core.exceptions from google.cloud import bigquery, functions_v2 @@ -28,9 +26,8 @@ import pandas import pyarrow -import bigframes.core.compile.ibis_types -import bigframes.dtypes import bigframes.formatting_helpers as bf_formatting +from bigframes.functions import function_typing # Naming convention for the function artifacts _BIGFRAMES_FUNCTION_PREFIX = "bigframes" @@ -198,42 +195,6 @@ def get_bigframes_function_name(function_hash, session_id, uniq_suffix=None): return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) -class IbisSignature(NamedTuple): - parameter_names: List[str] - input_types: List[Optional[ibis_dtypes.DataType]] - output_type: ibis_dtypes.DataType - output_type_override: Optional[ibis_dtypes.DataType] = None - - -def ibis_signature_from_python_signature( - signature: inspect.Signature, - input_types: Sequence[type], - output_type: type, -) -> IbisSignature: - - ibis_input_types: List[Optional[ibis_dtypes.DataType]] = [ - bigframes.core.compile.ibis_types.ibis_type_from_python_type(t) - for t in input_types - ] - - if typing.get_origin(output_type) is list: - ibis_output_type = ( - bigframes.core.compile.ibis_types.ibis_array_output_type_from_python_type( - output_type - ) - ) - else: - ibis_output_type = bigframes.core.compile.ibis_types.ibis_type_from_python_type( - output_type - ) - - return IbisSignature( - parameter_names=list(signature.parameters.keys()), - input_types=ibis_input_types, - output_type=ibis_output_type, - ) - - def get_python_output_type_from_bigframes_metadata( metadata_text: str, ) -> Optional[type]: @@ -249,7 +210,7 @@ def get_python_output_type_from_bigframes_metadata( for ( python_output_array_type - ) in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: + ) in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: if python_output_array_type.__name__ == output_type: return list[python_output_array_type] # type: ignore @@ -266,7 +227,7 @@ def get_bigframes_metadata(*, python_output_type: Optional[type] = None) -> str: python_output_array_type = typing.get_args(python_output_type)[0] if ( python_output_array_type - in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES + in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES ): inner_metadata[ "python_array_output_type" @@ -294,3 +255,17 @@ def get_python_version(is_compat: bool = False) -> str: major = sys.version_info.major minor = sys.version_info.minor return f"python{major}{minor}" if is_compat else f"python-{major}.{minor}" + + +def _build_unnest_post_routine(py_list_type: type[list]): + sdk_type = function_typing.sdk_array_output_type_from_python_type(py_list_type) + assert sdk_type.array_element_type is not None + inner_sdk_type = sdk_type.array_element_type + result_dtype = function_typing.sdk_type_to_bf_type(inner_sdk_type) + + def post_process(input): + import bigframes.bigquery as bbq + + return bbq.json_extract_string_array(input, value_dtype=result_dtype) + + return post_process diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 858c25fada..b695bcd250 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -14,28 +14,19 @@ from __future__ import annotations -import inspect import logging -import typing -from typing import cast, Optional, TYPE_CHECKING -import warnings - -import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes -import bigframes_vendored.ibis.expr.operations.udf as ibis_udf +from typing import Callable, cast, get_origin, Optional, TYPE_CHECKING if TYPE_CHECKING: from bigframes.session import Session + import bigframes.series import google.api_core.exceptions from google.cloud import bigquery -import bigframes.core.compile.ibis_types -import bigframes.dtypes -import bigframes.exceptions as bfe import bigframes.formatting_helpers as bf_formatting - -from . import _function_session as bff_session -from . import _utils +from bigframes.functions import _function_session as bff_session +from bigframes.functions import _utils, function_typing, udf_def logger = logging.getLogger(__name__) @@ -46,55 +37,6 @@ def __init__(self, type_, supported_types): self.supported_types = supported_types -class ReturnTypeMissingError(ValueError): - pass - - -# TODO: Move this to compile folder -def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature: - if routine.return_type: - ibis_output_type = ( - bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( - routine.return_type - ) - ) - else: - raise ReturnTypeMissingError - - ibis_output_type_override: Optional[ibis_dtypes.DataType] = None - if python_output_type := _utils.get_python_output_type_from_bigframes_metadata( - routine.description - ): - if not isinstance(ibis_output_type, ibis_dtypes.String): - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "An explicit output_type should be provided only for a BigQuery function with STRING output.", - ) - if typing.get_origin(python_output_type) is list: - ibis_output_type_override = bigframes.core.compile.ibis_types.ibis_array_output_type_from_python_type( - cast(type, python_output_type) - ) - else: - raise bf_formatting.create_exception_with_feedback_link( - TypeError, - "Currently only list of a type is supported as python output type.", - ) - - return _utils.IbisSignature( - parameter_names=[arg.name for arg in routine.arguments], - input_types=[ - bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( - arg.data_type - ) - if arg.data_type - else None - for arg in routine.arguments - ], - output_type=ibis_output_type, - output_type_override=ibis_output_type_override, - ) - - class DatasetMissingError(ValueError): pass @@ -136,6 +78,78 @@ def udf(*args, **kwargs): udf.__doc__ = bff_session.FunctionSession.udf.__doc__ +def _try_import_routine( + routine: bigquery.Routine, session: bigframes.Session +) -> BigqueryCallableRoutine: + udf_def = _routine_as_udf_def(routine) + override_type = _get_output_type_override(routine) + is_remote = ( + hasattr(routine, "remote_function_options") and routine.remote_function_options + ) + if override_type is not None: + return BigqueryCallableRoutine( + udf_def, + session, + post_routine=_utils._build_unnest_post_routine(override_type), + ) + return BigqueryCallableRoutine(udf_def, session, is_managed=not is_remote) + + +def _try_import_row_routine( + routine: bigquery.Routine, session: bigframes.Session +) -> BigqueryCallableRowRoutine: + udf_def = _routine_as_udf_def(routine) + override_type = _get_output_type_override(routine) + is_remote = ( + hasattr(routine, "remote_function_options") and routine.remote_function_options + ) + if override_type is not None: + return BigqueryCallableRowRoutine( + udf_def, + session, + post_routine=_utils._build_unnest_post_routine(override_type), + ) + return BigqueryCallableRowRoutine(udf_def, session, is_managed=not is_remote) + + +def _routine_as_udf_def(routine: bigquery.Routine) -> udf_def.BigqueryUdf: + try: + return udf_def.BigqueryUdf.from_routine(routine) + except udf_def.ReturnTypeMissingError: + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "Function return type must be specified." + ) + except function_typing.UnsupportedTypeError as e: + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + f"Type {e.type} not supported, supported types are {e.supported_types}.", + ) + + +def _get_output_type_override(routine: bigquery.Routine) -> Optional[type[list]]: + if routine.description is not None and isinstance(routine.description, str): + if python_output_type := _utils.get_python_output_type_from_bigframes_metadata( + routine.description + ): + bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type) + + if bq_return_type is None or bq_return_type.type_kind != "STRING": + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "An explicit output_type should be provided only for a BigQuery function with STRING output.", + ) + if get_origin(python_output_type) is list: + return python_output_type + else: + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "Currently only list of " + "a type is supported as python output type.", + ) + + return None + + # TODO(b/399894805): Support managed function. def read_gbq_function( function_name: str, @@ -147,7 +161,6 @@ def read_gbq_function( Read an existing BigQuery function and prepare it for use in future queries. """ bigquery_client = session.bqclient - ibis_client = session.ibis_client try: routine_ref = get_routine_reference(function_name, bigquery_client, session) @@ -172,86 +185,163 @@ def read_gbq_function( "takes in a single input representing the row.", ) - try: - ibis_signature = ibis_signature_from_routine(routine) - except ReturnTypeMissingError: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, "Function return type must be specified." - ) - except bigframes.core.compile.ibis_types.UnsupportedTypeError as e: - raise bf_formatting.create_exception_with_feedback_link( - ValueError, - f"Type {e.type} not supported, supported types are {e.supported_types}.", - ) + if is_row_processor: + return _try_import_row_routine(routine, session) + else: + return _try_import_routine(routine, session) - # The name "args" conflicts with the Ibis operator, so we use - # non-standard names for the arguments here. - def func(*bigframes_args, **bigframes_kwargs): - f"""Bigframes function {str(routine_ref)}.""" - nonlocal node # type: ignore - - expr = node(*bigframes_args, **bigframes_kwargs) # type: ignore - return ibis_client.execute(expr) - - func.__signature__ = inspect.signature(func).replace( # type: ignore - parameters=[ - # TODO(shobs): Find a better way to support functions with param - # named "name". This causes an issue in the ibis compilation. - inspect.Parameter( - f"bigframes_{name}", - inspect.Parameter.POSITIONAL_OR_KEYWORD, - ) - for name in ibis_signature.parameter_names - ] - ) - # TODO: Move ibis logic to compiler step - - func.__name__ = routine_ref.routine_id - - node = ibis_udf.scalar.builtin( - func, - name=routine_ref.routine_id, - catalog=routine_ref.project, - database=routine_ref.dataset_id, - signature=(ibis_signature.input_types, ibis_signature.output_type), - ) # type: ignore - func.bigframes_bigquery_function = str(routine_ref) # type: ignore - - # We will keep the "bigframes_remote_function" attr for remote function. - if hasattr(routine, "remote_function_options") and routine.remote_function_options: - func.bigframes_remote_function = func.bigframes_bigquery_function # type: ignore - - # set input bigframes data types - has_unknown_dtypes = False - function_input_dtypes = [] - for ibis_type in ibis_signature.input_types: - input_dtype = cast(bigframes.dtypes.Dtype, bigframes.dtypes.DEFAULT_DTYPE) - if ibis_type is None: - has_unknown_dtypes = True - else: - input_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_type - ) - ) - function_input_dtypes.append(input_dtype) - if has_unknown_dtypes: - msg = bfe.format_message( - "The function has one or more missing input data types. BigQuery DataFrames " - f"will assume default data type {bigframes.dtypes.DEFAULT_DTYPE} for them." - ) - warnings.warn(msg, category=bfe.UnknownDataTypeWarning) - func.input_dtypes = tuple(function_input_dtypes) # type: ignore +class BigqueryCallableRoutine: + """ + A reference to a routine in the context of a session. - func.output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( # type: ignore - ibis_signature.output_type_override - if ibis_signature.output_type_override - else ibis_signature.output_type - ) + Can be used both directly as a callable, or as an input to dataframe ops that take a callable. + """ - func.bigframes_bigquery_function_output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_signature.output_type) # type: ignore + def __init__( + self, + udf_def: udf_def.BigqueryUdf, + session: bigframes.Session, + *, + local_func: Optional[Callable] = None, + cloud_function_ref: Optional[str] = None, + post_routine: Optional[ + Callable[[bigframes.series.Series], bigframes.series.Series] + ] = None, + is_managed: bool = False, + ): + self._udf_def = udf_def + self._session = session + self._post_routine = post_routine + self._local_fun = local_func + self._cloud_function = cloud_function_ref + self._is_managed = is_managed + + def __call__(self, *args, **kwargs): + if self._local_fun: + return self._local_fun(*args, **kwargs) + # avoid circular imports + import bigframes.core.sql as bf_sql + import bigframes.session._io.bigquery as bf_io_bigquery + + args_string = ", ".join(map(bf_sql.simple_literal, args)) + sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})" + iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig()) # type: ignore + return list(iter.to_arrow().to_pydict().values())[0][0] + + @property + def bigframes_bigquery_function(self) -> str: + return str(self._udf_def.routine_ref) + + @property + def bigframes_remote_function(self): + return None if self._is_managed else str(self._udf_def.routine_ref) + + @property + def is_row_processor(self) -> bool: + return False + + @property + def udf_def(self) -> udf_def.BigqueryUdf: + return self._udf_def + + @property + def bigframes_cloud_function(self) -> Optional[str]: + return self._cloud_function + + @property + def input_dtypes(self): + return self.udf_def.signature.bf_input_types + + @property + def output_dtype(self): + return self.udf_def.signature.bf_output_type + + @property + def bigframes_bigquery_function_output_dtype(self): + return self.output_dtype + + def _post_process_series( + self, series: bigframes.series.Series + ) -> bigframes.series.Series: + if self._post_routine is not None: + return self._post_routine(series) + return series + + +class BigqueryCallableRowRoutine: + """ + A reference to a routine in the context of a session. - func.is_row_processor = is_row_processor # type: ignore - func.ibis_node = node # type: ignore - return func + Can be used both directly as a callable, or as an input to dataframe ops that take a callable. + """ + + def __init__( + self, + udf_def: udf_def.BigqueryUdf, + session: bigframes.Session, + *, + local_func: Optional[Callable] = None, + cloud_function_ref: Optional[str] = None, + post_routine: Optional[ + Callable[[bigframes.series.Series], bigframes.series.Series] + ] = None, + is_managed: bool = False, + ): + self._udf_def = udf_def + self._session = session + self._post_routine = post_routine + self._local_fun = local_func + self._cloud_function = cloud_function_ref + self._is_managed = is_managed + + def __call__(self, *args, **kwargs): + if self._local_fun: + return self._local_fun(*args, **kwargs) + # avoid circular imports + import bigframes.core.sql as bf_sql + import bigframes.session._io.bigquery as bf_io_bigquery + + args_string = ", ".join(map(bf_sql.simple_literal, args)) + sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})" + iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig()) # type: ignore + return list(iter.to_arrow().to_pydict().values())[0][0] + + @property + def bigframes_bigquery_function(self) -> str: + return str(self._udf_def.routine_ref) + + @property + def bigframes_remote_function(self): + return None if self._is_managed else str(self._udf_def.routine_ref) + + @property + def is_row_processor(self) -> bool: + return True + + @property + def udf_def(self) -> udf_def.BigqueryUdf: + return self._udf_def + + @property + def bigframes_cloud_function(self) -> Optional[str]: + return self._cloud_function + + @property + def input_dtypes(self): + return self.udf_def.signature.bf_input_types + + @property + def output_dtype(self): + return self.udf_def.signature.bf_output_type + + @property + def bigframes_bigquery_function_output_dtype(self): + return self.output_dtype + + def _post_process_series( + self, series: bigframes.series.Series + ) -> bigframes.series.Series: + if self._post_routine is not None: + return self._post_routine(series) + return series diff --git a/bigframes/functions/function_typing.py b/bigframes/functions/function_typing.py new file mode 100644 index 0000000000..f2fa794456 --- /dev/null +++ b/bigframes/functions/function_typing.py @@ -0,0 +1,122 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, get_args, get_origin, Type + +from google.cloud import bigquery + +import bigframes.dtypes + +# Input and output types supported by BigQuery DataFrames remote functions. +# TODO(shobs): Extend the support to all types supported by BQ remote functions +# https://cloud.google.com/bigquery/docs/remote-functions#limitations +RF_SUPPORTED_IO_PYTHON_TYPES = { + bool: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.BOOL), + bytes: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.BYTES), + float: bigquery.StandardSqlDataType( + type_kind=bigquery.StandardSqlTypeNames.FLOAT64 + ), + int: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.INT64), + str: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.STRING), +} + +# Support array output types in BigQuery DataFrames remote functions even though +# it is not currently (2024-10-06) supported in BigQuery remote functions. +# https://cloud.google.com/bigquery/docs/remote-functions#limitations +# TODO(b/284515241): remove this special handling when BigQuery remote functions +# support array. +RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES = {bool, float, int, str} + +DEFAULT_RF_TYPE = RF_SUPPORTED_IO_PYTHON_TYPES[float] + +RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS = { + "BOOLEAN", + "BOOL", + "BYTES", + "FLOAT", + "FLOAT64", + "INT64", + "INTEGER", + "STRING", + "ARRAY", +} + + +TIMEDELTA_DESCRIPTION_TAG = "#microseconds" + + +class UnsupportedTypeError(ValueError): + def __init__(self, type_, supported_types): + self.type = type_ + self.supported_types = supported_types + super().__init__( + f"'{type_}' is not one of the supported types {supported_types}" + ) + + +def sdk_type_from_python_type( + t: type, allow_lists: bool = False +) -> bigquery.StandardSqlDataType: + if (get_origin(t) is list) and allow_lists: + return sdk_array_output_type_from_python_type(t) + if t not in RF_SUPPORTED_IO_PYTHON_TYPES: + raise UnsupportedTypeError(t, RF_SUPPORTED_IO_PYTHON_TYPES) + return RF_SUPPORTED_IO_PYTHON_TYPES[t] + + +def sdk_array_output_type_from_python_type(t: type) -> bigquery.StandardSqlDataType: + array_of = get_args(t)[0] + if array_of not in RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: + raise UnsupportedTypeError(array_of, RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES) + inner_type = RF_SUPPORTED_IO_PYTHON_TYPES[array_of] + return bigquery.StandardSqlDataType( + type_kind=bigquery.StandardSqlTypeNames.ARRAY, array_element_type=inner_type + ) + + +def sdk_type_to_bf_type( + sdk_type: bigquery.StandardSqlDataType, +) -> bigframes.dtypes.Dtype: + if sdk_type.array_element_type is not None: + return bigframes.dtypes.list_type( + sdk_type_to_bf_type(sdk_type.array_element_type) + ) + if sdk_type.struct_type is not None: + raise ValueError("Cannot handle struct types in remote function") + assert sdk_type.type_kind is not None + return bigframes.dtypes._TK_TO_BIGFRAMES[sdk_type.type_kind.name] + + +def sdk_type_to_py_type( + sdk_type: bigquery.StandardSqlDataType, +) -> Type[Any]: + if sdk_type.array_element_type is not None: + return list[sdk_type_to_py_type(sdk_type.array_element_type)] # type: ignore + if sdk_type.struct_type is not None: + raise ValueError("Cannot handle struct types in remote function") + for key, value in RF_SUPPORTED_IO_PYTHON_TYPES.items(): + if value == sdk_type: + return key + raise ValueError(f"Cannot handle {sdk_type} in remote function") + + +def sdk_type_to_sql_string( + sdk_type: bigquery.StandardSqlDataType, +) -> str: + if sdk_type.array_element_type is not None: + return f"ARRAY<{sdk_type_to_sql_string(sdk_type.array_element_type)}>" + if sdk_type.struct_type is not None: + raise ValueError("Cannot handle struct types in remote function") + assert sdk_type.type_kind is not None + return sdk_type.type_kind.name diff --git a/bigframes/functions/udf_def.py b/bigframes/functions/udf_def.py new file mode 100644 index 0000000000..078e45f32d --- /dev/null +++ b/bigframes/functions/udf_def.py @@ -0,0 +1,173 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses +import inspect +from typing import cast, Optional +import warnings + +from google.cloud import bigquery + +import bigframes.dtypes +import bigframes.exceptions as bfe +import bigframes.formatting_helpers as bf_formatting +from bigframes.functions import function_typing + + +class ReturnTypeMissingError(ValueError): + pass + + +@dataclasses.dataclass(frozen=True) +class UdfField: + name: str = dataclasses.field() + dtype: bigquery.StandardSqlDataType = dataclasses.field(hash=False, compare=False) + + @classmethod + def from_sdk(cls, arg: bigquery.RoutineArgument) -> UdfField: + assert arg.name is not None + assert arg.data_type is not None + return cls(arg.name, arg.data_type) + + +@dataclasses.dataclass(frozen=True) +class UdfSignature: + input_types: tuple[UdfField, ...] = dataclasses.field() + output_bq_type: bigquery.StandardSqlDataType = dataclasses.field( + hash=False, compare=False + ) + + @property + def bf_input_types(self) -> tuple[bigframes.dtypes.Dtype, ...]: + return tuple( + function_typing.sdk_type_to_bf_type(arg.dtype) for arg in self.input_types + ) + + @property + def bf_output_type(self) -> bigframes.dtypes.Dtype: + return function_typing.sdk_type_to_bf_type(self.output_bq_type) + + @property + def py_input_types(self) -> tuple[type, ...]: + return tuple( + function_typing.sdk_type_to_py_type(arg.dtype) for arg in self.input_types + ) + + @property + def py_output_type(self) -> type: + return function_typing.sdk_type_to_py_type(self.output_bq_type) + + @property + def sql_input_types(self) -> tuple[str, ...]: + return tuple( + function_typing.sdk_type_to_sql_string(arg.dtype) + for arg in self.input_types + ) + + @property + def sql_output_type(self) -> str: + return function_typing.sdk_type_to_sql_string(self.output_bq_type) + + @classmethod + def from_routine(cls, routine: bigquery.Routine) -> UdfSignature: + if routine.return_type is None: + raise ReturnTypeMissingError + bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type) + + if ( + bq_return_type.type_kind is None + or bq_return_type.type_kind + not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + ): + raise ValueError( + f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}" + ) + + udf_fields = [] + for argument in routine.arguments: + if argument.data_type is None: + msg = bfe.format_message( + "The function has one or more missing input data types. BigQuery DataFrames " + f"will assume default data type {function_typing.DEFAULT_RF_TYPE} for them." + ) + warnings.warn(msg, category=bfe.UnknownDataTypeWarning) + assert argument.name is not None + udf_fields.append( + UdfField(argument.name, function_typing.DEFAULT_RF_TYPE) + ) + else: + udf_fields.append(UdfField.from_sdk(argument)) + + return cls( + input_types=tuple(udf_fields), + output_bq_type=bq_return_type, + ) + + @classmethod + def from_py_signature(cls, signature: inspect.Signature): + input_types: list[UdfField] = [] + for parameter in signature.parameters.values(): + if parameter.annotation is inspect.Signature.empty: + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + "'input_types' was not set and parameter " + f"'{parameter.name}' is missing a type annotation. " + "Types are required to use @remote_function.", + ) + bq_type = function_typing.sdk_type_from_python_type(parameter.annotation) + input_types.append(UdfField(parameter.name, bq_type)) + + if signature.return_annotation is inspect.Signature.empty: + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + "'output_type' was not set and function is missing a " + "return type annotation. Types are required to use " + "@remote_function.", + ) + output_bq_type = function_typing.sdk_type_from_python_type( + signature.return_annotation, + allow_lists=True, + ) + return cls(tuple(input_types), output_bq_type) + + +@dataclasses.dataclass(frozen=True) +class BigqueryUdf: + routine_ref: bigquery.RoutineReference = dataclasses.field() + signature: UdfSignature + # Used to provide alternative interpretations of output bq type, eg interpret int as timestamp + output_type_override: Optional[bigframes.dtypes.Dtype] = dataclasses.field( + default=None + ) + + @property + def bigframes_output_type(self) -> bigframes.dtypes.Dtype: + return self.output_type_override or function_typing.sdk_type_to_bf_type( + self.signature.output_bq_type + ) + + @classmethod + def from_routine(cls, routine: bigquery.Routine) -> BigqueryUdf: + signature = UdfSignature.from_routine(routine) + + if ( + signature.output_bq_type.type_kind is None + or signature.output_bq_type.type_kind + not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + ): + raise ValueError( + f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}" + ) + return cls(routine.reference, signature=signature) diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index 51cfccbc41..e610ce61d6 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -15,13 +15,15 @@ import dataclasses import typing +from bigframes.functions import udf_def from bigframes.operations import base_ops +# TODO: Enforce input type constraints from function def @dataclasses.dataclass(frozen=True) class RemoteFunctionOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "remote_function" - func: typing.Callable + function_def: udf_def.BigqueryUdf apply_on_null: bool @property @@ -29,45 +31,30 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - # The output dtype should be set to a valid Dtype by @udf decorator, - # @remote_function decorator, or read_gbq_function method. - if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): - return self.func.bigframes_bigquery_function_output_dtype - - raise AttributeError("bigframes_bigquery_function_output_dtype not defined") + return self.function_def.bigframes_output_type @dataclasses.dataclass(frozen=True) class BinaryRemoteFunctionOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "binary_remote_function" - func: typing.Callable + function_def: udf_def.BigqueryUdf @property def expensive(self) -> bool: return True def output_type(self, *input_types): - # The output dtype should be set to a valid Dtype by @udf decorator, - # @remote_function decorator, or read_gbq_function method. - if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): - return self.func.bigframes_bigquery_function_output_dtype - - raise AttributeError("bigframes_bigquery_function_output_dtype not defined") + return self.function_def.bigframes_output_type @dataclasses.dataclass(frozen=True) class NaryRemoteFunctionOp(base_ops.NaryOp): name: typing.ClassVar[str] = "nary_remote_function" - func: typing.Callable + function_def: udf_def.BigqueryUdf @property def expensive(self) -> bool: return True def output_type(self, *input_types): - # The output dtype should be set to a valid Dtype by @udf decorator, - # @remote_function decorator, or read_gbq_function method. - if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): - return self.func.bigframes_bigquery_function_output_dtype - - raise AttributeError("bigframes_bigquery_function_output_dtype not defined") + return self.function_def.bigframes_output_type diff --git a/bigframes/series.py b/bigframes/series.py index 1bb0c1e0dc..7a318c4c70 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -66,6 +66,7 @@ import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.formatting_helpers as formatter +import bigframes.functions import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.base @@ -1841,7 +1842,7 @@ def apply( " are supported." ) - if not hasattr(func, "bigframes_bigquery_function"): + if not isinstance(func, bigframes.functions.BigqueryCallableRoutine): # It is neither a remote function nor a managed function. # Then it must be a vectorized function that applies to the Series # as a whole. @@ -1873,24 +1874,9 @@ def apply( # We are working with bigquery function at this point result_series = self._apply_unary_op( - ops.RemoteFunctionOp(func=func, apply_on_null=True) + ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True) ) - - # If the result type is string but the function output is intended to - # be an array, reconstruct the array from the string assuming it is a - # json serialized form of the array. - if bigframes.dtypes.is_string_like( - result_series.dtype - ) and bigframes.dtypes.is_array_like(func.output_dtype): - import bigframes.bigquery as bbq - - result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( - func.output_dtype.pyarrow_dtype.value_type - ) - result_series = bbq.json_extract_string_array( - result_series, value_dtype=result_dtype - ) - + result_series = func._post_process_series(result_series) return result_series def combine( @@ -1905,7 +1891,7 @@ def combine( " are supported." ) - if not hasattr(func, "bigframes_bigquery_function"): + if not isinstance(func, bigframes.functions.BigqueryCallableRoutine): # Keep this in sync with .apply try: return func(self, other) @@ -1918,24 +1904,9 @@ def combine( raise result_series = self._apply_binary_op( - other, ops.BinaryRemoteFunctionOp(func=func) + other, ops.BinaryRemoteFunctionOp(function_def=func.udf_def) ) - - # If the result type is string but the function output is intended to - # be an array, reconstruct the array from the string assuming it is a - # json serialized form of the array. - if bigframes.dtypes.is_string_like( - result_series.dtype - ) and bigframes.dtypes.is_array_like(func.output_dtype): - import bigframes.bigquery as bbq - - result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( - func.output_dtype.pyarrow_dtype.value_type - ) - result_series = bbq.json_extract_string_array( - result_series, value_dtype=result_dtype - ) - + result_series = func._post_process_series(result_series) return result_series @validations.requires_index diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b6066daed3..7597f8eeed 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -181,18 +181,6 @@ def __init__( # the ibis client has been created original_default_query_job_config = self.bqclient.default_query_job_config - # Only used to fetch remote function metadata. - # TODO: Remove in favor of raw bq client - - self.ibis_client = typing.cast( - ibis_bigquery.Backend, - ibis_bigquery.Backend().connect( - project_id=context.project, - client=self.bqclient, - storage_client=self.bqstoragereadclient, - ), - ) - self.bqclient.default_query_job_config = original_default_query_job_config # Resolve the BQ connection for remote function and Vertex AI integration diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 5e5de2d0b2..723841a672 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -89,7 +89,6 @@ def __init__(self): self._location = None # type: ignore self._bq_kms_key_name = None # type: ignore self._clients_provider = None # type: ignore - self.ibis_client = None # type: ignore self._bq_connection = None # type: ignore self._skip_bq_connection_check = True self._session_id: str = "test_session" diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 4605d9ddbc..a75918ed23 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -22,7 +22,6 @@ import typing from typing import Dict, Generator, Optional -import bigframes_vendored.ibis.backends as ibis_backends import google.api_core.exceptions import google.cloud.bigquery as bigquery import google.cloud.bigquery_connection_v1 as bigquery_connection_v1 @@ -109,11 +108,6 @@ def bigquery_client_tokyo(session_tokyo: bigframes.Session) -> bigquery.Client: return session_tokyo.bqclient -@pytest.fixture(scope="session") -def ibis_client(session: bigframes.Session) -> ibis_backends.BaseBackend: - return session.ibis_client - - @pytest.fixture(scope="session") def bigqueryconnection_client( session: bigframes.Session, diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 5cb54a00c1..ad5849eb2f 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -19,6 +19,8 @@ import test_utils.prefixer import bigframes +import bigframes.dataframe +import bigframes.dtypes import bigframes.exceptions as bfe import bigframes.pandas as bpd from bigframes.testing.utils import cleanup_function_assets @@ -26,105 +28,6 @@ prefixer = test_utils.prefixer.Prefixer("bigframes", "") -def test_managed_function_multiply_with_ibis( - session, - scalars_table_id, - bigquery_client, - ibis_client, - dataset_id, -): - - try: - - @session.udf( - input_types=[int, int], - output_type=int, - dataset=dataset_id, - name=prefixer.create_prefix(), - ) - def multiply(x, y): - return x * y - - _, dataset_name, table_name = scalars_table_id.split(".") - if not ibis_client.dataset: - ibis_client.dataset = dataset_name - - col_name = "int64_col" - table = ibis_client.tables[table_name] - table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10) - sql = table.compile() - pandas_df_orig = bigquery_client.query(sql).to_dataframe() - - col = table[col_name] - col_2x = multiply(col, 2).name("int64_col_2x") - col_square = multiply(col, col).name("int64_col_square") - table = table.mutate([col_2x, col_square]) - sql = table.compile() - pandas_df_new = bigquery_client.query(sql).to_dataframe() - - pandas.testing.assert_series_equal( - pandas_df_orig[col_name] * 2, - pandas_df_new["int64_col_2x"], - check_names=False, - ) - - pandas.testing.assert_series_equal( - pandas_df_orig[col_name] * pandas_df_orig[col_name], - pandas_df_new["int64_col_square"], - check_names=False, - ) - finally: - # clean up the gcp assets created for the managed function. - cleanup_function_assets(multiply, bigquery_client, ignore_failures=False) - - -def test_managed_function_stringify_with_ibis( - session, - scalars_table_id, - bigquery_client, - ibis_client, - dataset_id, -): - try: - - @session.udf( - input_types=[int], - output_type=str, - dataset=dataset_id, - name=prefixer.create_prefix(), - ) - def stringify(x): - return f"I got {x}" - - # Function should work locally. - assert stringify(8912) == "I got 8912" - - _, dataset_name, table_name = scalars_table_id.split(".") - if not ibis_client.dataset: - ibis_client.dataset = dataset_name - - col_name = "int64_col" - table = ibis_client.tables[table_name] - table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10) - sql = table.compile() - pandas_df_orig = bigquery_client.query(sql).to_dataframe() - - col = table[col_name] - col_2x = stringify.ibis_node(col).name("int64_str_col") - table = table.mutate([col_2x]) - sql = table.compile() - pandas_df_new = bigquery_client.query(sql).to_dataframe() - - pandas.testing.assert_series_equal( - pandas_df_orig[col_name].apply(lambda x: f"I got {x}"), - pandas_df_new["int64_str_col"], - check_names=False, - ) - finally: - # clean up the gcp assets created for the managed function. - cleanup_function_assets(stringify, bigquery_client, ignore_failures=False) - - def test_managed_function_array_output(session, scalars_dfs, dataset_id): try: @@ -150,7 +53,7 @@ def featurize(x: int) -> list[float]: featurize_ref = session.read_gbq_function(featurize.bigframes_bigquery_function) assert hasattr(featurize_ref, "bigframes_bigquery_function") - assert not hasattr(featurize_ref, "bigframes_remote_function") + assert featurize_ref.bigframes_remote_function is None assert ( featurize_ref.bigframes_bigquery_function == featurize.bigframes_bigquery_function @@ -184,7 +87,6 @@ def foo(x: int) -> bytes: assert foo(-2) == bytes(2) assert hasattr(foo, "bigframes_bigquery_function") - assert hasattr(foo, "ibis_node") assert hasattr(foo, "input_dtypes") assert hasattr(foo, "output_dtype") assert hasattr(foo, "bigframes_bigquery_function_output_dtype") @@ -208,7 +110,7 @@ def foo(x: int) -> bytes: function_name=foo.bigframes_bigquery_function, # type: ignore ) assert hasattr(foo_ref, "bigframes_bigquery_function") - assert not hasattr(foo_ref, "bigframes_remote_function") + assert foo_ref.bigframes_remote_function is None assert foo.bigframes_bigquery_function == foo_ref.bigframes_bigquery_function # type: ignore bf_result_col_gbq = scalars_df["int64_too"].apply(foo_ref) @@ -358,7 +260,7 @@ def add_list(x: int, y: int) -> list[int]: ) assert hasattr(add_list_managed_func_ref, "bigframes_bigquery_function") - assert not hasattr(add_list_managed_func_ref, "bigframes_remote_function") + assert add_list_managed_func_ref.bigframes_remote_function is None assert ( add_list_managed_func_ref.bigframes_bigquery_function == add_list_managed_func.bigframes_bigquery_function @@ -515,16 +417,16 @@ def test_managed_function_dataframe_apply_axis_1_array_output(session, dataset_i # Assert the dataframe dtypes. assert tuple(bf_df.dtypes) == expected_dtypes - try: + @session.udf( + input_types=[int, float, str], + output_type=list[str], + dataset=dataset_id, + name=prefixer.create_prefix(), + ) + def foo(x, y, z): + return [str(x), str(y), z] - @session.udf( - input_types=[int, float, str], - output_type=list[str], - dataset=dataset_id, - name=prefixer.create_prefix(), - ) - def foo(x, y, z): - return [str(x), str(y), z] + try: assert getattr(foo, "is_row_processor") is False assert getattr(foo, "input_dtypes") == expected_dtypes @@ -585,7 +487,7 @@ def foo(x, y, z): foo_ref = session.read_gbq_function(foo.bigframes_bigquery_function) assert hasattr(foo_ref, "bigframes_bigquery_function") - assert not hasattr(foo_ref, "bigframes_remote_function") + assert foo_ref.bigframes_remote_function is None assert foo_ref.bigframes_bigquery_function == foo.bigframes_bigquery_function # Test on the function from read_gbq_function. diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 172fff3010..5e60f3ed9f 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -25,7 +25,6 @@ import google.api_core.exceptions from google.cloud import bigquery, functions_v2, storage import pandas -import pyarrow import pytest import test_utils.prefixer @@ -97,118 +96,6 @@ def bq_cf_connection() -> str: return "bigframes-rf-conn" -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_multiply_with_ibis( - session, - scalars_table_id, - bigquery_client, - ibis_client, - dataset_id, - bq_cf_connection, -): - try: - - @session.remote_function( - # Make sure that the input/output types can be used positionally. - # This avoids the worst of the breaking change from 1.x to 2.x. - [int, int], - int, - dataset_id, - bigquery_connection=bq_cf_connection, - reuse=False, - cloud_function_service_account="default", - ) - def multiply(x, y): - return x * y - - _, dataset_name, table_name = scalars_table_id.split(".") - if not ibis_client.dataset: - ibis_client.dataset = dataset_name - - col_name = "int64_col" - table = ibis_client.tables[table_name] - table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10) - sql = table.compile() - pandas_df_orig = bigquery_client.query(sql).to_dataframe() - - col = table[col_name] - col_2x = multiply(col, 2).name("int64_col_2x") - col_square = multiply(col, col).name("int64_col_square") - table = table.mutate([col_2x, col_square]) - sql = table.compile() - pandas_df_new = bigquery_client.query(sql).to_dataframe() - - pandas.testing.assert_series_equal( - pandas_df_orig[col_name] * 2, - pandas_df_new["int64_col_2x"], - check_names=False, - ) - - pandas.testing.assert_series_equal( - pandas_df_orig[col_name] * pandas_df_orig[col_name], - pandas_df_new["int64_col_square"], - check_names=False, - ) - finally: - # clean up the gcp assets created for the remote function - cleanup_function_assets(multiply, bigquery_client, session.cloudfunctionsclient) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_stringify_with_ibis( - session, - scalars_table_id, - bigquery_client, - ibis_client, - dataset_id, - bq_cf_connection, -): - try: - - @session.remote_function( - # Make sure that the input/output types can be used positionally. - # This avoids the worst of the breaking change from 1.x to 2.x. - [int], - str, - dataset_id, - bigquery_connection=bq_cf_connection, - reuse=False, - cloud_function_service_account="default", - ) - def stringify(x): - return f"I got {x}" - - # Function should work locally. - assert stringify(42) == "I got 42" - - _, dataset_name, table_name = scalars_table_id.split(".") - if not ibis_client.dataset: - ibis_client.dataset = dataset_name - - col_name = "int64_col" - table = ibis_client.tables[table_name] - table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10) - sql = table.compile() - pandas_df_orig = bigquery_client.query(sql).to_dataframe() - - col = table[col_name] - col_2x = stringify.ibis_node(col).name("int64_str_col") - table = table.mutate([col_2x]) - sql = table.compile() - pandas_df_new = bigquery_client.query(sql).to_dataframe() - - pandas.testing.assert_series_equal( - pandas_df_orig[col_name].apply(lambda x: f"I got {x}"), - pandas_df_new["int64_str_col"], - check_names=False, - ) - finally: - # clean up the gcp assets created for the remote function - cleanup_function_assets( - stringify, bigquery_client, session.cloudfunctionsclient - ) - - @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_binop(session, scalars_dfs, dataset_id, bq_cf_connection): try: @@ -2365,13 +2252,6 @@ def foo(x, y, z): assert getattr(foo, "is_row_processor") is False assert getattr(foo, "input_dtypes") == expected_dtypes - assert getattr(foo, "output_dtype") == pandas.ArrowDtype( - pyarrow.list_( - bigframes.dtypes.bigframes_dtype_to_arrow_dtype( - bigframes.dtypes.STRING_DTYPE - ) - ) - ) assert ( getattr(foo, "bigframes_bigquery_function_output_dtype") == bigframes.dtypes.STRING_DTYPE diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 47ab6e2174..d5d8b29786 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -99,7 +99,7 @@ def get_bq_connection_id_path_format(connection_id_dot_format): return f"projects/{fields[0]}/locations/{fields[1]}/connections/{fields[2]}" -@pytest.mark.flaky(retries=2, delay=120) +# @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, bigqueryconnection_client, @@ -134,7 +134,6 @@ def square(x): assert hasattr(square, "bigframes_remote_function") assert hasattr(square, "bigframes_bigquery_function") assert hasattr(square, "bigframes_cloud_function") - assert hasattr(square, "ibis_node") scalars_df, scalars_pandas_df = scalars_dfs @@ -718,7 +717,7 @@ def square1(x): assert square2.bigframes_remote_function assert square2.bigframes_bigquery_function - assert not hasattr(square2, "bigframes_cloud_function") + assert square2.bigframes_cloud_function is None # They should point to the same function. assert square1.bigframes_remote_function == square2.bigframes_remote_function # type: ignore diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index 37658bc436..77392bea2f 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -272,15 +272,3 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): ValueError, ): bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"}) - - -def test_remote_function_io_types_are_supported_bigframes_types(): - from bigframes_vendored.ibis.expr.datatypes.core import ( - dtype as python_type_to_ibis_type, - ) - - from bigframes.dtypes import RF_SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types - - for python_type in rf_supported_io_types: - ibis_type = python_type_to_ibis_type(python_type) - assert ibis_type in bigframes.core.compile.ibis_types.IBIS_TO_BIGFRAMES diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index 259a4390bc..978281e5c9 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -14,12 +14,9 @@ import re -import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes -from bigframes_vendored.ibis.expr import datatypes as ibis_types import pandas import pytest -import bigframes.dtypes import bigframes.functions.function as bff import bigframes.series from bigframes.testing import mocks @@ -56,26 +53,6 @@ def axis_1_function(myparam: series_type) -> str: # type: ignore # Still works as a normal function. assert axis_1_function(pandas.Series({"str_col": "World"})) == "Hello, World!" - assert axis_1_function.ibis_node is not None - - -def test_supported_types_correspond(): - # The same types should be representable by the supported Python and BigQuery types. - ibis_types_from_python = { - ibis_types.dtype(t) for t in bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES - } - ibis_types_from_bigquery = { - third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) - for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS - # TODO(b/284515241): ARRAY is the only exception because it is supported - # as an output type of the BQ routine in the read_gbq_function path but - # not in the remote function path. Remove this handline once BQ remote - # functions supports ARRAY output and the bigframes remote functions - # utilizes that to support array output. - if tk != "ARRAY" - } - - assert ibis_types_from_python == ibis_types_from_bigquery def test_missing_input_types(): diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py index 3eceb99331..9743297e99 100644 --- a/tests/unit/functions/test_remote_function_utils.py +++ b/tests/unit/functions/test_remote_function_utils.py @@ -15,8 +15,7 @@ import bigframes_vendored.constants as constants import pytest -import bigframes.dtypes -from bigframes.functions import _utils +from bigframes.functions import _utils, function_typing @pytest.mark.parametrize( @@ -133,7 +132,7 @@ def test_get_python_output_type_from_bigframes_metadata( def test_metadata_roundtrip_supported_array_types(): - for array_of in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: + for array_of in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES: ser = _utils.get_bigframes_metadata(python_output_type=list[array_of]) # type: ignore deser = _utils.get_python_output_type_from_bigframes_metadata(ser) assert deser == list[array_of] # type: ignore diff --git a/third_party/bigframes_vendored/ibis/expr/operations/udf.py b/third_party/bigframes_vendored/ibis/expr/operations/udf.py index 4fb25a9d34..91366cace8 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/udf.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/udf.py @@ -109,6 +109,7 @@ def _make_node( database: str | None = None, catalog: str | None = None, signature: tuple[tuple, Any] | None = None, + param_name_overrides: tuple[str, ...] | None = None, **kwargs, ) -> type[S]: """Construct a scalar user-defined function that is built-in to the backend.""" @@ -133,7 +134,7 @@ def _make_node( else: arg_types, return_annotation = signature - arg_names = list(inspect.signature(fn).parameters) + arg_names = param_name_overrides or list(inspect.signature(fn).parameters) fields = { arg_name: Argument(pattern=rlz.ValueOf(typ), typehint=typ) for arg_name, typ in zip(arg_names, arg_types) From 8ebfa57602ce02573384232f978304eb7cf4abdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 13 Jun 2025 16:54:14 -0500 Subject: [PATCH 17/23] chore: `_read_gbq_colab` supports querying a pandas DataFrame (#1801) * chore: `_read_gbq_colab` supports querying a pandas DataFrame * make more unit test * add session and dry_run arguments * add dry_run to to_view * initial pandas support with slow dry_run * speed up dry run * test with inline sql and load jobs * Update bigframes/core/pyformat.py * remove redundant test * Update tests/unit/session/test_read_gbq_colab.py * add dry run that works without a session * fix unit test * add unit tests for sessionless dry run * avoid binding to a location too early * dont try to set the default location unless its not a dry run * dont try to run any assertion on the response type * add support for small ints and floats * don't cast from float16 in earlier versions of arrow * rename _to_view to _to_placeholder_table * deduplicate column names in dry run * only allow lossless conversion if explicitly requested --- bigframes/core/blocks.py | 72 +++++-- bigframes/core/local_data.py | 4 +- bigframes/core/pyformat.py | 64 +++++- bigframes/core/tools/bigquery_schema.py | 48 +++++ bigframes/dataframe.py | 8 +- bigframes/dtypes.py | 29 ++- bigframes/pandas/io/api.py | 85 ++++++-- bigframes/session/__init__.py | 12 +- .../small/session/test_read_gbq_colab.py | 146 +++++++++++++- tests/unit/core/test_pyformat.py | 115 +++++++++-- tests/unit/core/tools/test_bigquery_schema.py | 187 ++++++++++++++++++ tests/unit/pandas/io/test_api.py | 31 ++- tests/unit/session/test_read_gbq_colab.py | 55 ++++-- tests/unit/test_local_data.py | 66 +++++++ 14 files changed, 848 insertions(+), 74 deletions(-) create mode 100644 bigframes/core/tools/bigquery_schema.py create mode 100644 tests/unit/core/tools/test_bigquery_schema.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 4607928b78..675e8c8b7a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -154,6 +154,7 @@ def __init__( self._stats_cache[" ".join(self.index_columns)] = {} self._transpose_cache: Optional[Block] = transpose_cache self._view_ref: Optional[bigquery.TableReference] = None + self._view_ref_dry_run: Optional[bigquery.TableReference] = None @classmethod def from_local( @@ -2459,19 +2460,19 @@ def is_monotonic_decreasing( ) -> bool: return self._is_monotonic(column_id, increasing=False) - def to_sql_query( - self, include_index: bool, enable_cache: bool = True - ) -> typing.Tuple[str, list[str], list[Label]]: + def _array_value_for_output( + self, *, include_index: bool + ) -> Tuple[bigframes.core.ArrayValue, list[str], list[Label]]: """ - Compiles this DataFrame's expression tree to SQL, optionally - including index columns. + Creates the expression tree with user-visible column names, such as for + SQL output. Args: include_index (bool): whether to include index columns. Returns: - a tuple of (sql_string, index_column_id_list, index_column_label_list). + a tuple of (ArrayValue, index_column_id_list, index_column_label_list). If include_index is set to False, index_column_id_list and index_column_label_list return empty lists. """ @@ -2494,25 +2495,72 @@ def to_sql_query( # the BigQuery unicode column name feature? substitutions[old_id] = new_id + return ( + array_value.rename_columns(substitutions), + new_ids[: len(idx_labels)], + idx_labels, + ) + + def to_sql_query( + self, include_index: bool, enable_cache: bool = True + ) -> Tuple[str, list[str], list[Label]]: + """ + Compiles this DataFrame's expression tree to SQL, optionally + including index columns. + + Args: + include_index (bool): + whether to include index columns. + + Returns: + a tuple of (sql_string, index_column_id_list, index_column_label_list). + If include_index is set to False, index_column_id_list and index_column_label_list + return empty lists. + """ + array_value, idx_ids, idx_labels = self._array_value_for_output( + include_index=include_index + ) + # Note: this uses the sql from the executor, so is coupled tightly to execution # implementaton. It will reference cached tables instead of original data sources. # Maybe should just compile raw BFET? Depends on user intent. - sql = self.session._executor.to_sql( - array_value.rename_columns(substitutions), enable_cache=enable_cache - ) + sql = self.session._executor.to_sql(array_value, enable_cache=enable_cache) return ( sql, - new_ids[: len(idx_labels)], + idx_ids, idx_labels, ) - def to_view(self, include_index: bool) -> bigquery.TableReference: + def to_placeholder_table( + self, include_index: bool, *, dry_run: bool = False + ) -> bigquery.TableReference: """ - Creates a temporary BigQuery VIEW with the SQL corresponding to this block. + Creates a temporary BigQuery VIEW (or empty table if dry_run) with the + SQL corresponding to this block. """ if self._view_ref is not None: return self._view_ref + # Prefer the real view if it exists, but since dry_run might be called + # many times before the real query, we cache that empty table reference + # with the correct schema too. + if dry_run: + if self._view_ref_dry_run is not None: + return self._view_ref_dry_run + + # Create empty temp table with the right schema. + array_value, _, _ = self._array_value_for_output( + include_index=include_index + ) + temp_table_schema = array_value.schema.to_bigquery() + self._view_ref_dry_run = self.session._create_temp_table( + schema=temp_table_schema + ) + return self._view_ref_dry_run + + # We shouldn't run `to_sql_query` if we have a `dry_run`, because it + # could cause us to make unnecessary API calls to upload local node + # data. sql, _, _ = self.to_sql_query(include_index=include_index) self._view_ref = self.session._create_temp_view(sql) return self._view_ref diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index da1c174bc4..a99366ad4c 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -336,7 +336,9 @@ def _adapt_arrow_array(array: pa.Array) -> tuple[pa.Array, bigframes.dtypes.Dtyp if target_type != array.type: # TODO: Maybe warn if lossy conversion? array = array.cast(target_type) - bf_type = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(target_type) + bf_type = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( + target_type, allow_lossless_cast=True + ) storage_type = _get_managed_storage_type(bf_type) if storage_type != array.type: diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 59ccdf1f5f..eab86dc629 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -21,10 +21,15 @@ import string import typing -from typing import Any, Union +from typing import Any, Optional, Union import google.cloud.bigquery -import google.cloud.bigquery.table +import pandas + +from bigframes.core import utils +import bigframes.core.local_data +from bigframes.core.tools import bigquery_schema +import bigframes.session _BQ_TABLE_TYPES = Union[ google.cloud.bigquery.Table, @@ -37,9 +42,51 @@ def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" +def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str: + # Ensure there are no duplicate column labels. + # + # Please make sure this stays in sync with the logic used to_gbq(). See + # bigframes.dataframe.DataFrame._prepare_export(). + new_col_labels, new_idx_labels = utils.get_standardized_ids( + pd_df.columns, pd_df.index.names + ) + pd_copy = pd_df.copy() + pd_copy.columns = pandas.Index(new_col_labels) + pd_copy.index.names = new_idx_labels + + managed_table = bigframes.core.local_data.ManagedArrowTable.from_pandas(pd_copy) + bqschema = managed_table.schema.to_bigquery() + return bigquery_schema.to_sql_dry_run(bqschema) + + +def _pandas_df_to_sql( + df_pd: pandas.DataFrame, + *, + name: str, + session: Optional[bigframes.session.Session] = None, + dry_run: bool = False, +) -> str: + if session is None: + if not dry_run: + message = ( + f"Can't embed pandas DataFrame {name} in a SQL " + "string without a bigframes session except if for a dry run." + ) + raise ValueError(message) + + return _pandas_df_to_sql_dry_run(df_pd) + + # Use the _deferred engine to avoid loading data too often during dry run. + df = session.read_pandas(df_pd, write_engine="_deferred") + return _table_to_sql(df._to_placeholder_table(dry_run=dry_run)) + + def _field_to_template_value( name: str, value: Any, + *, + session: Optional[bigframes.session.Session] = None, + dry_run: bool = False, ) -> str: """Convert value to something embeddable in a SQL string.""" import bigframes.core.sql # Avoid circular imports @@ -51,9 +98,11 @@ def _field_to_template_value( if isinstance(value, table_types): return _table_to_sql(value) - # TODO(tswast): convert pandas DataFrame objects to gbq tables or a literals subquery. + if isinstance(value, pandas.DataFrame): + return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name) + if isinstance(value, bigframes.dataframe.DataFrame): - return _table_to_sql(value._to_view()) + return _table_to_sql(value._to_placeholder_table(dry_run=dry_run)) return bigframes.core.sql.simple_literal(value) @@ -70,6 +119,7 @@ def _validate_type(name: str, value: Any): typing.get_args(_BQ_TABLE_TYPES) + typing.get_args(bigframes.core.sql.SIMPLE_LITERAL_TYPES) + (bigframes.dataframe.DataFrame,) + + (pandas.DataFrame,) ) if not isinstance(value, supported_types): @@ -91,6 +141,8 @@ def pyformat( sql_template: str, *, pyformat_args: dict, + session: Optional[bigframes.session.Session] = None, + dry_run: bool = False, ) -> str: """Unsafe Python-style string formatting of SQL string. @@ -115,6 +167,8 @@ def pyformat( format_kwargs = {} for name in fields: value = pyformat_args[name] - format_kwargs[name] = _field_to_template_value(name, value) + format_kwargs[name] = _field_to_template_value( + name, value, session=session, dry_run=dry_run + ) return sql_template.format(**format_kwargs) diff --git a/bigframes/core/tools/bigquery_schema.py b/bigframes/core/tools/bigquery_schema.py new file mode 100644 index 0000000000..227a69e0f7 --- /dev/null +++ b/bigframes/core/tools/bigquery_schema.py @@ -0,0 +1,48 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for working with BigQuery SchemaFields.""" + +from typing import Tuple + +import google.cloud.bigquery + + +def _type_to_sql(field: google.cloud.bigquery.SchemaField): + """Turn the type information of the field into SQL. + + Ignores the mode, since this has already been handled by _field_to_sql. + """ + if field.field_type.casefold() in ("record", "struct"): + return _to_struct(field.fields) + return field.field_type + + +def _field_to_sql(field: google.cloud.bigquery.SchemaField): + if field.mode == "REPEATED": + # Unlike other types, ARRAY are represented as mode="REPEATED". To get + # the array type, we use SchemaField object but ignore the mode. + return f"`{field.name}` ARRAY<{_type_to_sql(field)}>" + + return f"`{field.name}` {_type_to_sql(field)}" + + +def _to_struct(bqschema: Tuple[google.cloud.bigquery.SchemaField, ...]): + fields = [_field_to_sql(field) for field in bqschema] + return f"STRUCT<{', '.join(fields)}>" + + +def to_sql_dry_run(bqschema: Tuple[google.cloud.bigquery.SchemaField, ...]): + """Create an empty table expression with the correct schema.""" + return f"UNNEST(ARRAY<{_to_struct(bqschema)}>[])" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7e5bb3049a..38879d3ec0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -404,11 +404,13 @@ def _should_sql_have_index(self) -> bool: self.index.name is not None or len(self.index.names) > 1 ) - def _to_view(self) -> bigquery.TableReference: + def _to_placeholder_table(self, dry_run: bool = False) -> bigquery.TableReference: """Compiles this DataFrame's expression tree to SQL and saves it to a - (temporary) view. + (temporary) view or table (in the case of a dry run). """ - return self._block.to_view(include_index=self._should_sql_have_index()) + return self._block.to_placeholder_table( + include_index=self._should_sql_have_index(), dry_run=dry_run + ) def _to_sql_query( self, include_index: bool, enable_cache: bool = True diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index e0c3e39ac9..b0a31595e5 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -444,8 +444,35 @@ def dtype_for_etype(etype: ExpressionType) -> Dtype: if mapping.arrow_dtype is not None } +# Include types that aren't 1:1 to BigQuery but allowed to be loaded in to BigQuery: +_ARROW_TO_BIGFRAMES_LOSSLESS = { + pa.int8(): INT_DTYPE, + pa.int16(): INT_DTYPE, + pa.int32(): INT_DTYPE, + pa.uint8(): INT_DTYPE, + pa.uint16(): INT_DTYPE, + pa.uint32(): INT_DTYPE, + # uint64 is omitted because uint64 -> BigQuery INT64 is a lossy conversion. + pa.float16(): FLOAT_DTYPE, + pa.float32(): FLOAT_DTYPE, + # TODO(tswast): Can we support datetime/timestamp/time with units larger + # than microseconds? +} + + +def arrow_dtype_to_bigframes_dtype( + arrow_dtype: pa.DataType, allow_lossless_cast: bool = False +) -> Dtype: + """ + Convert an arrow type into the pandas-y type used to represent it in BigFrames. + + Args: + arrow_dtype: Arrow data type. + allow_lossless_cast: Allow lossless conversions, such as int32 to int64. + """ + if allow_lossless_cast and arrow_dtype in _ARROW_TO_BIGFRAMES_LOSSLESS: + return _ARROW_TO_BIGFRAMES_LOSSLESS[arrow_dtype] -def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: if arrow_dtype in _ARROW_TO_BIGFRAMES: return _ARROW_TO_BIGFRAMES[arrow_dtype] diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index e1fd7218bd..608eaf5a82 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -14,6 +14,7 @@ from __future__ import annotations +import functools import inspect import threading import typing @@ -51,6 +52,7 @@ import bigframes.enums import bigframes.series import bigframes.session +from bigframes.session import dry_runs import bigframes.session._io.bigquery import bigframes.session.clients @@ -216,6 +218,27 @@ def read_gbq( read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq) +def _try_read_gbq_colab_sessionless_dry_run( + create_query: Callable[[], str], +) -> Optional[pandas.Series]: + """Run a dry_run without a session, only if the session hasn't yet started.""" + + global _default_location_lock + + # Avoid creating a session just for dry run. We don't want to bind to a + # location too early. This is especially important if the query only refers + # to local data and not any BigQuery tables. + with _default_location_lock: + if not config.options.bigquery._session_started: + bqclient = _get_bqclient() + query = create_query() + job = _dry_run(query, bqclient) + return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ()) + + # Explicitly return None to indicate that we didn't run the dry run query. + return None + + @overload def _read_gbq_colab( # type: ignore[overload-overlap] query_or_table: str, @@ -263,11 +286,30 @@ def _read_gbq_colab( if pyformat_args is None: pyformat_args = {} - query = bigframes.core.pyformat.pyformat( + # Delay formatting the query with the special "session-less" logic. This + # avoids doing unnecessary work if the session already has a location or has + # already started. + create_query = functools.partial( + bigframes.core.pyformat.pyformat, query_or_table, pyformat_args=pyformat_args, + dry_run=True, ) - _set_default_session_location_if_possible(query) + + # Only try to set the global location if it's not a dry run. We don't want + # to bind to a location too early. This is especially important if the query + # only refers to local data and not any BigQuery tables. + if dry_run: + result = _try_read_gbq_colab_sessionless_dry_run(create_query) + + if result is not None: + return result + + # If we made it this far, we must have a session that has already + # started. That means we can safely call the "real" _read_gbq_colab, + # which generates slightly nicer SQL. + else: + _set_default_session_location_if_possible_deferred_query(create_query) return global_session.with_default_session( bigframes.session.Session._read_gbq_colab, @@ -530,7 +572,30 @@ def from_glob_path( _default_location_lock = threading.Lock() +def _get_bqclient() -> bigquery.Client: + clients_provider = bigframes.session.clients.ClientsProvider( + project=config.options.bigquery.project, + location=config.options.bigquery.location, + use_regional_endpoints=config.options.bigquery.use_regional_endpoints, + credentials=config.options.bigquery.credentials, + application_name=config.options.bigquery.application_name, + bq_kms_key_name=config.options.bigquery.kms_key_name, + client_endpoints_override=config.options.bigquery.client_endpoints_override, + requests_transport_adapters=config.options.bigquery.requests_transport_adapters, + ) + return clients_provider.bqclient + + +def _dry_run(query, bqclient) -> bigquery.QueryJob: + job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) + return job + + def _set_default_session_location_if_possible(query): + _set_default_session_location_if_possible_deferred_query(lambda: query) + + +def _set_default_session_location_if_possible_deferred_query(create_query): # Set the location as per the query if this is the first query the user is # running and: # (1) Default session has not started yet, and @@ -549,24 +614,14 @@ def _set_default_session_location_if_possible(query): ): return - clients_provider = bigframes.session.clients.ClientsProvider( - project=config.options.bigquery.project, - location=config.options.bigquery.location, - use_regional_endpoints=config.options.bigquery.use_regional_endpoints, - credentials=config.options.bigquery.credentials, - application_name=config.options.bigquery.application_name, - bq_kms_key_name=config.options.bigquery.kms_key_name, - client_endpoints_override=config.options.bigquery.client_endpoints_override, - requests_transport_adapters=config.options.bigquery.requests_transport_adapters, - ) - - bqclient = clients_provider.bqclient + query = create_query() + bqclient = _get_bqclient() if bigframes.session._io.bigquery.is_query(query): # Intentionally run outside of the session so that we can detect the # location before creating the session. Since it's a dry_run, labels # aren't necessary. - job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) + job = _dry_run(query, bqclient) config.options.bigquery.location = job.location else: table = bqclient.get_table(query) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7597f8eeed..c06233bad3 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -518,6 +518,8 @@ def _read_gbq_colab( query = bigframes.core.pyformat.pyformat( query, pyformat_args=pyformat_args, + session=self, + dry_run=dry_run, ) return self._loader.read_gbq_query( @@ -1965,9 +1967,17 @@ def _create_object_table(self, path: str, connection: str) -> str: return table def _create_temp_view(self, sql: str) -> bigquery.TableReference: - """Create a random id Object Table from the input path and connection.""" + """Create a random id view from the sql string.""" return self._anon_dataset_manager.create_temp_view(sql) + def _create_temp_table( + self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] + ) -> bigquery.TableReference: + """Allocate a random temporary table with the desired schema.""" + return self._temp_storage_manager.create_temp_table( + schema=schema, cluster_cols=cluster_cols + ) + def from_glob_path( self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None ) -> dataframe.DataFrame: diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 0992a10055..af78117262 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -14,8 +14,10 @@ """System tests for read_gbq_colab helper functions.""" +import numpy import pandas import pandas.testing +import pytest def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): @@ -140,33 +142,163 @@ def test_read_gbq_colab_includes_formatted_scalars(session): ) -def test_read_gbq_colab_includes_formatted_bigframes_dataframe( +@pytest.mark.skipif( + pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x" +) +def test_read_gbq_colab_includes_formatted_dataframes( session, scalars_df_index, scalars_pandas_df_index ): + pd_df = pandas.DataFrame( + { + "rowindex": [0, 1, 2, 3, 4, 5], + "value": [0, 100, 200, 300, 400, 500], + } + ) + + # Make sure we test with some data that is too large to inline as SQL. + pd_df_large = pandas.DataFrame( + { + "rowindex": numpy.arange(100_000), + "large_value": numpy.arange(100_000), + } + ) + pyformat_args = { # Apply some operations to make sure the columns aren't renamed. - "some_dataframe": scalars_df_index[scalars_df_index["int64_col"] > 0].assign( + "bf_df": scalars_df_index[scalars_df_index["int64_col"] > 0].assign( int64_col=scalars_df_index["int64_too"] ), + "pd_df": pd_df, + "pd_df_large": pd_df_large, # This is not a supported type, but ignored if not referenced. "some_object": object(), } + sql = """ + SELECT bf_df.int64_col + pd_df.value + pd_df_large.large_value AS int64_col, + COALESCE(bf_df.rowindex, pd_df.rowindex, pd_df_large.rowindex) AS rowindex + FROM {bf_df} AS bf_df + FULL OUTER JOIN {pd_df} AS pd_df + ON bf_df.rowindex = pd_df.rowindex + LEFT JOIN {pd_df_large} AS pd_df_large + ON bf_df.rowindex = pd_df_large.rowindex + ORDER BY rowindex ASC + """ + + # Do the dry run first so that we don't re-use the uploaded data from the + # real query. + dry_run_output = session._read_gbq_colab( + sql, + pyformat_args=pyformat_args, + dry_run=True, + ) + df = session._read_gbq_colab( - """ - SELECT int64_col, rowindex - FROM {some_dataframe} - ORDER BY rowindex ASC - """, + sql, pyformat_args=pyformat_args, ) + + # Confirm that dry_run was accurate. + pandas.testing.assert_series_equal( + pandas.Series(dry_run_output["columnDtypes"]), + df.dtypes, + ) + result = df.to_pandas() expected = ( scalars_pandas_df_index[scalars_pandas_df_index["int64_col"] > 0] .assign(int64_col=scalars_pandas_df_index["int64_too"]) .reset_index(drop=False)[["int64_col", "rowindex"]] + .merge( + pd_df, + on="rowindex", + how="outer", + ) + .merge( + pd_df_large, + on="rowindex", + how="left", + ) + .assign( + int64_col=lambda df: ( + df["int64_col"] + df["value"] + df["large_value"] + ).astype("Int64") + ) + .drop(columns=["value", "large_value"]) + .sort_values(by="rowindex") + .reset_index(drop=True) ) pandas.testing.assert_frame_equal( result, expected, check_index_type=False, # int64 vs Int64 ) + + +@pytest.mark.parametrize( + ("pd_df",), + ( + pytest.param( + pandas.DataFrame( + { + "rowindex": [0, 1, 2, 3, 4, 5], + "value": [0, 100, 200, 300, 400, 500], + "value2": [-1, -2, -3, -4, -5, -6], + } + ), + id="inline-df", + ), + pytest.param( + pandas.DataFrame( + { + # Make sure we test with some data that is too large to + # inline as SQL. + "rowindex": numpy.arange(100_000), + "value": numpy.arange(100_000), + "value2": numpy.arange(100_000), + } + ), + id="large-df", + ), + ), +) +def test_read_gbq_colab_with_formatted_dataframe_deduplicates_column_names_just_like_to_gbq( + session, + pd_df, +): + # Create duplicate column names. + pd_df.columns = ["rowindex", "value", "value"] + + pyformat_args = { + "pd_df": pd_df, + } + sql = """ + SELECT rowindex, value, value_1 + FROM {pd_df} + """ + + # Do the dry run first so that we don't re-use the uploaded data from the + # real query. + dry_run_output = session._read_gbq_colab( + sql, + pyformat_args=pyformat_args, + dry_run=True, + ) + + df = session._read_gbq_colab( + sql, + pyformat_args=pyformat_args, + ) + + # Confirm that dry_run was accurate. + pandas.testing.assert_series_equal( + pandas.Series(dry_run_output["columnDtypes"]), + df.dtypes, + ) + + # Make sure the query doesn't fail. + df.to_pandas_batches() + + # Make sure the + table_id = session.read_pandas(pd_df).to_gbq() + table = session.bqclient.get_table(table_id) + assert [field.name for field in table.schema] == ["rowindex", "value", "value_1"] diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 466f3d6116..05110d8485 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -19,13 +19,21 @@ from __future__ import annotations +import decimal from typing import Any, Dict, List import google.cloud.bigquery import google.cloud.bigquery.table +import pandas import pytest -import bigframes.core.pyformat as pyformat +from bigframes.core import pyformat +from bigframes.testing import mocks + + +@pytest.fixture +def session(): + return mocks.create_bigquery_session() @pytest.mark.parametrize( @@ -48,31 +56,116 @@ def test_parse_fields(sql_template: str, expected: List[str]): assert fields == expected -def test_pyformat_with_unsupported_type_raises_typeerror(): +def test_pyformat_with_unsupported_type_raises_typeerror(session): pyformat_args = {"my_object": object()} sql = "SELECT {my_object}" with pytest.raises(TypeError, match="my_object has unsupported type: "): - pyformat.pyformat(sql, pyformat_args=pyformat_args) + pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) -def test_pyformat_with_missing_variable_raises_keyerror(): +def test_pyformat_with_missing_variable_raises_keyerror(session): pyformat_args: Dict[str, Any] = {} sql = "SELECT {my_object}" with pytest.raises(KeyError, match="my_object"): - pyformat.pyformat(sql, pyformat_args=pyformat_args) + pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) -def test_pyformat_with_no_variables(): +def test_pyformat_with_no_variables(session): pyformat_args: Dict[str, Any] = {} sql = "SELECT '{{escaped curly brackets}}'" expected_sql = "SELECT '{escaped curly brackets}'" - got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) assert got_sql == expected_sql -def test_pyformat_with_query_string_replaces_variables(): +@pytest.mark.parametrize( + ("df_pd", "expected_struct"), + ( + pytest.param( + pandas.DataFrame(), + "STRUCT<>", + id="empty", + ), + pytest.param( + # Empty columns default to floating point, just like pandas. + pandas.DataFrame({"empty column": []}), + "STRUCT<`empty column` FLOAT>", + id="empty column", + ), + pytest.param( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [ + decimal.Decimal(1), + decimal.Decimal(2), + decimal.Decimal(3), + ], + } + ), + "STRUCT<`col1` INTEGER, `col2` STRING, `col3` NUMERIC>", + id="scalars", + ), + pytest.param( + pandas.DataFrame( + {"array col": [[1, 2, 3]], "another array": [["a", "b", "c"]]} + ), + "STRUCT<`array col` ARRAY, `another array` ARRAY>", + id="arrays", + ), + pytest.param( + pandas.DataFrame( + { + "struct col": [ + {"subfield": {"subsubfield": 1}, "subfield2": 2}, + ], + } + ), + "STRUCT<`struct col` STRUCT<`subfield` STRUCT<`subsubfield` INTEGER>, `subfield2` INTEGER>>", + id="structs", + ), + pytest.param( + pandas.DataFrame( + { + "array of struct col": [ + [{"subfield": {"subsubfield": 1}, "subfield2": 2}], + ], + } + ), + "STRUCT<`array of struct col` ARRAY, `subfield2` INTEGER>>>", + id="array_of_structs", + ), + pytest.param( + pandas.DataFrame({"c1": [1, 2, 3], "c2": ["a", "b", "c"]}).rename( + columns={"c1": "c", "c2": "c"} + ), + "STRUCT<`c` INTEGER, `c_1` STRING>", + id="duplicate_column_names", + ), + ), +) +def test_pyformat_with_pandas_dataframe_dry_run_no_session(df_pd, expected_struct): + pyformat_args: Dict[str, Any] = {"my_pandas_df": df_pd} + sql = "SELECT * FROM {my_pandas_df}" + expected_sql = f"SELECT * FROM UNNEST(ARRAY<{expected_struct}>[])" + got_sql = pyformat.pyformat( + sql, pyformat_args=pyformat_args, dry_run=True, session=None + ) + assert got_sql == expected_sql + + +def test_pyformat_with_pandas_dataframe_not_dry_run_no_session_raises_valueerror(): + pyformat_args: Dict[str, Any] = {"my_pandas_df": pandas.DataFrame()} + sql = "SELECT * FROM {my_pandas_df}" + + with pytest.raises(ValueError, match="my_pandas_df"): + pyformat.pyformat(sql, pyformat_args=pyformat_args) + + +def test_pyformat_with_query_string_replaces_variables(session): pyformat_args = { "my_string": "some string value", "max_value": 2.25, @@ -102,7 +195,7 @@ def test_pyformat_with_query_string_replaces_variables(): WHERE height < 2.25 """.strip() - got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) assert got_sql == expected_sql @@ -134,12 +227,12 @@ def test_pyformat_with_query_string_replaces_variables(): ), ), ) -def test_pyformat_with_table_replaces_variables(table, expected_sql): +def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session): pyformat_args = { "table": table, # Unreferenced values of unsupported type shouldn't cause issues. "my_object": object(), } sql = "SELECT * FROM {table}" - got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) assert got_sql == expected_sql diff --git a/tests/unit/core/tools/test_bigquery_schema.py b/tests/unit/core/tools/test_bigquery_schema.py new file mode 100644 index 0000000000..a5b0087801 --- /dev/null +++ b/tests/unit/core/tools/test_bigquery_schema.py @@ -0,0 +1,187 @@ +from google.cloud import bigquery +import pytest + +from bigframes.core.tools import bigquery_schema + + +# --- Tests for _type_to_sql --- +@pytest.mark.parametrize( + "field, expected_sql", + [ + # Simple types + (bigquery.SchemaField("test_field", "INTEGER"), "INTEGER"), + (bigquery.SchemaField("test_field", "STRING"), "STRING"), + (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOLEAN"), + # RECORD/STRUCT types with nested fields directly + ( + bigquery.SchemaField( + "test_field", + "RECORD", + fields=(bigquery.SchemaField("sub_field", "STRING"),), + ), + "STRUCT<`sub_field` STRING>", + ), + ( + bigquery.SchemaField( + "test_field", + "STRUCT", + fields=( + bigquery.SchemaField("sub_field", "INTEGER"), + bigquery.SchemaField("another", "BOOLEAN"), + ), + ), + "STRUCT<`sub_field` INTEGER, `another` BOOLEAN>", + ), + # Array is handled by _field_to_sql, instead. + (bigquery.SchemaField("test_field", "NUMERIC", mode="REPEATED"), "NUMERIC"), + ( + bigquery.SchemaField( + "test_field", + "RECORD", + mode="REPEATED", + fields=(bigquery.SchemaField("sub_field", "STRING"),), + ), + "STRUCT<`sub_field` STRING>", + ), + ], +) +def test_type_to_sql(field, expected_sql): + assert bigquery_schema._type_to_sql(field) == expected_sql + + +# --- Tests for _field_to_sql --- +@pytest.mark.parametrize( + "field, expected_sql", + [ + # Simple field + (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INTEGER"), + (bigquery.SchemaField("name", "STRING", "NULLABLE"), "`name` STRING"), + # Repeated field + (bigquery.SchemaField("tags", "STRING", "REPEATED"), "`tags` ARRAY"), + # Repeated RECORD + ( + bigquery.SchemaField( + "addresses", + "RECORD", + "REPEATED", + fields=( + bigquery.SchemaField("street", "STRING"), + bigquery.SchemaField("zip", "INTEGER"), + ), + ), + "`addresses` ARRAY>", + ), + # Simple STRUCT + ( + bigquery.SchemaField( + "person", + "STRUCT", + "NULLABLE", + fields=( + bigquery.SchemaField("age", "INTEGER"), + bigquery.SchemaField("city", "STRING"), + ), + ), + "`person` STRUCT<`age` INTEGER, `city` STRING>", + ), + ], +) +def test_field_to_sql(field, expected_sql): + assert bigquery_schema._field_to_sql(field) == expected_sql + + +# --- Tests for _to_struct --- +@pytest.mark.parametrize( + "bqschema, expected_sql", + [ + # Empty schema + ((), "STRUCT<>"), + # Simple fields + ( + ( + bigquery.SchemaField("id", "INTEGER"), + bigquery.SchemaField("name", "STRING"), + ), + "STRUCT<`id` INTEGER, `name` STRING>", + ), + # Nested RECORD/STRUCT + ( + ( + bigquery.SchemaField("item_id", "INTEGER"), + bigquery.SchemaField( + "details", + "RECORD", + "NULLABLE", + fields=( + bigquery.SchemaField("price", "NUMERIC"), + bigquery.SchemaField("currency", "STRING"), + ), + ), + ), + "STRUCT<`item_id` INTEGER, `details` STRUCT<`price` NUMERIC, `currency` STRING>>", + ), + # Repeated field + ( + ( + bigquery.SchemaField("user_id", "STRING"), + bigquery.SchemaField("emails", "STRING", "REPEATED"), + ), + "STRUCT<`user_id` STRING, `emails` ARRAY>", + ), + # Mixed types including complex nested repeated + ( + ( + bigquery.SchemaField("event_name", "STRING"), + bigquery.SchemaField( + "participants", + "RECORD", + "REPEATED", + fields=( + bigquery.SchemaField("p_id", "INTEGER"), + bigquery.SchemaField("roles", "STRING", "REPEATED"), + ), + ), + bigquery.SchemaField("timestamp", "TIMESTAMP"), + ), + "STRUCT<`event_name` STRING, `participants` ARRAY>>, `timestamp` TIMESTAMP>", + ), + ], +) +def test_to_struct(bqschema, expected_sql): + assert bigquery_schema._to_struct(bqschema) == expected_sql + + +# --- Tests for to_sql_dry_run --- +@pytest.mark.parametrize( + "bqschema, expected_sql", + [ + # Empty schema + ((), "UNNEST(ARRAY>[])"), + # Simple schema + ( + ( + bigquery.SchemaField("id", "INTEGER"), + bigquery.SchemaField("name", "STRING"), + ), + "UNNEST(ARRAY>[])", + ), + # Complex schema with nested and repeated fields + ( + ( + bigquery.SchemaField("order_id", "STRING"), + bigquery.SchemaField( + "items", + "RECORD", + "REPEATED", + fields=( + bigquery.SchemaField("item_name", "STRING"), + bigquery.SchemaField("quantity", "INTEGER"), + ), + ), + ), + "UNNEST(ARRAY>>>[])", + ), + ], +) +def test_to_sql_dry_run(bqschema, expected_sql): + assert bigquery_schema.to_sql_dry_run(bqschema) == expected_sql diff --git a/tests/unit/pandas/io/test_api.py b/tests/unit/pandas/io/test_api.py index fbc9027552..24ef51ad47 100644 --- a/tests/unit/pandas/io/test_api.py +++ b/tests/unit/pandas/io/test_api.py @@ -19,7 +19,32 @@ import bigframes.session -@mock.patch("bigframes.pandas.io.api._set_default_session_location_if_possible") +@mock.patch( + "bigframes.pandas.io.api._set_default_session_location_if_possible_deferred_query" +) +@mock.patch("bigframes.core.global_session.with_default_session") +def test_read_gbq_colab_dry_run_doesnt_call_set_location( + mock_with_default_session, mock_set_location +): + """ + Ensure that we don't bind to a location too early. If it's a dry run, the + user might not be done typing. + """ + mock_df = mock.create_autospec(bigframes.dataframe.DataFrame) + mock_with_default_session.return_value = mock_df + + query_or_table = "SELECT {param1} AS param1" + sample_pyformat_args = {"param1": "value1"} + bf_io_api._read_gbq_colab( + query_or_table, pyformat_args=sample_pyformat_args, dry_run=True + ) + + mock_set_location.assert_not_called() + + +@mock.patch( + "bigframes.pandas.io.api._set_default_session_location_if_possible_deferred_query" +) @mock.patch("bigframes.core.global_session.with_default_session") def test_read_gbq_colab_calls_set_location( mock_with_default_session, mock_set_location @@ -36,7 +61,9 @@ def test_read_gbq_colab_calls_set_location( # Make sure that we format the SQL first to prevent syntax errors. formatted_query = "SELECT 'value1' AS param1" - mock_set_location.assert_called_once_with(formatted_query) + mock_set_location.assert_called_once() + args, _ = mock_set_location.call_args + assert formatted_query == args[0]() mock_with_default_session.assert_called_once() # Check the actual arguments passed to with_default_session diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index c4635f85a9..52b091c045 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -15,8 +15,12 @@ """Unit tests for read_gbq_colab helper functions.""" import textwrap +from unittest import mock from google.cloud import bigquery +import numpy +import pandas +import pytest from bigframes.testing import mocks @@ -36,15 +40,29 @@ def test_read_gbq_colab_includes_label(): assert "session-read_gbq_colab" in label_values -def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch): - session = mocks.create_bigquery_session() +@pytest.mark.parametrize("dry_run", [True, False]) +def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch, dry_run): + bqclient = mock.create_autospec(bigquery.Client, instance=True) + bqclient.project = "proj" + session = mocks.create_bigquery_session(bqclient=bqclient) bf_df = mocks.create_dataframe(monkeypatch, session=session) - bf_df._to_view = lambda: bigquery.TableReference.from_string("my-project.my_dataset.some_view") # type: ignore + session._create_temp_table = mock.Mock( # type: ignore + return_value=bigquery.TableReference.from_string("proj.dset.temp_table") + ) + session._create_temp_view = mock.Mock( # type: ignore + return_value=bigquery.TableReference.from_string("proj.dset.temp_view") + ) + + # To avoid trouble with get_table() calls getting out of sync with mock + # "uploaded" data, make sure this is small enough to inline in the SQL as a + # view. + pd_df = pandas.DataFrame({"rowindex": numpy.arange(3), "value": numpy.arange(3)}) pyformat_args = { "some_integer": 123, "some_string": "This could be dangerous, but we escape it", "bf_df": bf_df, + "pd_df": pd_df, # This is not a supported type, but ignored if not referenced. "some_object": object(), } @@ -55,30 +73,35 @@ def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch): SELECT {some_integer} as some_integer, {some_string} as some_string, '{{escaped}}' as escaped - FROM {bf_df} + FROM {bf_df} AS bf_df + FULL OUTER JOIN {pd_df} AS pd_df + ON bf_df.rowindex = pd_df.rowindex """ ), pyformat_args=pyformat_args, - dry_run=True, + dry_run=dry_run, ) expected = textwrap.dedent( - """ + f""" SELECT 123 as some_integer, 'This could be dangerous, but we escape it' as some_string, - '{escaped}' as escaped - FROM `my-project`.`my_dataset`.`some_view` + '{{escaped}}' as escaped + FROM `proj`.`dset`.`temp_{"table" if dry_run else "view"}` AS bf_df + FULL OUTER JOIN `proj`.`dset`.`temp_{"table" if dry_run else "view"}` AS pd_df + ON bf_df.rowindex = pd_df.rowindex """ ) - queries = session._queries # type: ignore - configs = session._job_configs # type: ignore - for query, config in zip(queries, configs): - if config is None: - continue - if config.dry_run: - break + # This should be the most recent query. + query = session._queries[-1] # type: ignore + config = session._job_configs[-1] # type: ignore + + if dry_run: + assert config.dry_run + else: + # Allow for any "False-y" value. + assert not config.dry_run - assert config.dry_run assert query.strip() == expected.strip() diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py index 71479e89d4..dfd1cd622f 100644 --- a/tests/unit/test_local_data.py +++ b/tests/unit/test_local_data.py @@ -46,6 +46,72 @@ def test_local_data_well_formed_round_trip(): pandas.testing.assert_frame_equal(pd_data_normalized, result, check_dtype=False) +def test_local_data_small_sizes_round_trip(): + pyarrow_version = int(pa.__version__.split(".")[0]) + + int8s = [126, 127, -127, -128, 0, 1, -1] + uint8s = [254, 255, 1, 0, 128, 129, 127] + int16s = [32766, 32767, -32766, -32767, 0, 1, -1] + uint16s = [65534, 65535, 1, 0, 32768, 32769, 32767] + int32s = [2**31 - 2, 2**31 - 1, -(2**31) + 1, -(2**31), 0, 1, -1] + uint32s = [2**32 - 2, 2**32 - 1, 1, 0, 2**31, 2**31 + 1, 2**31 - 1] + float16s = [ + # Test some edge cases from: + # https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations + float.fromhex("0x1.0p-24"), # (2 ** -24).hex() + float.fromhex("-0x1.0p-24"), + float.fromhex("0x1.ffcp-13"), # ((2 ** -12) - (2 ** -23)).hex() + float.fromhex("-0x1.ffcp-13"), + 0, + float.fromhex("0x1.ffcp+14"), # (32768.0 - 16).hex() + float.fromhex("-0x1.ffcp+14"), + ] + float32s = [ + # Test some edge cases from: + # https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Notable_single-precision_cases + # and + # https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Precision_limitations_on_decimal_values_(between_1_and_16777216) + float.fromhex("0x1.0p-149"), # (2 ** -149).hex() + float.fromhex("-0x1.0p-149"), # (2 ** -149).hex() + float.fromhex("0x1.fffffep-1"), # (1.0 - (2 ** -24)).hex() + float.fromhex("-0x1.fffffep-1"), + 0, + float.fromhex("0x1.fffffcp-127"), # ((2 ** -126) * (1 - 2 ** -23)).hex() + float.fromhex("-0x1.fffffcp-127"), # ((2 ** -126) * (1 - 2 ** -23)).hex() + ] + small_data = { + "int8": pd.Series(int8s, dtype=pd.Int8Dtype()), + "int16": pd.Series(int16s, dtype=pd.Int16Dtype()), + "int32": pd.Series(int32s, dtype=pd.Int32Dtype()), + "uint8": pd.Series(uint8s, dtype=pd.UInt8Dtype()), + "uint16": pd.Series(uint16s, dtype=pd.UInt16Dtype()), + "uint32": pd.Series(uint32s, dtype=pd.UInt32Dtype()), + "float32": pd.Series(float32s, dtype="float32"), + } + expected_data = { + "int8": pd.Series(int8s, dtype=pd.Int64Dtype()), + "int16": pd.Series(int16s, dtype=pd.Int64Dtype()), + "int32": pd.Series(int32s, dtype=pd.Int64Dtype()), + "uint8": pd.Series(uint8s, dtype=pd.Int64Dtype()), + "uint16": pd.Series(uint16s, dtype=pd.Int64Dtype()), + "uint32": pd.Series(uint32s, dtype=pd.Int64Dtype()), + "float32": pd.Series(float32s, dtype=pd.Float64Dtype()), + } + + # Casting from float16 added in version 16. + # https://arrow.apache.org/blog/2024/04/20/16.0.0-release/#:~:text=Enhancements,New%20Features + if pyarrow_version >= 16: + small_data["float16"] = pd.Series(float16s, dtype="float16") + expected_data["float16"] = pd.Series(float16s, dtype=pd.Float64Dtype()) + + small_pd = pd.DataFrame(small_data) + local_entry = local_data.ManagedArrowTable.from_pandas(small_pd) + result = pd.DataFrame(local_entry.itertuples(), columns=small_pd.columns) + + expected = pd.DataFrame(expected_data) + pandas.testing.assert_frame_equal(expected, result, check_dtype=False) + + def test_local_data_well_formed_round_trip_chunked(): pa_table = pa.Table.from_pandas(pd_data, preserve_index=False) as_rechunked_pyarrow = pa.Table.from_batches(pa_table.to_batches(max_chunksize=2)) From 019051e453d81769891aa398475ebd04d1826e81 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 16 Jun 2025 11:53:08 -0700 Subject: [PATCH 18/23] feat: Add bbq.json_value_array and deprecate bbq.json_extract_string_array (#1818) This commit introduces the `bbq.json_value_array` method, which provides similar functionality to `JSON_VALUE_ARRAY` in BigQuery Standard SQL. The `bbq.json_extract_string_array` method has been marked as deprecated and will be removed in a future version. You should migrate to `bbq.json_value_array` for equivalent functionality. Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/bigquery/__init__.py | 2 + bigframes/bigquery/_operations/json.py | 66 +++++++++++++++++++- bigframes/core/compile/scalar_op_compiler.py | 12 ++++ bigframes/operations/__init__.py | 2 + bigframes/operations/json_ops.py | 17 +++++ tests/system/small/bigquery/test_json.py | 52 ++++++++++++++- 6 files changed, 149 insertions(+), 2 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index cdc3718893..7ca7fb693b 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -43,6 +43,7 @@ json_query_array, json_set, json_value, + json_value_array, parse_json, ) from bigframes.bigquery._operations.search import create_vector_index, vector_search @@ -71,6 +72,7 @@ "json_query_array", "json_set", "json_value", + "json_value_array", "parse_json", # search ops "create_vector_index", diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 00d230d684..7ad7855dba 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -196,6 +196,10 @@ def json_extract_string_array( values in the array. This function uses single quotes and brackets to escape invalid JSONPath characters in JSON keys. + .. deprecated:: 2.6.0 + The ``json_extract_string_array`` is deprecated and will be removed in a future version. + Use ``json_value_array`` instead. + **Examples:** >>> import bigframes.pandas as bpd @@ -233,6 +237,11 @@ def json_extract_string_array( Returns: bigframes.series.Series: A new Series with the parsed arrays from the input. """ + msg = ( + "The `json_extract_string_array` is deprecated and will be removed in a future version. " + "Use `json_value_array` instead." + ) + warnings.warn(bfe.format_message(msg), category=UserWarning) array_series = input._apply_unary_op( ops.JSONExtractStringArray(json_path=json_path) ) @@ -334,7 +343,7 @@ def json_query_array( def json_value( input: series.Series, - json_path: str, + json_path: str = "$", ) -> series.Series: """Extracts a JSON scalar value and converts it to a SQL ``STRING`` value. In addtion, this function: @@ -366,6 +375,61 @@ def json_value( return input._apply_unary_op(ops.JSONValue(json_path=json_path)) +def json_value_array( + input: series.Series, + json_path: str = "$", +) -> series.Series: + """ + Extracts a JSON array of scalar values and converts it to a SQL ``ARRAY`` + value. In addition, this function: + + - Removes the outermost quotes and unescapes the values. + - Returns a SQL ``NULL`` if the selected value isn't an array or not an array + containing only scalar values. + - Uses double quotes to escape invalid ``JSON_PATH`` characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_value_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": ["apples", "oranges", "grapes"]', + ... '{"fruits": ["guava", "grapes"]}' + ... ]) + >>> bbq.json_value_array(s, "$.fruits") + 0 ['apples' 'oranges' 'grapes'] + 1 ['guava' 'grapes'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}', + ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}' + ... ]) + >>> bbq.json_value_array(s, "$.fruits.names") + 0 ['apple' 'cherry'] + 1 ['guava' 'grapes'] + dtype: list[pyarrow] + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the parsed arrays from the input. + """ + return input._apply_unary_op(ops.JSONValueArray(json_path=json_path)) + + @utils.preview(name="The JSON-related API `parse_json`") def parse_json( input: series.Series, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index b819b1c4e2..075089bb7a 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1448,6 +1448,11 @@ def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue): return json_value(json_obj=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONValueArray, pass_op=True) +def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray): + return json_value_array(json_obj=x, json_path=op.json_path) + + # Blob Ops @scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op) def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value): @@ -2157,6 +2162,13 @@ def json_value( # type: ignore[empty-body] """Retrieve value of a JSON field as plain STRING.""" +@ibis_udf.scalar.builtin(name="json_value_array") +def json_value_array( # type: ignore[empty-body] + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String +) -> ibis_dtypes.Array[ibis_dtypes.String]: + """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs.""" + + @ibis_udf.scalar.builtin(name="INT64") def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] """Converts a JSON number to a SQL INT64 value.""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 291bf17fa5..86098d47cf 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -112,6 +112,7 @@ JSONQueryArray, JSONSet, JSONValue, + JSONValueArray, ParseJSON, ToJSONString, ) @@ -363,6 +364,7 @@ "JSONQueryArray", "JSONSet", "JSONValue", + "JSONValueArray", "ParseJSON", "ToJSONString", # Bool ops diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 95a47dcadb..81f00c39ce 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -153,6 +153,23 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE +@dataclasses.dataclass(frozen=True) +class JSONValueArray(base_ops.UnaryOp): + name: typing.ClassVar[str] = "json_value_array" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be a valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE)) + ) + + @dataclasses.dataclass(frozen=True) class JSONQuery(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_query" diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 4ad16d6cc8..4ecbd01318 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -186,7 +186,10 @@ def test_json_extract_array_w_invalid_series_type(): def test_json_extract_string_array_from_json_strings(): s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) - actual = bbq.json_extract_string_array(s, "$.a") + with pytest.warns( + UserWarning, match="The `json_extract_string_array` is deprecated" + ): + actual = bbq.json_extract_string_array(s, "$.a") expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]]) pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) @@ -214,6 +217,53 @@ def test_json_extract_string_array_w_invalid_series_type(): bbq.json_extract_string_array(s) +def test_json_value_array_from_json_strings(): + s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) + actual = bbq.json_value_array(s, "$.a") + expected_data = [["ab", "2", "3 xy"], [], ["4", "5"]] + # Expected dtype after JSON_VALUE_ARRAY is ARRAY + expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string()))) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_value_array_from_array_strings(): + s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) + actual = bbq.json_value_array(s) + expected_data = [["1", "2", "3"], [], ["4", "5"]] + expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string()))) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_value_array_w_invalid_series_type(): + s = bpd.Series([1, 2], dtype=dtypes.INT_DTYPE) # Not a JSON-like string + with pytest.raises(TypeError): + bbq.json_value_array(s) + + +def test_json_value_array_from_json_native(): + json_data = [ + '{"key": ["hello", "world"]}', + '{"key": ["123", "45.6"]}', + '{"key": []}', + "{}", # case with missing key + ] + s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE) + actual = bbq.json_value_array(s, json_path="$.key") + + expected_data_pandas = [["hello", "world"], ["123", "45.6"], [], None] + expected = bpd.Series( + expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string())) + ).fillna(pd.NA) + result_pd = actual.to_pandas().fillna(pd.NA) + pd.testing.assert_series_equal(result_pd, expected.to_pandas()) + + def test_json_query_from_json(): s = bpd.Series( ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], From aa323694e161f558bc5e60490c2f21008961e2ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 16 Jun 2025 14:29:55 -0500 Subject: [PATCH 19/23] docs: use pandas API instead of pandas-like or pandas-compatible (#1825) --- README.rst | 3 ++- notebooks/getting_started/bq_dataframes_template.ipynb | 3 ++- third_party/bigframes_vendored/pandas/io/gbq.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9288f2e6a5..36d3c2ca20 100644 --- a/README.rst +++ b/README.rst @@ -6,7 +6,8 @@ BigQuery DataFrames (BigFrames) BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine. -* ``bigframes.pandas`` provides a pandas-compatible API for analytics. +* `bigframes.pandas` provides a pandas API for analytics. Many workloads can be + migrated from pandas to bigframes by just changing a few imports. * ``bigframes.ml`` provides a scikit-learn-like API for ML. BigQuery DataFrames is an open-source package. diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index 68c5e9f74d..ae772d035e 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -81,7 +81,8 @@ "\n", "BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine.\n", "\n", - "* `bigframes.pandas` provides a pandas-like API for analytics.\n", + "* `bigframes.pandas` provides a pandas API for analytics. Many workloads can be\n", + " migrated from pandas to bigframes by just changing a few imports.\n", "* `bigframes.ml` provides a scikit-learn-like API for ML.\n", "* `bigframes.ml.llm` provides API for large language models including Gemini.\n", "\n", diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index a0d4092571..3dae2b6bbe 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -45,7 +45,7 @@ def read_gbq( * (Recommended) Set the ``index_col`` argument to one or more columns. Unique values for the row labels are recommended. Duplicate labels are possible, but note that joins on a non-unique index can duplicate - rows via pandas-like outer join behavior. + rows via pandas-compatible outer join behavior. .. note:: By default, even SQL query inputs with an ORDER BY clause create a From 72076c76a6ebc3efe59834d39861fdd37dbbdcab Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 16 Jun 2025 13:23:51 -0700 Subject: [PATCH 20/23] chore: compile concat nodes by sqlglot (#1824) * chore: compile concat node * chore: compile concat nodes by sqlglot --- bigframes/core/compile/sqlglot/compiler.py | 11 ++ bigframes/core/compile/sqlglot/sqlglot_ir.py | 53 ++++++++- .../test_compile_concat/out.sql | 107 ++++++++++++++++++ .../test_compile_projection/out.sql | 1 + .../compile/sqlglot/test_compile_concat.py | 32 ++++++ 5 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql create mode 100644 tests/unit/core/compile/sqlglot/test_compile_concat.py diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index ebe2a64699..d2b796b0aa 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -190,6 +190,17 @@ def compile_projection( ) return child.project(projected_cols) + @_compile_node.register + def compile_concat( + self, node: nodes.ConcatNode, *children: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + output_ids = [id.sql for id in node.output_ids] + return ir.SQLGlotIR.from_union( + [child.expr for child in children], + output_ids=output_ids, + uid_gen=self.uid_gen, + ) + def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 95e4f90118..43bdc6b06b 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -149,6 +149,57 @@ def from_query_string( select_expr.set("with", sge.With(expressions=[cte])) return cls(expr=select_expr, uid_gen=uid_gen) + @classmethod + def from_union( + cls, + selects: typing.Sequence[sge.Select], + output_ids: typing.Sequence[str], + uid_gen: guid.SequentialUIDGenerator, + ) -> SQLGlotIR: + """Builds SQLGlot expression by union of multiple select expressions.""" + assert ( + len(list(selects)) >= 2 + ), f"At least two select expressions must be provided, but got {selects}." + + existing_ctes: list[sge.CTE] = [] + union_selects: list[sge.Select] = [] + for select in selects: + assert isinstance( + select, sge.Select + ), f"All provided expressions must be of type sge.Select, but got {type(select)}" + + select_expr = select.copy() + existing_ctes = [*existing_ctes, *select_expr.args.pop("with", [])] + + new_cte_name = sge.to_identifier( + next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted + ) + new_cte = sge.CTE( + this=select_expr, + alias=new_cte_name, + ) + existing_ctes = [*existing_ctes, new_cte] + + selections = [ + sge.Alias( + this=expr.alias_or_name, + alias=sge.to_identifier(output_id, quoted=cls.quoted), + ) + for expr, output_id in zip(select_expr.expressions, output_ids) + ] + union_selects.append( + sge.Select().select(*selections).from_(sge.Table(this=new_cte_name)) + ) + + union_expr = sg.union( + *union_selects, + distinct=False, + copy=False, + ) + final_select_expr = sge.Select().select(sge.Star()).from_(union_expr.subquery()) + final_select_expr.set("with", sge.With(expressions=existing_ctes)) + return cls(expr=final_select_expr, uid_gen=uid_gen) + def select( self, selected_cols: tuple[tuple[str, sge.Expression], ...], @@ -181,7 +232,7 @@ def project( ) for id, expr in projected_cols ] - new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=False) + new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=True) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) def insert( diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql new file mode 100644 index 0000000000..4b6b2617ac --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql @@ -0,0 +1,107 @@ +WITH `bfcte_1` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)]) +), `bfcte_3` AS ( + SELECT + `bfcol_0` AS `bfcol_5`, + `bfcol_2` AS `bfcol_6`, + `bfcol_1` AS `bfcol_7`, + `bfcol_3` AS `bfcol_8`, + `bfcol_4` AS `bfcol_9` + FROM `bfcte_1` +), `bfcte_5` AS ( + SELECT + *, + `bfcol_9` AS `bfcol_10` + FROM `bfcte_3` +), `bfcte_7` AS ( + SELECT + `bfcol_5` AS `bfcol_11`, + `bfcol_6` AS `bfcol_12`, + `bfcol_7` AS `bfcol_13`, + `bfcol_8` AS `bfcol_14`, + `bfcol_10` AS `bfcol_15` + FROM `bfcte_5` +), `bfcte_9` AS ( + SELECT + *, + 0 AS `bfcol_16` + FROM `bfcte_7` +), `bfcte_10` AS ( + SELECT + `bfcol_11` AS `bfcol_17`, + `bfcol_12` AS `bfcol_18`, + `bfcol_13` AS `bfcol_19`, + `bfcol_14` AS `bfcol_20`, + `bfcol_16` AS `bfcol_21`, + `bfcol_15` AS `bfcol_22` + FROM `bfcte_9` +), `bfcte_0` AS ( + SELECT + * + FROM UNNEST(ARRAY>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)]) +), `bfcte_2` AS ( + SELECT + `bfcol_23` AS `bfcol_28`, + `bfcol_25` AS `bfcol_29`, + `bfcol_24` AS `bfcol_30`, + `bfcol_26` AS `bfcol_31`, + `bfcol_27` AS `bfcol_32` + FROM `bfcte_0` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_32` AS `bfcol_33` + FROM `bfcte_2` +), `bfcte_6` AS ( + SELECT + `bfcol_28` AS `bfcol_34`, + `bfcol_29` AS `bfcol_35`, + `bfcol_30` AS `bfcol_36`, + `bfcol_31` AS `bfcol_37`, + `bfcol_33` AS `bfcol_38` + FROM `bfcte_4` +), `bfcte_8` AS ( + SELECT + *, + 1 AS `bfcol_39` + FROM `bfcte_6` +), `bfcte_11` AS ( + SELECT + `bfcol_34` AS `bfcol_40`, + `bfcol_35` AS `bfcol_41`, + `bfcol_36` AS `bfcol_42`, + `bfcol_37` AS `bfcol_43`, + `bfcol_39` AS `bfcol_44`, + `bfcol_38` AS `bfcol_45` + FROM `bfcte_8` +), `bfcte_12` AS ( + SELECT + * + FROM ( + SELECT + bfcol_17 AS `bfcol_46`, + bfcol_18 AS `bfcol_47`, + bfcol_19 AS `bfcol_48`, + bfcol_20 AS `bfcol_49`, + bfcol_21 AS `bfcol_50`, + bfcol_22 AS `bfcol_51` + FROM `bfcte_10` + UNION ALL + SELECT + bfcol_40 AS `bfcol_46`, + bfcol_41 AS `bfcol_47`, + bfcol_42 AS `bfcol_48`, + bfcol_43 AS `bfcol_49`, + bfcol_44 AS `bfcol_50`, + bfcol_45 AS `bfcol_51` + FROM `bfcte_11` + ) +) +SELECT + `bfcol_46` AS `rowindex`, + `bfcol_47` AS `rowindex_1`, + `bfcol_48` AS `int64_col`, + `bfcol_49` AS `string_col` +FROM `bfcte_12` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql index 8a24b01a25..db470e3ba3 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql @@ -8,6 +8,7 @@ WITH `bfcte_0` AS ( FROM `test-project`.`test_dataset`.`test_table` ), `bfcte_1` AS ( SELECT + *, `bfcol_0` AS `bfcol_5`, `bfcol_2` AS `bfcol_6`, `bfcol_3` AS `bfcol_7`, diff --git a/tests/unit/core/compile/sqlglot/test_compile_concat.py b/tests/unit/core/compile/sqlglot/test_compile_concat.py new file mode 100644 index 0000000000..ec7e83a4b0 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_concat.py @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +import bigframes +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +def test_compile_concat( + scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot +): + # TODO: concat two same dataframes, which SQL does not get reused. + # TODO: concat dataframes from a gbq table but trigger a windows compiler. + df1 = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session) + df1 = df1[["rowindex", "int64_col", "string_col"]] + concat_df = bpd.concat([df1, df1]) + snapshot.assert_match(concat_df.sql, "out.sql") From 3abc02e893e3f1a7a5f463c84594f31312680772 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 16 Jun 2025 13:52:04 -0700 Subject: [PATCH 21/23] chore: enable order_by and limit for new compiler (#1815) * chore: enable order_by and limit for new compiler * fix tests after merge main --- bigframes/core/compile/sqlglot/compiler.py | 20 ++++++- bigframes/core/compile/sqlglot/sqlglot_ir.py | 46 ++++++++++++---- .../test_compile_concat/out.sql | 5 +- .../test_compile_projection/out.sql | 20 ++++--- .../test_compile_readlocal/out.sql | 53 +++++++++++++------ .../test_compile_readlocal_w_json_df/out.sql | 11 +++- .../test_compile_readlocal_w_lists_df/out.sql | 32 +++++++---- .../out.sql | 14 +++-- .../test_compile_readtable_w_limit/out.sql | 24 +++++++++ .../test_compile_readtable_w_ordering/out.sql | 40 ++++++++++++++ .../compile/sqlglot/test_compile_readtable.py | 12 +++++ 11 files changed, 228 insertions(+), 49 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index d2b796b0aa..68b572f911 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -125,9 +125,25 @@ def _compile_result_node(self, root: nodes.ResultNode) -> str: (name, scalar_compiler.compile_scalar_expression(ref)) for ref, name in root.output_cols ) - sqlglot_ir = sqlglot_ir.select(selected_cols) + # Skip squashing selections to ensure the right ordering and limit keys + sqlglot_ir = sqlglot_ir.select(selected_cols, squash_selections=False) + + if root.order_by is not None: + ordering_cols = tuple( + sge.Ordered( + this=scalar_compiler.compile_scalar_expression( + ordering.scalar_expression + ), + desc=ordering.direction.is_ascending is False, + nulls_first=ordering.na_last is False, + ) + for ordering in root.order_by.all_ordering_columns + ) + sqlglot_ir = sqlglot_ir.order_by(ordering_cols) + + if root.limit is not None: + sqlglot_ir = sqlglot_ir.limit(root.limit) - # TODO: add order_by, limit to sqlglot_expr return sqlglot_ir.sql @functools.lru_cache(maxsize=5000) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 43bdc6b06b..77ee0ccb78 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -28,7 +28,7 @@ from bigframes.core import guid import bigframes.core.compile.sqlglot.sqlglot_types as sgt import bigframes.core.local_data as local_data -import bigframes.core.schema as schemata +import bigframes.core.schema as bf_schema # shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. try: @@ -67,7 +67,7 @@ def sql(self) -> str: def from_pyarrow( cls, pa_table: pa.Table, - schema: schemata.ArraySchema, + schema: bf_schema.ArraySchema, uid_gen: guid.SequentialUIDGenerator, ) -> SQLGlotIR: """Builds SQLGlot expression from pyarrow table.""" @@ -203,6 +203,7 @@ def from_union( def select( self, selected_cols: tuple[tuple[str, sge.Expression], ...], + squash_selections: bool = True, ) -> SQLGlotIR: selections = [ sge.Alias( @@ -211,15 +212,39 @@ def select( ) for id, expr in selected_cols ] - # Attempts to simplify selected columns when the original and new column - # names are simply aliases of each other. - squashed_selections = _squash_selections(self.expr.expressions, selections) - if squashed_selections != []: - new_expr = self.expr.select(*squashed_selections, append=False) - return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + + # If squashing is enabled, we try to simplify the selections + # by checking if the new selections are simply aliases of the + # original columns. + if squash_selections: + new_selections = _squash_selections(self.expr.expressions, selections) + if new_selections != []: + new_expr = self.expr.select(*new_selections, append=False) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + + new_expr = self._encapsulate_as_cte().select(*selections, append=False) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + + def order_by( + self, + ordering: tuple[sge.Ordered, ...], + ) -> SQLGlotIR: + """Adds ORDER BY clause to the query.""" + if len(ordering) == 0: + return SQLGlotIR(expr=self.expr.copy(), uid_gen=self.uid_gen) + new_expr = self.expr.order_by(*ordering) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + + def limit( + self, + limit: int | None, + ) -> SQLGlotIR: + """Adds LIMIT clause to the query.""" + if limit is not None: + new_expr = self.expr.limit(limit) else: - new_expr = self._encapsulate_as_cte().select(*selections, append=False) - return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + new_expr = self.expr.copy() + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) def project( self, @@ -342,6 +367,7 @@ def _squash_selections( old_expr: list[sge.Expression], new_expr: list[sge.Alias] ) -> list[sge.Alias]: """ + TODO: Reanble this function to optimize the SQL. Simplifies the select column expressions if existing (old_expr) and new (new_expr) selected columns are both simple aliases of column definitions. diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql index 4b6b2617ac..855e5874c2 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql @@ -104,4 +104,7 @@ SELECT `bfcol_47` AS `rowindex_1`, `bfcol_48` AS `int64_col`, `bfcol_49` AS `string_col` -FROM `bfcte_12` \ No newline at end of file +FROM `bfcte_12` +ORDER BY + `bfcol_50` ASC NULLS LAST, + `bfcol_51` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql index db470e3ba3..2804925b2d 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql @@ -15,11 +15,19 @@ WITH `bfcte_0` AS ( `bfcol_4` AS `bfcol_8`, `bfcol_1` + 1 AS `bfcol_9` FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + `bfcol_5` AS `bfcol_10`, + `bfcol_9` AS `bfcol_11`, + `bfcol_6` AS `bfcol_12`, + `bfcol_7` AS `bfcol_13`, + `bfcol_8` AS `bfcol_14` + FROM `bfcte_1` ) SELECT - `bfcol_5` AS `rowindex`, - `bfcol_9` AS `int64_col`, - `bfcol_6` AS `string_col`, - `bfcol_7` AS `float64_col`, - `bfcol_8` AS `bool_col` -FROM `bfcte_1` \ No newline at end of file + `bfcol_10` AS `rowindex`, + `bfcol_11` AS `int64_col`, + `bfcol_12` AS `string_col`, + `bfcol_13` AS `float64_col`, + `bfcol_14` AS `bool_col` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index a34f3526d6..89c51b346d 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -155,21 +155,42 @@ WITH `bfcte_0` AS ( CAST(NULL AS TIMESTAMP), 8 )]) +), `bfcte_1` AS ( + SELECT + `bfcol_0` AS `bfcol_16`, + `bfcol_1` AS `bfcol_17`, + `bfcol_2` AS `bfcol_18`, + `bfcol_3` AS `bfcol_19`, + `bfcol_4` AS `bfcol_20`, + `bfcol_5` AS `bfcol_21`, + `bfcol_6` AS `bfcol_22`, + `bfcol_7` AS `bfcol_23`, + `bfcol_8` AS `bfcol_24`, + `bfcol_9` AS `bfcol_25`, + `bfcol_10` AS `bfcol_26`, + `bfcol_11` AS `bfcol_27`, + `bfcol_12` AS `bfcol_28`, + `bfcol_13` AS `bfcol_29`, + `bfcol_14` AS `bfcol_30`, + `bfcol_15` AS `bfcol_31` + FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_1` AS `bool_col`, - `bfcol_2` AS `bytes_col`, - `bfcol_3` AS `date_col`, - `bfcol_4` AS `datetime_col`, - `bfcol_5` AS `geography_col`, - `bfcol_6` AS `int64_col`, - `bfcol_7` AS `int64_too`, - `bfcol_8` AS `numeric_col`, - `bfcol_9` AS `float64_col`, - `bfcol_10` AS `rowindex_1`, - `bfcol_11` AS `rowindex_2`, - `bfcol_12` AS `string_col`, - `bfcol_13` AS `time_col`, - `bfcol_14` AS `timestamp_col` -FROM `bfcte_0` \ No newline at end of file + `bfcol_16` AS `rowindex`, + `bfcol_17` AS `bool_col`, + `bfcol_18` AS `bytes_col`, + `bfcol_19` AS `date_col`, + `bfcol_20` AS `datetime_col`, + `bfcol_21` AS `geography_col`, + `bfcol_22` AS `int64_col`, + `bfcol_23` AS `int64_too`, + `bfcol_24` AS `numeric_col`, + `bfcol_25` AS `float64_col`, + `bfcol_26` AS `rowindex_1`, + `bfcol_27` AS `rowindex_2`, + `bfcol_28` AS `string_col`, + `bfcol_29` AS `time_col`, + `bfcol_30` AS `timestamp_col` +FROM `bfcte_1` +ORDER BY + `bfcol_31` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql index 31b46e6c70..76cbde7c64 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -2,7 +2,14 @@ WITH `bfcte_0` AS ( SELECT * FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) +), `bfcte_1` AS ( + SELECT + `bfcol_0` AS `bfcol_2`, + `bfcol_1` AS `bfcol_3` + FROM `bfcte_0` ) SELECT - `bfcol_0` AS `json_col` -FROM `bfcte_0` \ No newline at end of file + `bfcol_2` AS `json_col` +FROM `bfcte_1` +ORDER BY + `bfcol_3` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql index 1ba602f205..6363739d9d 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql @@ -32,14 +32,28 @@ WITH `bfcte_0` AS ( ['', 'a'], 2 )]) +), `bfcte_1` AS ( + SELECT + `bfcol_0` AS `bfcol_9`, + `bfcol_1` AS `bfcol_10`, + `bfcol_2` AS `bfcol_11`, + `bfcol_3` AS `bfcol_12`, + `bfcol_4` AS `bfcol_13`, + `bfcol_5` AS `bfcol_14`, + `bfcol_6` AS `bfcol_15`, + `bfcol_7` AS `bfcol_16`, + `bfcol_8` AS `bfcol_17` + FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_1` AS `int_list_col`, - `bfcol_2` AS `bool_list_col`, - `bfcol_3` AS `float_list_col`, - `bfcol_4` AS `date_list_col`, - `bfcol_5` AS `date_time_list_col`, - `bfcol_6` AS `numeric_list_col`, - `bfcol_7` AS `string_list_col` -FROM `bfcte_0` \ No newline at end of file + `bfcol_9` AS `rowindex`, + `bfcol_10` AS `int_list_col`, + `bfcol_11` AS `bool_list_col`, + `bfcol_12` AS `float_list_col`, + `bfcol_13` AS `date_list_col`, + `bfcol_14` AS `date_time_list_col`, + `bfcol_15` AS `numeric_list_col`, + `bfcol_16` AS `string_list_col` +FROM `bfcte_1` +ORDER BY + `bfcol_17` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql index 54d1a1bb2b..af7206b759 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -18,8 +18,16 @@ WITH `bfcte_0` AS ( ), 1 )]) +), `bfcte_1` AS ( + SELECT + `bfcol_0` AS `bfcol_3`, + `bfcol_1` AS `bfcol_4`, + `bfcol_2` AS `bfcol_5` + FROM `bfcte_0` ) SELECT - `bfcol_0` AS `id`, - `bfcol_1` AS `person` -FROM `bfcte_0` \ No newline at end of file + `bfcol_3` AS `id`, + `bfcol_4` AS `person` +FROM `bfcte_1` +ORDER BY + `bfcol_5` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql new file mode 100644 index 0000000000..837b805ca4 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql @@ -0,0 +1,24 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `string_col` AS `bfcol_2`, + `float64_col` AS `bfcol_3`, + `bool_col` AS `bfcol_4` + FROM `test-project`.`test_dataset`.`test_table` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int64_col`, + `bfcol_2` AS `string_col`, + `bfcol_3` AS `float64_col`, + `bfcol_4` AS `bool_col` +FROM `bfcte_1` +ORDER BY + `bfcol_5` ASC NULLS LAST +LIMIT 10 \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql new file mode 100644 index 0000000000..9376691572 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql @@ -0,0 +1,40 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `string_col` AS `bfcol_2`, + `float64_col` AS `bfcol_3`, + `bool_col` AS `bfcol_4` + FROM `test-project`.`test_dataset`.`test_table` +), `bfcte_1` AS ( + SELECT + `bfcol_0` AS `bfcol_5`, + `bfcol_1` AS `bfcol_6`, + `bfcol_2` AS `bfcol_7`, + `bfcol_3` AS `bfcol_8`, + `bfcol_4` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_5` AS `bfcol_10` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + `bfcol_5` AS `bfcol_11`, + `bfcol_6` AS `bfcol_12`, + `bfcol_7` AS `bfcol_13`, + `bfcol_8` AS `bfcol_14`, + `bfcol_9` AS `bfcol_15`, + `bfcol_10` AS `bfcol_16` + FROM `bfcte_2` +) +SELECT + `bfcol_11` AS `rowindex`, + `bfcol_12` AS `int64_col`, + `bfcol_13` AS `string_col`, + `bfcol_14` AS `float64_col`, + `bfcol_15` AS `bool_col` +FROM `bfcte_3` +ORDER BY + `bfcol_16` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py index 848ace58f3..41e01e9b25 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readtable.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py @@ -22,3 +22,15 @@ def test_compile_readtable(compiler_session: bigframes.Session, snapshot): bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readtable_w_ordering(compiler_session: bigframes.Session, snapshot): + bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") + bf_df = bf_df.set_index("rowindex").sort_index() + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_readtable_w_limit(compiler_session: bigframes.Session, snapshot): + bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") + bf_df = bf_df.sort_values("int64_col").head(10) + snapshot.assert_match(bf_df.sql, "out.sql") From 33ab2b85dca8d358a022137ca49b2ee0d6f4192e Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Jun 2025 14:40:14 -0700 Subject: [PATCH 22/23] chore: inject dtypes to SQLGlot scalar expr compiler (#1821) * chore: inject dtypes to SQLGlot scalar expr compiler * fix format --- bigframes/core/compile/sqlglot/compiler.py | 9 ++--- .../core/compile/sqlglot/scalar_compiler.py | 29 +++++++++++---- bigframes/core/rewrite/schema_binding.py | 8 ++++- .../test_compile_readtable_w_limit/out.sql | 9 ++--- .../test_compile_readtable_w_ordering/out.sql | 36 ++++--------------- .../test_compile_numerical_add/out.sql | 33 +++++++++++++++++ .../test_compile_string_add/out.sql | 33 +++++++++++++++++ .../sqlglot/test_compile_scalar_expr.py | 31 ++++++++++++++++ 8 files changed, 140 insertions(+), 48 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql create mode 100644 tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 68b572f911..84fd7124ba 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -119,14 +119,16 @@ def _remap_variables(self, node: nodes.ResultNode) -> nodes.ResultNode: return typing.cast(nodes.ResultNode, result_node) def _compile_result_node(self, root: nodes.ResultNode) -> str: - sqlglot_ir = self.compile_node(root.child) - + # Have to bind schema as the final step before compilation. + root = typing.cast(nodes.ResultNode, schema_binding.bind_schema_to_tree(root)) selected_cols: tuple[tuple[str, sge.Expression], ...] = tuple( (name, scalar_compiler.compile_scalar_expression(ref)) for ref, name in root.output_cols ) # Skip squashing selections to ensure the right ordering and limit keys - sqlglot_ir = sqlglot_ir.select(selected_cols, squash_selections=False) + sqlglot_ir = self.compile_node(root.child).select( + selected_cols, squash_selections=False + ) if root.order_by is not None: ordering_cols = tuple( @@ -220,6 +222,5 @@ def compile_concat( def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) - node = nodes.bottom_up(node, schema_binding.bind_schema_to_expressions) node = nodes.bottom_up(node, rewrite.rewrite_range_rolling) return node diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 18d709732a..00ec892620 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -13,15 +13,25 @@ # limitations under the License. from __future__ import annotations +import dataclasses import functools import sqlglot.expressions as sge +from bigframes import dtypes from bigframes.core import expression import bigframes.core.compile.sqlglot.sqlglot_ir as ir import bigframes.operations as ops +@dataclasses.dataclass(frozen=True) +class TypedExpr: + """SQLGlot expression with type.""" + + expr: sge.Expression + dtype: dtypes.ExpressionType + + @functools.singledispatch def compile_scalar_expression( expression: expression.Expression, @@ -50,9 +60,12 @@ def compile_constant_expression( @compile_scalar_expression.register -def compile_op_expression(expr: expression.OpExpression): +def compile_op_expression(expr: expression.OpExpression) -> sge.Expression: # Non-recursively compiles the children scalar expressions. - args = tuple(map(compile_scalar_expression, expr.inputs)) + args = tuple( + TypedExpr(compile_scalar_expression(input), input.output_type) + for input in expr.inputs + ) op = expr.op op_name = expr.op.__class__.__name__ @@ -79,8 +92,10 @@ def compile_op_expression(expr: expression.OpExpression): # TODO: add parenthesize for operators -def compile_addop( - op: ops.AddOp, left: sge.Expression, right: sge.Expression -) -> sge.Expression: - # TODO: support addop for string dtype. - return sge.Add(this=left, expression=right) +def compile_addop(op: ops.AddOp, left: TypedExpr, right: TypedExpr) -> sge.Expression: + if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: + # String addition + return sge.Concat(expressions=[left.expr, right.expr]) + + # Numerical addition + return sge.Add(this=left.expr, expression=right.expr) diff --git a/bigframes/core/rewrite/schema_binding.py b/bigframes/core/rewrite/schema_binding.py index f3c313233b..aa5cb986b9 100644 --- a/bigframes/core/rewrite/schema_binding.py +++ b/bigframes/core/rewrite/schema_binding.py @@ -19,7 +19,13 @@ from bigframes.core import nodes -def bind_schema_to_expressions( +def bind_schema_to_tree( + node: bigframe_node.BigFrameNode, +) -> bigframe_node.BigFrameNode: + return nodes.bottom_up(node, bind_schema_to_node) + + +def bind_schema_to_node( node: bigframe_node.BigFrameNode, ) -> bigframe_node.BigFrameNode: if isinstance(node, nodes.ProjectionNode): diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql index 837b805ca4..c5724c8442 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql @@ -6,11 +6,6 @@ WITH `bfcte_0` AS ( `float64_col` AS `bfcol_3`, `bool_col` AS `bfcol_4` FROM `test-project`.`test_dataset`.`test_table` -), `bfcte_1` AS ( - SELECT - *, - `bfcol_1` AS `bfcol_5` - FROM `bfcte_0` ) SELECT `bfcol_0` AS `rowindex`, @@ -18,7 +13,7 @@ SELECT `bfcol_2` AS `string_col`, `bfcol_3` AS `float64_col`, `bfcol_4` AS `bool_col` -FROM `bfcte_1` +FROM `bfcte_0` ORDER BY - `bfcol_5` ASC NULLS LAST + `bfcol_1` ASC NULLS LAST LIMIT 10 \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql index 9376691572..238659cc01 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql @@ -6,35 +6,13 @@ WITH `bfcte_0` AS ( `float64_col` AS `bfcol_3`, `bool_col` AS `bfcol_4` FROM `test-project`.`test_dataset`.`test_table` -), `bfcte_1` AS ( - SELECT - `bfcol_0` AS `bfcol_5`, - `bfcol_1` AS `bfcol_6`, - `bfcol_2` AS `bfcol_7`, - `bfcol_3` AS `bfcol_8`, - `bfcol_4` AS `bfcol_9` - FROM `bfcte_0` -), `bfcte_2` AS ( - SELECT - *, - `bfcol_5` AS `bfcol_10` - FROM `bfcte_1` -), `bfcte_3` AS ( - SELECT - `bfcol_5` AS `bfcol_11`, - `bfcol_6` AS `bfcol_12`, - `bfcol_7` AS `bfcol_13`, - `bfcol_8` AS `bfcol_14`, - `bfcol_9` AS `bfcol_15`, - `bfcol_10` AS `bfcol_16` - FROM `bfcte_2` ) SELECT - `bfcol_11` AS `rowindex`, - `bfcol_12` AS `int64_col`, - `bfcol_13` AS `string_col`, - `bfcol_14` AS `float64_col`, - `bfcol_15` AS `bool_col` -FROM `bfcte_3` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int64_col`, + `bfcol_2` AS `string_col`, + `bfcol_3` AS `float64_col`, + `bfcol_4` AS `bool_col` +FROM `bfcte_0` ORDER BY - `bfcol_16` ASC NULLS LAST \ No newline at end of file + `bfcol_0` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql new file mode 100644 index 0000000000..405b02d897 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql @@ -0,0 +1,33 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `string_col` AS `bfcol_2`, + `float64_col` AS `bfcol_3`, + `bool_col` AS `bfcol_4` + FROM `test-project`.`test_dataset`.`test_table` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` AS `bfcol_5`, + `bfcol_2` AS `bfcol_6`, + `bfcol_3` AS `bfcol_7`, + `bfcol_4` AS `bfcol_8`, + `bfcol_1` + `bfcol_1` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + `bfcol_5` AS `bfcol_10`, + `bfcol_9` AS `bfcol_11`, + `bfcol_6` AS `bfcol_12`, + `bfcol_7` AS `bfcol_13`, + `bfcol_8` AS `bfcol_14` + FROM `bfcte_1` +) +SELECT + `bfcol_10` AS `rowindex`, + `bfcol_11` AS `int64_col`, + `bfcol_12` AS `string_col`, + `bfcol_13` AS `float64_col`, + `bfcol_14` AS `bool_col` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql new file mode 100644 index 0000000000..49ec5435f9 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql @@ -0,0 +1,33 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `string_col` AS `bfcol_2`, + `float64_col` AS `bfcol_3`, + `bool_col` AS `bfcol_4` + FROM `test-project`.`test_dataset`.`test_table` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` AS `bfcol_5`, + `bfcol_1` AS `bfcol_6`, + `bfcol_3` AS `bfcol_7`, + `bfcol_4` AS `bfcol_8`, + CONCAT(`bfcol_2`, 'a') AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + `bfcol_5` AS `bfcol_10`, + `bfcol_6` AS `bfcol_11`, + `bfcol_9` AS `bfcol_12`, + `bfcol_7` AS `bfcol_13`, + `bfcol_8` AS `bfcol_14` + FROM `bfcte_1` +) +SELECT + `bfcol_10` AS `rowindex`, + `bfcol_11` AS `int64_col`, + `bfcol_12` AS `string_col`, + `bfcol_13` AS `float64_col`, + `bfcol_14` AS `bool_col` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py b/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py new file mode 100644 index 0000000000..ebdb82477f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes + +pytest.importorskip("pytest_snapshot") + + +def test_compile_numerical_add(compiler_session: bigframes.Session, snapshot): + bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") + bf_df["int64_col"] = bf_df["int64_col"] + bf_df["int64_col"] + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_string_add(compiler_session: bigframes.Session, snapshot): + bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") + bf_df["string_col"] = bf_df["string_col"] + "a" + snapshot.assert_match(bf_df.sql, "out.sql") From eef158b7143868131154e2643eac5cd542aac0ff Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 16 Jun 2025 15:28:48 -0700 Subject: [PATCH 23/23] chore(main): release 2.7.0 (#1805) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> Co-authored-by: Shuowei Li --- CHANGELOG.md | 23 +++++++++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0393ad944c..46b97c2210 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,29 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.7.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.6.0...v2.7.0) (2025-06-16) + + +### Features + +* Add bbq.json_query_array and warn bbq.json_extract_array deprecated ([#1811](https://github.com/googleapis/python-bigquery-dataframes/issues/1811)) ([dc9eb27](https://github.com/googleapis/python-bigquery-dataframes/commit/dc9eb27fa75e90c2c95a0619551bf67aea6ef63b)) +* Add bbq.json_value_array and deprecate bbq.json_extract_string_array ([#1818](https://github.com/googleapis/python-bigquery-dataframes/issues/1818)) ([019051e](https://github.com/googleapis/python-bigquery-dataframes/commit/019051e453d81769891aa398475ebd04d1826e81)) +* Add groupby cumcount ([#1798](https://github.com/googleapis/python-bigquery-dataframes/issues/1798)) ([18f43e8](https://github.com/googleapis/python-bigquery-dataframes/commit/18f43e8b58e03a27b021bce07566a3d006ac3679)) +* Support custom build service account in `remote_function` ([#1796](https://github.com/googleapis/python-bigquery-dataframes/issues/1796)) ([e586151](https://github.com/googleapis/python-bigquery-dataframes/commit/e586151df81917b49f702ae496aaacbd02931636)) + + +### Bug Fixes + +* Correct read_csv behaviours with use_cols, names, index_col ([#1804](https://github.com/googleapis/python-bigquery-dataframes/issues/1804)) ([855031a](https://github.com/googleapis/python-bigquery-dataframes/commit/855031a316a6957731a5d1c5e59dedb9757d9f7a)) +* Fix single row broadcast with null index ([#1803](https://github.com/googleapis/python-bigquery-dataframes/issues/1803)) ([080eb7b](https://github.com/googleapis/python-bigquery-dataframes/commit/080eb7be3cde591e08cad0d5c52c68cc0b25ade8)) + + +### Documentation + +* Document how to use ai.map() for information extraction ([#1808](https://github.com/googleapis/python-bigquery-dataframes/issues/1808)) ([b586746](https://github.com/googleapis/python-bigquery-dataframes/commit/b5867464a5bf30300dcfc069eda546b11f03146c)) +* Rearrange README.rst to include a short code sample ([#1812](https://github.com/googleapis/python-bigquery-dataframes/issues/1812)) ([f6265db](https://github.com/googleapis/python-bigquery-dataframes/commit/f6265dbb8e22de81bb59c7def175cd325e85c041)) +* Use pandas API instead of pandas-like or pandas-compatible ([#1825](https://github.com/googleapis/python-bigquery-dataframes/issues/1825)) ([aa32369](https://github.com/googleapis/python-bigquery-dataframes/commit/aa323694e161f558bc5e60490c2f21008961e2ca)) + ## [2.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.5.0...v2.6.0) (2025-06-09) diff --git a/bigframes/version.py b/bigframes/version.py index e41364d4d1..138c380d0c 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.6.0" +__version__ = "2.7.0" # {x-release-please-start-date} -__release_date__ = "2025-06-09" +__release_date__ = "2025-06-16" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index e41364d4d1..138c380d0c 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.6.0" +__version__ = "2.7.0" # {x-release-please-start-date} -__release_date__ = "2025-06-09" +__release_date__ = "2025-06-16" # {x-release-please-end}