From e0f065fec9ccf4656838924619f0b954a9a9f667 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Mon, 9 Jun 2025 15:52:36 -0700
Subject: [PATCH 01/23] test: Engine tests for selection ops (#1800)

---
 tests/system/small/engines/conftest.py        |  9 ++-
 tests/system/small/engines/engine_utils.py    | 31 ++++++++++
 tests/system/small/engines/test_read_local.py | 30 +++-------
 tests/system/small/engines/test_selection.py  | 60 +++++++++++++++++++
 4 files changed, 107 insertions(+), 23 deletions(-)
 create mode 100644 tests/system/small/engines/engine_utils.py
 create mode 100644 tests/system/small/engines/test_selection.py

diff --git a/tests/system/small/engines/conftest.py b/tests/system/small/engines/conftest.py
index 2a72cb2196..249bd59260 100644
--- a/tests/system/small/engines/conftest.py
+++ b/tests/system/small/engines/conftest.py
@@ -19,7 +19,7 @@
 import pytest
 
 import bigframes
-from bigframes.core import local_data
+from bigframes.core import ArrayValue, local_data
 from bigframes.session import (
     direct_gbq_execution,
     local_scan_executor,
@@ -62,6 +62,13 @@ def managed_data_source(
     return local_data.ManagedArrowTable.from_pandas(scalars_pandas_df_index)
 
 
+@pytest.fixture(scope="module")
+def scalars_array_value(
+    managed_data_source: local_data.ManagedArrowTable, fake_session: bigframes.Session
+):
+    return ArrayValue.from_managed(managed_data_source, fake_session)
+
+
 @pytest.fixture(scope="module")
 def zero_row_source() -> local_data.ManagedArrowTable:
     return local_data.ManagedArrowTable.from_pandas(pd.DataFrame({"a": [], "b": []}))
diff --git a/tests/system/small/engines/engine_utils.py b/tests/system/small/engines/engine_utils.py
new file mode 100644
index 0000000000..f58e5951a1
--- /dev/null
+++ b/tests/system/small/engines/engine_utils.py
@@ -0,0 +1,31 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bigframes.core import nodes
+from bigframes.session import semi_executor
+
+
+def assert_equivalence_execution(
+    node: nodes.BigFrameNode,
+    engine1: semi_executor.SemiExecutor,
+    engine2: semi_executor.SemiExecutor,
+):
+    e1_result = engine1.execute(node, ordered=True)
+    e2_result = engine2.execute(node, ordered=True)
+    assert e1_result is not None
+    assert e2_result is not None
+    # Schemas might have extra nullity markers, normalize to node expected schema, which should be looser
+    e1_table = e1_result.to_arrow_table().cast(node.schema.to_pyarrow())
+    e2_table = e2_result.to_arrow_table().cast(node.schema.to_pyarrow())
+    assert e1_table.equals(e2_table), f"{e1_table} is not equal to {e2_table}"
diff --git a/tests/system/small/engines/test_read_local.py b/tests/system/small/engines/test_read_local.py
index 7bf1316a44..0517f45f1a 100644
--- a/tests/system/small/engines/test_read_local.py
+++ b/tests/system/small/engines/test_read_local.py
@@ -16,7 +16,8 @@
 
 import bigframes
 from bigframes.core import identifiers, local_data, nodes
-from bigframes.session import polars_executor, semi_executor
+from bigframes.session import polars_executor
+from tests.system.small.engines.engine_utils import assert_equivalence_execution
 
 pytest.importorskip("polars")
 
@@ -24,21 +25,6 @@
 REFERENCE_ENGINE = polars_executor.PolarsExecutor()
 
 
-def ensure_equivalence(
-    node: nodes.BigFrameNode,
-    engine1: semi_executor.SemiExecutor,
-    engine2: semi_executor.SemiExecutor,
-):
-    e1_result = engine1.execute(node, ordered=True)
-    e2_result = engine2.execute(node, ordered=True)
-    assert e1_result is not None
-    assert e2_result is not None
-    # Schemas might have extra nullity markers, normalize to node expected schema, which should be looser
-    e1_table = e1_result.to_arrow_table().cast(node.schema.to_pyarrow())
-    e2_table = e2_result.to_arrow_table().cast(node.schema.to_pyarrow())
-    assert e1_table.equals(e2_table), f"{e1_table} is not equal to {e2_table}"
-
-
 def test_engines_read_local(
     fake_session: bigframes.Session,
     managed_data_source: local_data.ManagedArrowTable,
@@ -51,7 +37,7 @@ def test_engines_read_local(
     local_node = nodes.ReadLocalNode(
         managed_data_source, scan_list, fake_session, offsets_col=None
     )
-    ensure_equivalence(local_node, REFERENCE_ENGINE, engine)
+    assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine)
 
 
 def test_engines_read_local_w_offsets(
@@ -69,7 +55,7 @@ def test_engines_read_local_w_offsets(
         fake_session,
         offsets_col=identifiers.ColumnId("offsets"),
     )
-    ensure_equivalence(local_node, REFERENCE_ENGINE, engine)
+    assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine)
 
 
 def test_engines_read_local_w_col_subset(
@@ -84,7 +70,7 @@ def test_engines_read_local_w_col_subset(
     local_node = nodes.ReadLocalNode(
         managed_data_source, scan_list, fake_session, offsets_col=None
     )
-    ensure_equivalence(local_node, REFERENCE_ENGINE, engine)
+    assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine)
 
 
 def test_engines_read_local_w_zero_row_source(
@@ -99,7 +85,7 @@ def test_engines_read_local_w_zero_row_source(
     local_node = nodes.ReadLocalNode(
         zero_row_source, scan_list, fake_session, offsets_col=None
     )
-    ensure_equivalence(local_node, REFERENCE_ENGINE, engine)
+    assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine)
 
 
 def test_engines_read_local_w_nested_source(
@@ -114,7 +100,7 @@ def test_engines_read_local_w_nested_source(
     local_node = nodes.ReadLocalNode(
         nested_data_source, scan_list, fake_session, offsets_col=None
     )
-    ensure_equivalence(local_node, REFERENCE_ENGINE, engine)
+    assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine)
 
 
 def test_engines_read_local_w_repeated_source(
@@ -129,4 +115,4 @@ def test_engines_read_local_w_repeated_source(
     local_node = nodes.ReadLocalNode(
         repeated_data_source, scan_list, fake_session, offsets_col=None
     )
-    ensure_equivalence(local_node, REFERENCE_ENGINE, engine)
+    assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine)
diff --git a/tests/system/small/engines/test_selection.py b/tests/system/small/engines/test_selection.py
new file mode 100644
index 0000000000..6350e79403
--- /dev/null
+++ b/tests/system/small/engines/test_selection.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from bigframes.core import array_value, expression, identifiers, nodes
+from bigframes.session import polars_executor
+from tests.system.small.engines.engine_utils import assert_equivalence_execution
+
+pytest.importorskip("polars")
+
+# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree.
+REFERENCE_ENGINE = polars_executor.PolarsExecutor()
+
+
+def test_engines_select_identity(
+    scalars_array_value: array_value.ArrayValue,
+    engine,
+):
+    selection = tuple(
+        nodes.AliasedRef(expression.deref(col), identifiers.ColumnId(col))
+        for col in scalars_array_value.column_ids
+    )
+    node = nodes.SelectionNode(scalars_array_value.node, selection)
+    assert_equivalence_execution(node, REFERENCE_ENGINE, engine)
+
+
+def test_engines_select_rename(
+    scalars_array_value: array_value.ArrayValue,
+    engine,
+):
+    selection = tuple(
+        nodes.AliasedRef(expression.deref(col), identifiers.ColumnId(f"renamed_{col}"))
+        for col in scalars_array_value.column_ids
+    )
+    node = nodes.SelectionNode(scalars_array_value.node, selection)
+    assert_equivalence_execution(node, REFERENCE_ENGINE, engine)
+
+
+def test_engines_select_reorder_rename_drop(
+    scalars_array_value: array_value.ArrayValue,
+    engine,
+):
+    selection = tuple(
+        nodes.AliasedRef(expression.deref(col), identifiers.ColumnId(f"renamed_{col}"))
+        for col in scalars_array_value.column_ids[::-2]
+    )
+    node = nodes.SelectionNode(scalars_array_value.node, selection)
+    assert_equivalence_execution(node, REFERENCE_ENGINE, engine)

From 080eb7be3cde591e08cad0d5c52c68cc0b25ade8 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Tue, 10 Jun 2025 11:03:48 -0700
Subject: [PATCH 02/23] fix: Fix single row broadcast with null index (#1803)

---
 bigframes/core/blocks.py             |  2 +-
 tests/system/small/test_dataframe.py | 31 ++++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 35cb7d41ae..acfa399d75 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -2895,7 +2895,7 @@ def join_with_single_row(
         combined_expr,
         index_columns=index_cols_post_join,
         column_labels=left.column_labels.append(single_row_block.column_labels),
-        index_labels=[left.index.name],
+        index_labels=left.index.names,
     )
     return (
         block,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index c80ced45a5..18d8fed7dc 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -2657,16 +2657,16 @@ def test_listlike_binop_axis_1_bf_index(scalars_dfs):
     assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
 
 
-def test_binop_with_self_aggregate(session, scalars_dfs):
-    scalars_df, scalars_pandas_df = scalars_dfs
+def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered):
+    scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered
 
     df_columns = ["int64_col", "float64_col", "int64_too"]
 
     # Ensure that this takes the optimized single-query path by counting executions
-    execution_count_before = session._metrics.execution_count
+    execution_count_before = scalars_df._session._metrics.execution_count
     bf_df = scalars_df[df_columns]
     bf_result = (bf_df - bf_df.mean()).to_pandas()
-    execution_count_after = session._metrics.execution_count
+    execution_count_after = scalars_df._session._metrics.execution_count
 
     pd_df = scalars_pandas_df[df_columns]
     pd_result = pd_df - pd_df.mean()
@@ -2677,6 +2677,29 @@ def test_binop_with_self_aggregate(session, scalars_dfs):
     assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
 
 
+def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered):
+    scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered
+
+    df_columns = ["int64_col", "float64_col", "int64_too"]
+
+    # Ensure that this takes the optimized single-query path by counting executions
+    execution_count_before = scalars_df._session._metrics.execution_count
+    bf_df = scalars_df[df_columns].reset_index(drop=True)
+    bf_result = (bf_df - bf_df.mean()).to_pandas()
+    execution_count_after = scalars_df._session._metrics.execution_count
+
+    pd_df = scalars_pandas_df[df_columns].reset_index(drop=True)
+    pd_result = pd_df - pd_df.mean()
+
+    executions = execution_count_after - execution_count_before
+
+    assert executions == 1
+    pd_result.index = pd_result.index.astype("Int64")
+    assert_pandas_df_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     ("left_labels", "right_labels"),
     [

From 3edc313307d753396a333570d7952984128c694b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Tue, 10 Jun 2025 17:14:16 -0500
Subject: [PATCH 03/23] chore: prevent location warning when `_read_gbq_colab`
 can determine the location (#1802)

* chore: prevent location warning when _read_gbq_colab can determine the location

I've made some updates to prevent a location warning when the system can determine the location for `bigframes.pandas.io.api._read_gbq_colab`.

I've updated `bigframes.pandas.io.api._read_gbq_colab` so it correctly calls `bigframes.session.Session._read_gbq_colab` and adjusted its arguments.

The `_read_gbq_colab` function in the pandas API layer now has a simpler signature, accepting `query_or_table`, `pyformat_args`, and `dry_run`. It will continue to call `_set_default_session_location_if_possible` to prevent location warnings.

I've also updated the unit tests to reflect these changes, making sure that the correct session-level function is called and that arguments are passed through as expected. I've also moved the tests to `tests/unit/pandas/io/test_api.py` and converted them to pytest style to follow our repository conventions.

* Update tests/unit/pandas/io/test_api.py

* remove unused import

* format the query first

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
---
 bigframes/pandas/io/api.py       | 61 ++++++++++++++++++++++++++++++++
 tests/unit/pandas/io/test_api.py | 48 +++++++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 tests/unit/pandas/io/test_api.py

diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py
index b2ce5f211e..e1fd7218bd 100644
--- a/bigframes/pandas/io/api.py
+++ b/bigframes/pandas/io/api.py
@@ -216,6 +216,67 @@ def read_gbq(
 read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq)
 
 
+@overload
+def _read_gbq_colab(  # type: ignore[overload-overlap]
+    query_or_table: str,
+    *,
+    pyformat_args: Optional[Dict[str, Any]] = ...,
+    dry_run: Literal[False] = ...,
+) -> bigframes.dataframe.DataFrame:
+    ...
+
+
+@overload
+def _read_gbq_colab(
+    query_or_table: str,
+    *,
+    pyformat_args: Optional[Dict[str, Any]] = ...,
+    dry_run: Literal[True] = ...,
+) -> pandas.Series:
+    ...
+
+
+def _read_gbq_colab(
+    query_or_table: str,
+    *,
+    pyformat_args: Optional[Dict[str, Any]] = None,
+    dry_run: bool = False,
+) -> bigframes.dataframe.DataFrame | pandas.Series:
+    """A Colab-specific version of read_gbq.
+
+    Calls `_set_default_session_location_if_possible` and then delegates
+    to `bigframes.session.Session._read_gbq_colab`.
+
+    Args:
+        query_or_table (str):
+            SQL query or table ID (table ID not yet supported).
+        pyformat_args (Optional[Dict[str, Any]]):
+            Parameters to format into the query string.
+        dry_run (bool):
+            If True, estimates the query results size without returning data.
+            The return will be a pandas Series with query metadata.
+
+    Returns:
+        Union[bigframes.dataframe.DataFrame, pandas.Series]:
+            A BigQuery DataFrame if `dry_run` is False, otherwise a pandas Series.
+    """
+    if pyformat_args is None:
+        pyformat_args = {}
+
+    query = bigframes.core.pyformat.pyformat(
+        query_or_table,
+        pyformat_args=pyformat_args,
+    )
+    _set_default_session_location_if_possible(query)
+
+    return global_session.with_default_session(
+        bigframes.session.Session._read_gbq_colab,
+        query_or_table,
+        pyformat_args=pyformat_args,
+        dry_run=dry_run,
+    )
+
+
 def read_gbq_model(model_name: str):
     return global_session.with_default_session(
         bigframes.session.Session.read_gbq_model,
diff --git a/tests/unit/pandas/io/test_api.py b/tests/unit/pandas/io/test_api.py
new file mode 100644
index 0000000000..fbc9027552
--- /dev/null
+++ b/tests/unit/pandas/io/test_api.py
@@ -0,0 +1,48 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest import mock
+
+import bigframes.dataframe
+import bigframes.pandas.io.api as bf_io_api
+import bigframes.session
+
+
+@mock.patch("bigframes.pandas.io.api._set_default_session_location_if_possible")
+@mock.patch("bigframes.core.global_session.with_default_session")
+def test_read_gbq_colab_calls_set_location(
+    mock_with_default_session, mock_set_location
+):
+    # Configure the mock for with_default_session to return a DataFrame mock
+    mock_df = mock.create_autospec(bigframes.dataframe.DataFrame)
+    mock_with_default_session.return_value = mock_df
+
+    query_or_table = "SELECT {param1} AS param1"
+    sample_pyformat_args = {"param1": "value1"}
+    result = bf_io_api._read_gbq_colab(
+        query_or_table, pyformat_args=sample_pyformat_args, dry_run=False
+    )
+
+    # Make sure that we format the SQL first to prevent syntax errors.
+    formatted_query = "SELECT 'value1' AS param1"
+    mock_set_location.assert_called_once_with(formatted_query)
+    mock_with_default_session.assert_called_once()
+
+    # Check the actual arguments passed to with_default_session
+    args, kwargs = mock_with_default_session.call_args
+    assert args[0] == bigframes.session.Session._read_gbq_colab
+    assert args[1] == query_or_table
+    assert kwargs["pyformat_args"] == sample_pyformat_args
+    assert not kwargs["dry_run"]
+    assert isinstance(result, bigframes.dataframe.DataFrame)

From 18f43e8b58e03a27b021bce07566a3d006ac3679 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Tue, 10 Jun 2025 17:29:13 -0700
Subject: [PATCH 04/23] feat: add groupby cumcount (#1798)

---
 bigframes/core/array_value.py                 | 22 +++++++--
 bigframes/core/blocks.py                      | 32 ++++++++++---
 bigframes/core/groupby/dataframe_group_by.py  | 29 ++++++++++--
 tests/system/small/test_groupby.py            | 46 +++++++++++++++----
 .../pandas/core/groupby/__init__.py           |  1 -
 5 files changed, 106 insertions(+), 24 deletions(-)

diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py
index a6c700a485..4b05781cb7 100644
--- a/bigframes/core/array_value.py
+++ b/bigframes/core/array_value.py
@@ -403,8 +403,23 @@ def project_window_op(
         never_skip_nulls: will disable null skipping for operators that would otherwise do so
         skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection
         """
+
+        return self.project_window_expr(
+            ex.UnaryAggregation(op, ex.deref(column_name)),
+            window_spec,
+            never_skip_nulls,
+            skip_reproject_unsafe,
+        )
+
+    def project_window_expr(
+        self,
+        expression: ex.Aggregation,
+        window: WindowSpec,
+        never_skip_nulls=False,
+        skip_reproject_unsafe: bool = False,
+    ):
         # TODO: Support non-deterministic windowing
-        if window_spec.is_row_bounded or not op.order_independent:
+        if window.is_row_bounded or not expression.op.order_independent:
             if self.node.order_ambiguous and not self.session._strictly_ordered:
                 if not self.session._allows_ambiguity:
                     raise ValueError(
@@ -415,14 +430,13 @@ def project_window_op(
                         "Window ordering may be ambiguous, this can cause unstable results."
                     )
                     warnings.warn(msg, category=bfe.AmbiguousWindowWarning)
-
         output_name = self._gen_namespaced_uid()
         return (
             ArrayValue(
                 nodes.WindowOpNode(
                     child=self.node,
-                    expression=ex.UnaryAggregation(op, ex.deref(column_name)),
-                    window_spec=window_spec,
+                    expression=expression,
+                    window_spec=window,
                     output_name=ids.ColumnId(output_name),
                     never_skip_nulls=never_skip_nulls,
                     skip_reproject_unsafe=skip_reproject_unsafe,
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index acfa399d75..4607928b78 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1012,16 +1012,34 @@ def apply_window_op(
         skip_null_groups: bool = False,
         skip_reproject_unsafe: bool = False,
         never_skip_nulls: bool = False,
+    ) -> typing.Tuple[Block, str]:
+        agg_expr = ex.UnaryAggregation(op, ex.deref(column))
+        return self.apply_analytic(
+            agg_expr,
+            window_spec,
+            result_label,
+            skip_reproject_unsafe=skip_reproject_unsafe,
+            never_skip_nulls=never_skip_nulls,
+            skip_null_groups=skip_null_groups,
+        )
+
+    def apply_analytic(
+        self,
+        agg_expr: ex.Aggregation,
+        window: windows.WindowSpec,
+        result_label: Label,
+        *,
+        skip_reproject_unsafe: bool = False,
+        never_skip_nulls: bool = False,
+        skip_null_groups: bool = False,
     ) -> typing.Tuple[Block, str]:
         block = self
         if skip_null_groups:
-            for key in window_spec.grouping_keys:
-                block, not_null_id = block.apply_unary_op(key.id.name, ops.notnull_op)
-                block = block.filter_by_id(not_null_id).drop_columns([not_null_id])
-        expr, result_id = block._expr.project_window_op(
-            column,
-            op,
-            window_spec,
+            for key in window.grouping_keys:
+                block = block.filter(ops.notnull_op.as_expr(key.id.name))
+        expr, result_id = block._expr.project_window_expr(
+            agg_expr,
+            window,
             skip_reproject_unsafe=skip_reproject_unsafe,
             never_skip_nulls=never_skip_nulls,
         )
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
index f234bad126..a2c4cf2867 100644
--- a/bigframes/core/groupby/dataframe_group_by.py
+++ b/bigframes/core/groupby/dataframe_group_by.py
@@ -275,6 +275,27 @@ def count(self) -> df.DataFrame:
     def nunique(self) -> df.DataFrame:
         return self._aggregate_all(agg_ops.nunique_op)
 
+    @validations.requires_ordering()
+    def cumcount(self, ascending: bool = True) -> series.Series:
+        window_spec = (
+            window_specs.cumulative_rows(grouping_keys=tuple(self._by_col_ids))
+            if ascending
+            else window_specs.inverse_cumulative_rows(
+                grouping_keys=tuple(self._by_col_ids)
+            )
+        )
+        block, result_id = self._block.apply_analytic(
+            ex.NullaryAggregation(agg_ops.size_op),
+            window=window_spec,
+            result_label=None,
+        )
+        result = series.Series(block.select_column(result_id)) - 1
+        if self._dropna and (len(self._by_col_ids) == 1):
+            result = result.mask(
+                series.Series(block.select_column(self._by_col_ids[0])).isna()
+            )
+        return result
+
     @validations.requires_ordering()
     def cumsum(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame:
         if not numeric_only:
@@ -546,10 +567,12 @@ def _apply_window_op(
         )
         columns, _ = self._aggregated_columns(numeric_only=numeric_only)
         block, result_ids = self._block.multi_apply_window_op(
-            columns, op, window_spec=window_spec
+            columns,
+            op,
+            window_spec=window_spec,
         )
-        block = block.select_columns(result_ids)
-        return df.DataFrame(block)
+        result = df.DataFrame(block.select_columns(result_ids))
+        return result
 
     def _resolve_label(self, label: blocks.Label) -> str:
         """Resolve label to column id."""
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index f1d2bacf08..bc2e9cc385 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -383,14 +383,14 @@ def test_dataframe_groupby_multi_sum(
 
 
 @pytest.mark.parametrize(
-    ("operator"),
+    ("operator", "dropna"),
     [
-        (lambda x: x.cumsum(numeric_only=True)),
-        (lambda x: x.cummax(numeric_only=True)),
-        (lambda x: x.cummin(numeric_only=True)),
+        (lambda x: x.cumsum(numeric_only=True), True),
+        (lambda x: x.cummax(numeric_only=True), True),
+        (lambda x: x.cummin(numeric_only=True), False),
         # Pre-pandas 2.2 doesn't always proeduce float.
-        (lambda x: x.cumprod().astype("Float64")),
-        (lambda x: x.shift(periods=2)),
+        (lambda x: x.cumprod().astype("Float64"), False),
+        (lambda x: x.shift(periods=2), True),
     ],
     ids=[
         "cumsum",
@@ -401,16 +401,44 @@ def test_dataframe_groupby_multi_sum(
     ],
 )
 def test_dataframe_groupby_analytic(
-    scalars_df_index, scalars_pandas_df_index, operator
+    scalars_df_index,
+    scalars_pandas_df_index,
+    operator,
+    dropna,
 ):
     col_names = ["float64_col", "int64_col", "bool_col", "string_col"]
-    bf_result = operator(scalars_df_index[col_names].groupby("string_col"))
-    pd_result = operator(scalars_pandas_df_index[col_names].groupby("string_col"))
+    bf_result = operator(
+        scalars_df_index[col_names].groupby("string_col", dropna=dropna)
+    )
+    pd_result = operator(
+        scalars_pandas_df_index[col_names].groupby("string_col", dropna=dropna)
+    )
     bf_result_computed = bf_result.to_pandas()
 
     pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    ("ascending", "dropna"),
+    [
+        (True, True),
+        (False, False),
+    ],
+)
+def test_dataframe_groupby_cumcount(
+    scalars_df_index, scalars_pandas_df_index, ascending, dropna
+):
+    bf_result = scalars_df_index.groupby("string_col", dropna=dropna).cumcount(
+        ascending
+    )
+    pd_result = scalars_pandas_df_index.groupby("string_col", dropna=dropna).cumcount(
+        ascending
+    )
+    bf_result_computed = bf_result.to_pandas()
+
+    pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False)
+
+
 def test_dataframe_groupby_size_as_index_false(
     scalars_df_index, scalars_pandas_df_index
 ):
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
index 4fb8498932..ebfbfa8830 100644
--- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
+++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -718,7 +718,6 @@ def max(
     def cumcount(self, ascending: bool = True):
         """
         Number each item in each group from 0 to the length of that group - 1.
-        (DataFrameGroupBy functionality is not yet available.)
 
         **Examples:**
 

From b3db5197444262b487532b4c7d5fcc4f50ee1404 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Tue, 10 Jun 2025 20:34:00 -0700
Subject: [PATCH 05/23] chore: Move remaining test utils to bigframes.testing
 module (#1810)

Co-authored-by: Shenyang Cai <sycai@users.noreply.github.com>
---
 .../testing}/compiler_session.py                    |  0
 .../engines => bigframes/testing}/engine_utils.py   |  0
 {tests/system => bigframes/testing}/utils.py        |  0
 pytest.ini                                          |  1 +
 scripts/__init__.py                                 | 13 -------------
 scripts/conftest.py                                 |  8 ++++++++
 scripts/test_publish_api_coverage.py                |  5 ++---
 tests/system/conftest.py                            | 10 +++++-----
 .../system/large/functions/test_managed_function.py |  2 +-
 .../system/large/functions/test_remote_function.py  |  2 +-
 tests/system/large/ml/test_cluster.py               |  2 +-
 tests/system/large/ml/test_compose.py               |  2 +-
 tests/system/large/ml/test_core.py                  |  2 +-
 tests/system/large/ml/test_decomposition.py         |  2 +-
 tests/system/large/ml/test_ensemble.py              |  2 +-
 tests/system/large/ml/test_forecasting.py           |  2 +-
 tests/system/large/ml/test_linear_model.py          |  2 +-
 tests/system/large/ml/test_model_selection.py       |  2 +-
 tests/system/large/ml/test_pipeline.py              |  2 +-
 tests/system/load/test_llm.py                       |  2 +-
 tests/system/small/bigquery/test_vector_search.py   |  2 +-
 tests/system/small/engines/test_read_local.py       |  2 +-
 tests/system/small/engines/test_selection.py        |  2 +-
 .../system/small/functions/test_remote_function.py  |  2 +-
 tests/system/small/geopandas/test_geoseries.py      |  2 +-
 tests/system/small/ml/test_cluster.py               |  2 +-
 tests/system/small/ml/test_core.py                  |  2 +-
 tests/system/small/ml/test_decomposition.py         | 10 +++++-----
 tests/system/small/ml/test_llm.py                   |  2 +-
 tests/system/small/ml/test_multimodal_llm.py        |  2 +-
 tests/system/small/ml/test_preprocessing.py         |  2 +-
 tests/system/small/operations/test_datetimes.py     |  2 +-
 tests/system/small/operations/test_lists.py         |  2 +-
 tests/system/small/operations/test_strings.py       |  3 +--
 .../regression/test_issue355_merge_after_filter.py  |  2 +-
 tests/system/small/test_dataframe.py                |  2 +-
 tests/system/small/test_dataframe_io.py             |  2 +-
 tests/system/small/test_encryption.py               |  2 +-
 tests/system/small/test_groupby.py                  |  2 +-
 tests/system/small/test_index.py                    |  2 +-
 tests/system/small/test_large_local_data.py         |  2 +-
 tests/system/small/test_multiindex.py               |  2 +-
 tests/system/small/test_pandas.py                   |  2 +-
 tests/system/small/test_series.py                   |  2 +-
 tests/system/small/test_session.py                  |  2 +-
 tests/system/small/test_unordered.py                |  2 +-
 tests/unit/core/compile/sqlglot/conftest.py         |  6 +++---
 tests/unit/test_dataframe_polars.py                 |  2 +-
 48 files changed, 62 insertions(+), 68 deletions(-)
 rename {tests/unit/core/compile/sqlglot => bigframes/testing}/compiler_session.py (100%)
 rename {tests/system/small/engines => bigframes/testing}/engine_utils.py (100%)
 rename {tests/system => bigframes/testing}/utils.py (100%)
 delete mode 100644 scripts/__init__.py
 create mode 100644 scripts/conftest.py

diff --git a/tests/unit/core/compile/sqlglot/compiler_session.py b/bigframes/testing/compiler_session.py
similarity index 100%
rename from tests/unit/core/compile/sqlglot/compiler_session.py
rename to bigframes/testing/compiler_session.py
diff --git a/tests/system/small/engines/engine_utils.py b/bigframes/testing/engine_utils.py
similarity index 100%
rename from tests/system/small/engines/engine_utils.py
rename to bigframes/testing/engine_utils.py
diff --git a/tests/system/utils.py b/bigframes/testing/utils.py
similarity index 100%
rename from tests/system/utils.py
rename to bigframes/testing/utils.py
diff --git a/pytest.ini b/pytest.ini
index 204c743bbf..75b69ce435 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -2,3 +2,4 @@
 doctest_optionflags = NORMALIZE_WHITESPACE
 filterwarnings =
     ignore::pandas.errors.SettingWithCopyWarning
+addopts = "--import-mode=importlib"
diff --git a/scripts/__init__.py b/scripts/__init__.py
deleted file mode 100644
index 6d5e14bcf4..0000000000
--- a/scripts/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/scripts/conftest.py b/scripts/conftest.py
new file mode 100644
index 0000000000..83fd2b19af
--- /dev/null
+++ b/scripts/conftest.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+import sys
+
+# inserts scripts into path so that tests can import
+project_root = Path(__file__).parent.parent
+scripts_dir = project_root / "scripts"
+
+sys.path.insert(0, str(scripts_dir))
diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py
index 6dea10b608..6e366b6854 100644
--- a/scripts/test_publish_api_coverage.py
+++ b/scripts/test_publish_api_coverage.py
@@ -15,16 +15,15 @@
 import sys
 
 import pandas
+from publish_api_coverage import build_api_coverage_table
 import pytest
 
-from . import publish_api_coverage
-
 pytest.importorskip("sklearn")
 
 
 @pytest.fixture
 def api_coverage_df():
-    return publish_api_coverage.build_api_coverage_table("my_bf_ver", "my_release_ver")
+    return build_api_coverage_table("my_bf_ver", "my_release_ver")
 
 
 @pytest.mark.skipif(
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index a4bab1bcfe..4605d9ddbc 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -41,7 +41,7 @@
 import bigframes.dataframe
 import bigframes.pandas as bpd
 import bigframes.series
-import tests.system.utils
+import bigframes.testing.utils
 
 # Use this to control the number of cloud functions being deleted in a single
 # test session. This should help soften the spike of the number of mutations per
@@ -615,7 +615,7 @@ def scalars_pandas_df_default_index() -> pd.DataFrame:
         DATA_DIR / "scalars.jsonl",
         lines=True,
     )
-    tests.system.utils.convert_pandas_dtypes(df, bytes_col=True)
+    bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True)
 
     df = df.set_index("rowindex", drop=False)
     df.index.name = None
@@ -1422,12 +1422,12 @@ def use_fast_query_path():
 @pytest.fixture(scope="session", autouse=True)
 def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent):
     """Clean up stale cloud functions."""
-    permanent_endpoints = tests.system.utils.get_remote_function_endpoints(
+    permanent_endpoints = bigframes.testing.utils.get_remote_function_endpoints(
         session.bqclient, dataset_id_permanent
     )
     delete_count = 0
     try:
-        for cloud_function in tests.system.utils.get_cloud_functions(
+        for cloud_function in bigframes.testing.utils.get_cloud_functions(
             cloudfunctions_client,
             session.bqclient.project,
             session.bqclient.location,
@@ -1447,7 +1447,7 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent
 
             # Go ahead and delete
             try:
-                tests.system.utils.delete_cloud_function(
+                bigframes.testing.utils.delete_cloud_function(
                     cloudfunctions_client, cloud_function.name
                 )
                 delete_count += 1
diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py
index 9eba1907e6..5cb54a00c1 100644
--- a/tests/system/large/functions/test_managed_function.py
+++ b/tests/system/large/functions/test_managed_function.py
@@ -21,7 +21,7 @@
 import bigframes
 import bigframes.exceptions as bfe
 import bigframes.pandas as bpd
-from tests.system.utils import cleanup_function_assets
+from bigframes.testing.utils import cleanup_function_assets
 
 prefixer = test_utils.prefixer.Prefixer("bigframes", "")
 
diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py
index 426813b0ff..9e0dcfe4d7 100644
--- a/tests/system/large/functions/test_remote_function.py
+++ b/tests/system/large/functions/test_remote_function.py
@@ -36,7 +36,7 @@
 import bigframes.functions._utils as bff_utils
 import bigframes.pandas as bpd
 import bigframes.series
-from tests.system.utils import (
+from bigframes.testing.utils import (
     assert_pandas_df_equal,
     cleanup_function_assets,
     delete_cloud_function,
diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py
index 39368f490b..9736199b17 100644
--- a/tests/system/large/ml/test_cluster.py
+++ b/tests/system/large/ml/test_cluster.py
@@ -15,7 +15,7 @@
 import pandas as pd
 
 from bigframes.ml import cluster
-from tests.system import utils
+from bigframes.testing import utils
 
 
 def test_cluster_configure_fit_score_predict(
diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py
index cbc702018a..9279324b3c 100644
--- a/tests/system/large/ml/test_compose.py
+++ b/tests/system/large/ml/test_compose.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from bigframes.ml import compose, preprocessing
-from tests.system import utils
+from bigframes.testing import utils
 
 
 def test_columntransformer_standalone_fit_and_transform(
diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py
index c1e1cc19d9..6f0551b1ef 100644
--- a/tests/system/large/ml/test_core.py
+++ b/tests/system/large/ml/test_core.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from bigframes.ml import globals
-from tests.system import utils
+from bigframes.testing import utils
 
 
 def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df):
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index e0e4b79c6f..c36e873816 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -16,7 +16,7 @@
 import pandas.testing
 
 from bigframes.ml import decomposition
-from tests.system import utils
+from bigframes.testing import utils
 
 
 def test_decomposition_configure_fit_score_predict(
diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py
index 706cbfdfaf..c2e9036eed 100644
--- a/tests/system/large/ml/test_ensemble.py
+++ b/tests/system/large/ml/test_ensemble.py
@@ -15,7 +15,7 @@
 import pytest
 
 import bigframes.ml.ensemble
-from tests.system import utils
+from bigframes.testing import utils
 
 
 @pytest.mark.flaky(retries=2)
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
index 56b93e5338..72a0ee469b 100644
--- a/tests/system/large/ml/test_forecasting.py
+++ b/tests/system/large/ml/test_forecasting.py
@@ -15,7 +15,7 @@
 import pytest
 
 from bigframes.ml import forecasting
-from tests.system import utils
+from bigframes.testing import utils
 
 ARIMA_EVALUATE_OUTPUT_COL = [
     "non_seasonal_p",
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
index be98902007..f0e2892ba8 100644
--- a/tests/system/large/ml/test_linear_model.py
+++ b/tests/system/large/ml/test_linear_model.py
@@ -16,7 +16,7 @@
 
 from bigframes.ml import model_selection
 import bigframes.ml.linear_model
-from tests.system import utils
+from bigframes.testing import utils
 
 
 def test_linear_regression_configure_fit_score(penguins_df_default_index, dataset_id):
diff --git a/tests/system/large/ml/test_model_selection.py b/tests/system/large/ml/test_model_selection.py
index c1856a1537..26174b7ee9 100644
--- a/tests/system/large/ml/test_model_selection.py
+++ b/tests/system/large/ml/test_model_selection.py
@@ -15,7 +15,7 @@
 import pytest
 
 from bigframes.ml import linear_model, model_selection
-from tests.system import utils
+from bigframes.testing import utils
 
 
 @pytest.mark.parametrize(
diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py
index 84a6b11ff2..6c51a11a11 100644
--- a/tests/system/large/ml/test_pipeline.py
+++ b/tests/system/large/ml/test_pipeline.py
@@ -25,7 +25,7 @@
     pipeline,
     preprocessing,
 )
-from tests.system import utils
+from bigframes.testing import utils
 
 
 def test_pipeline_linear_regression_fit_score_predict(
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
index 5cf9621ef9..fc04956749 100644
--- a/tests/system/load/test_llm.py
+++ b/tests/system/load/test_llm.py
@@ -16,7 +16,7 @@
 import pytest
 
 from bigframes.ml import llm
-from tests.system import utils
+from bigframes.testing import utils
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index 6297d729ea..a282135fa6 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -23,7 +23,7 @@
 
 import bigframes.bigquery as bbq
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal
+from bigframes.testing.utils import assert_pandas_df_equal
 
 # Need at least 5,000 rows to create a vector index.
 VECTOR_DF = pd.DataFrame(
diff --git a/tests/system/small/engines/test_read_local.py b/tests/system/small/engines/test_read_local.py
index 0517f45f1a..82af7c984d 100644
--- a/tests/system/small/engines/test_read_local.py
+++ b/tests/system/small/engines/test_read_local.py
@@ -17,7 +17,7 @@
 import bigframes
 from bigframes.core import identifiers, local_data, nodes
 from bigframes.session import polars_executor
-from tests.system.small.engines.engine_utils import assert_equivalence_execution
+from bigframes.testing.engine_utils import assert_equivalence_execution
 
 pytest.importorskip("polars")
 
diff --git a/tests/system/small/engines/test_selection.py b/tests/system/small/engines/test_selection.py
index 6350e79403..94c8a6463c 100644
--- a/tests/system/small/engines/test_selection.py
+++ b/tests/system/small/engines/test_selection.py
@@ -16,7 +16,7 @@
 
 from bigframes.core import array_value, expression, identifiers, nodes
 from bigframes.session import polars_executor
-from tests.system.small.engines.engine_utils import assert_equivalence_execution
+from bigframes.testing.engine_utils import assert_equivalence_execution
 
 pytest.importorskip("polars")
 
diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py
index 7fc7caf2fc..47ab6e2174 100644
--- a/tests/system/small/functions/test_remote_function.py
+++ b/tests/system/small/functions/test_remote_function.py
@@ -31,7 +31,7 @@
 from bigframes.functions import _utils as bff_utils
 from bigframes.functions import function as bff
 import bigframes.session._io.bigquery
-from tests.system.utils import assert_pandas_df_equal, get_function_name
+from bigframes.testing.utils import assert_pandas_df_equal, get_function_name
 
 _prefixer = test_utils.prefixer.Prefixer("bigframes", "")
 
diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py
index 36dd070ef5..51344edcbd 100644
--- a/tests/system/small/geopandas/test_geoseries.py
+++ b/tests/system/small/geopandas/test_geoseries.py
@@ -31,7 +31,7 @@
 import bigframes.geopandas
 import bigframes.pandas
 import bigframes.series
-from tests.system.utils import assert_series_equal
+from bigframes.testing.utils import assert_series_equal
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py
index 96066e5fbe..4840329cda 100644
--- a/tests/system/small/ml/test_cluster.py
+++ b/tests/system/small/ml/test_cluster.py
@@ -16,7 +16,7 @@
 
 from bigframes.ml import cluster
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal
+from bigframes.testing.utils import assert_pandas_df_equal
 
 _PD_NEW_PENGUINS = pd.DataFrame.from_dict(
     {
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index 3c5ba9bb18..ef62e5ddd3 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -23,7 +23,7 @@
 import bigframes
 import bigframes.features
 from bigframes.ml import core
-from tests.system import utils
+from bigframes.testing import utils
 
 
 def test_model_eval(
diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py
index 9eb9b25ea1..10255003a1 100644
--- a/tests/system/small/ml/test_decomposition.py
+++ b/tests/system/small/ml/test_decomposition.py
@@ -16,7 +16,7 @@
 
 from bigframes.ml import decomposition
 import bigframes.pandas as bpd
-import tests.system.utils
+import bigframes.testing.utils
 
 
 def test_pca_predict(
@@ -33,7 +33,7 @@ def test_pca_predict(
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )
 
-    tests.system.utils.assert_pandas_df_equal_pca(
+    bigframes.testing.utils.assert_pandas_df_equal_pca(
         predictions, expected, check_exact=False, rtol=0.1
     )
 
@@ -161,7 +161,7 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA):
         .reset_index(drop=True)
     )
 
-    tests.system.utils.assert_pandas_df_equal_pca_components(
+    bigframes.testing.utils.assert_pandas_df_equal_pca_components(
         result,
         expected,
         check_exact=False,
@@ -180,7 +180,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA):
             "explained_variance": [3.278657, 1.270829, 1.125354],
         },
     )
-    tests.system.utils.assert_pandas_df_equal(
+    bigframes.testing.utils.assert_pandas_df_equal(
         result,
         expected,
         check_exact=False,
@@ -200,7 +200,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA):
             "explained_variance_ratio": [0.469357, 0.181926, 0.1611],
         },
     )
-    tests.system.utils.assert_pandas_df_equal(
+    bigframes.testing.utils.assert_pandas_df_equal(
         result,
         expected,
         check_exact=False,
diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py
index 3d5453099d..11425400bf 100644
--- a/tests/system/small/ml/test_llm.py
+++ b/tests/system/small/ml/test_llm.py
@@ -22,7 +22,7 @@
 from bigframes import exceptions
 from bigframes.ml import core, llm
 import bigframes.pandas as bpd
-from tests.system import utils
+from bigframes.testing import utils
 
 
 @pytest.mark.parametrize(
diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py
index beee95636f..48a69f522c 100644
--- a/tests/system/small/ml/test_multimodal_llm.py
+++ b/tests/system/small/ml/test_multimodal_llm.py
@@ -18,7 +18,7 @@
 
 from bigframes.ml import llm
 import bigframes.pandas as bpd
-from tests.system import utils
+from bigframes.testing import utils
 
 
 @pytest.mark.flaky(retries=2)
diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py
index 16b153ab45..34be48be1e 100644
--- a/tests/system/small/ml/test_preprocessing.py
+++ b/tests/system/small/ml/test_preprocessing.py
@@ -19,7 +19,7 @@
 
 import bigframes.features
 from bigframes.ml import preprocessing
-from tests.system import utils
+from bigframes.testing import utils
 
 ONE_HOT_ENCODED_DTYPE = (
     pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])))
diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py
index bbecf40e0b..4e2beb9c19 100644
--- a/tests/system/small/operations/test_datetimes.py
+++ b/tests/system/small/operations/test_datetimes.py
@@ -20,7 +20,7 @@
 import pytest
 
 import bigframes.series
-from tests.system.utils import assert_series_equal
+from bigframes.testing.utils import assert_series_equal
 
 DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)]
 DATE_COLUMNS = [
diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py
index 7b39bdebd5..fda01a5dae 100644
--- a/tests/system/small/operations/test_lists.py
+++ b/tests/system/small/operations/test_lists.py
@@ -18,7 +18,7 @@
 import pyarrow as pa
 import pytest
 
-from ...utils import assert_series_equal
+from bigframes.testing.utils import assert_series_equal
 
 
 @pytest.mark.parametrize(
diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py
index 8801faf657..209bc87f9b 100644
--- a/tests/system/small/operations/test_strings.py
+++ b/tests/system/small/operations/test_strings.py
@@ -20,8 +20,7 @@
 
 import bigframes.dtypes as dtypes
 import bigframes.pandas as bpd
-
-from ...utils import assert_series_equal
+from bigframes.testing.utils import assert_series_equal
 
 
 def test_find(scalars_dfs):
diff --git a/tests/system/small/regression/test_issue355_merge_after_filter.py b/tests/system/small/regression/test_issue355_merge_after_filter.py
index 24ee01cb7f..1c3b6e4fe3 100644
--- a/tests/system/small/regression/test_issue355_merge_after_filter.py
+++ b/tests/system/small/regression/test_issue355_merge_after_filter.py
@@ -15,7 +15,7 @@
 import pandas as pd
 import pytest
 
-from tests.system.utils import assert_pandas_df_equal
+from bigframes.testing.utils import assert_pandas_df_equal
 
 
 @pytest.mark.parametrize(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 18d8fed7dc..946df79cbf 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -33,7 +33,7 @@
 import bigframes.dtypes as dtypes
 import bigframes.pandas as bpd
 import bigframes.series as series
-from tests.system.utils import (
+from bigframes.testing.utils import (
     assert_dfs_equivalent,
     assert_pandas_df_equal,
     assert_series_equal,
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index 5df7283e3c..afe3b53d6d 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -23,7 +23,7 @@
 import pytest
 
 import bigframes.dtypes as dtypes
-from tests.system import utils
+from bigframes.testing import utils
 
 try:
     import pandas_gbq  # type: ignore
diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py
index 1ba8ed7e09..1f30df451d 100644
--- a/tests/system/small/test_encryption.py
+++ b/tests/system/small/test_encryption.py
@@ -21,7 +21,7 @@
 
 import bigframes
 import bigframes.ml.linear_model
-from tests.system import utils
+from bigframes.testing import utils
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index bc2e9cc385..0af173adc8 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -16,7 +16,7 @@
 import pytest
 
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal
+from bigframes.testing.utils import assert_pandas_df_equal
 
 # =================
 # DataFrame.groupby
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
index 7643f5701b..3b9854be26 100644
--- a/tests/system/small/test_index.py
+++ b/tests/system/small/test_index.py
@@ -19,7 +19,7 @@
 import pytest
 
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_index_equal_ignore_index_type
+from bigframes.testing.utils import assert_pandas_index_equal_ignore_index_type
 
 
 def test_index_construct_from_list():
diff --git a/tests/system/small/test_large_local_data.py b/tests/system/small/test_large_local_data.py
index eddec37132..0c03a8b6a3 100644
--- a/tests/system/small/test_large_local_data.py
+++ b/tests/system/small/test_large_local_data.py
@@ -17,7 +17,7 @@
 import pytest
 
 import bigframes
-from tests.system.utils import assert_pandas_df_equal
+from bigframes.testing.utils import assert_pandas_df_equal
 
 large_dataframe = pd.DataFrame(np.random.rand(10000, 10), dtype="Float64")
 large_dataframe.index = large_dataframe.index.astype("Int64")
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index a01b7aab92..b63468d311 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -17,7 +17,7 @@
 import pytest
 
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal
+from bigframes.testing.utils import assert_pandas_df_equal
 
 
 def test_multi_index_from_arrays():
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index 491b56d5fc..4e8d3d20f7 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -21,7 +21,7 @@
 import pytz
 
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal
+from bigframes.testing.utils import assert_pandas_df_equal
 
 
 @pytest.mark.parametrize(
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 10671720af..6760d63a20 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -32,7 +32,7 @@
 import bigframes.features
 import bigframes.pandas
 import bigframes.series as series
-from tests.system.utils import (
+from bigframes.testing.utils import (
     assert_pandas_df_equal,
     assert_series_equal,
     get_first_file_from_wildcard,
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index 9febb0da42..cbb441e5aa 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -36,7 +36,7 @@
 import bigframes.dataframe
 import bigframes.dtypes
 import bigframes.ml.linear_model
-from tests.system import utils
+from bigframes.testing import utils
 
 all_write_engines = pytest.mark.parametrize(
     "write_engine",
diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py
index f6a56af7ff..0825b78037 100644
--- a/tests/system/small/test_unordered.py
+++ b/tests/system/small/test_unordered.py
@@ -19,7 +19,7 @@
 
 import bigframes.exceptions
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal, assert_series_equal
+from bigframes.testing.utils import assert_pandas_df_equal, assert_series_equal
 
 
 def test_unordered_mode_sql_no_hash(unordered_session):
diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py
index 4d871fd707..83d6a2b881 100644
--- a/tests/unit/core/compile/sqlglot/conftest.py
+++ b/tests/unit/core/compile/sqlglot/conftest.py
@@ -19,7 +19,7 @@
 import pytest
 
 from bigframes import dtypes
-import tests.system.utils
+import bigframes.testing.utils
 
 CURRENT_DIR = pathlib.Path(__file__).parent
 DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data"
@@ -27,7 +27,7 @@
 
 @pytest.fixture(scope="session")
 def compiler_session():
-    from . import compiler_session
+    from bigframes.testing import compiler_session
 
     return compiler_session.SQLCompilerSession()
 
@@ -41,7 +41,7 @@ def scalars_types_pandas_df() -> pd.DataFrame:
         DATA_DIR / "scalars.jsonl",
         lines=True,
     )
-    tests.system.utils.convert_pandas_dtypes(df, bytes_col=True)
+    bigframes.testing.utils.convert_pandas_dtypes(df, bytes_col=True)
 
     df = df.set_index("rowindex", drop=False)
     return df
diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py
index 2bda563418..b434e473e9 100644
--- a/tests/unit/test_dataframe_polars.py
+++ b/tests/unit/test_dataframe_polars.py
@@ -30,7 +30,7 @@
 import bigframes.dataframe as dataframe
 import bigframes.pandas as bpd
 import bigframes.series as series
-from tests.system.utils import (
+from bigframes.testing.utils import (
     assert_dfs_equivalent,
     assert_pandas_df_equal,
     assert_series_equal,

From 582bbaf0bf27c8387eae35c663789713184cdf89 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 11 Jun 2025 13:08:07 -0700
Subject: [PATCH 06/23] chore: implement compile_readtable (#1809)

* use mocks.create_bigquery_session

* chore: implement compile_readtable
---
 bigframes/core/compile/sqlglot/compiler.py    | 11 ++++++
 .../core/compile/sqlglot/scalar_compiler.py   |  4 ++-
 bigframes/core/compile/sqlglot/sqlglot_ir.py  | 29 +++++++++++++--
 bigframes/testing/compiler_session.py         | 35 -------------------
 bigframes/testing/mocks.py                    |  8 ++---
 tests/unit/core/compile/sqlglot/conftest.py   | 21 +++++++++--
 .../test_compile_projection/out.sql           | 27 ++++++++++----
 .../test_compile_readtable/out.sql            | 16 +++++++++
 .../sqlglot/test_compile_projection.py        | 10 ++----
 .../compile/sqlglot/test_compile_readtable.py | 24 +++++++++++++
 10 files changed, 125 insertions(+), 60 deletions(-)
 create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql
 create mode 100644 tests/unit/core/compile/sqlglot/test_compile_readtable.py

diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
index 50169d1a8b..7e55c0285f 100644
--- a/bigframes/core/compile/sqlglot/compiler.py
+++ b/bigframes/core/compile/sqlglot/compiler.py
@@ -158,6 +158,17 @@ def compile_readlocal(self, node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR:
 
         return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema, uid_gen=self.uid_gen)
 
+    @_compile_node.register
+    def compile_readtable(self, node: nodes.ReadTableNode, *args):
+        table = node.source.table
+        return ir.SQLGlotIR.from_table(
+            table.project_id,
+            table.dataset_id,
+            table.table_id,
+            col_names=[col.source_id for col in node.scan_list.items],
+            alias_names=[col.id.sql for col in node.scan_list.items],
+        )
+
     @_compile_node.register
     def compile_selection(
         self, node: nodes.SelectionNode, child: ir.SQLGlotIR
diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py
index 0f059d482c..18d709732a 100644
--- a/bigframes/core/compile/sqlglot/scalar_compiler.py
+++ b/bigframes/core/compile/sqlglot/scalar_compiler.py
@@ -79,6 +79,8 @@ def compile_op_expression(expr: expression.OpExpression):
 
 
 # TODO: add parenthesize for operators
-def compile_addop(op: ops.AddOp, left: sge.Expression, right: sge.Expression):
+def compile_addop(
+    op: ops.AddOp, left: sge.Expression, right: sge.Expression
+) -> sge.Expression:
     # TODO: support addop for string dtype.
     return sge.Add(this=left, expression=right)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
index 23b441591b..fc1a687c71 100644
--- a/bigframes/core/compile/sqlglot/sqlglot_ir.py
+++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -106,6 +106,30 @@ def from_pyarrow(
         )
         return cls(expr=sg.select(sge.Star()).from_(expr), uid_gen=uid_gen)
 
+    @classmethod
+    def from_table(
+        cls,
+        project_id: str,
+        dataset_id: str,
+        table_id: str,
+        col_names: typing.Sequence[str],
+        alias_names: typing.Sequence[str],
+    ) -> SQLGlotIR:
+        selections = [
+            sge.Alias(
+                this=sge.to_identifier(col_name, quoted=cls.quoted),
+                alias=sge.to_identifier(alias_name, quoted=cls.quoted),
+            )
+            for col_name, alias_name in zip(col_names, alias_names)
+        ]
+        table_expr = sge.Table(
+            this=sg.to_identifier(table_id, quoted=cls.quoted),
+            db=sg.to_identifier(dataset_id, quoted=cls.quoted),
+            catalog=sg.to_identifier(project_id, quoted=cls.quoted),
+        )
+        select_expr = sge.Select().select(*selections).from_(table_expr)
+        return cls(expr=select_expr)
+
     @classmethod
     def from_query_string(
         cls,
@@ -156,9 +180,8 @@ def project(
             )
             for id, expr in projected_cols
         ]
-        # TODO: some columns are not able to be projected into the same select.
-        select_expr = self.expr.select(*projected_cols_expr, append=True)
-        return SQLGlotIR(expr=select_expr)
+        new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=False)
+        return SQLGlotIR(expr=new_expr)
 
     def insert(
         self,
diff --git a/bigframes/testing/compiler_session.py b/bigframes/testing/compiler_session.py
index 7309349681..35114d95d0 100644
--- a/bigframes/testing/compiler_session.py
+++ b/bigframes/testing/compiler_session.py
@@ -14,13 +14,10 @@
 
 import dataclasses
 import typing
-import weakref
 
 import bigframes.core
 import bigframes.core.compile.sqlglot as sqlglot
-import bigframes.dataframe
 import bigframes.session.executor
-import bigframes.session.metrics
 
 
 @dataclasses.dataclass
@@ -44,35 +41,3 @@ def to_sql(
         return self.compiler.SQLGlotCompiler().compile(
             array_value.node, ordered=ordered
         )
-
-
-class SQLCompilerSession(bigframes.session.Session):
-    """Session for SQL compilation using sqlglot."""
-
-    def __init__(self):
-        # TODO: remove unused attributes.
-        self._location = None  # type: ignore
-        self._bq_kms_key_name = None  # type: ignore
-        self._clients_provider = None  # type: ignore
-        self.ibis_client = None  # type: ignore
-        self._bq_connection = None  # type: ignore
-        self._skip_bq_connection_check = True
-        self._objects: list[
-            weakref.ReferenceType[
-                typing.Union[
-                    bigframes.core.indexes.Index,
-                    bigframes.series.Series,
-                    bigframes.dataframe.DataFrame,
-                ]
-            ]
-        ] = []
-        self._strictly_ordered: bool = True
-        self._allow_ambiguity = False  # type: ignore
-        self._default_index_type = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64
-        self._metrics = bigframes.session.metrics.ExecutionMetrics()
-        self._remote_function_session = None  # type: ignore
-        self._temp_storage_manager = None  # type: ignore
-        self._loader = None  # type: ignore
-
-        self._session_id: str = "sqlglot_unit_tests_session"
-        self._executor = SQLCompilerExecutor()
diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py
index 7ddc2e2e6e..25f1f90fe7 100644
--- a/bigframes/testing/mocks.py
+++ b/bigframes/testing/mocks.py
@@ -64,7 +64,7 @@ def create_bigquery_session(
 
     if bqclient is None:
         bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
-        bqclient.project = "test-project"
+        bqclient.project = anonymous_dataset.project
         bqclient.location = location
 
         # Mock the location.
@@ -74,9 +74,9 @@ def create_bigquery_session(
         type(table).created = mock.PropertyMock(return_value=table_time)
         type(table).location = mock.PropertyMock(return_value=location)
         type(table).schema = mock.PropertyMock(return_value=table_schema)
-        type(table).reference = mock.PropertyMock(
-            return_value=anonymous_dataset.table("test_table")
-        )
+        type(table).project = anonymous_dataset.project
+        type(table).dataset_id = anonymous_dataset.dataset_id
+        type(table).table_id = "test_table"
         type(table).num_rows = mock.PropertyMock(return_value=1000000000)
         bqclient.get_table.return_value = table
 
diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py
index 83d6a2b881..d9a13ae53f 100644
--- a/tests/unit/core/compile/sqlglot/conftest.py
+++ b/tests/unit/core/compile/sqlglot/conftest.py
@@ -13,12 +13,15 @@
 # limitations under the License.
 
 import pathlib
+import typing
 
+from google.cloud import bigquery
 import pandas as pd
 import pyarrow as pa
 import pytest
 
 from bigframes import dtypes
+import bigframes.testing.mocks as mocks
 import bigframes.testing.utils
 
 CURRENT_DIR = pathlib.Path(__file__).parent
@@ -26,10 +29,24 @@
 
 
 @pytest.fixture(scope="session")
-def compiler_session():
+def compiler_session(basic_types_table_schema):
     from bigframes.testing import compiler_session
 
-    return compiler_session.SQLCompilerSession()
+    # TODO: Check if ordering mode is needed for the tests.
+    session = mocks.create_bigquery_session(table_schema=basic_types_table_schema)
+    session._executor = compiler_session.SQLCompilerExecutor()
+    return session
+
+
+@pytest.fixture(scope="session")
+def basic_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
+    return [
+        bigquery.SchemaField("rowindex", "INTEGER"),
+        bigquery.SchemaField("int64_col", "INTEGER"),
+        bigquery.SchemaField("string_col", "STRING"),
+        bigquery.SchemaField("float64_col", "FLOAT"),
+        bigquery.SchemaField("bool_col", "BOOLEAN"),
+    ]
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
index f5182a380b..8a24b01a25 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
@@ -1,11 +1,24 @@
 WITH `bfcte_0` AS (
   SELECT
-    *,
-    `bfcol_0` AS `bfcol_3`,
-    `bfcol_1` + 1 AS `bfcol_4`
-  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` INT64, `bfcol_2` INT64>>[STRUCT(0, 123456789, 0), STRUCT(1, -987654321, 1), STRUCT(2, 314159, 2), STRUCT(3, CAST(NULL AS INT64), 3), STRUCT(4, -234892, 4), STRUCT(5, 55555, 5), STRUCT(6, 101202303, 6), STRUCT(7, -214748367, 7), STRUCT(8, 2, 8)])
+    `rowindex` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`,
+    `string_col` AS `bfcol_2`,
+    `float64_col` AS `bfcol_3`,
+    `bool_col` AS `bfcol_4`
+  FROM `test-project`.`test_dataset`.`test_table`
+), `bfcte_1` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_5`,
+    `bfcol_2` AS `bfcol_6`,
+    `bfcol_3` AS `bfcol_7`,
+    `bfcol_4` AS `bfcol_8`,
+    `bfcol_1` + 1 AS `bfcol_9`
+  FROM `bfcte_0`
 )
 SELECT
-  `bfcol_3` AS `rowindex`,
-  `bfcol_4` AS `int64_col`
-FROM `bfcte_0`
\ No newline at end of file
+  `bfcol_5` AS `rowindex`,
+  `bfcol_9` AS `int64_col`,
+  `bfcol_6` AS `string_col`,
+  `bfcol_7` AS `float64_col`,
+  `bfcol_8` AS `bool_col`
+FROM `bfcte_1`
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql
new file mode 100644
index 0000000000..f010f77bf1
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql
@@ -0,0 +1,16 @@
+WITH `bfcte_2` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`,
+    `string_col` AS `bfcol_2`,
+    `float64_col` AS `bfcol_3`,
+    `bool_col` AS `bfcol_4`
+  FROM `test-project`.`test_dataset`.`test_table`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_1` AS `int64_col`,
+  `bfcol_2` AS `string_col`,
+  `bfcol_3` AS `float64_col`,
+  `bfcol_4` AS `bool_col`
+FROM `bfcte_2`
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/test_compile_projection.py b/tests/unit/core/compile/sqlglot/test_compile_projection.py
index be74255649..82e6c60668 100644
--- a/tests/unit/core/compile/sqlglot/test_compile_projection.py
+++ b/tests/unit/core/compile/sqlglot/test_compile_projection.py
@@ -12,20 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pandas as pd
 import pytest
 
 import bigframes
-import bigframes.pandas as bpd
 
 pytest.importorskip("pytest_snapshot")
 
 
-def test_compile_projection(
-    scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot
-):
-    bf_df = bpd.DataFrame(
-        scalars_types_pandas_df[["int64_col"]], session=compiler_session
-    )
+def test_compile_projection(compiler_session: bigframes.Session, snapshot):
+    bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
     bf_df["int64_col"] = bf_df["int64_col"] + 1
     snapshot.assert_match(bf_df.sql, "out.sql")
diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py
new file mode 100644
index 0000000000..848ace58f3
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import bigframes
+
+pytest.importorskip("pytest_snapshot")
+
+
+def test_compile_readtable(compiler_session: bigframes.Session, snapshot):
+    bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
+    snapshot.assert_match(bf_df.sql, "out.sql")

From b5867464a5bf30300dcfc069eda546b11f03146c Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@users.noreply.github.com>
Date: Wed, 11 Jun 2025 13:25:32 -0700
Subject: [PATCH 07/23] docs: document how to use ai.map() for information
 extraction (#1808)

* doc: document how to use ai.map() for information extraction

* fix lint
---
 bigframes/operations/ai.py                | 19 +++++-
 notebooks/experimental/ai_operators.ipynb | 78 +++++++++++++----------
 2 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py
index 87245d104e..f7a9e6358e 100644
--- a/bigframes/operations/ai.py
+++ b/bigframes/operations/ai.py
@@ -117,7 +117,8 @@ def map(
         attach_logprobs=False,
     ):
         """
-        Maps the DataFrame with the semantics of the user instruction.
+        Maps the DataFrame with the semantics of the user instruction. The name of the keys in the output_schema parameter carry
+        semantic meaning, and can be used for information extraction.
 
         **Examples:**
 
@@ -139,6 +140,22 @@ def map(
             <BLANKLINE>
             [2 rows x 3 columns]
 
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> bpd.options.experiments.ai_operators = True
+            >>> bpd.options.compute.ai_ops_confirmation_threshold = 25
+
+            >>> import bigframes.ml.llm as llm
+            >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001")
+
+            >>> df = bpd.DataFrame({"text": ["Elmo lives at 123 Sesame Street."]})
+            >>> df.ai.map("{text}", model=model, output_schema={"person": "string", "address": "string"})
+                                           text person            address
+            0  Elmo lives at 123 Sesame Street.   Elmo  123 Sesame Street
+            <BLANKLINE>
+            [1 rows x 3 columns]
+
         Args:
             instruction (str):
                 An instruction on how to map the data. This value must contain
diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb
index 49a9d798e2..f830787801 100644
--- a/notebooks/experimental/ai_operators.ipynb
+++ b/notebooks/experimental/ai_operators.ipynb
@@ -264,7 +264,7 @@
         "id": "hQft3o3OiouS"
       },
       "source": [
-        "# API Samples"
+        "# API Examples"
       ]
     },
     {
@@ -403,7 +403,7 @@
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
+            "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
             "`db_dtypes` is a preview feature and subject to change.\n",
             "  warnings.warn(msg, bfe.PreviewWarning)\n"
           ]
@@ -594,7 +594,7 @@
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
+            "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
             "`db_dtypes` is a preview feature and subject to change.\n",
             "  warnings.warn(msg, bfe.PreviewWarning)\n"
           ]
@@ -676,7 +676,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 14,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -685,12 +685,30 @@
         "id": "PpL24AQFiouS",
         "outputId": "e7aff038-bf4b-4833-def8-fe2648e8885b"
       },
+      "outputs": [],
+      "source": [
+        "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### AI Extraction\n",
+        "\n",
+        "AI mapping is also able to extract multiple pieces of information based on your prompt, because the output schema keys can carry semantic meanings:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {},
       "outputs": [
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
+            "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
             "`db_dtypes` is a preview feature and subject to change.\n",
             "  warnings.warn(msg, bfe.PreviewWarning)\n"
           ]
@@ -716,54 +734,50 @@
               "  <thead>\n",
               "    <tr style=\"text-align: right;\">\n",
               "      <th></th>\n",
-              "      <th>ingredient_1</th>\n",
-              "      <th>ingredient_2</th>\n",
-              "      <th>food</th>\n",
+              "      <th>text</th>\n",
+              "      <th>person</th>\n",
+              "      <th>address</th>\n",
               "    </tr>\n",
               "  </thead>\n",
               "  <tbody>\n",
               "    <tr>\n",
               "      <th>0</th>\n",
-              "      <td>Bun</td>\n",
-              "      <td>Beef Patty</td>\n",
-              "      <td>Burger</td>\n",
+              "      <td>Elmo lives at 123 Sesame Street.</td>\n",
+              "      <td>Elmo</td>\n",
+              "      <td>123 Sesame Street</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>1</th>\n",
-              "      <td>Soy Bean</td>\n",
-              "      <td>Bittern</td>\n",
-              "      <td>Tofu</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>Sausage</td>\n",
-              "      <td>Long Bread</td>\n",
-              "      <td>Hotdog</td>\n",
+              "      <td>124 Conch Street is SpongeBob's home</td>\n",
+              "      <td>SpongeBob</td>\n",
+              "      <td>124 Conch Street</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
-              "<p>3 rows × 3 columns</p>\n",
-              "</div>[3 rows x 3 columns in total]"
+              "<p>2 rows × 3 columns</p>\n",
+              "</div>[2 rows x 3 columns in total]"
             ],
             "text/plain": [
-              "  ingredient_1 ingredient_2     food\n",
-              "0          Bun   Beef Patty  Burger\n",
-              "\n",
-              "1     Soy Bean      Bittern    Tofu\n",
-              "\n",
-              "2      Sausage   Long Bread  Hotdog\n",
-              "\n",
+              "                                   text     person            address\n",
+              "0      Elmo lives at 123 Sesame Street.       Elmo  123 Sesame Street\n",
+              "1  124 Conch Street is SpongeBob's home  SpongeBob   124 Conch Street\n",
               "\n",
-              "[3 rows x 3 columns]"
+              "[2 rows x 3 columns]"
             ]
           },
-          "execution_count": 13,
+          "execution_count": 15,
           "metadata": {},
           "output_type": "execute_result"
         }
       ],
       "source": [
-        "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)"
+        "df = bpd.DataFrame({\n",
+        "    \"text\": [\n",
+        "        \"Elmo lives at 123 Sesame Street.\", \n",
+        "        \"124 Conch Street is SpongeBob's home\",\n",
+        "    ]\n",
+        "})\n",
+        "df.ai.map(\"{text}\", model=gemini_model, output_schema={\"person\": \"string\", \"address\": \"string\"})"
       ]
     },
     {

From 63205f2565bdfe3833d6b20b912a88ef0599d955 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Wed, 11 Jun 2025 14:19:12 -0700
Subject: [PATCH 08/23] refactor: Refactor polars scalar op compiler (#1807)

---
 bigframes/core/compile/polars/compiler.py | 208 ++++++++++++++--------
 bigframes/operations/base_ops.py          |  10 +-
 bigframes/operations/blob_ops.py          |   3 +-
 bigframes/operations/bool_ops.py          |   9 +-
 bigframes/operations/comparison_ops.py    |  21 ++-
 bigframes/operations/date_ops.py          |  27 ++-
 bigframes/operations/datetime_ops.py      |   6 +-
 bigframes/operations/distance_ops.py      |   9 +-
 bigframes/operations/generic_ops.py       |  26 +--
 bigframes/operations/geo_ops.py           |  30 ++--
 bigframes/operations/numeric_ops.py       |  83 ++++++---
 bigframes/operations/string_ops.py        |  39 ++--
 bigframes/operations/time_ops.py          |  12 +-
 13 files changed, 317 insertions(+), 166 deletions(-)

diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py
index a0e85d8c69..62654c1518 100644
--- a/bigframes/core/compile/polars/compiler.py
+++ b/bigframes/core/compile/polars/compiler.py
@@ -29,6 +29,10 @@
 import bigframes.dtypes
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
+import bigframes.operations.bool_ops as bool_ops
+import bigframes.operations.comparison_ops as comp_ops
+import bigframes.operations.generic_ops as gen_ops
+import bigframes.operations.numeric_ops as num_ops
 
 polars_installed = True
 if TYPE_CHECKING:
@@ -123,84 +127,146 @@ def _(
             self,
             expression: ex.OpExpression,
         ) -> pl.Expr:
-            # TODO: Complete the implementation, convert to hash dispatch
+            # TODO: Complete the implementation
             op = expression.op
             args = tuple(map(self.compile_expression, expression.inputs))
-            if isinstance(op, ops.invert_op.__class__):
-                return ~args[0]
-            if isinstance(op, ops.and_op.__class__):
-                return args[0] & args[1]
-            if isinstance(op, ops.or_op.__class__):
-                return args[0] | args[1]
-            if isinstance(op, ops.add_op.__class__):
-                return args[0] + args[1]
-            if isinstance(op, ops.sub_op.__class__):
-                return args[0] - args[1]
-            if isinstance(op, ops.mul_op.__class__):
-                return args[0] * args[1]
-            if isinstance(op, ops.div_op.__class__):
-                return args[0] / args[1]
-            if isinstance(op, ops.floordiv_op.__class__):
-                # TODO: Handle int // 0
-                return args[0] // args[1]
-            if isinstance(op, (ops.pow_op.__class__, ops.unsafe_pow_op.__class__)):
-                return args[0] ** args[1]
-            if isinstance(op, ops.abs_op.__class__):
-                return args[0].abs()
-            if isinstance(op, ops.neg_op.__class__):
-                return args[0].neg()
-            if isinstance(op, ops.pos_op.__class__):
-                return args[0]
-            if isinstance(op, ops.ge_op.__class__):
-                return args[0] >= args[1]
-            if isinstance(op, ops.gt_op.__class__):
-                return args[0] > args[1]
-            if isinstance(op, ops.le_op.__class__):
-                return args[0] <= args[1]
-            if isinstance(op, ops.lt_op.__class__):
-                return args[0] < args[1]
-            if isinstance(op, ops.eq_op.__class__):
-                return args[0].eq(args[1])
-            if isinstance(op, ops.eq_null_match_op.__class__):
-                return args[0].eq_missing(args[1])
-            if isinstance(op, ops.ne_op.__class__):
-                return args[0].ne(args[1])
-            if isinstance(op, ops.IsInOp):
-                # TODO: Filter out types that can't be coerced to right type
-                if op.match_nulls or not any(map(pd.isna, op.values)):
-                    # newer polars version have nulls_equal arg
-                    return args[0].is_in(op.values)
-                else:
-                    return args[0].is_in(op.values) or args[0].is_null()
-            if isinstance(op, ops.mod_op.__class__):
-                return args[0] % args[1]
-            if isinstance(op, ops.coalesce_op.__class__):
-                return pl.coalesce(*args)
-            if isinstance(op, ops.fillna_op.__class__):
-                return pl.coalesce(*args)
-            if isinstance(op, ops.isnull_op.__class__):
-                return args[0].is_null()
-            if isinstance(op, ops.notnull_op.__class__):
-                return args[0].is_not_null()
-            if isinstance(op, ops.CaseWhenOp):
-                expr = pl.when(args[0]).then(args[1])
-                for pred, result in zip(args[2::2], args[3::2]):
-                    expr = expr.when(pred).then(result)  # type: ignore
-                return expr
-            if isinstance(op, ops.where_op.__class__):
-                original, condition, otherwise = args
-                return pl.when(condition).then(original).otherwise(otherwise)
-            if isinstance(op, ops.AsTypeOp):
-                return self.astype(args[0], op.to_type, safe=op.safe)
+            return self.compile_op(op, *args)
 
+        @functools.singledispatchmethod
+        def compile_op(self, op: ops.ScalarOp, *args: pl.Expr) -> pl.Expr:
             raise NotImplementedError(f"Polars compiler hasn't implemented {op}")
 
-        def astype(
-            self, col: pl.Expr, dtype: bigframes.dtypes.Dtype, safe: bool
+        @compile_op.register(gen_ops.InvertOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            return ~input
+
+        @compile_op.register(num_ops.AbsOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            return input.abs()
+
+        @compile_op.register(num_ops.PosOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            return input.__pos__()
+
+        @compile_op.register(num_ops.NegOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            return input.__neg__()
+
+        @compile_op.register(bool_ops.AndOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input & r_input
+
+        @compile_op.register(bool_ops.OrOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input | r_input
+
+        @compile_op.register(num_ops.AddOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input + r_input
+
+        @compile_op.register(num_ops.SubOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input - r_input
+
+        @compile_op.register(num_ops.MulOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input * r_input
+
+        @compile_op.register(num_ops.DivOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input / r_input
+
+        @compile_op.register(num_ops.FloorDivOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input // r_input
+
+        @compile_op.register(num_ops.FloorDivOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input // r_input
+
+        @compile_op.register(num_ops.ModOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input % r_input
+
+        @compile_op.register(num_ops.PowOp)
+        @compile_op.register(num_ops.UnsafePowOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input**r_input
+
+        @compile_op.register(comp_ops.EqOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input.eq(r_input)
+
+        @compile_op.register(comp_ops.EqNullsMatchOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input.eq_missing(r_input)
+
+        @compile_op.register(comp_ops.NeOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input.ne(r_input)
+
+        @compile_op.register(comp_ops.GtOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input > r_input
+
+        @compile_op.register(comp_ops.GeOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input >= r_input
+
+        @compile_op.register(comp_ops.LtOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input < r_input
+
+        @compile_op.register(comp_ops.LeOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return l_input <= r_input
+
+        @compile_op.register(gen_ops.IsInOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            # TODO: Filter out types that can't be coerced to right type
+            assert isinstance(op, gen_ops.IsInOp)
+            if op.match_nulls or not any(map(pd.isna, op.values)):
+                # newer polars version have nulls_equal arg
+                return input.is_in(op.values)
+            else:
+                return input.is_in(op.values) or input.is_null()
+
+        @compile_op.register(gen_ops.IsNullOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            return input.is_null()
+
+        @compile_op.register(gen_ops.NotNullOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            return input.is_not_null()
+
+        @compile_op.register(gen_ops.FillNaOp)
+        @compile_op.register(gen_ops.CoalesceOp)
+        def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr:
+            return pl.coalesce(l_input, r_input)
+
+        @compile_op.register(gen_ops.CaseWhenOp)
+        def _(self, op: ops.ScalarOp, *inputs: pl.Expr) -> pl.Expr:
+            expr = pl.when(inputs[0]).then(inputs[1])
+            for pred, result in zip(inputs[2::2], inputs[3::2]):
+                expr = expr.when(pred).then(result)  # type: ignore
+            return expr
+
+        @compile_op.register(gen_ops.WhereOp)
+        def _(
+            self,
+            op: ops.ScalarOp,
+            original: pl.Expr,
+            condition: pl.Expr,
+            otherwise: pl.Expr,
         ) -> pl.Expr:
+            return pl.when(condition).then(original).otherwise(otherwise)
+
+        @compile_op.register(gen_ops.AsTypeOp)
+        def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
+            assert isinstance(op, gen_ops.AsTypeOp)
             # TODO: Polars casting works differently, need to lower instead to specific conversion ops.
-            # eg. We want "True" instead of "true" for bool to string.
-            return col.cast(_DTYPE_MAPPING[dtype], strict=not safe)
+            # eg. We want "True" instead of "true" for bool to strin
+            return input.cast(_DTYPE_MAPPING[op.to_type], strict=not op.safe)
 
     @dataclasses.dataclass(frozen=True)
     class PolarsAggregateCompiler:
diff --git a/bigframes/operations/base_ops.py b/bigframes/operations/base_ops.py
index fc92ffe760..c0145a6711 100644
--- a/bigframes/operations/base_ops.py
+++ b/bigframes/operations/base_ops.py
@@ -180,7 +180,9 @@ def _convert_expr_input(
 
 
 # Operation Factories
-def create_unary_op(name: str, type_signature: op_typing.UnaryTypeSignature) -> UnaryOp:
+def create_unary_op(
+    name: str, type_signature: op_typing.UnaryTypeSignature
+) -> type[UnaryOp]:
     return dataclasses.make_dataclass(
         name,
         [
@@ -189,12 +191,12 @@ def create_unary_op(name: str, type_signature: op_typing.UnaryTypeSignature) ->
         ],
         bases=(UnaryOp,),
         frozen=True,
-    )()
+    )
 
 
 def create_binary_op(
     name: str, type_signature: op_typing.BinaryTypeSignature
-) -> BinaryOp:
+) -> type[BinaryOp]:
     return dataclasses.make_dataclass(
         name,
         [
@@ -203,4 +205,4 @@ def create_binary_op(
         ],
         bases=(BinaryOp,),
         frozen=True,
-    )()
+    )
diff --git a/bigframes/operations/blob_ops.py b/bigframes/operations/blob_ops.py
index b17d1b1215..2936e0f14f 100644
--- a/bigframes/operations/blob_ops.py
+++ b/bigframes/operations/blob_ops.py
@@ -19,9 +19,10 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-obj_fetch_metadata_op = base_ops.create_unary_op(
+ObjFetchMetadataOp = base_ops.create_unary_op(
     name="obj_fetch_metadata", type_signature=op_typing.BLOB_TRANSFORM
 )
+obj_fetch_metadata_op = ObjFetchMetadataOp()
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/operations/bool_ops.py b/bigframes/operations/bool_ops.py
index c8cd08efe5..003318f822 100644
--- a/bigframes/operations/bool_ops.py
+++ b/bigframes/operations/bool_ops.py
@@ -16,8 +16,11 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-and_op = base_ops.create_binary_op(name="and", type_signature=op_typing.LOGICAL)
+AndOp = base_ops.create_binary_op(name="and", type_signature=op_typing.LOGICAL)
+and_op = AndOp()
 
-or_op = base_ops.create_binary_op(name="or", type_signature=op_typing.LOGICAL)
+OrOp = base_ops.create_binary_op(name="or", type_signature=op_typing.LOGICAL)
+or_op = OrOp()
 
-xor_op = base_ops.create_binary_op(name="xor", type_signature=op_typing.LOGICAL)
+XorOp = base_ops.create_binary_op(name="xor", type_signature=op_typing.LOGICAL)
+xor_op = XorOp()
diff --git a/bigframes/operations/comparison_ops.py b/bigframes/operations/comparison_ops.py
index b109a85d18..4c2911808d 100644
--- a/bigframes/operations/comparison_ops.py
+++ b/bigframes/operations/comparison_ops.py
@@ -16,18 +16,25 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-eq_op = base_ops.create_binary_op(name="eq", type_signature=op_typing.COMPARISON)
+EqOp = base_ops.create_binary_op(name="eq", type_signature=op_typing.COMPARISON)
+eq_op = EqOp()
 
-eq_null_match_op = base_ops.create_binary_op(
+EqNullsMatchOp = base_ops.create_binary_op(
     name="eq_nulls_match", type_signature=op_typing.COMPARISON
 )
+eq_null_match_op = EqNullsMatchOp()
 
-ne_op = base_ops.create_binary_op(name="ne", type_signature=op_typing.COMPARISON)
+NeOp = base_ops.create_binary_op(name="ne", type_signature=op_typing.COMPARISON)
+ne_op = NeOp()
 
-lt_op = base_ops.create_binary_op(name="lt", type_signature=op_typing.COMPARISON)
+LtOp = base_ops.create_binary_op(name="lt", type_signature=op_typing.COMPARISON)
+lt_op = LtOp()
 
-gt_op = base_ops.create_binary_op(name="gt", type_signature=op_typing.COMPARISON)
+GtOp = base_ops.create_binary_op(name="gt", type_signature=op_typing.COMPARISON)
+gt_op = GtOp()
 
-le_op = base_ops.create_binary_op(name="le", type_signature=op_typing.COMPARISON)
+LeOp = base_ops.create_binary_op(name="le", type_signature=op_typing.COMPARISON)
+le_op = LeOp()
 
-ge_op = base_ops.create_binary_op(name="ge", type_signature=op_typing.COMPARISON)
+GeOp = base_ops.create_binary_op(name="ge", type_signature=op_typing.COMPARISON)
+ge_op = GeOp()
diff --git a/bigframes/operations/date_ops.py b/bigframes/operations/date_ops.py
index 0b91c86b11..352bc9f93e 100644
--- a/bigframes/operations/date_ops.py
+++ b/bigframes/operations/date_ops.py
@@ -19,49 +19,58 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-day_op = base_ops.create_unary_op(
+DayOp = base_ops.create_unary_op(
     name="day",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+day_op = DayOp()
 
-month_op = base_ops.create_unary_op(
+MonthOp = base_ops.create_unary_op(
     name="month",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+month_op = MonthOp()
 
-year_op = base_ops.create_unary_op(
+YearOp = base_ops.create_unary_op(
     name="year",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+year_op = YearOp()
 
-iso_day_op = base_ops.create_unary_op(
+IsoDayOp = base_ops.create_unary_op(
     name="iso_day", type_signature=op_typing.DATELIKE_ACCESSOR
 )
+iso_day_op = IsoDayOp()
 
-iso_week_op = base_ops.create_unary_op(
+IsoWeekOp = base_ops.create_unary_op(
     name="iso_weeek",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+iso_week_op = IsoWeekOp()
 
-iso_year_op = base_ops.create_unary_op(
+IsoYearOp = base_ops.create_unary_op(
     name="iso_year",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+iso_year_op = IsoYearOp()
 
-dayofweek_op = base_ops.create_unary_op(
+DayOfWeekOp = base_ops.create_unary_op(
     name="dayofweek",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+dayofweek_op = DayOfWeekOp()
 
-dayofyear_op = base_ops.create_unary_op(
+DayOfYearOp = base_ops.create_unary_op(
     name="dayofyear",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+dayofyear_op = DayOfYearOp()
 
-quarter_op = base_ops.create_unary_op(
+QuarterOp = base_ops.create_unary_op(
     name="quarter",
     type_signature=op_typing.DATELIKE_ACCESSOR,
 )
+quarter_op = QuarterOp()
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py
index 6e7fb32941..7c760b689b 100644
--- a/bigframes/operations/datetime_ops.py
+++ b/bigframes/operations/datetime_ops.py
@@ -22,19 +22,21 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-date_op = base_ops.create_unary_op(
+DateOp = base_ops.create_unary_op(
     name="date",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_date_like, dtypes.DATE_DTYPE, description="date-like"
     ),
 )
+date_op = DateOp()
 
-time_op = base_ops.create_unary_op(
+TimeOp = base_ops.create_unary_op(
     name="time",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_time_like, dtypes.TIME_DTYPE, description="time-like"
     ),
 )
+time_op = TimeOp()
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/operations/distance_ops.py b/bigframes/operations/distance_ops.py
index 74595b561a..ac0863b9e6 100644
--- a/bigframes/operations/distance_ops.py
+++ b/bigframes/operations/distance_ops.py
@@ -16,14 +16,17 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-cosine_distance_op = base_ops.create_binary_op(
+CosineDistanceOp = base_ops.create_binary_op(
     name="ml_cosine_distance", type_signature=op_typing.VECTOR_METRIC
 )
+cosine_distance_op = CosineDistanceOp()
 
-manhattan_distance_op = base_ops.create_binary_op(
+ManhattanDistanceOp = base_ops.create_binary_op(
     name="ml_manhattan_distance", type_signature=op_typing.VECTOR_METRIC
 )
+manhattan_distance_op = ManhattanDistanceOp()
 
-euclidean_distance_op = base_ops.create_binary_op(
+EuclidDistanceOp = base_ops.create_binary_op(
     name="ml_euclidean_distance", type_signature=op_typing.VECTOR_METRIC
 )
+euclidean_distance_op = EuclidDistanceOp()
diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py
index b90a43b091..3c3f9653b4 100644
--- a/bigframes/operations/generic_ops.py
+++ b/bigframes/operations/generic_ops.py
@@ -20,34 +20,38 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-invert_op = base_ops.create_unary_op(
+InvertOp = base_ops.create_unary_op(
     name="invert",
     type_signature=op_typing.TypePreserving(
         dtypes.is_binary_like,
         description="binary-like",
     ),
 )
+invert_op = InvertOp()
 
-isnull_op = base_ops.create_unary_op(
+IsNullOp = base_ops.create_unary_op(
     name="isnull",
     type_signature=op_typing.FixedOutputType(
         lambda x: True, dtypes.BOOL_DTYPE, description="nullable"
     ),
 )
+isnull_op = IsNullOp()
 
-notnull_op = base_ops.create_unary_op(
+NotNullOp = base_ops.create_unary_op(
     name="notnull",
     type_signature=op_typing.FixedOutputType(
         lambda x: True, dtypes.BOOL_DTYPE, description="nullable"
     ),
 )
+notnull_op = NotNullOp()
 
-hash_op = base_ops.create_unary_op(
+HashOp = base_ops.create_unary_op(
     name="hash",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_string_like, dtypes.INT_DTYPE, description="string-like"
     ),
 )
+hash_op = HashOp()
 
 
 @dataclasses.dataclass(frozen=True)
@@ -80,15 +84,17 @@ def output_type(self, *input_types):
         return input_types[0]
 
 
-fillna_op = base_ops.create_binary_op(name="fillna", type_signature=op_typing.COERCE)
+FillNaOp = base_ops.create_binary_op(name="fillna", type_signature=op_typing.COERCE)
+fillna_op = FillNaOp()
 
-maximum_op = base_ops.create_binary_op(name="maximum", type_signature=op_typing.COERCE)
+MaximumOp = base_ops.create_binary_op(name="maximum", type_signature=op_typing.COERCE)
+maximum_op = MaximumOp()
 
-minimum_op = base_ops.create_binary_op(name="minimum", type_signature=op_typing.COERCE)
+MinimumOp = base_ops.create_binary_op(name="minimum", type_signature=op_typing.COERCE)
+minimum_op = MinimumOp()
 
-coalesce_op = base_ops.create_binary_op(
-    name="coalesce", type_signature=op_typing.COERCE
-)
+CoalesceOp = base_ops.create_binary_op(name="coalesce", type_signature=op_typing.COERCE)
+coalesce_op = CoalesceOp()
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py
index 1b99e47ab1..0268c63249 100644
--- a/bigframes/operations/geo_ops.py
+++ b/bigframes/operations/geo_ops.py
@@ -18,66 +18,76 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-geo_area_op = base_ops.create_unary_op(
+GeoAreaOp = base_ops.create_unary_op(
     name="geo_area",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like"
     ),
 )
+geo_area_op = GeoAreaOp()
 
-geo_st_astext_op = base_ops.create_unary_op(
+GeoStAstextOp = base_ops.create_unary_op(
     name="geo_st_astext",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_geo_like, dtypes.STRING_DTYPE, description="geo-like"
     ),
 )
+geo_st_astext_op = GeoStAstextOp()
 
-geo_st_boundary_op = base_ops.create_unary_op(
+GeoStBoundaryOp = base_ops.create_unary_op(
     name="geo_st_boundary",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_geo_like, dtypes.GEO_DTYPE, description="geo-like"
     ),
 )
+geo_st_boundary_op = GeoStBoundaryOp()
 
-geo_st_difference_op = base_ops.create_binary_op(
+GeoStDifferenceOp = base_ops.create_binary_op(
     name="geo_st_difference", type_signature=op_typing.BinaryGeo()
 )
+geo_st_difference_op = GeoStDifferenceOp()
 
-geo_st_geogfromtext_op = base_ops.create_unary_op(
+GeoStGeogfromtextOp = base_ops.create_unary_op(
     name="geo_st_geogfromtext",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_string_like, dtypes.GEO_DTYPE, description="string-like"
     ),
 )
+geo_st_geogfromtext_op = GeoStGeogfromtextOp()
 
-geo_st_geogpoint_op = base_ops.create_binary_op(
+GeoStGeogpointOp = base_ops.create_binary_op(
     name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo()
 )
+geo_st_geogpoint_op = GeoStGeogpointOp()
 
-geo_st_isclosed_op = base_ops.create_unary_op(
+GeoStIsclosedOp = base_ops.create_unary_op(
     name="geo_st_isclosed",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_geo_like, dtypes.BOOL_DTYPE, description="geo-like"
     ),
 )
+geo_st_isclosed_op = GeoStIsclosedOp()
 
-geo_x_op = base_ops.create_unary_op(
+GeoXOp = base_ops.create_unary_op(
     name="geo_x",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like"
     ),
 )
+geo_x_op = GeoXOp()
 
-geo_y_op = base_ops.create_unary_op(
+GeoYOp = base_ops.create_unary_op(
     name="geo_y",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like"
     ),
 )
+geo_y_op = GeoYOp()
 
-geo_st_intersection_op = base_ops.create_binary_op(
+GeoStIntersectionOp = base_ops.create_binary_op(
     name="geo_st_intersection", type_signature=op_typing.BinaryGeo()
 )
+geo_st_intersection_op = GeoStIntersectionOp()
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py
index b9820cd0ea..64eec9d8a1 100644
--- a/bigframes/operations/numeric_ops.py
+++ b/bigframes/operations/numeric_ops.py
@@ -19,97 +19,118 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-sin_op = base_ops.create_unary_op(
+SinOp = base_ops.create_unary_op(
     name="sin", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+sin_op = SinOp()
 
-cos_op = base_ops.create_unary_op(
+CosOp = base_ops.create_unary_op(
     name="cos", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+cos_op = CosOp()
 
-tan_op = base_ops.create_unary_op(
+TanOp = base_ops.create_unary_op(
     name="tan", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+tan_op = TanOp()
 
-arcsin_op = base_ops.create_unary_op(
+ArcsinOp = base_ops.create_unary_op(
     name="arcsin", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+arcsin_op = ArcsinOp()
 
-arccos_op = base_ops.create_unary_op(
+ArccosOp = base_ops.create_unary_op(
     name="arccos", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+arccos_op = ArccosOp()
 
-arctan_op = base_ops.create_unary_op(
+ArctanOp = base_ops.create_unary_op(
     name="arctan", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+arctan_op = ArctanOp()
 
-sinh_op = base_ops.create_unary_op(
+SinhOp = base_ops.create_unary_op(
     name="sinh", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+sinh_op = SinhOp()
 
-cosh_op = base_ops.create_unary_op(
+CoshOp = base_ops.create_unary_op(
     name="cosh", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+cosh_op = CoshOp()
 
-tanh_op = base_ops.create_unary_op(
+TanhOp = base_ops.create_unary_op(
     name="tanh", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+tanh_op = TanhOp()
 
-arcsinh_op = base_ops.create_unary_op(
+ArcsinhOp = base_ops.create_unary_op(
     name="arcsinh", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+arcsinh_op = ArcsinhOp()
 
-arccosh_op = base_ops.create_unary_op(
+ArccoshOp = base_ops.create_unary_op(
     name="arccosh", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+arccosh_op = ArccoshOp()
 
-arctanh_op = base_ops.create_unary_op(
+ArctanhOp = base_ops.create_unary_op(
     name="arctanh", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+arctanh_op = ArctanhOp()
 
-floor_op = base_ops.create_unary_op(
+FloorOp = base_ops.create_unary_op(
     name="floor", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+floor_op = FloorOp()
 
-ceil_op = base_ops.create_unary_op(
+CeilOp = base_ops.create_unary_op(
     name="ceil", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+ceil_op = CeilOp()
 
-abs_op = base_ops.create_unary_op(
+AbsOp = base_ops.create_unary_op(
     name="abs", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA
 )
+abs_op = AbsOp()
 
-pos_op = base_ops.create_unary_op(
+PosOp = base_ops.create_unary_op(
     name="pos", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA
 )
+pos_op = PosOp()
 
-neg_op = base_ops.create_unary_op(
+NegOp = base_ops.create_unary_op(
     name="neg", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA
 )
+neg_op = NegOp()
 
-exp_op = base_ops.create_unary_op(
+ExpOp = base_ops.create_unary_op(
     name="exp", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+exp_op = ExpOp()
 
-expm1_op = base_ops.create_unary_op(
+Expm1Op = base_ops.create_unary_op(
     name="expm1", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+expm1_op = Expm1Op()
 
-ln_op = base_ops.create_unary_op(
-    name="log", type_signature=op_typing.UNARY_REAL_NUMERIC
-)
+LnOp = base_ops.create_unary_op(name="log", type_signature=op_typing.UNARY_REAL_NUMERIC)
+ln_op = LnOp()
 
-log10_op = base_ops.create_unary_op(
+Log10Op = base_ops.create_unary_op(
     name="log10", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+log10_op = Log10Op()
 
-log1p_op = base_ops.create_unary_op(
+Log1pOp = base_ops.create_unary_op(
     name="log1p", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+log1p_op = Log1pOp()
 
-sqrt_op = base_ops.create_unary_op(
+SqrtOp = base_ops.create_unary_op(
     name="sqrt", type_signature=op_typing.UNARY_REAL_NUMERIC
 )
+sqrt_op = SqrtOp()
 
 
 @dataclasses.dataclass(frozen=True)
@@ -282,16 +303,20 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 
 mod_op = ModOp()
 
-pow_op = base_ops.create_binary_op(name="pow", type_signature=op_typing.BINARY_NUMERIC)
+PowOp = base_ops.create_binary_op(name="pow", type_signature=op_typing.BINARY_NUMERIC)
+pow_op = PowOp()
 
-arctan2_op = base_ops.create_binary_op(
+Arctan2Op = base_ops.create_binary_op(
     name="arctan2", type_signature=op_typing.BINARY_REAL_NUMERIC
 )
+arctan2_op = Arctan2Op()
 
-round_op = base_ops.create_binary_op(
+RoundOp = base_ops.create_binary_op(
     name="round", type_signature=op_typing.BINARY_NUMERIC
 )
+round_op = RoundOp()
 
-unsafe_pow_op = base_ops.create_binary_op(
+UnsafePowOp = base_ops.create_binary_op(
     name="unsafe_pow_op", type_signature=op_typing.BINARY_REAL_NUMERIC
 )
+unsafe_pow_op = UnsafePowOp()
diff --git a/bigframes/operations/string_ops.py b/bigframes/operations/string_ops.py
index a2755f6654..f937ed23b6 100644
--- a/bigframes/operations/string_ops.py
+++ b/bigframes/operations/string_ops.py
@@ -22,60 +22,73 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-len_op = base_ops.create_unary_op(
+LenOp = base_ops.create_unary_op(
     name="len",
     type_signature=op_typing.FixedOutputType(
         dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable"
     ),
 )
+len_op = LenOp()
 
-reverse_op = base_ops.create_unary_op(
+ReverseOp = base_ops.create_unary_op(
     name="reverse", type_signature=op_typing.STRING_TRANSFORM
 )
+reverse_op = ReverseOp()
 
-lower_op = base_ops.create_unary_op(
+LowerOp = base_ops.create_unary_op(
     name="lower", type_signature=op_typing.STRING_TRANSFORM
 )
+lower_op = LowerOp()
 
-upper_op = base_ops.create_unary_op(
+UpperOp = base_ops.create_unary_op(
     name="upper", type_signature=op_typing.STRING_TRANSFORM
 )
+upper_op = UpperOp()
 
-isalnum_op = base_ops.create_unary_op(
+IsAlnumOp = base_ops.create_unary_op(
     name="isalnum", type_signature=op_typing.STRING_PREDICATE
 )
+isalnum_op = IsAlnumOp()
 
-isalpha_op = base_ops.create_unary_op(
+IsAlphaOp = base_ops.create_unary_op(
     name="isalpha", type_signature=op_typing.STRING_PREDICATE
 )
+isalpha_op = IsAlphaOp()
 
-isdecimal_op = base_ops.create_unary_op(
+IsDecimalOp = base_ops.create_unary_op(
     name="isdecimal", type_signature=op_typing.STRING_PREDICATE
 )
+isdecimal_op = IsDecimalOp()
 
-isdigit_op = base_ops.create_unary_op(
+IsDigitOp = base_ops.create_unary_op(
     name="isdigit", type_signature=op_typing.STRING_PREDICATE
 )
+isdigit_op = IsDigitOp()
 
-isnumeric_op = base_ops.create_unary_op(
+IsNumericOp = base_ops.create_unary_op(
     name="isnumeric", type_signature=op_typing.STRING_PREDICATE
 )
+isnumeric_op = IsNumericOp()
 
-isspace_op = base_ops.create_unary_op(
+IsSpaceOp = base_ops.create_unary_op(
     name="isspace", type_signature=op_typing.STRING_PREDICATE
 )
+isspace_op = IsSpaceOp()
 
-islower_op = base_ops.create_unary_op(
+IsLowerOp = base_ops.create_unary_op(
     name="islower", type_signature=op_typing.STRING_PREDICATE
 )
+islower_op = IsLowerOp()
 
-isupper_op = base_ops.create_unary_op(
+IsUpperOp = base_ops.create_unary_op(
     name="isupper", type_signature=op_typing.STRING_PREDICATE
 )
+isupper_op = IsUpperOp()
 
-capitalize_op = base_ops.create_unary_op(
+CapitalizeOp = base_ops.create_unary_op(
     name="capitalize", type_signature=op_typing.STRING_TRANSFORM
 )
+capitalize_op = CapitalizeOp()
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/operations/time_ops.py b/bigframes/operations/time_ops.py
index a6a65ad80e..bf6fa3e7d1 100644
--- a/bigframes/operations/time_ops.py
+++ b/bigframes/operations/time_ops.py
@@ -16,25 +16,29 @@
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
 
-hour_op = base_ops.create_unary_op(
+HourOp = base_ops.create_unary_op(
     name="hour",
     type_signature=op_typing.TIMELIKE_ACCESSOR,
 )
+hour_op = HourOp()
 
-minute_op = base_ops.create_unary_op(
+MinuteOp = base_ops.create_unary_op(
     name="minute",
     type_signature=op_typing.TIMELIKE_ACCESSOR,
 )
+minute_op = MinuteOp()
 
-second_op = base_ops.create_unary_op(
+SecondOp = base_ops.create_unary_op(
     name="second",
     type_signature=op_typing.TIMELIKE_ACCESSOR,
 )
+second_op = SecondOp()
 
-normalize_op = base_ops.create_unary_op(
+NormalizeOp = base_ops.create_unary_op(
     name="normalize",
     type_signature=op_typing.TypePreserving(
         dtypes.is_time_like,
         description="time-like",
     ),
 )
+normalize_op = NormalizeOp()

From e403528b9241e4bd0ad9a09dc0c1cd8e8f8437d8 Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@users.noreply.github.com>
Date: Wed, 11 Jun 2025 14:58:10 -0700
Subject: [PATCH 09/23] chore: add snippet tests for type system doc (#1783)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* chore: add snippet tests for type system doc

* fix format

* fix more lint

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* add tests for snippets

* fix lint

* try to fix tests with typo

* restore project in set_options test

* use options.reset():

* put global options setting in a try-finally block

* warn about json type and remove json type output from the comment

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* polish comments

* Update samples/snippets/type_system_test.py

Co-authored-by: Tim Sweña (Swast) <swast@google.com>

* remove json samples

* remove json samples

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 samples/snippets/set_options_test.py |  44 ++---
 samples/snippets/type_system_test.py | 235 +++++++++++++++++++++++++++
 2 files changed, 259 insertions(+), 20 deletions(-)
 create mode 100644 samples/snippets/type_system_test.py

diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py
index 3dea524a17..6007dcbb38 100644
--- a/samples/snippets/set_options_test.py
+++ b/samples/snippets/set_options_test.py
@@ -19,23 +19,27 @@ def test_bigquery_dataframes_set_options() -> None:
 
     bpd.close_session()
 
-    # [START bigquery_dataframes_set_options]
-    import bigframes.pandas as bpd
-
-    PROJECT_ID = "bigframes-dec"  # @param {type:"string"}
-    REGION = "US"  # @param {type:"string"}
-
-    # Set BigQuery DataFrames options
-    # Note: The project option is not required in all environments.
-    # On BigQuery Studio, the project ID is automatically detected.
-    bpd.options.bigquery.project = PROJECT_ID
-
-    # Note: The location option is not required.
-    # It defaults to the location of the first table or query
-    # passed to read_gbq(). For APIs where a location can't be
-    # auto-detected, the location defaults to the "US" location.
-    bpd.options.bigquery.location = REGION
-
-    # [END bigquery_dataframes_set_options]
-    assert bpd.options.bigquery.project == PROJECT_ID
-    assert bpd.options.bigquery.location == REGION
+    try:
+        # [START bigquery_dataframes_set_options]
+        import bigframes.pandas as bpd
+
+        PROJECT_ID = "bigframes-dev"  # @param {type:"string"}
+        REGION = "US"  # @param {type:"string"}
+
+        # Set BigQuery DataFrames options
+        # Note: The project option is not required in all environments.
+        # On BigQuery Studio, the project ID is automatically detected.
+        bpd.options.bigquery.project = PROJECT_ID
+
+        # Note: The location option is not required.
+        # It defaults to the location of the first table or query
+        # passed to read_gbq(). For APIs where a location can't be
+        # auto-detected, the location defaults to the "US" location.
+        bpd.options.bigquery.location = REGION
+
+        # [END bigquery_dataframes_set_options]
+        assert bpd.options.bigquery.project == PROJECT_ID
+        assert bpd.options.bigquery.location == REGION
+    finally:
+        bpd.close_session()
+        bpd.options.reset()
diff --git a/samples/snippets/type_system_test.py b/samples/snippets/type_system_test.py
new file mode 100644
index 0000000000..88b9e74742
--- /dev/null
+++ b/samples/snippets/type_system_test.py
@@ -0,0 +1,235 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas.testing
+
+from bigframes import dtypes
+
+
+def test_type_system_examples() -> None:
+    # [START bigquery_dataframes_type_sytem_timestamp_local_type_conversion]
+    import pandas as pd
+
+    import bigframes.pandas as bpd
+
+    s = pd.Series([pd.Timestamp("20250101")])
+    assert s.dtype == "datetime64[ns]"
+    assert bpd.read_pandas(s).dtype == "timestamp[us][pyarrow]"
+    # [END bigquery_dataframes_type_sytem_timestamp_local_type_conversion]
+
+    # [START bigquery_dataframes_type_system_pyarrow_preference]
+    import datetime
+
+    import pandas as pd
+
+    import bigframes.pandas as bpd
+
+    s = pd.Series([datetime.date(2025, 1, 1)])
+    s + pd.Timedelta(hours=12)
+    # 0	2025-01-01
+    # dtype: object
+
+    bpd.read_pandas(s) + pd.Timedelta(hours=12)
+    # 0    2025-01-01 12:00:00
+    # dtype: timestamp[us][pyarrow]
+    # [END bigquery_dataframes_type_system_pyarrow_preference]
+    pandas.testing.assert_series_equal(
+        s + pd.Timedelta(hours=12), pd.Series([datetime.date(2025, 1, 1)])
+    )
+    pandas.testing.assert_series_equal(
+        (bpd.read_pandas(s) + pd.Timedelta(hours=12)).to_pandas(),
+        pd.Series([pd.Timestamp(2025, 1, 1, 12)], dtype=dtypes.DATETIME_DTYPE),
+        check_index_type=False,
+    )
+
+    # [START bigquery_dataframes_type_system_load_timedelta]
+    import pandas as pd
+
+    import bigframes.pandas as bpd
+
+    s = pd.Series([pd.Timedelta("1s"), pd.Timedelta("2m")])
+    bpd.read_pandas(s)
+    # 0    0 days 00:00:01
+    # 1    0 days 00:02:00
+    # dtype: duration[us][pyarrow]
+    # [END bigquery_dataframes_type_system_load_timedelta]
+    pandas.testing.assert_series_equal(
+        bpd.read_pandas(s).to_pandas(),
+        s.astype(dtypes.TIMEDELTA_DTYPE),
+        check_index_type=False,
+    )
+
+    # [START bigquery_dataframes_type_system_timedelta_precision]
+    import pandas as pd
+
+    s = pd.Series([pd.Timedelta("999ns")])
+    bpd.read_pandas(s.dt.round("us"))
+    # 0    0 days 00:00:00.000001
+    # dtype: duration[us][pyarrow]
+    # [END bigquery_dataframes_type_system_timedelta_precision]
+    pandas.testing.assert_series_equal(
+        bpd.read_pandas(s.dt.round("us")).to_pandas(),
+        s.dt.round("us").astype(dtypes.TIMEDELTA_DTYPE),
+        check_index_type=False,
+    )
+
+    # [START bigquery_dataframes_type_system_cast_timedelta]
+    import bigframes.pandas as bpd
+
+    bpd.to_timedelta([1, 2, 3], unit="s")
+    # 0    0 days 00:00:01
+    # 1    0 days 00:00:02
+    # 2    0 days 00:00:03
+    # dtype: duration[us][pyarrow]
+    # [END bigquery_dataframes_type_system_cast_timedelta]
+    pandas.testing.assert_series_equal(
+        bpd.to_timedelta([1, 2, 3], unit="s").to_pandas(),
+        pd.Series(pd.to_timedelta([1, 2, 3], unit="s"), dtype=dtypes.TIMEDELTA_DTYPE),
+        check_index_type=False,
+    )
+
+    # [START bigquery_dataframes_type_system_list_accessor]
+    import bigframes.pandas as bpd
+
+    s = bpd.Series([[1, 2, 3], [4, 5], [6]])  # dtype: list<item: int64>[pyarrow]
+
+    # Access the first elements of each list
+    s.list[0]
+    # 0    1
+    # 1    4
+    # 2    6
+    # dtype: Int64
+
+    # Get the lengths of each list
+    s.list.len()
+    # 0    3
+    # 1    2
+    # 2    1
+    # dtype: Int64
+    # [END bigquery_dataframes_type_system_list_accessor]
+    pandas.testing.assert_series_equal(
+        s.list[0].to_pandas(),
+        pd.Series([1, 4, 6], dtype="Int64"),
+        check_index_type=False,
+    )
+    pandas.testing.assert_series_equal(
+        s.list.len().to_pandas(),
+        pd.Series([3, 2, 1], dtype="Int64"),
+        check_index_type=False,
+    )
+
+    # [START bigquery_dataframes_type_system_struct_accessor]
+    import bigframes.pandas as bpd
+
+    structs = [
+        {"id": 101, "category": "A"},
+        {"id": 102, "category": "B"},
+        {"id": 103, "category": "C"},
+    ]
+    s = bpd.Series(structs)
+    # Get the 'id' field of each struct
+    s.struct.field("id")
+    # 0    101
+    # 1    102
+    # 2    103
+    # Name: id, dtype: Int64
+    # [END bigquery_dataframes_type_system_struct_accessor]
+
+    # [START bigquery_dataframes_type_system_struct_accessor_shortcut]
+    import bigframes.pandas as bpd
+
+    structs = [
+        {"id": 101, "category": "A"},
+        {"id": 102, "category": "B"},
+        {"id": 103, "category": "C"},
+    ]
+    s = bpd.Series(structs)
+
+    # not explicitly using the "struct" property
+    s.id
+    # 0    101
+    # 1    102
+    # 2    103
+    # Name: id, dtype: Int64
+    # [END bigquery_dataframes_type_system_struct_accessor_shortcut]
+    pandas.testing.assert_series_equal(
+        s.struct.field("id").to_pandas(),
+        pd.Series([101, 102, 103], dtype="Int64", name="id"),
+        check_index_type=False,
+    )
+    pandas.testing.assert_series_equal(
+        s.id.to_pandas(),
+        pd.Series([101, 102, 103], dtype="Int64", name="id"),
+        check_index_type=False,
+    )
+
+    # [START bigquery_dataframes_type_system_string_accessor]
+    import bigframes.pandas as bpd
+
+    s = bpd.Series(["abc", "de", "1"])  # dtype: string[pyarrow]
+
+    # Get the first character of each string
+    s.str[0]
+    # 0    a
+    # 1    d
+    # 2    1
+    # dtype: string
+
+    # Check whether there are only alphabetic characters in each string
+    s.str.isalpha()
+    # 0     True
+    # 1     True
+    # 2     False
+    # dtype: boolean
+
+    # Cast the alphabetic characters to their upper cases for each string
+    s.str.upper()
+    # 0    ABC
+    # 1     DE
+    # 2      1
+    # dtype: string
+    # [END bigquery_dataframes_type_system_string_accessor]
+    pandas.testing.assert_series_equal(
+        s.str[0].to_pandas(),
+        pd.Series(["a", "d", "1"], dtype=dtypes.STRING_DTYPE),
+        check_index_type=False,
+    )
+    pandas.testing.assert_series_equal(
+        s.str.isalpha().to_pandas(),
+        pd.Series([True, True, False], dtype=dtypes.BOOL_DTYPE),
+        check_index_type=False,
+    )
+    pandas.testing.assert_series_equal(
+        s.str.upper().to_pandas(),
+        pd.Series(["ABC", "DE", "1"], dtype=dtypes.STRING_DTYPE),
+        check_index_type=False,
+    )
+
+    # [START bigquery_dataframes_type_system_geo_accessor]
+    from shapely.geometry import Point
+
+    import bigframes.pandas as bpd
+
+    s = bpd.Series([Point(1, 0), Point(2, 1)])  # dtype: geometry
+
+    s.geo.y
+    # 0    0.0
+    # 1    1.0
+    # dtype: Float64
+    # [END bigquery_dataframes_type_system_geo_accessor]
+    pandas.testing.assert_series_equal(
+        s.geo.y.to_pandas(),
+        pd.Series([0.0, 1.0], dtype=dtypes.FLOAT_DTYPE),
+        check_index_type=False,
+    )

From 855031a316a6957731a5d1c5e59dedb9757d9f7a Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 12 Jun 2025 09:58:01 -0700
Subject: [PATCH 10/23] fix: correct read_csv behaviours with use_cols, names,
 index_col (#1804)

* fix: correct read_csv behaviours with use_cols, names, index_col parameters

* fix test_default_index_warning_not_raised_by_read_gbq_primary_key

* refactor read_gbq_table for more readable

* fix presubmit
---
 .../session/_io/bigquery/read_gbq_table.py    |  18 +-
 bigframes/session/loader.py                   | 245 +++++++++++-------
 tests/system/small/test_session.py            | 130 +++++++++-
 3 files changed, 283 insertions(+), 110 deletions(-)

diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py
index 2dff16933f..6322040428 100644
--- a/bigframes/session/_io/bigquery/read_gbq_table.py
+++ b/bigframes/session/_io/bigquery/read_gbq_table.py
@@ -243,25 +243,17 @@ def get_index_cols(
     | int
     | bigframes.enums.DefaultIndexKind,
     *,
-    names: Optional[Iterable[str]] = None,
+    rename_to_schema: Optional[Dict[str, str]] = None,
 ) -> List[str]:
     """
     If we can get a total ordering from the table, such as via primary key
     column(s), then return those too so that ordering generation can be
     avoided.
     """
-
     # Transform index_col -> index_cols so we have a variable that is
     # always a list of column names (possibly empty).
     schema_len = len(table.schema)
 
-    # If the `names` is provided, the index_col provided by the user is the new
-    # name, so we need to rename it to the original name in the table schema.
-    renamed_schema: Optional[Dict[str, str]] = None
-    if names is not None:
-        assert len(list(names)) == schema_len
-        renamed_schema = {name: field.name for name, field in zip(names, table.schema)}
-
     index_cols: List[str] = []
     if isinstance(index_col, bigframes.enums.DefaultIndexKind):
         if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64:
@@ -278,8 +270,8 @@ def get_index_cols(
                 f"Got unexpected index_col {repr(index_col)}. {constants.FEEDBACK_LINK}"
             )
     elif isinstance(index_col, str):
-        if renamed_schema is not None:
-            index_col = renamed_schema.get(index_col, index_col)
+        if rename_to_schema is not None:
+            index_col = rename_to_schema.get(index_col, index_col)
         index_cols = [index_col]
     elif isinstance(index_col, int):
         if not 0 <= index_col < schema_len:
@@ -291,8 +283,8 @@ def get_index_cols(
     elif isinstance(index_col, Iterable):
         for item in index_col:
             if isinstance(item, str):
-                if renamed_schema is not None:
-                    item = renamed_schema.get(item, item)
+                if rename_to_schema is not None:
+                    item = rename_to_schema.get(item, item)
                 index_cols.append(item)
             elif isinstance(item, int):
                 if not 0 <= item < schema_len:
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
index 814d44292e..add4efb6ab 100644
--- a/bigframes/session/loader.py
+++ b/bigframes/session/loader.py
@@ -96,22 +96,35 @@ def _to_index_cols(
     return index_cols
 
 
-def _check_column_duplicates(
-    index_cols: Iterable[str], columns: Iterable[str], index_col_in_columns: bool
-) -> Iterable[str]:
-    """Validates and processes index and data columns for duplicates and overlap.
+def _check_duplicates(name: str, columns: Optional[Iterable[str]] = None):
+    """Check for duplicate column names in the provided iterable."""
+    if columns is None:
+        return
+    columns_list = list(columns)
+    set_columns = set(columns_list)
+    if len(columns_list) > len(set_columns):
+        raise ValueError(
+            f"The '{name}' argument contains duplicate names. "
+            f"All column names specified in '{name}' must be unique."
+        )
 
-    This function performs two main tasks:
-    1.  Ensures there are no duplicate column names within the `index_cols` list
-        or within the `columns` list.
-    2.  Based on the `index_col_in_columns` flag, it validates the relationship
-        between `index_cols` and `columns`.
+
+def _check_index_col_param(
+    index_cols: Iterable[str],
+    columns: Iterable[str],
+    *,
+    table_columns: Optional[Iterable[str]] = None,
+    index_col_in_columns: Optional[bool] = False,
+):
+    """Checks for duplicates in `index_cols` and resolves overlap with `columns`.
 
     Args:
         index_cols (Iterable[str]):
-            An iterable of column names designated as the index.
+            Column names designated as the index columns.
         columns (Iterable[str]):
-            An iterable of column names designated as the data columns.
+            Used column names from table_columns.
+        table_columns (Iterable[str]):
+            A full list of column names in the table schema.
         index_col_in_columns (bool):
             A flag indicating how to handle overlap between `index_cols` and
             `columns`.
@@ -121,40 +134,97 @@ def _check_column_duplicates(
               `columns`. An error is raised if an index column is not found
               in the `columns` list.
     """
-    index_cols_list = list(index_cols) if index_cols is not None else []
-    columns_list = list(columns) if columns is not None else []
-    set_index = set(index_cols_list)
-    set_columns = set(columns_list)
+    _check_duplicates("index_col", index_cols)
 
-    if len(index_cols_list) > len(set_index):
-        raise ValueError(
-            "The 'index_col' argument contains duplicate names. "
-            "All column names specified in 'index_col' must be unique."
-        )
+    if columns is not None and len(list(columns)) > 0:
+        set_index = set(list(index_cols) if index_cols is not None else [])
+        set_columns = set(list(columns) if columns is not None else [])
 
-    if len(columns_list) == 0:
-        return columns
+        if index_col_in_columns:
+            if not set_index.issubset(set_columns):
+                raise ValueError(
+                    f"The specified index column(s) were not found: {set_index - set_columns}. "
+                    f"Available columns are: {set_columns}"
+                )
+        else:
+            if not set_index.isdisjoint(set_columns):
+                raise ValueError(
+                    "Found column names that exist in both 'index_col' and 'columns' arguments. "
+                    "These arguments must specify distinct sets of columns."
+                )
 
-    if len(columns_list) > len(set_columns):
-        raise ValueError(
-            "The 'columns' argument contains duplicate names. "
-            "All column names specified in 'columns' must be unique."
-        )
+    if not index_col_in_columns and table_columns is not None:
+        for key in index_cols:
+            if key not in table_columns:
+                possibility = min(
+                    table_columns,
+                    key=lambda item: bigframes._tools.strings.levenshtein_distance(
+                        key, item
+                    ),
+                )
+                raise ValueError(
+                    f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?"
+                )
 
-    if index_col_in_columns:
-        if not set_index.issubset(set_columns):
-            raise ValueError(
-                f"The specified index column(s) were not found: {set_index - set_columns}. "
-                f"Available columns are: {set_columns}"
+
+def _check_columns_param(columns: Iterable[str], table_columns: Iterable[str]):
+    """Validates that the specified columns are present in the table columns.
+
+    Args:
+        columns (Iterable[str]):
+            Used column names from table_columns.
+        table_columns (Iterable[str]):
+            A full list of column names in the table schema.
+    Raises:
+        ValueError: If any column in `columns` is not found in the table columns.
+    """
+    for column_name in columns:
+        if column_name not in table_columns:
+            possibility = min(
+                table_columns,
+                key=lambda item: bigframes._tools.strings.levenshtein_distance(
+                    column_name, item
+                ),
             )
-        return [col for col in columns if col not in set_index]
-    else:
-        if not set_index.isdisjoint(set_columns):
             raise ValueError(
-                "Found column names that exist in both 'index_col' and 'columns' arguments. "
-                "These arguments must specify distinct sets of columns."
+                f"Column '{column_name}' is not found. Did you mean '{possibility}'?"
             )
-        return columns
+
+
+def _check_names_param(
+    names: Iterable[str],
+    index_col: Iterable[str]
+    | str
+    | Iterable[int]
+    | int
+    | bigframes.enums.DefaultIndexKind,
+    columns: Iterable[str],
+    table_columns: Iterable[str],
+):
+    len_names = len(list(names))
+    len_table_columns = len(list(table_columns))
+    len_columns = len(list(columns))
+    if len_names > len_table_columns:
+        raise ValueError(
+            f"Too many columns specified: expected {len_table_columns}"
+            f" and found {len_names}"
+        )
+    elif len_names < len_table_columns:
+        if isinstance(index_col, bigframes.enums.DefaultIndexKind) or index_col != ():
+            raise KeyError(
+                "When providing both `index_col` and `names`, ensure the "
+                "number of `names` matches the number of columns in your "
+                "data."
+            )
+        if len_columns != 0:
+            # The 'columns' must be identical to the 'names'. If not, raise an error.
+            if len_columns != len_names:
+                raise ValueError(
+                    "Number of passed names did not match number of header "
+                    "fields in the file"
+                )
+            if set(list(names)) != set(list(columns)):
+                raise ValueError("Usecols do not match columns")
 
 
 @dataclasses.dataclass
@@ -545,11 +615,14 @@ def read_gbq_table(
                 f"`max_results` should be a positive number, got {max_results}."
             )
 
+        _check_duplicates("columns", columns)
+
         table_ref = google.cloud.bigquery.table.TableReference.from_string(
             table_id, default_project=self._bqclient.project
         )
 
         columns = list(columns)
+        include_all_columns = columns is None or len(columns) == 0
         filters = typing.cast(list, list(filters))
 
         # ---------------------------------
@@ -563,72 +636,58 @@ def read_gbq_table(
             cache=self._df_snapshot,
             use_cache=use_cache,
         )
-        table_column_names = {field.name for field in table.schema}
 
         if table.location.casefold() != self._storage_manager.location.casefold():
             raise ValueError(
                 f"Current session is in {self._storage_manager.location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}"
             )
 
-        for key in columns:
-            if key not in table_column_names:
-                possibility = min(
-                    table_column_names,
-                    key=lambda item: bigframes._tools.strings.levenshtein_distance(
-                        key, item
-                    ),
-                )
-                raise ValueError(
-                    f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?"
-                )
-
-        # TODO(b/408499371): check `names` work with `use_cols` for read_csv method.
+        table_column_names = [field.name for field in table.schema]
+        rename_to_schema: Optional[Dict[str, str]] = None
         if names is not None:
+            _check_names_param(names, index_col, columns, table_column_names)
+
+            # Additional unnamed columns is going to set as index columns
             len_names = len(list(names))
-            len_columns = len(table.schema)
-            if len_names > len_columns:
-                raise ValueError(
-                    f"Too many columns specified: expected {len_columns}"
-                    f" and found {len_names}"
-                )
-            elif len_names < len_columns:
-                if (
-                    isinstance(index_col, bigframes.enums.DefaultIndexKind)
-                    or index_col != ()
-                ):
-                    raise KeyError(
-                        "When providing both `index_col` and `names`, ensure the "
-                        "number of `names` matches the number of columns in your "
-                        "data."
-                    )
-                index_col = range(len_columns - len_names)
+            len_schema = len(table.schema)
+            if len(columns) == 0 and len_names < len_schema:
+                index_col = range(len_schema - len_names)
                 names = [
-                    field.name for field in table.schema[: len_columns - len_names]
+                    field.name for field in table.schema[: len_schema - len_names]
                 ] + list(names)
 
+            assert len_schema >= len_names
+            assert len_names >= len(columns)
+
+            table_column_names = table_column_names[: len(list(names))]
+            rename_to_schema = dict(zip(list(names), table_column_names))
+
+        if len(columns) != 0:
+            if names is None:
+                _check_columns_param(columns, table_column_names)
+            else:
+                _check_columns_param(columns, names)
+                names = columns
+                assert rename_to_schema is not None
+                columns = [rename_to_schema[renamed_name] for renamed_name in columns]
+
         # Converting index_col into a list of column names requires
         # the table metadata because we might use the primary keys
         # when constructing the index.
         index_cols = bf_read_gbq_table.get_index_cols(
             table=table,
             index_col=index_col,
-            names=names,
+            rename_to_schema=rename_to_schema,
         )
-        columns = list(
-            _check_column_duplicates(index_cols, columns, index_col_in_columns)
+        _check_index_col_param(
+            index_cols,
+            columns,
+            table_columns=table_column_names,
+            index_col_in_columns=index_col_in_columns,
         )
-
-        for key in index_cols:
-            if key not in table_column_names:
-                possibility = min(
-                    table_column_names,
-                    key=lambda item: bigframes._tools.strings.levenshtein_distance(
-                        key, item
-                    ),
-                )
-                raise ValueError(
-                    f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?"
-                )
+        if index_col_in_columns and not include_all_columns:
+            set_index = set(list(index_cols) if index_cols is not None else [])
+            columns = [col for col in columns if col not in set_index]
 
         # -----------------------------
         # Optionally, execute the query
@@ -715,7 +774,7 @@ def read_gbq_table(
             metadata_only=not self._scan_index_uniqueness,
         )
         schema = schemata.ArraySchema.from_bq_table(table)
-        if columns:
+        if not include_all_columns:
             schema = schema.select(index_cols + columns)
         array_value = core.ArrayValue.from_table(
             table,
@@ -767,14 +826,14 @@ def read_gbq_table(
 
         value_columns = [col for col in array_value.column_ids if col not in index_cols]
         if names is not None:
-            renamed_cols: Dict[str, str] = {
-                col: new_name for col, new_name in zip(array_value.column_ids, names)
-            }
+            assert rename_to_schema is not None
+            schema_to_rename = {value: key for key, value in rename_to_schema.items()}
             if index_col != bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64:
                 index_names = [
-                    renamed_cols.get(index_col, index_col) for index_col in index_cols
+                    schema_to_rename.get(index_col, index_col)
+                    for index_col in index_cols
                 ]
-            value_columns = [renamed_cols.get(col, col) for col in value_columns]
+            value_columns = [schema_to_rename.get(col, col) for col in value_columns]
 
         block = blocks.Block(
             array_value,
@@ -898,9 +957,7 @@ def read_gbq_query(
             )
 
         index_cols = _to_index_cols(index_col)
-        columns = _check_column_duplicates(
-            index_cols, columns, index_col_in_columns=False
-        )
+        _check_index_col_param(index_cols, columns)
 
         filters_copy1, filters_copy2 = itertools.tee(filters)
         has_filters = len(list(filters_copy1)) != 0
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index cbb441e5aa..809d08c6c1 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -146,9 +146,7 @@ def test_read_gbq_w_unknown_column(
 ):
     with pytest.raises(
         ValueError,
-        match=re.escape(
-            "Column 'int63_col' of `columns` not found in this table. Did you mean 'int64_col'?"
-        ),
+        match=re.escape("Column 'int63_col' is not found. Did you mean 'int64_col'?"),
     ):
         session.read_gbq(
             scalars_table_id,
@@ -1365,6 +1363,132 @@ def test_read_csv_for_names_and_index_col(
     )
 
 
+@pytest.mark.parametrize(
+    "usecols",
+    [
+        pytest.param(["a", "b", "c"], id="same"),
+        pytest.param(["a", "c"], id="less_than_names"),
+    ],
+)
+def test_read_csv_for_names_and_usecols(
+    session, usecols, df_and_gcs_csv_for_two_columns
+):
+    _, path = df_and_gcs_csv_for_two_columns
+
+    names = ["a", "b", "c"]
+    bf_df = session.read_csv(path, engine="bigquery", names=names, usecols=usecols)
+
+    # Convert default pandas dtypes to match BigQuery DataFrames dtypes.
+    pd_df = session.read_csv(
+        path, names=names, usecols=usecols, dtype=bf_df.dtypes.to_dict()
+    )
+
+    assert bf_df.shape == pd_df.shape
+    assert bf_df.columns.tolist() == pd_df.columns.tolist()
+
+    # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
+    # (b/280889935) or guarantee row ordering.
+    bf_df = bf_df.set_index(names[0]).sort_index()
+    pd_df = pd_df.set_index(names[0])
+    pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
+
+
+def test_read_csv_for_names_and_invalid_usecols(
+    session, df_and_gcs_csv_for_two_columns
+):
+    _, path = df_and_gcs_csv_for_two_columns
+
+    names = ["a", "b", "c"]
+    usecols = ["a", "X"]
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Column 'X' is not found. "),
+    ):
+        session.read_csv(path, engine="bigquery", names=names, usecols=usecols)
+
+
+@pytest.mark.parametrize(
+    ("usecols", "index_col"),
+    [
+        pytest.param(["a", "b", "c"], "a", id="same"),
+        pytest.param(["a", "b", "c"], ["a", "b"], id="same_two_index"),
+        pytest.param(["a", "c"], 0, id="less_than_names"),
+    ],
+)
+def test_read_csv_for_names_and_usecols_and_indexcol(
+    session, usecols, index_col, df_and_gcs_csv_for_two_columns
+):
+    _, path = df_and_gcs_csv_for_two_columns
+
+    names = ["a", "b", "c"]
+    bf_df = session.read_csv(
+        path, engine="bigquery", names=names, usecols=usecols, index_col=index_col
+    )
+
+    # Convert default pandas dtypes to match BigQuery DataFrames dtypes.
+    pd_df = session.read_csv(
+        path,
+        names=names,
+        usecols=usecols,
+        index_col=index_col,
+        dtype=bf_df.reset_index().dtypes.to_dict(),
+    )
+
+    assert bf_df.shape == pd_df.shape
+    assert bf_df.columns.tolist() == pd_df.columns.tolist()
+
+    pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
+
+
+def test_read_csv_for_names_less_than_columns_and_same_usecols(
+    session, df_and_gcs_csv_for_two_columns
+):
+    _, path = df_and_gcs_csv_for_two_columns
+    names = ["a", "c"]
+    usecols = ["a", "c"]
+    bf_df = session.read_csv(path, engine="bigquery", names=names, usecols=usecols)
+
+    # Convert default pandas dtypes to match BigQuery DataFrames dtypes.
+    pd_df = session.read_csv(
+        path, names=names, usecols=usecols, dtype=bf_df.dtypes.to_dict()
+    )
+
+    assert bf_df.shape == pd_df.shape
+    assert bf_df.columns.tolist() == pd_df.columns.tolist()
+
+    # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
+    # (b/280889935) or guarantee row ordering.
+    bf_df = bf_df.set_index(names[0]).sort_index()
+    pd_df = pd_df.set_index(names[0])
+    pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
+
+
+def test_read_csv_for_names_less_than_columns_and_mismatched_usecols(
+    session, df_and_gcs_csv_for_two_columns
+):
+    _, path = df_and_gcs_csv_for_two_columns
+    names = ["a", "b"]
+    usecols = ["a"]
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Number of passed names did not match number"),
+    ):
+        session.read_csv(path, engine="bigquery", names=names, usecols=usecols)
+
+
+def test_read_csv_for_names_less_than_columns_and_different_usecols(
+    session, df_and_gcs_csv_for_two_columns
+):
+    _, path = df_and_gcs_csv_for_two_columns
+    names = ["a", "b"]
+    usecols = ["a", "c"]
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Usecols do not match columns"),
+    ):
+        session.read_csv(path, engine="bigquery", names=names, usecols=usecols)
+
+
 def test_read_csv_for_dtype(session, df_and_gcs_csv_for_two_columns):
     _, path = df_and_gcs_csv_for_two_columns
 

From e586151df81917b49f702ae496aaacbd02931636 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Thu, 12 Jun 2025 15:15:44 -0700
Subject: [PATCH 11/23] feat: support custom build service account in
 `remote_function` (#1796)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Regarding the commit to refactor the system test for `cloud_build_service_account`:

This commit refactors the system test `test_remote_function_via_session_custom_build_sa`
in `tests/system/large/functions/test_remote_function.py` to align with
the structure and validation approach of `test_remote_function_via_session_custom_sa`.

The test now:
- Uses the project "bigframes-dev-perf".
- Sets `cloud_build_service_account` to "bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com".
- Sets `cloud_function_service_account` to the same value for simplicity in this test.
- Uses `cloud_function_ingress_settings="all"`.
- Validates that `gcf.build_config.service_account` matches the
  provided `cloud_build_service_account`.
- Employs a dedicated session for the test and ensures proper cleanup.

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* add proper test, improve documentation

* nit rewording for readability

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
---
 bigframes/functions/_function_client.py       | 13 ++++
 bigframes/functions/_function_session.py      | 12 +++
 bigframes/pandas/__init__.py                  |  2 +
 bigframes/session/__init__.py                 | 12 +++
 .../large/functions/test_remote_function.py   | 76 ++++++++++++++++++-
 5 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py
index d03021dd23..e818015a9b 100644
--- a/bigframes/functions/_function_client.py
+++ b/bigframes/functions/_function_client.py
@@ -77,6 +77,7 @@ def __init__(
         cloud_function_service_account=None,
         cloud_function_kms_key_name=None,
         cloud_function_docker_repository=None,
+        cloud_build_service_account=None,
         *,
         session: Session,
     ):
@@ -94,6 +95,7 @@ def __init__(
         self._cloud_function_service_account = cloud_function_service_account
         self._cloud_function_kms_key_name = cloud_function_kms_key_name
         self._cloud_function_docker_repository = cloud_function_docker_repository
+        self._cloud_build_service_account = cloud_build_service_account
 
     def _create_bq_connection(self) -> None:
         if self._bq_connection_manager:
@@ -452,6 +454,17 @@ def create_cloud_function(
             function.build_config.docker_repository = (
                 self._cloud_function_docker_repository
             )
+
+            if self._cloud_build_service_account:
+                canonical_cloud_build_service_account = (
+                    self._cloud_build_service_account
+                    if "/" in self._cloud_build_service_account
+                    else f"projects/{self._gcp_project_id}/serviceAccounts/{self._cloud_build_service_account}"
+                )
+                function.build_config.service_account = (
+                    canonical_cloud_build_service_account
+                )
+
             function.service_config = functions_v2.ServiceConfig()
             if memory_mib is not None:
                 function.service_config.available_memory = f"{memory_mib}Mi"
diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py
index e18f7084db..2fb3480d6c 100644
--- a/bigframes/functions/_function_session.py
+++ b/bigframes/functions/_function_session.py
@@ -263,6 +263,7 @@ def remote_function(
         cloud_function_ingress_settings: Literal[
             "all", "internal-only", "internal-and-gclb"
         ] = "internal-only",
+        cloud_build_service_account: Optional[str] = None,
     ):
         """Decorator to turn a user defined function into a BigQuery remote function.
 
@@ -453,6 +454,16 @@ def remote_function(
                 If no setting is provided, `internal-only` will be used by default.
                 See for more details
                 https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings.
+            cloud_build_service_account (str, Optional):
+                Service account in the fully qualified format
+                `projects/PROJECT_ID/serviceAccounts/SERVICE_ACCOUNT_EMAIL`, or
+                just the SERVICE_ACCOUNT_EMAIL. The latter would be interpreted
+                as belonging to the BigQuery DataFrames session project. This is
+                to be used by Cloud Build to build the function source code into
+                a deployable artifact. If not provided, the default Cloud Build
+                service account is used. See
+                https://cloud.google.com/build/docs/cloud-build-service-account
+                for more details.
         """
         # Some defaults may be used from the session if not provided otherwise.
         session = self._resolve_session(session)
@@ -599,6 +610,7 @@ def wrapper(func):
                 else cloud_function_service_account,
                 cloud_function_kms_key_name,
                 cloud_function_docker_repository,
+                cloud_build_service_account=cloud_build_service_account,
                 session=session,  # type: ignore
             )
 
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
index d08ef4e91d..e8253769be 100644
--- a/bigframes/pandas/__init__.py
+++ b/bigframes/pandas/__init__.py
@@ -89,6 +89,7 @@ def remote_function(
     cloud_function_ingress_settings: Literal[
         "all", "internal-only", "internal-and-gclb"
     ] = "internal-only",
+    cloud_build_service_account: Optional[str] = None,
 ):
     return global_session.with_default_session(
         bigframes.session.Session.remote_function,
@@ -108,6 +109,7 @@ def remote_function(
         cloud_function_vpc_connector=cloud_function_vpc_connector,
         cloud_function_memory_mib=cloud_function_memory_mib,
         cloud_function_ingress_settings=cloud_function_ingress_settings,
+        cloud_build_service_account=cloud_build_service_account,
     )
 
 
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index ab09230c99..b6066daed3 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -1378,6 +1378,7 @@ def remote_function(
         cloud_function_ingress_settings: Literal[
             "all", "internal-only", "internal-and-gclb"
         ] = "internal-only",
+        cloud_build_service_account: Optional[str] = None,
     ):
         """Decorator to turn a user defined function into a BigQuery remote function. Check out
         the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
@@ -1553,6 +1554,16 @@ def remote_function(
                 If no setting is provided, `internal-only` will be used by default.
                 See for more details
                 https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings.
+            cloud_build_service_account (str, Optional):
+                Service account in the fully qualified format
+                `projects/PROJECT_ID/serviceAccounts/SERVICE_ACCOUNT_EMAIL`, or
+                just the SERVICE_ACCOUNT_EMAIL. The latter would be interpreted
+                as belonging to the BigQuery DataFrames session project. This is
+                to be used by Cloud Build to build the function source code into
+                a deployable artifact. If not provided, the default Cloud Build
+                service account is used. See
+                https://cloud.google.com/build/docs/cloud-build-service-account
+                for more details.
         Returns:
             collections.abc.Callable:
                 A remote function object pointing to the cloud assets created
@@ -1581,6 +1592,7 @@ def remote_function(
             cloud_function_vpc_connector=cloud_function_vpc_connector,
             cloud_function_memory_mib=cloud_function_memory_mib,
             cloud_function_ingress_settings=cloud_function_ingress_settings,
+            cloud_build_service_account=cloud_build_service_account,
         )
 
     def udf(
diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py
index 9e0dcfe4d7..172fff3010 100644
--- a/tests/system/large/functions/test_remote_function.py
+++ b/tests/system/large/functions/test_remote_function.py
@@ -1342,7 +1342,7 @@ def test_remote_function_via_session_custom_sa(scalars_dfs):
     # For upfront convenience, the following set up has been statically created
     # in the project bigfrmames-dev-perf via cloud console:
     #
-    # 1. Create a service account as per
+    # 1. Create a service account bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com as per
     #    https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
     # 2. Give necessary roles as per
     #    https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration
@@ -1395,6 +1395,80 @@ def square_num(x):
         )
 
 
+@pytest.mark.parametrize(
+    ("set_build_service_account"),
+    [
+        pytest.param(
+            "projects/bigframes-dev-perf/serviceAccounts/bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com",
+            id="fully-qualified-sa",
+        ),
+        pytest.param(
+            "bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com",
+            id="just-sa-email",
+        ),
+    ],
+)
+@pytest.mark.flaky(retries=2, delay=120)
+def test_remote_function_via_session_custom_build_sa(
+    scalars_dfs, set_build_service_account
+):
+    # TODO(shobs): Automate the following set-up during testing in the test project.
+    #
+    # For upfront convenience, the following set up has been statically created
+    # in the project bigfrmames-dev-perf via cloud console:
+    #
+    # 1. Create a service account bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com as per
+    #    https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
+    # 2. Give "Cloud Build Service Account (roles/cloudbuild.builds.builder)" role as per
+    #    https://cloud.google.com/build/docs/cloud-build-service-account#default_permissions_of_the_legacy_service_account
+    #
+    project = "bigframes-dev-perf"
+    expected_build_service_account = "projects/bigframes-dev-perf/serviceAccounts/bigframes-dev-perf-1@bigframes-dev-perf.iam.gserviceaccount.com"
+
+    rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project))
+
+    try:
+
+        # TODO(shobs): Figure out why the default ingress setting
+        # (internal-only) does not work here
+        @rf_session.remote_function(
+            input_types=[int],
+            output_type=int,
+            reuse=False,
+            cloud_function_service_account="default",
+            cloud_build_service_account=set_build_service_account,
+            cloud_function_ingress_settings="all",
+        )
+        def square_num(x):
+            if x is None:
+                return x
+            return x * x
+
+        # assert that the GCF is created with the intended SA
+        gcf = rf_session.cloudfunctionsclient.get_function(
+            name=square_num.bigframes_cloud_function
+        )
+        assert gcf.build_config.service_account == expected_build_service_account
+
+        # assert that the function works as expected on data
+        scalars_df, scalars_pandas_df = scalars_dfs
+
+        bf_int64_col = scalars_df["int64_col"]
+        bf_result_col = bf_int64_col.apply(square_num)
+        bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas()
+
+        pd_int64_col = scalars_pandas_df["int64_col"]
+        pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x * x)
+        pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
+
+        assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_function_assets(
+            square_num, rf_session.bqclient, rf_session.cloudfunctionsclient
+        )
+
+
 def test_remote_function_throws_none_cloud_function_service_account(session):
     with pytest.raises(
         ValueError,

From f6265dbb8e22de81bb59c7def175cd325e85c041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Fri, 13 Jun 2025 10:07:05 -0500
Subject: [PATCH 12/23] docs: rearrange README.rst to include a short code
 sample (#1812)

* docs: rearrange README.rst to include a short code sample

Towards internal issue 424443170

* change to BigFrames
---
 README.rst | 72 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/README.rst b/README.rst
index 7f487b9077..9288f2e6a5 100644
--- a/README.rst
+++ b/README.rst
@@ -1,16 +1,60 @@
-BigQuery DataFrames
-===================
+BigQuery DataFrames (BigFrames)
+===============================
 
 |GA| |pypi| |versions|
 
-BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API
-powered by the BigQuery engine.
+BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame
+and machine learning (ML) API powered by the BigQuery engine.
 
 * ``bigframes.pandas`` provides a pandas-compatible API for analytics.
 * ``bigframes.ml`` provides a scikit-learn-like API for ML.
 
-BigQuery DataFrames is an open-source package. You can run
-``pip install --upgrade bigframes`` to install the latest version.
+BigQuery DataFrames is an open-source package.
+
+**Version 2.0 introduces breaking changes for improved security and performance. See below for details.**
+
+Getting started with BigQuery DataFrames
+----------------------------------------
+
+The easiest way to get started is to try the
+`BigFrames quickstart <https://cloud.google.com/bigquery/docs/dataframes-quickstart>`_
+in a `notebook in BigQuery Studio <https://cloud.google.com/bigquery/docs/notebooks-introduction>`_.
+
+To use BigFrames in your local development environment,
+
+1. Run ``pip install --upgrade bigframes`` to install the latest version.
+
+2. Setup `Application default credentials <https://cloud.google.com/docs/authentication/set-up-adc-local-dev-environment>`_
+   for your local development environment enviroment.
+
+3. Create a `GCP project with the BigQuery API enabled <https://cloud.google.com/bigquery/docs/sandbox>`_.
+
+4. Use the ``bigframes`` package to query data.
+
+.. code-block:: python
+
+    import bigframes.pandas as bpd
+
+    bpd.options.bigquery.project = your_gcp_project_id
+    df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013")
+    print(
+        df.groupby("name")
+        .agg({"number": "sum"})
+        .sort_values("number", ascending=False)
+        .head(10)
+        .to_pandas()
+    )
+
+
+Documentation
+-------------
+
+To learn more about BigQuery DataFrames, visit these pages
+
+* `Introduction to BigQuery DataFrames (BigFrames) <https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction>`_
+* `Sample notebooks <https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks>`_
+* `API reference <https://cloud.google.com/python/docs/reference/bigframes/latest/summary_overview>`_
+* `Source code (GitHub) <https://github.com/googleapis/python-bigquery-dataframes>`_
 
 ⚠️ Warning: Breaking Changes in BigQuery DataFrames v2.0
 --------------------------------------------------------
@@ -44,22 +88,6 @@ To learn about these changes and how to migrate to version 2.0, see the
 .. |versions| image:: https://img.shields.io/pypi/pyversions/bigframes.svg
    :target: https://pypi.org/project/bigframes/
 
-Documentation
--------------
-
-* `BigQuery DataFrames source code (GitHub) <https://github.com/googleapis/python-bigquery-dataframes>`_
-* `BigQuery DataFrames sample notebooks <https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks>`_
-* `BigQuery DataFrames API reference <https://cloud.google.com/python/docs/reference/bigframes/latest/summary_overview>`_
-* `BigQuery DataFrames supported pandas APIs <https://cloud.google.com/python/docs/reference/bigframes/latest/supported_pandas_apis>`_
-
-
-Getting started with BigQuery DataFrames
-----------------------------------------
-Read `Introduction to BigQuery DataFrames <https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction>`_
-and try the `BigQuery DataFrames quickstart <https://cloud.google.com/bigquery/docs/dataframes-quickstart>`_
-to get up and running in just a few minutes.
-
-
 License
 -------
 

From 0562a374418d7025793dfd21dc1a4f37cb938fec Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Fri, 13 Jun 2025 11:21:34 -0700
Subject: [PATCH 13/23] test: Add unit tests for SequentialUIDGenerator (#1813)

This commit introduces unit tests for the `get_uid_stream` method in the `SequentialUIDGenerator` class.

Fixes internal issue 416487613

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
---
 bigframes/core/compile/sqlglot/compiler.py    |  1 +
 bigframes/core/compile/sqlglot/sqlglot_ir.py  |  9 ++--
 .../test_compile_readtable/out.sql            |  4 +-
 tests/unit/core/test_guid.py                  | 41 +++++++++++++++++++
 4 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 tests/unit/core/test_guid.py

diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
index 7e55c0285f..ebe2a64699 100644
--- a/bigframes/core/compile/sqlglot/compiler.py
+++ b/bigframes/core/compile/sqlglot/compiler.py
@@ -167,6 +167,7 @@ def compile_readtable(self, node: nodes.ReadTableNode, *args):
             table.table_id,
             col_names=[col.source_id for col in node.scan_list.items],
             alias_names=[col.id.sql for col in node.scan_list.items],
+            uid_gen=self.uid_gen,
         )
 
     @_compile_node.register
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
index fc1a687c71..95e4f90118 100644
--- a/bigframes/core/compile/sqlglot/sqlglot_ir.py
+++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -114,6 +114,7 @@ def from_table(
         table_id: str,
         col_names: typing.Sequence[str],
         alias_names: typing.Sequence[str],
+        uid_gen: guid.SequentialUIDGenerator,
     ) -> SQLGlotIR:
         selections = [
             sge.Alias(
@@ -128,7 +129,7 @@ def from_table(
             catalog=sg.to_identifier(project_id, quoted=cls.quoted),
         )
         select_expr = sge.Select().select(*selections).from_(table_expr)
-        return cls(expr=select_expr)
+        return cls(expr=select_expr, uid_gen=uid_gen)
 
     @classmethod
     def from_query_string(
@@ -164,10 +165,10 @@ def select(
         squashed_selections = _squash_selections(self.expr.expressions, selections)
         if squashed_selections != []:
             new_expr = self.expr.select(*squashed_selections, append=False)
-            return SQLGlotIR(expr=new_expr)
+            return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
         else:
             new_expr = self._encapsulate_as_cte().select(*selections, append=False)
-            return SQLGlotIR(expr=new_expr)
+            return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
 
     def project(
         self,
@@ -181,7 +182,7 @@ def project(
             for id, expr in projected_cols
         ]
         new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=False)
-        return SQLGlotIR(expr=new_expr)
+        return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
 
     def insert(
         self,
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql
index f010f77bf1..a5cb399b40 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql
@@ -1,4 +1,4 @@
-WITH `bfcte_2` AS (
+WITH `bfcte_0` AS (
   SELECT
     `rowindex` AS `bfcol_0`,
     `int64_col` AS `bfcol_1`,
@@ -13,4 +13,4 @@ SELECT
   `bfcol_2` AS `string_col`,
   `bfcol_3` AS `float64_col`,
   `bfcol_4` AS `bool_col`
-FROM `bfcte_2`
\ No newline at end of file
+FROM `bfcte_0`
\ No newline at end of file
diff --git a/tests/unit/core/test_guid.py b/tests/unit/core/test_guid.py
new file mode 100644
index 0000000000..c7334848ee
--- /dev/null
+++ b/tests/unit/core/test_guid.py
@@ -0,0 +1,41 @@
+import types
+import unittest
+
+from bigframes.core.guid import SequentialUIDGenerator
+
+
+class TestSequentialUIDGenerator(unittest.TestCase):
+    def test_get_uid_stream_returns_generator(self):
+        generator = SequentialUIDGenerator()
+        stream = generator.get_uid_stream("prefix")
+        self.assertIsInstance(stream, types.GeneratorType)
+
+    def test_generator_yields_correct_uids(self):
+        generator = SequentialUIDGenerator()
+        stream = generator.get_uid_stream("prefix")
+        self.assertEqual(next(stream), "prefix0")
+        self.assertEqual(next(stream), "prefix1")
+        self.assertEqual(next(stream), "prefix2")
+
+    def test_generator_yields_different_uids_for_different_prefixes(self):
+        generator = SequentialUIDGenerator()
+        stream_a = generator.get_uid_stream("prefixA")
+        stream_b = generator.get_uid_stream("prefixB")
+        self.assertEqual(next(stream_a), "prefixA0")
+        self.assertEqual(next(stream_b), "prefixB0")
+        self.assertEqual(next(stream_a), "prefixA1")
+        self.assertEqual(next(stream_b), "prefixB1")
+
+    def test_multiple_calls_continue_generation(self):
+        generator = SequentialUIDGenerator()
+        stream1 = generator.get_uid_stream("prefix")
+        self.assertEqual(next(stream1), "prefix0")
+        self.assertEqual(next(stream1), "prefix1")
+
+        stream2 = generator.get_uid_stream("prefix")
+        self.assertEqual(next(stream2), "prefix2")
+        self.assertEqual(next(stream2), "prefix3")
+
+
+if __name__ == "__main__":
+    unittest.main()

From dc9eb27fa75e90c2c95a0619551bf67aea6ef63b Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Fri, 13 Jun 2025 11:55:36 -0700
Subject: [PATCH 14/23] feat: add bbq.json_query_array and warn
 bbq.json_extract_array deprecated (#1811)

* feat: add bbq.json_query_array and warn bbq.json_extract_array deprecated

* complete features

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
---
 bigframes/bigquery/__init__.py               |  2 +
 bigframes/bigquery/_operations/json.py       | 59 ++++++++++++++++++
 bigframes/core/compile/scalar_op_compiler.py | 13 ++++
 bigframes/operations/__init__.py             |  2 +
 bigframes/operations/json_ops.py             | 17 ++++++
 tests/system/small/bigquery/test_json.py     | 63 +++++++++++++++++++-
 6 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 22bcfb1407..cdc3718893 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -40,6 +40,7 @@
     json_extract_array,
     json_extract_string_array,
     json_query,
+    json_query_array,
     json_set,
     json_value,
     parse_json,
@@ -67,6 +68,7 @@
     "json_extract_array",
     "json_extract_string_array",
     "json_query",
+    "json_query_array",
     "json_set",
     "json_value",
     "parse_json",
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
index 561fb57348..00d230d684 100644
--- a/bigframes/bigquery/_operations/json.py
+++ b/bigframes/bigquery/_operations/json.py
@@ -133,6 +133,10 @@ def json_extract_array(
     `STRING` or `JSON` values. This function uses single quotes and brackets to
     escape invalid JSONPath characters in JSON keys.
 
+    .. deprecated:: 2.5.0
+        The ``json_extract_array`` is deprecated and will be removed in a future version.
+        Use ``json_query_array`` instead.
+
     **Examples:**
 
         >>> import bigframes.pandas as bpd
@@ -172,6 +176,11 @@ def json_extract_array(
     Returns:
         bigframes.series.Series: A new Series with the parsed arrays from the input.
     """
+    msg = (
+        "The `json_extract_array` is deprecated and will be removed in a future version. "
+        "Use `json_query_array` instead."
+    )
+    warnings.warn(bfe.format_message(msg), category=UserWarning)
     return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
 
 
@@ -273,6 +282,56 @@ def json_query(
     return input._apply_unary_op(ops.JSONQuery(json_path=json_path))
 
 
+def json_query_array(
+    input: series.Series,
+    json_path: str = "$",
+) -> series.Series:
+    """Extracts a JSON array and converts it to a SQL array of JSON-formatted
+    `STRING` or `JSON` values. This function uses double quotes to escape invalid
+    JSONPath characters in JSON keys. For example: `"a.b"`.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
+        >>> bbq.json_query_array(s)
+        0    ['1' '2' '3']
+        1        ['4' '5']
+        dtype: list<item: string>[pyarrow]
+
+        >>> s = bpd.Series([
+        ...   '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
+        ...   '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}'
+        ... ])
+        >>> bbq.json_query_array(s, "$.fruits")
+        0    ['{"name":"apple"}' '{"name":"cherry"}']
+        1    ['{"name":"guava"}' '{"name":"grapes"}']
+        dtype: list<item: string>[pyarrow]
+
+        >>> s = bpd.Series([
+        ...   '{"fruits": {"color": "red",   "names": ["apple","cherry"]}}',
+        ...   '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
+        ... ])
+        >>> bbq.json_query_array(s, "$.fruits.names")
+        0    ['"apple"' '"cherry"']
+        1    ['"guava"' '"grapes"']
+        dtype: list<item: string>[pyarrow]
+
+    Args:
+        input (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path (str):
+            The JSON path identifying the data that you want to obtain from the input.
+
+    Returns:
+        bigframes.series.Series: A new Series with the parsed arrays from the input.
+    """
+    return input._apply_unary_op(ops.JSONQueryArray(json_path=json_path))
+
+
 def json_value(
     input: series.Series,
     json_path: str,
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index a1fc995159..908f3082c3 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -1379,6 +1379,19 @@ def json_query(json_or_json_string, json_path: ibis_dtypes.str):  # type: ignore
     return json_query_op(json_or_json_string=x, json_path=op.json_path)
 
 
+@scalar_op_compiler.register_unary_op(ops.JSONQueryArray, pass_op=True)
+def json_query_array_op_impl(x: ibis_types.Value, op: ops.JSONQueryArray):
+    # Define a user-defined function whose returned type is dynamically matching the input.
+    def json_query_array(json_or_json_string, json_path: ibis_dtypes.str):  # type: ignore
+        """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
+        ...
+
+    return_type = x.type()
+    json_query_array.__annotations__["return"] = ibis_dtypes.Array[return_type]  # type: ignore
+    json_query_op = ibis_udf.scalar.builtin(json_query_array)
+    return json_query_op(json_or_json_string=x, json_path=op.json_path)
+
+
 @scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True)
 def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON):
     return parse_json(json_str=x)
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index faf4e18d5e..291bf17fa5 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -109,6 +109,7 @@
     JSONExtractArray,
     JSONExtractStringArray,
     JSONQuery,
+    JSONQueryArray,
     JSONSet,
     JSONValue,
     ParseJSON,
@@ -359,6 +360,7 @@
     "JSONExtractArray",
     "JSONExtractStringArray",
     "JSONQuery",
+    "JSONQueryArray",
     "JSONSet",
     "JSONValue",
     "ParseJSON",
diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py
index b083035d38..95a47dcadb 100644
--- a/bigframes/operations/json_ops.py
+++ b/bigframes/operations/json_ops.py
@@ -37,6 +37,23 @@ def output_type(self, *input_types):
         return input_type
 
 
+@dataclasses.dataclass(frozen=True)
+class JSONQueryArray(base_ops.UnaryOp):
+    name: typing.ClassVar[str] = "json_query_array"
+    json_path: str
+
+    def output_type(self, *input_types):
+        input_type = input_types[0]
+        if not dtypes.is_json_like(input_type):
+            raise TypeError(
+                "Input type must be a valid JSON object or JSON-formatted string type."
+                + f" Received type: {input_type}"
+            )
+        return pd.ArrowDtype(
+            pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type))
+        )
+
+
 @dataclasses.dataclass(frozen=True)
 class JSONExtractArray(base_ops.UnaryOp):
     name: typing.ClassVar[str] = "json_extract_array"
diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py
index 3d155b5f16..4ad16d6cc8 100644
--- a/tests/system/small/bigquery/test_json.py
+++ b/tests/system/small/bigquery/test_json.py
@@ -128,7 +128,8 @@ def test_json_extract_array_from_json():
         ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"],
         dtype=dtypes.JSON_DTYPE,
     )
-    actual = bbq.json_extract_array(s, "$.a")
+    with pytest.warns(UserWarning, match="The `json_extract_array` is deprecated"):
+        actual = bbq.json_extract_array(s, "$.a")
 
     # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262,
     # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType())
@@ -241,6 +242,66 @@ def test_json_query_w_invalid_series_type():
         bbq.json_query(s, "$.a")
 
 
+def test_json_query_array_from_json():
+    s = bpd.Series(
+        ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"],
+        dtype=dtypes.JSON_DTYPE,
+    )
+    actual = bbq.json_query_array(s, "$.a")
+
+    # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262,
+    # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType())
+    sql = """
+        SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data,
+        UNION ALL
+        SELECT 1, [],
+        UNION ALL
+        SELECT 2, [JSON '"4"', JSON '"5"'],
+        UNION ALL
+        SELECT 3, null,
+    """
+    df = bpd.read_gbq(sql).set_index("id").sort_index()
+    expected = df["data"]
+    expected.index.name = None
+    expected.name = None
+
+    pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
+
+
+def test_json_query_array_from_json_strings():
+    s = bpd.Series(
+        ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"],
+        dtype=pd.StringDtype(storage="pyarrow"),
+    )
+    actual = bbq.json_query_array(s, "$.a")
+    expected = bpd.Series(
+        [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
+        dtype=pd.ArrowDtype(pa.list_(pa.string())),
+    )
+
+    pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
+
+
+def test_json_query_array_from_json_array_strings():
+    s = bpd.Series(
+        ["[1, 2, 3]", "[]", "[4,5]"],
+        dtype=pd.StringDtype(storage="pyarrow"),
+    )
+    actual = bbq.json_query_array(s)
+    expected = bpd.Series(
+        [["1", "2", "3"], [], ["4", "5"]],
+        dtype=pd.ArrowDtype(pa.list_(pa.string())),
+    )
+
+    pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
+
+
+def test_json_query_array_w_invalid_series_type():
+    s = bpd.Series([1, 2])
+    with pytest.raises(TypeError):
+        bbq.json_query_array(s)
+
+
 def test_json_value_from_json():
     s = bpd.Series(
         ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],

From 1e8a2f1b9fa8cbb40bad638db32afbf4043297e3 Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@users.noreply.github.com>
Date: Fri, 13 Jun 2025 11:58:27 -0700
Subject: [PATCH 15/23] chore!: Remove attach_logprobs parameter from AI
 operations (#1816)

* Refactor: Remove attach_logprobs parameter from AI operations

This commit removes the `attach_logprobs` parameter from the `filter`, `map`, `classify`, and `join` methods within the `AIAccessor` class in `bigframes/operations/ai.py`.

The associated logic for calculating and attaching the 'logprob' column has also been removed from the `map` method.

System tests in `tests/system/large/operations/test_ai.py` that specifically tested the `attach_logprobs` functionality have been updated by:
- Removing the `attach_logprobs=True` argument from method calls.
- Removing assertions for the 'logprob' column.
- Renaming the test methods to reflect their updated scope (e.g., `test_filter_attach_logprob` to `test_filter_functionality_formerly_attach_logprob`).

The small system tests and experimental notebooks were not affected as they did not utilize this parameter.

* polish tests

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
---
 bigframes/operations/ai.py               | 38 ------------
 tests/system/large/operations/test_ai.py | 78 ------------------------
 2 files changed, 116 deletions(-)

diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py
index f7a9e6358e..30192695ac 100644
--- a/bigframes/operations/ai.py
+++ b/bigframes/operations/ai.py
@@ -41,7 +41,6 @@ def filter(
         instruction: str,
         model,
         ground_with_google_search: bool = False,
-        attach_logprobs: bool = False,
     ):
         """
         Filters the DataFrame with the semantics of the user instruction.
@@ -82,10 +81,6 @@ def filter(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
         Returns:
             bigframes.pandas.DataFrame: DataFrame filtered by the instruction.
 
@@ -103,7 +98,6 @@ def filter(
             model,
             output_schema,
             ground_with_google_search,
-            attach_logprobs,
         )
 
         return result[result[answer_col]].drop(answer_col, axis=1)
@@ -114,7 +108,6 @@ def map(
         model,
         output_schema: Dict[str, str] | None = None,
         ground_with_google_search: bool = False,
-        attach_logprobs=False,
     ):
         """
         Maps the DataFrame with the semantics of the user instruction. The name of the keys in the output_schema parameter carry
@@ -180,11 +173,6 @@ def map(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
-
         Returns:
             bigframes.pandas.DataFrame: DataFrame with attached mapping results.
 
@@ -258,19 +246,6 @@ def map(
 
         attach_columns = [results[col] for col, _ in output_schema.items()]
 
-        def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series:
-            from bigframes import bigquery as bbq
-
-            logprob_jsons = bbq.json_extract_array(s, "$.candidates").list[0]
-            logprobs = bbq.json_extract(logprob_jsons, "$.avg_logprobs").astype(
-                "Float64"
-            )
-            logprobs.name = "logprob"
-            return logprobs
-
-        if attach_logprobs:
-            attach_columns.append(extract_logprob(results["full_response"]))
-
         from bigframes.core.reshape.api import concat
 
         return concat([self._df, *attach_columns], axis=1)
@@ -282,7 +257,6 @@ def classify(
         labels: Sequence[str],
         output_column: str = "result",
         ground_with_google_search: bool = False,
-        attach_logprobs=False,
     ):
         """
         Classifies the rows of dataframes based on user instruction into the provided labels.
@@ -337,11 +311,6 @@ def classify(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
-
         Returns:
             bigframes.pandas.DataFrame: DataFrame with classification result.
 
@@ -367,7 +336,6 @@ def classify(
             model,
             output_schema={output_column: "string"},
             ground_with_google_search=ground_with_google_search,
-            attach_logprobs=attach_logprobs,
         )
 
     def join(
@@ -376,7 +344,6 @@ def join(
         instruction: str,
         model,
         ground_with_google_search: bool = False,
-        attach_logprobs=False,
     ):
         """
         Joines two dataframes by applying the instruction over each pair of rows from
@@ -428,10 +395,6 @@ def join(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
         Returns:
             bigframes.pandas.DataFrame: The joined dataframe.
 
@@ -510,7 +473,6 @@ def join(
             instruction,
             model,
             ground_with_google_search=ground_with_google_search,
-            attach_logprobs=attach_logprobs,
         ).reset_index(drop=True)
 
     def search(
diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py
index c0716220b1..afd135591f 100644
--- a/tests/system/large/operations/test_ai.py
+++ b/tests/system/large/operations/test_ai.py
@@ -66,31 +66,6 @@ def test_filter(session, gemini_flash_model):
     )
 
 
-def test_filter_attach_logprob(session, gemini_flash_model):
-    df = dataframe.DataFrame(
-        data={
-            "number_1": [1, 2],
-            "number_2": [2, 1],
-            "col": [0, 0],
-        },
-        session=session,
-    )
-
-    with bigframes.option_context(
-        AI_OP_EXP_OPTION,
-        True,
-        THRESHOLD_OPTION,
-        10,
-    ):
-        actual_df = df.ai.filter(
-            "{number_1} is greater than {number_2}",
-            gemini_flash_model,
-            attach_logprobs=True,
-        ).to_pandas()
-
-    assert "logprob" in actual_df.columns
-
-
 def test_filter_multi_model(session, gemini_flash_model):
     with bigframes.option_context(
         AI_OP_EXP_OPTION,
@@ -259,31 +234,6 @@ def test_map(session, gemini_flash_model, output_schema, output_col):
     )
 
 
-def test_map_attach_logprob(session, gemini_flash_model):
-    df = dataframe.DataFrame(
-        data={
-            "ingredient_1": ["Burger Bun", "Soy Bean"],
-            "ingredient_2": ["Beef Patty", "Bittern"],
-            "gluten-free": [True, True],
-        },
-        session=session,
-    )
-
-    with bigframes.option_context(
-        AI_OP_EXP_OPTION,
-        True,
-        THRESHOLD_OPTION,
-        10,
-    ):
-        actual_df = df.ai.map(
-            "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.",
-            gemini_flash_model,
-            attach_logprobs=True,
-        ).to_pandas()
-
-    assert "logprob" in actual_df.columns
-
-
 def test_map_multimodel(session, gemini_flash_model):
     with bigframes.option_context(
         AI_OP_EXP_OPTION,
@@ -478,34 +428,6 @@ def test_join(instruction, session, gemini_flash_model):
     )
 
 
-def test_join_attach_logprob(session, gemini_flash_model):
-    cities = dataframe.DataFrame(
-        data={
-            "city": ["Seattle", "Berlin"],
-        },
-        session=session,
-    )
-    countries = dataframe.DataFrame(
-        data={"country": ["USA", "UK", "Germany"]},
-        session=session,
-    )
-
-    with bigframes.option_context(
-        AI_OP_EXP_OPTION,
-        True,
-        THRESHOLD_OPTION,
-        10,
-    ):
-        actual_df = cities.ai.join(
-            countries,
-            "{city} is in {country}",
-            gemini_flash_model,
-            attach_logprobs=True,
-        ).to_pandas()
-
-    assert "logprob" in actual_df.columns
-
-
 @pytest.mark.parametrize(
     ("reply"),
     [

From f984381dee56b3dc4a96a59703696d8535cab783 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Fri, 13 Jun 2025 11:59:04 -0700
Subject: [PATCH 16/23] refactor: Refactor udf definitions (#1814)

---
 bigframes/core/compile/ibis_types.py          |  48 ---
 bigframes/core/compile/scalar_op_compiler.py  |  83 ++--
 bigframes/dataframe.py                        |  40 +-
 bigframes/dtypes.py                           |  28 --
 bigframes/functions/__init__.py               |   9 +
 bigframes/functions/_function_client.py       |  34 +-
 bigframes/functions/_function_session.py      | 355 ++++++-----------
 bigframes/functions/_utils.py                 |  61 +--
 bigframes/functions/function.py               | 372 +++++++++++-------
 bigframes/functions/function_typing.py        | 122 ++++++
 bigframes/functions/udf_def.py                | 173 ++++++++
 bigframes/operations/remote_function_ops.py   |  29 +-
 bigframes/series.py                           |  43 +-
 bigframes/session/__init__.py                 |  12 -
 bigframes/testing/polars_session.py           |   1 -
 tests/system/conftest.py                      |   6 -
 .../large/functions/test_managed_function.py  | 128 +-----
 .../large/functions/test_remote_function.py   | 120 ------
 .../small/functions/test_remote_function.py   |   5 +-
 tests/unit/core/test_dtypes.py                |  12 -
 tests/unit/functions/test_remote_function.py  |  23 --
 .../functions/test_remote_function_utils.py   |   5 +-
 .../ibis/expr/operations/udf.py               |   3 +-
 23 files changed, 791 insertions(+), 921 deletions(-)
 create mode 100644 bigframes/functions/function_typing.py
 create mode 100644 bigframes/functions/udf_def.py

diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
index d5f9b5c5f9..0a61be716a 100644
--- a/bigframes/core/compile/ibis_types.py
+++ b/bigframes/core/compile/ibis_types.py
@@ -13,20 +13,14 @@
 # limitations under the License.
 from __future__ import annotations
 
-import typing
 from typing import cast, Dict, Iterable, Optional, Tuple, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.ibis
-import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes
 import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
-from bigframes_vendored.ibis.expr.datatypes.core import (
-    dtype as python_type_to_ibis_type,
-)
 import bigframes_vendored.ibis.expr.types as ibis_types
 import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
-import google.cloud.bigquery as bigquery
 import pandas as pd
 import pyarrow as pa
 
@@ -439,45 +433,3 @@ def literal_to_ibis_scalar(
         )
 
     return scalar_expr
-
-
-class UnsupportedTypeError(ValueError):
-    def __init__(self, type_, supported_types):
-        self.type = type_
-        self.supported_types = supported_types
-        super().__init__(
-            f"'{type_}' is not one of the supported types {supported_types}"
-        )
-
-
-def ibis_type_from_python_type(t: type) -> ibis_dtypes.DataType:
-    if t not in bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES:
-        raise UnsupportedTypeError(t, bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES)
-    return python_type_to_ibis_type(t)
-
-
-def ibis_array_output_type_from_python_type(t: type) -> ibis_dtypes.DataType:
-    array_of = typing.get_args(t)[0]
-    if array_of not in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES:
-        raise UnsupportedTypeError(
-            array_of, bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES
-        )
-    return python_type_to_ibis_type(t)
-
-
-def ibis_type_from_bigquery_type(
-    type_: bigquery.StandardSqlDataType,
-) -> ibis_dtypes.DataType:
-    """Convert bq type to ibis. Only to be used for remote functions, does not handle all types."""
-    if type_.type_kind not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS:
-        raise UnsupportedTypeError(
-            type_.type_kind, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
-        )
-    elif type_.type_kind == "ARRAY":
-        return ibis_dtypes.Array(
-            value_type=ibis_type_from_bigquery_type(
-                typing.cast(bigquery.StandardSqlDataType, type_.array_element_type)
-            )
-        )
-    else:
-        return third_party_ibis_bqtypes.BigQueryType.to_ibis(type_.type_kind)
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index 908f3082c3..b819b1c4e2 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -17,7 +17,6 @@
 import functools
 import typing
 
-import bigframes_vendored.constants as constants
 import bigframes_vendored.ibis.expr.api as ibis_api
 import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
 import bigframes_vendored.ibis.expr.operations.generic as ibis_generic
@@ -30,6 +29,7 @@
 import bigframes.core.compile.default_ordering
 import bigframes.core.compile.ibis_types
 import bigframes.core.expression as ex
+import bigframes.dtypes
 import bigframes.operations as ops
 
 _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0))
@@ -1284,17 +1284,58 @@ def timedelta_floor_op_impl(x: ibis_types.NumericValue):
 
 @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)
 def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp):
-    ibis_node = getattr(op.func, "ibis_node", None)
-    if ibis_node is None:
-        raise TypeError(
-            f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}"
-        )
-    x_transformed = ibis_node(x)
+    udf_sig = op.function_def.signature
+    ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type)
+
+    @ibis_udf.scalar.builtin(
+        name=str(op.function_def.routine_ref), signature=ibis_py_sig
+    )
+    def udf(input):
+        ...
+
+    x_transformed = udf(x)
     if not op.apply_on_null:
-        x_transformed = ibis_api.case().when(x.isnull(), x).else_(x_transformed).end()
+        return ibis_api.case().when(x.isnull(), x).else_(x_transformed).end()
     return x_transformed
 
 
+@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True)
+def binary_remote_function_op_impl(
+    x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp
+):
+    udf_sig = op.function_def.signature
+    ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type)
+
+    @ibis_udf.scalar.builtin(
+        name=str(op.function_def.routine_ref), signature=ibis_py_sig
+    )
+    def udf(input1, input2):
+        ...
+
+    x_transformed = udf(x, y)
+    return x_transformed
+
+
+@scalar_op_compiler.register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True)
+def nary_remote_function_op_impl(
+    *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp
+):
+    udf_sig = op.function_def.signature
+    ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type)
+    arg_names = tuple(arg.name for arg in udf_sig.input_types)
+
+    @ibis_udf.scalar.builtin(
+        name=str(op.function_def.routine_ref),
+        signature=ibis_py_sig,
+        param_name_overrides=arg_names,
+    )
+    def udf(*inputs):
+        ...
+
+    result = udf(*operands)
+    return result
+
+
 @scalar_op_compiler.register_unary_op(ops.MapOp, pass_op=True)
 def map_op_impl(x: ibis_types.Value, op: ops.MapOp):
     case = ibis_api.case()
@@ -1931,19 +1972,6 @@ def manhattan_distance_impl(
     return vector_distance(vector1, vector2, "MANHATTAN")
 
 
-@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True)
-def binary_remote_function_op_impl(
-    x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp
-):
-    ibis_node = getattr(op.func, "ibis_node", None)
-    if ibis_node is None:
-        raise TypeError(
-            f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}"
-        )
-    x_transformed = ibis_node(x, y)
-    return x_transformed
-
-
 # Blob Ops
 @scalar_op_compiler.register_binary_op(ops.obj_make_ref_op)
 def obj_make_ref_op(x: ibis_types.Value, y: ibis_types.Value):
@@ -2005,19 +2033,6 @@ def case_when_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value:
     return case_val.end()  # type: ignore
 
 
-@scalar_op_compiler.register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True)
-def nary_remote_function_op_impl(
-    *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp
-):
-    ibis_node = getattr(op.func, "ibis_node", None)
-    if ibis_node is None:
-        raise TypeError(
-            f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}"
-        )
-    result = ibis_node(*operands)
-    return result
-
-
 @scalar_op_compiler.register_nary_op(ops.SqlScalarOp, pass_op=True)
 def sql_scalar_op_impl(*operands: ibis_types.Value, op: ops.SqlScalarOp):
     return ibis_generic.SqlScalar(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 1d0d485392..7e5bb3049a 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -74,6 +74,7 @@
 import bigframes.dtypes
 import bigframes.exceptions as bfe
 import bigframes.formatting_helpers as formatter
+import bigframes.functions
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
 import bigframes.operations.ai
@@ -4470,7 +4471,7 @@ def _prepare_export(
         return array_value, id_overrides
 
     def map(self, func, na_action: Optional[str] = None) -> DataFrame:
-        if not callable(func):
+        if not isinstance(func, bigframes.functions.BigqueryCallableRoutine):
             raise TypeError("the first argument must be callable")
 
         if na_action not in {None, "ignore"}:
@@ -4478,7 +4479,9 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
 
         # TODO(shobs): Support **kwargs
         return self._apply_unary_op(
-            ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None))
+            ops.RemoteFunctionOp(
+                function_def=func.udf_def, apply_on_null=(na_action is None)
+            )
         )
 
     def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
@@ -4492,13 +4495,18 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
             )
             warnings.warn(msg, category=bfe.FunctionAxisOnePreviewWarning)
 
-            if not hasattr(func, "bigframes_bigquery_function"):
+            if not isinstance(
+                func,
+                (
+                    bigframes.functions.BigqueryCallableRoutine,
+                    bigframes.functions.BigqueryCallableRowRoutine,
+                ),
+            ):
                 raise ValueError(
                     "For axis=1 a BigFrames BigQuery function must be used."
                 )
 
-            is_row_processor = getattr(func, "is_row_processor")
-            if is_row_processor:
+            if func.is_row_processor:
                 # Early check whether the dataframe dtypes are currently supported
                 # in the bigquery function
                 # NOTE: Keep in sync with the value converters used in the gcf code
@@ -4552,7 +4560,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
 
                 # Apply the function
                 result_series = rows_as_json_series._apply_unary_op(
-                    ops.RemoteFunctionOp(func=func, apply_on_null=True)
+                    ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True)
                 )
             else:
                 # This is a special case where we are providing not-pandas-like
@@ -4567,7 +4575,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
                 #      compatible with the data types of the input params
                 #   3. The order of the columns in the dataframe must correspond
                 #      to the order of the input params in the function
-                udf_input_dtypes = getattr(func, "input_dtypes")
+                udf_input_dtypes = func.udf_def.signature.bf_input_types
                 if len(udf_input_dtypes) != len(self.columns):
                     raise ValueError(
                         f"BigFrames BigQuery function takes {len(udf_input_dtypes)}"
@@ -4581,25 +4589,11 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
 
                 series_list = [self[col] for col in self.columns]
                 result_series = series_list[0]._apply_nary_op(
-                    ops.NaryRemoteFunctionOp(func=func), series_list[1:]
+                    ops.NaryRemoteFunctionOp(function_def=func.udf_def), series_list[1:]
                 )
             result_series.name = None
 
-            # If the result type is string but the function output is intended
-            # to be an array, reconstruct the array from the string assuming it
-            # is a json serialized form of the array.
-            if bigframes.dtypes.is_string_like(
-                result_series.dtype
-            ) and bigframes.dtypes.is_array_like(func.output_dtype):
-                import bigframes.bigquery as bbq
-
-                result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
-                    func.output_dtype.pyarrow_dtype.value_type
-                )
-                result_series = bbq.json_extract_string_array(
-                    result_series, value_dtype=result_dtype
-                )
-
+            result_series = func._post_process_series(result_series)
             return result_series
 
         # At this point column-wise or element-wise bigquery function operation will
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 2c5df89665..e0c3e39ac9 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -870,32 +870,4 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:
     return result
 
 
-### Remote functions use only
-# TODO: Refactor into remote function module
-
-# Input and output types supported by BigQuery DataFrames remote functions.
-# TODO(shobs): Extend the support to all types supported by BQ remote functions
-# https://cloud.google.com/bigquery/docs/remote-functions#limitations
-RF_SUPPORTED_IO_PYTHON_TYPES = {bool, bytes, float, int, str}
-
-# Support array output types in BigQuery DataFrames remote functions even though
-# it is not currently (2024-10-06) supported in BigQuery remote functions.
-# https://cloud.google.com/bigquery/docs/remote-functions#limitations
-# TODO(b/284515241): remove this special handling when BigQuery remote functions
-# support array.
-RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES = {bool, float, int, str}
-
-RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS = {
-    "BOOLEAN",
-    "BOOL",
-    "BYTES",
-    "FLOAT",
-    "FLOAT64",
-    "INT64",
-    "INTEGER",
-    "STRING",
-    "ARRAY",
-}
-
-
 TIMEDELTA_DESCRIPTION_TAG = "#microseconds"
diff --git a/bigframes/functions/__init__.py b/bigframes/functions/__init__.py
index 6d5e14bcf4..5f87956a61 100644
--- a/bigframes/functions/__init__.py
+++ b/bigframes/functions/__init__.py
@@ -11,3 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from bigframes.functions.function import (
+    BigqueryCallableRoutine,
+    BigqueryCallableRowRoutine,
+)
+
+__all__ = [
+    "BigqueryCallableRoutine",
+    "BigqueryCallableRowRoutine",
+]
diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py
index e818015a9b..1833ac489c 100644
--- a/bigframes/functions/_function_client.py
+++ b/bigframes/functions/_function_client.py
@@ -25,7 +25,7 @@
 import tempfile
 import textwrap
 import types
-from typing import cast, Tuple, TYPE_CHECKING
+from typing import Any, cast, Optional, Sequence, Tuple, TYPE_CHECKING
 
 import requests
 
@@ -39,8 +39,6 @@
 import google.api_core.retry
 from google.cloud import bigquery, functions_v2
 
-import bigframes.session._io.bigquery
-
 from . import _utils
 
 logger = logging.getLogger(__name__)
@@ -126,6 +124,8 @@ def _ensure_dataset_exists(self) -> None:
 
     def _create_bq_function(self, create_function_ddl: str) -> None:
         # TODO(swast): plumb through the original, user-facing api_name.
+        import bigframes.session._io.bigquery
+
         _, query_job = bigframes.session._io.bigquery.start_query_with_client(
             cast(bigquery.Client, self._session.bqclient),
             create_function_ddl,
@@ -149,13 +149,13 @@ def _format_function_options(self, function_options: dict) -> str:
 
     def create_bq_remote_function(
         self,
-        input_args,
-        input_types,
-        output_type,
-        endpoint,
-        bq_function_name,
-        max_batching_rows,
-        metadata,
+        input_args: Sequence[str],
+        input_types: Sequence[str],
+        output_type: str,
+        endpoint: str,
+        bq_function_name: str,
+        max_batching_rows: int,
+        metadata: str,
     ):
         """Create a BigQuery remote function given the artifacts of a user defined
         function and the http endpoint of a corresponding cloud function."""
@@ -198,14 +198,14 @@ def create_bq_remote_function(
     def provision_bq_managed_function(
         self,
         func,
-        input_types,
-        output_type,
-        name,
-        packages,
-        is_row_processor,
+        input_types: Sequence[str],
+        output_type: str,
+        name: Optional[str],
+        packages: Optional[Sequence[str]],
+        is_row_processor: bool,
         bq_connection_id,
         *,
-        capture_references=False,
+        capture_references: bool = False,
     ):
         """Create a BigQuery managed function."""
 
@@ -230,7 +230,7 @@ def provision_bq_managed_function(
         for name_, type_ in zip(input_args, input_types):
             bq_function_args.append(f"{name_} {type_}")
 
-        managed_function_options = {
+        managed_function_options: dict[str, Any] = {
             "runtime_version": _MANAGED_FUNC_PYTHON_VERSION,
             "entry_point": "bigframes_handler",
         }
diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py
index 2fb3480d6c..9e7555431a 100644
--- a/bigframes/functions/_function_session.py
+++ b/bigframes/functions/_function_session.py
@@ -16,14 +16,15 @@
 from __future__ import annotations
 
 import collections.abc
+import functools
 import inspect
 import sys
 import threading
 from typing import (
     Any,
-    Callable,
     cast,
     Dict,
+    get_origin,
     Literal,
     Mapping,
     Optional,
@@ -33,10 +34,6 @@
 )
 import warnings
 
-import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes
-import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
-import bigframes_vendored.ibis.expr.operations.udf as ibis_udf
-import cloudpickle
 import google.api_core.exceptions
 from google.cloud import (
     bigquery,
@@ -46,17 +43,17 @@
 )
 
 from bigframes import clients
-import bigframes.core.compile.ibis_types
 import bigframes.exceptions as bfe
 import bigframes.formatting_helpers as bf_formatting
-import bigframes.series as bf_series
+from bigframes.functions import function as bq_functions
+from bigframes.functions import udf_def
 
 if TYPE_CHECKING:
     from bigframes.session import Session
 
 import pandas
 
-from . import _function_client, _utils
+from bigframes.functions import _function_client, _utils
 
 
 class FunctionSession:
@@ -220,17 +217,6 @@ def clean_up(
 
             self._temp_artifacts.clear()
 
-    def _try_delattr(self, func: Callable, attr: str) -> None:
-        """Attempts to delete an attribute from a bigframes function."""
-        # In the unlikely case where the user is trying to re-deploy the same
-        # function, cleanup the attributes we add in bigframes functions, first.
-        # This prevents the pickle from having dependencies that might not
-        # otherwise be present such as ibis or pandas.
-        try:
-            delattr(func, attr)
-        except AttributeError:
-            pass
-
     # Inspired by @udf decorator implemented in ibis-bigquery package
     # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py
     # which has moved as @js to the ibis package
@@ -543,58 +529,32 @@ def wrapper(func):
             else:
                 signature_kwargs = {}  # type: ignore
 
-            signature = inspect.signature(
+            py_sig = inspect.signature(
                 func,
                 **signature_kwargs,
             )
+            if input_types is not None:
+                if not isinstance(input_types, collections.abc.Sequence):
+                    input_types = [input_types]
+                py_sig = py_sig.replace(
+                    parameters=[
+                        par.replace(annotation=itype)
+                        for par, itype in zip(py_sig.parameters.values(), input_types)
+                    ]
+                )
+            if output_type:
+                py_sig = py_sig.replace(return_annotation=output_type)
 
             # Try to get input types via type annotations.
-            if input_types is None:
-                input_types = []
-                for parameter in signature.parameters.values():
-                    if (param_type := parameter.annotation) is inspect.Signature.empty:
-                        raise bf_formatting.create_exception_with_feedback_link(
-                            ValueError,
-                            "'input_types' was not set and parameter "
-                            f"'{parameter.name}' is missing a type annotation. "
-                            "Types are required to use @remote_function.",
-                        )
-                    input_types.append(param_type)
-            elif not isinstance(input_types, collections.abc.Sequence):
-                input_types = [input_types]
-
-            if output_type is None:
-                if (
-                    output_type := signature.return_annotation
-                ) is inspect.Signature.empty:
-                    raise bf_formatting.create_exception_with_feedback_link(
-                        ValueError,
-                        "'output_type' was not set and function is missing a "
-                        "return type annotation. Types are required to use "
-                        "@remote_function.",
-                    )
 
             # The function will actually be receiving a pandas Series, but allow both
             # BigQuery DataFrames and pandas object types for compatibility.
+            # The function will actually be receiving a pandas Series, but allow
+            # both BigQuery DataFrames and pandas object types for compatibility.
             is_row_processor = False
-            if len(input_types) == 1 and (
-                (input_type := input_types[0]) == bf_series.Series
-                or input_type == pandas.Series
-            ):
-                msg = bfe.format_message("input_types=Series is in preview.")
-                warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning)
-
-                # we will model the row as a json serialized string containing the data
-                # and the metadata representing the row.
-                input_types = [str]
+            if new_sig := _convert_row_processor_sig(py_sig):
+                py_sig = new_sig
                 is_row_processor = True
-            elif isinstance(input_types, type):
-                input_types = [input_types]
-
-            # TODO(b/340898611): fix type error.
-            ibis_signature = _utils.ibis_signature_from_python_signature(
-                signature, input_types, output_type  # type: ignore
-            )
 
             remote_function_client = _function_client.FunctionClient(
                 dataset_ref.project,
@@ -614,37 +574,25 @@ def wrapper(func):
                 session=session,  # type: ignore
             )
 
-            # To respect the user code/environment let's use a copy of the
-            # original udf, especially since we would be setting some properties
-            # on it.
-            func = cloudpickle.loads(cloudpickle.dumps(func))
-
-            self._try_delattr(func, "bigframes_cloud_function")
-            self._try_delattr(func, "bigframes_remote_function")
-            self._try_delattr(func, "bigframes_bigquery_function")
-            self._try_delattr(func, "bigframes_bigquery_function_output_dtype")
-            self._try_delattr(func, "input_dtypes")
-            self._try_delattr(func, "output_dtype")
-            self._try_delattr(func, "is_row_processor")
-            self._try_delattr(func, "ibis_node")
-
             # resolve the output type that can be supported in the bigframes,
             # ibis, BQ remote functions and cloud functions integration.
-            ibis_output_type_for_bqrf = ibis_signature.output_type
             bqrf_metadata = None
-            if isinstance(ibis_signature.output_type, ibis_dtypes.Array):
+            post_process_routine = None
+            if get_origin(py_sig.return_annotation) is list:
                 # TODO(b/284515241): remove this special handling to support
                 # array output types once BQ remote functions support ARRAY.
                 # Until then, use json serialized strings at the cloud function
                 # and BQ level, and parse that to the intended output type at
                 # the bigframes level.
-                ibis_output_type_for_bqrf = ibis_dtypes.String()
                 bqrf_metadata = _utils.get_bigframes_metadata(
-                    python_output_type=output_type
+                    python_output_type=py_sig.return_annotation
                 )
-            bqrf_output_type = third_party_ibis_bqtypes.BigQueryType.from_ibis(
-                ibis_output_type_for_bqrf
-            )
+                post_process_routine = _utils._build_unnest_post_routine(
+                    py_sig.return_annotation
+                )
+                py_sig = py_sig.replace(return_annotation=str)
+
+            udf_sig = udf_def.UdfSignature.from_py_signature(py_sig)
 
             (
                 rf_name,
@@ -652,12 +600,8 @@ def wrapper(func):
                 created_new,
             ) = remote_function_client.provision_bq_remote_function(
                 func,
-                input_types=tuple(
-                    third_party_ibis_bqtypes.BigQueryType.from_ibis(type_)
-                    for type_ in ibis_signature.input_types
-                    if type_ is not None
-                ),
-                output_type=bqrf_output_type,
+                input_types=udf_sig.sql_input_types,
+                output_type=udf_sig.sql_output_type,
                 reuse=reuse,
                 name=name,
                 package_requirements=packages,
@@ -671,56 +615,14 @@ def wrapper(func):
                 bq_metadata=bqrf_metadata,
             )
 
-            # TODO(shobs): Find a better way to support udfs with param named "name".
-            # This causes an issue in the ibis compilation.
-            func.__signature__ = inspect.signature(func).replace(  # type: ignore
-                parameters=[
-                    inspect.Parameter(
-                        f"bigframes_{param.name}",
-                        param.kind,
-                    )
-                    for param in inspect.signature(func).parameters.values()
-                ]
-            )
-
-            # TODO: Move ibis logic to compiler step.
-            node = ibis_udf.scalar.builtin(
-                func,
-                name=rf_name,
-                catalog=dataset_ref.project,
-                database=dataset_ref.dataset_id,
-                signature=(ibis_signature.input_types, ibis_output_type_for_bqrf),
-            )  # type: ignore
-            func.bigframes_cloud_function = (
+            bigframes_cloud_function = (
                 remote_function_client.get_cloud_function_fully_qualified_name(cf_name)
             )
-            func.bigframes_bigquery_function = (
+            bigframes_bigquery_function = (
                 remote_function_client.get_remote_function_fully_qualilfied_name(
                     rf_name
                 )
             )
-            func.bigframes_remote_function = func.bigframes_bigquery_function
-            func.input_dtypes = tuple(
-                [
-                    bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
-                        input_type
-                    )
-                    for input_type in ibis_signature.input_types
-                    if input_type is not None
-                ]
-            )
-            func.output_dtype = (
-                bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
-                    ibis_signature.output_type
-                )
-            )
-            func.bigframes_bigquery_function_output_dtype = (
-                bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
-                    ibis_output_type_for_bqrf
-                )
-            )
-            func.is_row_processor = is_row_processor
-            func.ibis_node = node
 
             # If a new remote function was created, update the cloud artifacts
             # created in the session. This would be used to clean up any
@@ -731,9 +633,38 @@ def wrapper(func):
             # with that name and would directly manage their lifecycle.
             if created_new and (not name):
                 self._update_temp_artifacts(
-                    func.bigframes_bigquery_function, func.bigframes_cloud_function
+                    bigframes_bigquery_function, bigframes_cloud_function
+                )
+
+            udf_definition = udf_def.BigqueryUdf(
+                routine_ref=bigquery.RoutineReference.from_string(
+                    bigframes_bigquery_function
+                ),
+                signature=udf_sig,
+            )
+            decorator = functools.wraps(func)
+            if is_row_processor:
+                return decorator(
+                    bq_functions.BigqueryCallableRowRoutine(
+                        udf_definition,
+                        session,
+                        post_routine=post_process_routine,
+                        cloud_function_ref=bigframes_cloud_function,
+                        local_func=func,
+                        is_managed=False,
+                    )
+                )
+            else:
+                return decorator(
+                    bq_functions.BigqueryCallableRoutine(
+                        udf_definition,
+                        session,
+                        post_routine=post_process_routine,
+                        cloud_function_ref=bigframes_cloud_function,
+                        local_func=func,
+                        is_managed=False,
+                    )
                 )
-            return func
 
         return wrapper
 
@@ -858,57 +789,30 @@ def wrapper(func):
             else:
                 signature_kwargs = {}  # type: ignore
 
-            signature = inspect.signature(
+            py_sig = inspect.signature(
                 func,
                 **signature_kwargs,
             )
+            if input_types is not None:
+                if not isinstance(input_types, collections.abc.Sequence):
+                    input_types = [input_types]
+                py_sig = py_sig.replace(
+                    parameters=[
+                        par.replace(annotation=itype)
+                        for par, itype in zip(py_sig.parameters.values(), input_types)
+                    ]
+                )
+            if output_type:
+                py_sig = py_sig.replace(return_annotation=output_type)
 
-            # Try to get input types via type annotations.
-            if input_types is None:
-                input_types = []
-                for parameter in signature.parameters.values():
-                    if (param_type := parameter.annotation) is inspect.Signature.empty:
-                        raise bf_formatting.create_exception_with_feedback_link(
-                            ValueError,
-                            "'input_types' was not set and parameter "
-                            f"'{parameter.name}' is missing a type annotation. "
-                            "Types are required to use udf.",
-                        )
-                    input_types.append(param_type)
-            elif not isinstance(input_types, collections.abc.Sequence):
-                input_types = [input_types]
-
-            if output_type is None:
-                if (
-                    output_type := signature.return_annotation
-                ) is inspect.Signature.empty:
-                    raise bf_formatting.create_exception_with_feedback_link(
-                        ValueError,
-                        "'output_type' was not set and function is missing a "
-                        "return type annotation. Types are required to use udf",
-                    )
+            udf_sig = udf_def.UdfSignature.from_py_signature(py_sig)
 
             # The function will actually be receiving a pandas Series, but allow
             # both BigQuery DataFrames and pandas object types for compatibility.
             is_row_processor = False
-            if len(input_types) == 1 and (
-                (input_type := input_types[0]) == bf_series.Series
-                or input_type == pandas.Series
-            ):
-                msg = bfe.format_message("input_types=Series is in preview.")
-                warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning)
-
-                # we will model the row as a json serialized string containing
-                # the data and the metadata representing the row.
-                input_types = [str]
+            if new_sig := _convert_row_processor_sig(py_sig):
+                py_sig = new_sig
                 is_row_processor = True
-            elif isinstance(input_types, type):
-                input_types = [input_types]
-
-            # TODO(b/340898611): fix type error.
-            ibis_signature = _utils.ibis_signature_from_python_signature(
-                signature, input_types, output_type  # type: ignore
-            )
 
             managed_function_client = _function_client.FunctionClient(
                 dataset_ref.project,
@@ -920,80 +824,59 @@ def wrapper(func):
                 session=session,  # type: ignore
             )
 
-            func = cloudpickle.loads(cloudpickle.dumps(func))
-
-            self._try_delattr(func, "bigframes_bigquery_function")
-            self._try_delattr(func, "bigframes_bigquery_function_output_dtype")
-            self._try_delattr(func, "input_dtypes")
-            self._try_delattr(func, "output_dtype")
-            self._try_delattr(func, "is_row_processor")
-            self._try_delattr(func, "ibis_node")
-
             bq_function_name = managed_function_client.provision_bq_managed_function(
                 func=func,
-                input_types=tuple(
-                    third_party_ibis_bqtypes.BigQueryType.from_ibis(type_)
-                    for type_ in ibis_signature.input_types
-                    if type_ is not None
-                ),
-                output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis(
-                    ibis_signature.output_type
-                ),
+                input_types=udf_sig.sql_input_types,
+                output_type=udf_sig.sql_output_type,
                 name=name,
                 packages=packages,
                 is_row_processor=is_row_processor,
                 bq_connection_id=bq_connection_id,
             )
-
-            # TODO(shobs): Find a better way to support udfs with param named
-            # "name". This causes an issue in the ibis compilation.
-            func.__signature__ = inspect.signature(func).replace(  # type: ignore
-                parameters=[
-                    inspect.Parameter(
-                        f"bigframes_{param.name}",
-                        param.kind,
-                    )
-                    for param in inspect.signature(func).parameters.values()
-                ]
-            )
-
-            # TODO: Move ibis logic to compiler step.
-            node = ibis_udf.scalar.builtin(
-                func,
-                name=bq_function_name,
-                catalog=dataset_ref.project,
-                database=dataset_ref.dataset_id,
-                signature=(ibis_signature.input_types, ibis_signature.output_type),
-            )  # type: ignore
-            func.bigframes_bigquery_function = (
+            full_rf_name = (
                 managed_function_client.get_remote_function_fully_qualilfied_name(
                     bq_function_name
                 )
             )
-            func.input_dtypes = tuple(
-                [
-                    bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
-                        input_type
-                    )
-                    for input_type in ibis_signature.input_types
-                    if input_type is not None
-                ]
-            )
-            func.output_dtype = (
-                bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
-                    ibis_signature.output_type
-                )
+
+            udf_definition = udf_def.BigqueryUdf(
+                routine_ref=bigquery.RoutineReference.from_string(full_rf_name),
+                signature=udf_sig,
             )
-            # Managed function directly supports certain output types which are
-            # not supported in remote function (e.g. list output). Thus no more
-            # processing for 'bigframes_bigquery_function_output_dtype'.
-            func.bigframes_bigquery_function_output_dtype = func.output_dtype
-            func.is_row_processor = is_row_processor
-            func.ibis_node = node
 
             if not name:
-                self._update_temp_artifacts(func.bigframes_bigquery_function, "")
+                self._update_temp_artifacts(full_rf_name, "")
 
-            return func
+            decorator = functools.wraps(func)
+            if is_row_processor:
+                return decorator(
+                    bq_functions.BigqueryCallableRowRoutine(
+                        udf_definition, session, local_func=func, is_managed=True
+                    )
+                )
+            else:
+                return decorator(
+                    bq_functions.BigqueryCallableRoutine(
+                        udf_definition,
+                        session,
+                        local_func=func,
+                        is_managed=True,
+                    )
+                )
 
         return wrapper
+
+
+def _convert_row_processor_sig(
+    signature: inspect.Signature,
+) -> Optional[inspect.Signature]:
+    import bigframes.series as bf_series
+
+    if len(signature.parameters) == 1:
+        only_param = next(iter(signature.parameters.values()))
+        param_type = only_param.annotation
+        if (param_type == bf_series.Series) or (param_type == pandas.Series):
+            msg = bfe.format_message("input_types=Series is in preview.")
+            warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning)
+            return signature.replace(parameters=[only_param.replace(annotation=str)])
+    return None
diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py
index 1d930a280d..69cf74ada0 100644
--- a/bigframes/functions/_utils.py
+++ b/bigframes/functions/_utils.py
@@ -14,13 +14,11 @@
 
 
 import hashlib
-import inspect
 import json
 import sys
 import typing
-from typing import cast, List, NamedTuple, Optional, Sequence, Set
+from typing import cast, Optional, Set
 
-import bigframes_vendored.ibis.expr.datatypes.core as ibis_dtypes
 import cloudpickle
 import google.api_core.exceptions
 from google.cloud import bigquery, functions_v2
@@ -28,9 +26,8 @@
 import pandas
 import pyarrow
 
-import bigframes.core.compile.ibis_types
-import bigframes.dtypes
 import bigframes.formatting_helpers as bf_formatting
+from bigframes.functions import function_typing
 
 # Naming convention for the function artifacts
 _BIGFRAMES_FUNCTION_PREFIX = "bigframes"
@@ -198,42 +195,6 @@ def get_bigframes_function_name(function_hash, session_id, uniq_suffix=None):
     return _BQ_FUNCTION_NAME_SEPERATOR.join(parts)
 
 
-class IbisSignature(NamedTuple):
-    parameter_names: List[str]
-    input_types: List[Optional[ibis_dtypes.DataType]]
-    output_type: ibis_dtypes.DataType
-    output_type_override: Optional[ibis_dtypes.DataType] = None
-
-
-def ibis_signature_from_python_signature(
-    signature: inspect.Signature,
-    input_types: Sequence[type],
-    output_type: type,
-) -> IbisSignature:
-
-    ibis_input_types: List[Optional[ibis_dtypes.DataType]] = [
-        bigframes.core.compile.ibis_types.ibis_type_from_python_type(t)
-        for t in input_types
-    ]
-
-    if typing.get_origin(output_type) is list:
-        ibis_output_type = (
-            bigframes.core.compile.ibis_types.ibis_array_output_type_from_python_type(
-                output_type
-            )
-        )
-    else:
-        ibis_output_type = bigframes.core.compile.ibis_types.ibis_type_from_python_type(
-            output_type
-        )
-
-    return IbisSignature(
-        parameter_names=list(signature.parameters.keys()),
-        input_types=ibis_input_types,
-        output_type=ibis_output_type,
-    )
-
-
 def get_python_output_type_from_bigframes_metadata(
     metadata_text: str,
 ) -> Optional[type]:
@@ -249,7 +210,7 @@ def get_python_output_type_from_bigframes_metadata(
 
     for (
         python_output_array_type
-    ) in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES:
+    ) in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES:
         if python_output_array_type.__name__ == output_type:
             return list[python_output_array_type]  # type: ignore
 
@@ -266,7 +227,7 @@ def get_bigframes_metadata(*, python_output_type: Optional[type] = None) -> str:
         python_output_array_type = typing.get_args(python_output_type)[0]
         if (
             python_output_array_type
-            in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES
+            in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES
         ):
             inner_metadata[
                 "python_array_output_type"
@@ -294,3 +255,17 @@ def get_python_version(is_compat: bool = False) -> str:
     major = sys.version_info.major
     minor = sys.version_info.minor
     return f"python{major}{minor}" if is_compat else f"python-{major}.{minor}"
+
+
+def _build_unnest_post_routine(py_list_type: type[list]):
+    sdk_type = function_typing.sdk_array_output_type_from_python_type(py_list_type)
+    assert sdk_type.array_element_type is not None
+    inner_sdk_type = sdk_type.array_element_type
+    result_dtype = function_typing.sdk_type_to_bf_type(inner_sdk_type)
+
+    def post_process(input):
+        import bigframes.bigquery as bbq
+
+        return bbq.json_extract_string_array(input, value_dtype=result_dtype)
+
+    return post_process
diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py
index 858c25fada..b695bcd250 100644
--- a/bigframes/functions/function.py
+++ b/bigframes/functions/function.py
@@ -14,28 +14,19 @@
 
 from __future__ import annotations
 
-import inspect
 import logging
-import typing
-from typing import cast, Optional, TYPE_CHECKING
-import warnings
-
-import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
-import bigframes_vendored.ibis.expr.operations.udf as ibis_udf
+from typing import Callable, cast, get_origin, Optional, TYPE_CHECKING
 
 if TYPE_CHECKING:
     from bigframes.session import Session
+    import bigframes.series
 
 import google.api_core.exceptions
 from google.cloud import bigquery
 
-import bigframes.core.compile.ibis_types
-import bigframes.dtypes
-import bigframes.exceptions as bfe
 import bigframes.formatting_helpers as bf_formatting
-
-from . import _function_session as bff_session
-from . import _utils
+from bigframes.functions import _function_session as bff_session
+from bigframes.functions import _utils, function_typing, udf_def
 
 logger = logging.getLogger(__name__)
 
@@ -46,55 +37,6 @@ def __init__(self, type_, supported_types):
         self.supported_types = supported_types
 
 
-class ReturnTypeMissingError(ValueError):
-    pass
-
-
-# TODO: Move this to compile folder
-def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature:
-    if routine.return_type:
-        ibis_output_type = (
-            bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type(
-                routine.return_type
-            )
-        )
-    else:
-        raise ReturnTypeMissingError
-
-    ibis_output_type_override: Optional[ibis_dtypes.DataType] = None
-    if python_output_type := _utils.get_python_output_type_from_bigframes_metadata(
-        routine.description
-    ):
-        if not isinstance(ibis_output_type, ibis_dtypes.String):
-            raise bf_formatting.create_exception_with_feedback_link(
-                TypeError,
-                "An explicit output_type should be provided only for a BigQuery function with STRING output.",
-            )
-        if typing.get_origin(python_output_type) is list:
-            ibis_output_type_override = bigframes.core.compile.ibis_types.ibis_array_output_type_from_python_type(
-                cast(type, python_output_type)
-            )
-        else:
-            raise bf_formatting.create_exception_with_feedback_link(
-                TypeError,
-                "Currently only list of a type is supported as python output type.",
-            )
-
-    return _utils.IbisSignature(
-        parameter_names=[arg.name for arg in routine.arguments],
-        input_types=[
-            bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type(
-                arg.data_type
-            )
-            if arg.data_type
-            else None
-            for arg in routine.arguments
-        ],
-        output_type=ibis_output_type,
-        output_type_override=ibis_output_type_override,
-    )
-
-
 class DatasetMissingError(ValueError):
     pass
 
@@ -136,6 +78,78 @@ def udf(*args, **kwargs):
 udf.__doc__ = bff_session.FunctionSession.udf.__doc__
 
 
+def _try_import_routine(
+    routine: bigquery.Routine, session: bigframes.Session
+) -> BigqueryCallableRoutine:
+    udf_def = _routine_as_udf_def(routine)
+    override_type = _get_output_type_override(routine)
+    is_remote = (
+        hasattr(routine, "remote_function_options") and routine.remote_function_options
+    )
+    if override_type is not None:
+        return BigqueryCallableRoutine(
+            udf_def,
+            session,
+            post_routine=_utils._build_unnest_post_routine(override_type),
+        )
+    return BigqueryCallableRoutine(udf_def, session, is_managed=not is_remote)
+
+
+def _try_import_row_routine(
+    routine: bigquery.Routine, session: bigframes.Session
+) -> BigqueryCallableRowRoutine:
+    udf_def = _routine_as_udf_def(routine)
+    override_type = _get_output_type_override(routine)
+    is_remote = (
+        hasattr(routine, "remote_function_options") and routine.remote_function_options
+    )
+    if override_type is not None:
+        return BigqueryCallableRowRoutine(
+            udf_def,
+            session,
+            post_routine=_utils._build_unnest_post_routine(override_type),
+        )
+    return BigqueryCallableRowRoutine(udf_def, session, is_managed=not is_remote)
+
+
+def _routine_as_udf_def(routine: bigquery.Routine) -> udf_def.BigqueryUdf:
+    try:
+        return udf_def.BigqueryUdf.from_routine(routine)
+    except udf_def.ReturnTypeMissingError:
+        raise bf_formatting.create_exception_with_feedback_link(
+            ValueError, "Function return type must be specified."
+        )
+    except function_typing.UnsupportedTypeError as e:
+        raise bf_formatting.create_exception_with_feedback_link(
+            ValueError,
+            f"Type {e.type} not supported, supported types are {e.supported_types}.",
+        )
+
+
+def _get_output_type_override(routine: bigquery.Routine) -> Optional[type[list]]:
+    if routine.description is not None and isinstance(routine.description, str):
+        if python_output_type := _utils.get_python_output_type_from_bigframes_metadata(
+            routine.description
+        ):
+            bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type)
+
+            if bq_return_type is None or bq_return_type.type_kind != "STRING":
+                raise bf_formatting.create_exception_with_feedback_link(
+                    TypeError,
+                    "An explicit output_type should be provided only for a BigQuery function with STRING output.",
+                )
+            if get_origin(python_output_type) is list:
+                return python_output_type
+            else:
+                raise bf_formatting.create_exception_with_feedback_link(
+                    TypeError,
+                    "Currently only list of "
+                    "a type is supported as python output type.",
+                )
+
+    return None
+
+
 # TODO(b/399894805): Support managed function.
 def read_gbq_function(
     function_name: str,
@@ -147,7 +161,6 @@ def read_gbq_function(
     Read an existing BigQuery function and prepare it for use in future queries.
     """
     bigquery_client = session.bqclient
-    ibis_client = session.ibis_client
 
     try:
         routine_ref = get_routine_reference(function_name, bigquery_client, session)
@@ -172,86 +185,163 @@ def read_gbq_function(
             "takes in a single input representing the row.",
         )
 
-    try:
-        ibis_signature = ibis_signature_from_routine(routine)
-    except ReturnTypeMissingError:
-        raise bf_formatting.create_exception_with_feedback_link(
-            ValueError, "Function return type must be specified."
-        )
-    except bigframes.core.compile.ibis_types.UnsupportedTypeError as e:
-        raise bf_formatting.create_exception_with_feedback_link(
-            ValueError,
-            f"Type {e.type} not supported, supported types are {e.supported_types}.",
-        )
+    if is_row_processor:
+        return _try_import_row_routine(routine, session)
+    else:
+        return _try_import_routine(routine, session)
 
-    # The name "args" conflicts with the Ibis operator, so we use
-    # non-standard names for the arguments here.
-    def func(*bigframes_args, **bigframes_kwargs):
-        f"""Bigframes function {str(routine_ref)}."""
-        nonlocal node  # type: ignore
-
-        expr = node(*bigframes_args, **bigframes_kwargs)  # type: ignore
-        return ibis_client.execute(expr)
-
-    func.__signature__ = inspect.signature(func).replace(  # type: ignore
-        parameters=[
-            # TODO(shobs): Find a better way to support functions with param
-            # named "name". This causes an issue in the ibis compilation.
-            inspect.Parameter(
-                f"bigframes_{name}",
-                inspect.Parameter.POSITIONAL_OR_KEYWORD,
-            )
-            for name in ibis_signature.parameter_names
-        ]
-    )
 
-    # TODO: Move ibis logic to compiler step
-
-    func.__name__ = routine_ref.routine_id
-
-    node = ibis_udf.scalar.builtin(
-        func,
-        name=routine_ref.routine_id,
-        catalog=routine_ref.project,
-        database=routine_ref.dataset_id,
-        signature=(ibis_signature.input_types, ibis_signature.output_type),
-    )  # type: ignore
-    func.bigframes_bigquery_function = str(routine_ref)  # type: ignore
-
-    # We will keep the "bigframes_remote_function" attr for remote function.
-    if hasattr(routine, "remote_function_options") and routine.remote_function_options:
-        func.bigframes_remote_function = func.bigframes_bigquery_function  # type: ignore
-
-    # set input bigframes data types
-    has_unknown_dtypes = False
-    function_input_dtypes = []
-    for ibis_type in ibis_signature.input_types:
-        input_dtype = cast(bigframes.dtypes.Dtype, bigframes.dtypes.DEFAULT_DTYPE)
-        if ibis_type is None:
-            has_unknown_dtypes = True
-        else:
-            input_dtype = (
-                bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
-                    ibis_type
-                )
-            )
-        function_input_dtypes.append(input_dtype)
-    if has_unknown_dtypes:
-        msg = bfe.format_message(
-            "The function has one or more missing input data types. BigQuery DataFrames "
-            f"will assume default data type {bigframes.dtypes.DEFAULT_DTYPE} for them."
-        )
-        warnings.warn(msg, category=bfe.UnknownDataTypeWarning)
-    func.input_dtypes = tuple(function_input_dtypes)  # type: ignore
+class BigqueryCallableRoutine:
+    """
+    A reference to a routine in the context of a session.
 
-    func.output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(  # type: ignore
-        ibis_signature.output_type_override
-        if ibis_signature.output_type_override
-        else ibis_signature.output_type
-    )
+    Can be used both directly as a callable, or as an input to dataframe ops that take a callable.
+    """
 
-    func.bigframes_bigquery_function_output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_signature.output_type)  # type: ignore
+    def __init__(
+        self,
+        udf_def: udf_def.BigqueryUdf,
+        session: bigframes.Session,
+        *,
+        local_func: Optional[Callable] = None,
+        cloud_function_ref: Optional[str] = None,
+        post_routine: Optional[
+            Callable[[bigframes.series.Series], bigframes.series.Series]
+        ] = None,
+        is_managed: bool = False,
+    ):
+        self._udf_def = udf_def
+        self._session = session
+        self._post_routine = post_routine
+        self._local_fun = local_func
+        self._cloud_function = cloud_function_ref
+        self._is_managed = is_managed
+
+    def __call__(self, *args, **kwargs):
+        if self._local_fun:
+            return self._local_fun(*args, **kwargs)
+        # avoid circular imports
+        import bigframes.core.sql as bf_sql
+        import bigframes.session._io.bigquery as bf_io_bigquery
+
+        args_string = ", ".join(map(bf_sql.simple_literal, args))
+        sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})"
+        iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig())  # type: ignore
+        return list(iter.to_arrow().to_pydict().values())[0][0]
+
+    @property
+    def bigframes_bigquery_function(self) -> str:
+        return str(self._udf_def.routine_ref)
+
+    @property
+    def bigframes_remote_function(self):
+        return None if self._is_managed else str(self._udf_def.routine_ref)
+
+    @property
+    def is_row_processor(self) -> bool:
+        return False
+
+    @property
+    def udf_def(self) -> udf_def.BigqueryUdf:
+        return self._udf_def
+
+    @property
+    def bigframes_cloud_function(self) -> Optional[str]:
+        return self._cloud_function
+
+    @property
+    def input_dtypes(self):
+        return self.udf_def.signature.bf_input_types
+
+    @property
+    def output_dtype(self):
+        return self.udf_def.signature.bf_output_type
+
+    @property
+    def bigframes_bigquery_function_output_dtype(self):
+        return self.output_dtype
+
+    def _post_process_series(
+        self, series: bigframes.series.Series
+    ) -> bigframes.series.Series:
+        if self._post_routine is not None:
+            return self._post_routine(series)
+        return series
+
+
+class BigqueryCallableRowRoutine:
+    """
+    A reference to a routine in the context of a session.
 
-    func.is_row_processor = is_row_processor  # type: ignore
-    func.ibis_node = node  # type: ignore
-    return func
+    Can be used both directly as a callable, or as an input to dataframe ops that take a callable.
+    """
+
+    def __init__(
+        self,
+        udf_def: udf_def.BigqueryUdf,
+        session: bigframes.Session,
+        *,
+        local_func: Optional[Callable] = None,
+        cloud_function_ref: Optional[str] = None,
+        post_routine: Optional[
+            Callable[[bigframes.series.Series], bigframes.series.Series]
+        ] = None,
+        is_managed: bool = False,
+    ):
+        self._udf_def = udf_def
+        self._session = session
+        self._post_routine = post_routine
+        self._local_fun = local_func
+        self._cloud_function = cloud_function_ref
+        self._is_managed = is_managed
+
+    def __call__(self, *args, **kwargs):
+        if self._local_fun:
+            return self._local_fun(*args, **kwargs)
+        # avoid circular imports
+        import bigframes.core.sql as bf_sql
+        import bigframes.session._io.bigquery as bf_io_bigquery
+
+        args_string = ", ".join(map(bf_sql.simple_literal, args))
+        sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})"
+        iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig())  # type: ignore
+        return list(iter.to_arrow().to_pydict().values())[0][0]
+
+    @property
+    def bigframes_bigquery_function(self) -> str:
+        return str(self._udf_def.routine_ref)
+
+    @property
+    def bigframes_remote_function(self):
+        return None if self._is_managed else str(self._udf_def.routine_ref)
+
+    @property
+    def is_row_processor(self) -> bool:
+        return True
+
+    @property
+    def udf_def(self) -> udf_def.BigqueryUdf:
+        return self._udf_def
+
+    @property
+    def bigframes_cloud_function(self) -> Optional[str]:
+        return self._cloud_function
+
+    @property
+    def input_dtypes(self):
+        return self.udf_def.signature.bf_input_types
+
+    @property
+    def output_dtype(self):
+        return self.udf_def.signature.bf_output_type
+
+    @property
+    def bigframes_bigquery_function_output_dtype(self):
+        return self.output_dtype
+
+    def _post_process_series(
+        self, series: bigframes.series.Series
+    ) -> bigframes.series.Series:
+        if self._post_routine is not None:
+            return self._post_routine(series)
+        return series
diff --git a/bigframes/functions/function_typing.py b/bigframes/functions/function_typing.py
new file mode 100644
index 0000000000..f2fa794456
--- /dev/null
+++ b/bigframes/functions/function_typing.py
@@ -0,0 +1,122 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, get_args, get_origin, Type
+
+from google.cloud import bigquery
+
+import bigframes.dtypes
+
+# Input and output types supported by BigQuery DataFrames remote functions.
+# TODO(shobs): Extend the support to all types supported by BQ remote functions
+# https://cloud.google.com/bigquery/docs/remote-functions#limitations
+RF_SUPPORTED_IO_PYTHON_TYPES = {
+    bool: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.BOOL),
+    bytes: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.BYTES),
+    float: bigquery.StandardSqlDataType(
+        type_kind=bigquery.StandardSqlTypeNames.FLOAT64
+    ),
+    int: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.INT64),
+    str: bigquery.StandardSqlDataType(type_kind=bigquery.StandardSqlTypeNames.STRING),
+}
+
+# Support array output types in BigQuery DataFrames remote functions even though
+# it is not currently (2024-10-06) supported in BigQuery remote functions.
+# https://cloud.google.com/bigquery/docs/remote-functions#limitations
+# TODO(b/284515241): remove this special handling when BigQuery remote functions
+# support array.
+RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES = {bool, float, int, str}
+
+DEFAULT_RF_TYPE = RF_SUPPORTED_IO_PYTHON_TYPES[float]
+
+RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS = {
+    "BOOLEAN",
+    "BOOL",
+    "BYTES",
+    "FLOAT",
+    "FLOAT64",
+    "INT64",
+    "INTEGER",
+    "STRING",
+    "ARRAY",
+}
+
+
+TIMEDELTA_DESCRIPTION_TAG = "#microseconds"
+
+
+class UnsupportedTypeError(ValueError):
+    def __init__(self, type_, supported_types):
+        self.type = type_
+        self.supported_types = supported_types
+        super().__init__(
+            f"'{type_}' is not one of the supported types {supported_types}"
+        )
+
+
+def sdk_type_from_python_type(
+    t: type, allow_lists: bool = False
+) -> bigquery.StandardSqlDataType:
+    if (get_origin(t) is list) and allow_lists:
+        return sdk_array_output_type_from_python_type(t)
+    if t not in RF_SUPPORTED_IO_PYTHON_TYPES:
+        raise UnsupportedTypeError(t, RF_SUPPORTED_IO_PYTHON_TYPES)
+    return RF_SUPPORTED_IO_PYTHON_TYPES[t]
+
+
+def sdk_array_output_type_from_python_type(t: type) -> bigquery.StandardSqlDataType:
+    array_of = get_args(t)[0]
+    if array_of not in RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES:
+        raise UnsupportedTypeError(array_of, RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES)
+    inner_type = RF_SUPPORTED_IO_PYTHON_TYPES[array_of]
+    return bigquery.StandardSqlDataType(
+        type_kind=bigquery.StandardSqlTypeNames.ARRAY, array_element_type=inner_type
+    )
+
+
+def sdk_type_to_bf_type(
+    sdk_type: bigquery.StandardSqlDataType,
+) -> bigframes.dtypes.Dtype:
+    if sdk_type.array_element_type is not None:
+        return bigframes.dtypes.list_type(
+            sdk_type_to_bf_type(sdk_type.array_element_type)
+        )
+    if sdk_type.struct_type is not None:
+        raise ValueError("Cannot handle struct types in remote function")
+    assert sdk_type.type_kind is not None
+    return bigframes.dtypes._TK_TO_BIGFRAMES[sdk_type.type_kind.name]
+
+
+def sdk_type_to_py_type(
+    sdk_type: bigquery.StandardSqlDataType,
+) -> Type[Any]:
+    if sdk_type.array_element_type is not None:
+        return list[sdk_type_to_py_type(sdk_type.array_element_type)]  # type: ignore
+    if sdk_type.struct_type is not None:
+        raise ValueError("Cannot handle struct types in remote function")
+    for key, value in RF_SUPPORTED_IO_PYTHON_TYPES.items():
+        if value == sdk_type:
+            return key
+    raise ValueError(f"Cannot handle {sdk_type} in remote function")
+
+
+def sdk_type_to_sql_string(
+    sdk_type: bigquery.StandardSqlDataType,
+) -> str:
+    if sdk_type.array_element_type is not None:
+        return f"ARRAY<{sdk_type_to_sql_string(sdk_type.array_element_type)}>"
+    if sdk_type.struct_type is not None:
+        raise ValueError("Cannot handle struct types in remote function")
+    assert sdk_type.type_kind is not None
+    return sdk_type.type_kind.name
diff --git a/bigframes/functions/udf_def.py b/bigframes/functions/udf_def.py
new file mode 100644
index 0000000000..078e45f32d
--- /dev/null
+++ b/bigframes/functions/udf_def.py
@@ -0,0 +1,173 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import dataclasses
+import inspect
+from typing import cast, Optional
+import warnings
+
+from google.cloud import bigquery
+
+import bigframes.dtypes
+import bigframes.exceptions as bfe
+import bigframes.formatting_helpers as bf_formatting
+from bigframes.functions import function_typing
+
+
+class ReturnTypeMissingError(ValueError):
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class UdfField:
+    name: str = dataclasses.field()
+    dtype: bigquery.StandardSqlDataType = dataclasses.field(hash=False, compare=False)
+
+    @classmethod
+    def from_sdk(cls, arg: bigquery.RoutineArgument) -> UdfField:
+        assert arg.name is not None
+        assert arg.data_type is not None
+        return cls(arg.name, arg.data_type)
+
+
+@dataclasses.dataclass(frozen=True)
+class UdfSignature:
+    input_types: tuple[UdfField, ...] = dataclasses.field()
+    output_bq_type: bigquery.StandardSqlDataType = dataclasses.field(
+        hash=False, compare=False
+    )
+
+    @property
+    def bf_input_types(self) -> tuple[bigframes.dtypes.Dtype, ...]:
+        return tuple(
+            function_typing.sdk_type_to_bf_type(arg.dtype) for arg in self.input_types
+        )
+
+    @property
+    def bf_output_type(self) -> bigframes.dtypes.Dtype:
+        return function_typing.sdk_type_to_bf_type(self.output_bq_type)
+
+    @property
+    def py_input_types(self) -> tuple[type, ...]:
+        return tuple(
+            function_typing.sdk_type_to_py_type(arg.dtype) for arg in self.input_types
+        )
+
+    @property
+    def py_output_type(self) -> type:
+        return function_typing.sdk_type_to_py_type(self.output_bq_type)
+
+    @property
+    def sql_input_types(self) -> tuple[str, ...]:
+        return tuple(
+            function_typing.sdk_type_to_sql_string(arg.dtype)
+            for arg in self.input_types
+        )
+
+    @property
+    def sql_output_type(self) -> str:
+        return function_typing.sdk_type_to_sql_string(self.output_bq_type)
+
+    @classmethod
+    def from_routine(cls, routine: bigquery.Routine) -> UdfSignature:
+        if routine.return_type is None:
+            raise ReturnTypeMissingError
+        bq_return_type = cast(bigquery.StandardSqlDataType, routine.return_type)
+
+        if (
+            bq_return_type.type_kind is None
+            or bq_return_type.type_kind
+            not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+        ):
+            raise ValueError(
+                f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}"
+            )
+
+        udf_fields = []
+        for argument in routine.arguments:
+            if argument.data_type is None:
+                msg = bfe.format_message(
+                    "The function has one or more missing input data types. BigQuery DataFrames "
+                    f"will assume default data type {function_typing.DEFAULT_RF_TYPE} for them."
+                )
+                warnings.warn(msg, category=bfe.UnknownDataTypeWarning)
+                assert argument.name is not None
+                udf_fields.append(
+                    UdfField(argument.name, function_typing.DEFAULT_RF_TYPE)
+                )
+            else:
+                udf_fields.append(UdfField.from_sdk(argument))
+
+        return cls(
+            input_types=tuple(udf_fields),
+            output_bq_type=bq_return_type,
+        )
+
+    @classmethod
+    def from_py_signature(cls, signature: inspect.Signature):
+        input_types: list[UdfField] = []
+        for parameter in signature.parameters.values():
+            if parameter.annotation is inspect.Signature.empty:
+                raise bf_formatting.create_exception_with_feedback_link(
+                    ValueError,
+                    "'input_types' was not set and parameter "
+                    f"'{parameter.name}' is missing a type annotation. "
+                    "Types are required to use @remote_function.",
+                )
+            bq_type = function_typing.sdk_type_from_python_type(parameter.annotation)
+            input_types.append(UdfField(parameter.name, bq_type))
+
+        if signature.return_annotation is inspect.Signature.empty:
+            raise bf_formatting.create_exception_with_feedback_link(
+                ValueError,
+                "'output_type' was not set and function is missing a "
+                "return type annotation. Types are required to use "
+                "@remote_function.",
+            )
+        output_bq_type = function_typing.sdk_type_from_python_type(
+            signature.return_annotation,
+            allow_lists=True,
+        )
+        return cls(tuple(input_types), output_bq_type)
+
+
+@dataclasses.dataclass(frozen=True)
+class BigqueryUdf:
+    routine_ref: bigquery.RoutineReference = dataclasses.field()
+    signature: UdfSignature
+    # Used to provide alternative interpretations of output bq type, eg interpret int as timestamp
+    output_type_override: Optional[bigframes.dtypes.Dtype] = dataclasses.field(
+        default=None
+    )
+
+    @property
+    def bigframes_output_type(self) -> bigframes.dtypes.Dtype:
+        return self.output_type_override or function_typing.sdk_type_to_bf_type(
+            self.signature.output_bq_type
+        )
+
+    @classmethod
+    def from_routine(cls, routine: bigquery.Routine) -> BigqueryUdf:
+        signature = UdfSignature.from_routine(routine)
+
+        if (
+            signature.output_bq_type.type_kind is None
+            or signature.output_bq_type.type_kind
+            not in function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+        ):
+            raise ValueError(
+                f"Remote function must have one of the following supported output types: {function_typing.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS}"
+            )
+        return cls(routine.reference, signature=signature)
diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py
index 51cfccbc41..e610ce61d6 100644
--- a/bigframes/operations/remote_function_ops.py
+++ b/bigframes/operations/remote_function_ops.py
@@ -15,13 +15,15 @@
 import dataclasses
 import typing
 
+from bigframes.functions import udf_def
 from bigframes.operations import base_ops
 
 
+# TODO: Enforce input type constraints from function def
 @dataclasses.dataclass(frozen=True)
 class RemoteFunctionOp(base_ops.UnaryOp):
     name: typing.ClassVar[str] = "remote_function"
-    func: typing.Callable
+    function_def: udf_def.BigqueryUdf
     apply_on_null: bool
 
     @property
@@ -29,45 +31,30 @@ def expensive(self) -> bool:
         return True
 
     def output_type(self, *input_types):
-        # The output dtype should be set to a valid Dtype by @udf decorator,
-        # @remote_function decorator, or read_gbq_function method.
-        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
-            return self.func.bigframes_bigquery_function_output_dtype
-
-        raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
+        return self.function_def.bigframes_output_type
 
 
 @dataclasses.dataclass(frozen=True)
 class BinaryRemoteFunctionOp(base_ops.BinaryOp):
     name: typing.ClassVar[str] = "binary_remote_function"
-    func: typing.Callable
+    function_def: udf_def.BigqueryUdf
 
     @property
     def expensive(self) -> bool:
         return True
 
     def output_type(self, *input_types):
-        # The output dtype should be set to a valid Dtype by @udf decorator,
-        # @remote_function decorator, or read_gbq_function method.
-        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
-            return self.func.bigframes_bigquery_function_output_dtype
-
-        raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
+        return self.function_def.bigframes_output_type
 
 
 @dataclasses.dataclass(frozen=True)
 class NaryRemoteFunctionOp(base_ops.NaryOp):
     name: typing.ClassVar[str] = "nary_remote_function"
-    func: typing.Callable
+    function_def: udf_def.BigqueryUdf
 
     @property
     def expensive(self) -> bool:
         return True
 
     def output_type(self, *input_types):
-        # The output dtype should be set to a valid Dtype by @udf decorator,
-        # @remote_function decorator, or read_gbq_function method.
-        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
-            return self.func.bigframes_bigquery_function_output_dtype
-
-        raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
+        return self.function_def.bigframes_output_type
diff --git a/bigframes/series.py b/bigframes/series.py
index 1bb0c1e0dc..7a318c4c70 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -66,6 +66,7 @@
 import bigframes.dtypes
 import bigframes.exceptions as bfe
 import bigframes.formatting_helpers as formatter
+import bigframes.functions
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
 import bigframes.operations.base
@@ -1841,7 +1842,7 @@ def apply(
                 " are supported."
             )
 
-        if not hasattr(func, "bigframes_bigquery_function"):
+        if not isinstance(func, bigframes.functions.BigqueryCallableRoutine):
             # It is neither a remote function nor a managed function.
             # Then it must be a vectorized function that applies to the Series
             # as a whole.
@@ -1873,24 +1874,9 @@ def apply(
 
         # We are working with bigquery function at this point
         result_series = self._apply_unary_op(
-            ops.RemoteFunctionOp(func=func, apply_on_null=True)
+            ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True)
         )
-
-        # If the result type is string but the function output is intended to
-        # be an array, reconstruct the array from the string assuming it is a
-        # json serialized form of the array.
-        if bigframes.dtypes.is_string_like(
-            result_series.dtype
-        ) and bigframes.dtypes.is_array_like(func.output_dtype):
-            import bigframes.bigquery as bbq
-
-            result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
-                func.output_dtype.pyarrow_dtype.value_type
-            )
-            result_series = bbq.json_extract_string_array(
-                result_series, value_dtype=result_dtype
-            )
-
+        result_series = func._post_process_series(result_series)
         return result_series
 
     def combine(
@@ -1905,7 +1891,7 @@ def combine(
                 " are supported."
             )
 
-        if not hasattr(func, "bigframes_bigquery_function"):
+        if not isinstance(func, bigframes.functions.BigqueryCallableRoutine):
             # Keep this in sync with .apply
             try:
                 return func(self, other)
@@ -1918,24 +1904,9 @@ def combine(
                 raise
 
         result_series = self._apply_binary_op(
-            other, ops.BinaryRemoteFunctionOp(func=func)
+            other, ops.BinaryRemoteFunctionOp(function_def=func.udf_def)
         )
-
-        # If the result type is string but the function output is intended to
-        # be an array, reconstruct the array from the string assuming it is a
-        # json serialized form of the array.
-        if bigframes.dtypes.is_string_like(
-            result_series.dtype
-        ) and bigframes.dtypes.is_array_like(func.output_dtype):
-            import bigframes.bigquery as bbq
-
-            result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
-                func.output_dtype.pyarrow_dtype.value_type
-            )
-            result_series = bbq.json_extract_string_array(
-                result_series, value_dtype=result_dtype
-            )
-
+        result_series = func._post_process_series(result_series)
         return result_series
 
     @validations.requires_index
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index b6066daed3..7597f8eeed 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -181,18 +181,6 @@ def __init__(
         # the ibis client has been created
         original_default_query_job_config = self.bqclient.default_query_job_config
 
-        # Only used to fetch remote function metadata.
-        # TODO: Remove in favor of raw bq client
-
-        self.ibis_client = typing.cast(
-            ibis_bigquery.Backend,
-            ibis_bigquery.Backend().connect(
-                project_id=context.project,
-                client=self.bqclient,
-                storage_client=self.bqstoragereadclient,
-            ),
-        )
-
         self.bqclient.default_query_job_config = original_default_query_job_config
 
         # Resolve the BQ connection for remote function and Vertex AI integration
diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py
index 5e5de2d0b2..723841a672 100644
--- a/bigframes/testing/polars_session.py
+++ b/bigframes/testing/polars_session.py
@@ -89,7 +89,6 @@ def __init__(self):
         self._location = None  # type: ignore
         self._bq_kms_key_name = None  # type: ignore
         self._clients_provider = None  # type: ignore
-        self.ibis_client = None  # type: ignore
         self._bq_connection = None  # type: ignore
         self._skip_bq_connection_check = True
         self._session_id: str = "test_session"
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index 4605d9ddbc..a75918ed23 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -22,7 +22,6 @@
 import typing
 from typing import Dict, Generator, Optional
 
-import bigframes_vendored.ibis.backends as ibis_backends
 import google.api_core.exceptions
 import google.cloud.bigquery as bigquery
 import google.cloud.bigquery_connection_v1 as bigquery_connection_v1
@@ -109,11 +108,6 @@ def bigquery_client_tokyo(session_tokyo: bigframes.Session) -> bigquery.Client:
     return session_tokyo.bqclient
 
 
-@pytest.fixture(scope="session")
-def ibis_client(session: bigframes.Session) -> ibis_backends.BaseBackend:
-    return session.ibis_client
-
-
 @pytest.fixture(scope="session")
 def bigqueryconnection_client(
     session: bigframes.Session,
diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py
index 5cb54a00c1..ad5849eb2f 100644
--- a/tests/system/large/functions/test_managed_function.py
+++ b/tests/system/large/functions/test_managed_function.py
@@ -19,6 +19,8 @@
 import test_utils.prefixer
 
 import bigframes
+import bigframes.dataframe
+import bigframes.dtypes
 import bigframes.exceptions as bfe
 import bigframes.pandas as bpd
 from bigframes.testing.utils import cleanup_function_assets
@@ -26,105 +28,6 @@
 prefixer = test_utils.prefixer.Prefixer("bigframes", "")
 
 
-def test_managed_function_multiply_with_ibis(
-    session,
-    scalars_table_id,
-    bigquery_client,
-    ibis_client,
-    dataset_id,
-):
-
-    try:
-
-        @session.udf(
-            input_types=[int, int],
-            output_type=int,
-            dataset=dataset_id,
-            name=prefixer.create_prefix(),
-        )
-        def multiply(x, y):
-            return x * y
-
-        _, dataset_name, table_name = scalars_table_id.split(".")
-        if not ibis_client.dataset:
-            ibis_client.dataset = dataset_name
-
-        col_name = "int64_col"
-        table = ibis_client.tables[table_name]
-        table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
-        sql = table.compile()
-        pandas_df_orig = bigquery_client.query(sql).to_dataframe()
-
-        col = table[col_name]
-        col_2x = multiply(col, 2).name("int64_col_2x")
-        col_square = multiply(col, col).name("int64_col_square")
-        table = table.mutate([col_2x, col_square])
-        sql = table.compile()
-        pandas_df_new = bigquery_client.query(sql).to_dataframe()
-
-        pandas.testing.assert_series_equal(
-            pandas_df_orig[col_name] * 2,
-            pandas_df_new["int64_col_2x"],
-            check_names=False,
-        )
-
-        pandas.testing.assert_series_equal(
-            pandas_df_orig[col_name] * pandas_df_orig[col_name],
-            pandas_df_new["int64_col_square"],
-            check_names=False,
-        )
-    finally:
-        # clean up the gcp assets created for the managed function.
-        cleanup_function_assets(multiply, bigquery_client, ignore_failures=False)
-
-
-def test_managed_function_stringify_with_ibis(
-    session,
-    scalars_table_id,
-    bigquery_client,
-    ibis_client,
-    dataset_id,
-):
-    try:
-
-        @session.udf(
-            input_types=[int],
-            output_type=str,
-            dataset=dataset_id,
-            name=prefixer.create_prefix(),
-        )
-        def stringify(x):
-            return f"I got {x}"
-
-        # Function should work locally.
-        assert stringify(8912) == "I got 8912"
-
-        _, dataset_name, table_name = scalars_table_id.split(".")
-        if not ibis_client.dataset:
-            ibis_client.dataset = dataset_name
-
-        col_name = "int64_col"
-        table = ibis_client.tables[table_name]
-        table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
-        sql = table.compile()
-        pandas_df_orig = bigquery_client.query(sql).to_dataframe()
-
-        col = table[col_name]
-        col_2x = stringify.ibis_node(col).name("int64_str_col")
-        table = table.mutate([col_2x])
-        sql = table.compile()
-        pandas_df_new = bigquery_client.query(sql).to_dataframe()
-
-        pandas.testing.assert_series_equal(
-            pandas_df_orig[col_name].apply(lambda x: f"I got {x}"),
-            pandas_df_new["int64_str_col"],
-            check_names=False,
-        )
-    finally:
-        # clean up the gcp assets created for the managed function.
-        cleanup_function_assets(stringify, bigquery_client, ignore_failures=False)
-
-
 def test_managed_function_array_output(session, scalars_dfs, dataset_id):
     try:
 
@@ -150,7 +53,7 @@ def featurize(x: int) -> list[float]:
         featurize_ref = session.read_gbq_function(featurize.bigframes_bigquery_function)
 
         assert hasattr(featurize_ref, "bigframes_bigquery_function")
-        assert not hasattr(featurize_ref, "bigframes_remote_function")
+        assert featurize_ref.bigframes_remote_function is None
         assert (
             featurize_ref.bigframes_bigquery_function
             == featurize.bigframes_bigquery_function
@@ -184,7 +87,6 @@ def foo(x: int) -> bytes:
         assert foo(-2) == bytes(2)
 
         assert hasattr(foo, "bigframes_bigquery_function")
-        assert hasattr(foo, "ibis_node")
         assert hasattr(foo, "input_dtypes")
         assert hasattr(foo, "output_dtype")
         assert hasattr(foo, "bigframes_bigquery_function_output_dtype")
@@ -208,7 +110,7 @@ def foo(x: int) -> bytes:
             function_name=foo.bigframes_bigquery_function,  # type: ignore
         )
         assert hasattr(foo_ref, "bigframes_bigquery_function")
-        assert not hasattr(foo_ref, "bigframes_remote_function")
+        assert foo_ref.bigframes_remote_function is None
         assert foo.bigframes_bigquery_function == foo_ref.bigframes_bigquery_function  # type: ignore
 
         bf_result_col_gbq = scalars_df["int64_too"].apply(foo_ref)
@@ -358,7 +260,7 @@ def add_list(x: int, y: int) -> list[int]:
         )
 
         assert hasattr(add_list_managed_func_ref, "bigframes_bigquery_function")
-        assert not hasattr(add_list_managed_func_ref, "bigframes_remote_function")
+        assert add_list_managed_func_ref.bigframes_remote_function is None
         assert (
             add_list_managed_func_ref.bigframes_bigquery_function
             == add_list_managed_func.bigframes_bigquery_function
@@ -515,16 +417,16 @@ def test_managed_function_dataframe_apply_axis_1_array_output(session, dataset_i
     # Assert the dataframe dtypes.
     assert tuple(bf_df.dtypes) == expected_dtypes
 
-    try:
+    @session.udf(
+        input_types=[int, float, str],
+        output_type=list[str],
+        dataset=dataset_id,
+        name=prefixer.create_prefix(),
+    )
+    def foo(x, y, z):
+        return [str(x), str(y), z]
 
-        @session.udf(
-            input_types=[int, float, str],
-            output_type=list[str],
-            dataset=dataset_id,
-            name=prefixer.create_prefix(),
-        )
-        def foo(x, y, z):
-            return [str(x), str(y), z]
+    try:
 
         assert getattr(foo, "is_row_processor") is False
         assert getattr(foo, "input_dtypes") == expected_dtypes
@@ -585,7 +487,7 @@ def foo(x, y, z):
         foo_ref = session.read_gbq_function(foo.bigframes_bigquery_function)
 
         assert hasattr(foo_ref, "bigframes_bigquery_function")
-        assert not hasattr(foo_ref, "bigframes_remote_function")
+        assert foo_ref.bigframes_remote_function is None
         assert foo_ref.bigframes_bigquery_function == foo.bigframes_bigquery_function
 
         # Test on the function from read_gbq_function.
diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py
index 172fff3010..5e60f3ed9f 100644
--- a/tests/system/large/functions/test_remote_function.py
+++ b/tests/system/large/functions/test_remote_function.py
@@ -25,7 +25,6 @@
 import google.api_core.exceptions
 from google.cloud import bigquery, functions_v2, storage
 import pandas
-import pyarrow
 import pytest
 import test_utils.prefixer
 
@@ -97,118 +96,6 @@ def bq_cf_connection() -> str:
     return "bigframes-rf-conn"
 
 
-@pytest.mark.flaky(retries=2, delay=120)
-def test_remote_function_multiply_with_ibis(
-    session,
-    scalars_table_id,
-    bigquery_client,
-    ibis_client,
-    dataset_id,
-    bq_cf_connection,
-):
-    try:
-
-        @session.remote_function(
-            # Make sure that the input/output types can be used positionally.
-            # This avoids the worst of the breaking change from 1.x to 2.x.
-            [int, int],
-            int,
-            dataset_id,
-            bigquery_connection=bq_cf_connection,
-            reuse=False,
-            cloud_function_service_account="default",
-        )
-        def multiply(x, y):
-            return x * y
-
-        _, dataset_name, table_name = scalars_table_id.split(".")
-        if not ibis_client.dataset:
-            ibis_client.dataset = dataset_name
-
-        col_name = "int64_col"
-        table = ibis_client.tables[table_name]
-        table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
-        sql = table.compile()
-        pandas_df_orig = bigquery_client.query(sql).to_dataframe()
-
-        col = table[col_name]
-        col_2x = multiply(col, 2).name("int64_col_2x")
-        col_square = multiply(col, col).name("int64_col_square")
-        table = table.mutate([col_2x, col_square])
-        sql = table.compile()
-        pandas_df_new = bigquery_client.query(sql).to_dataframe()
-
-        pandas.testing.assert_series_equal(
-            pandas_df_orig[col_name] * 2,
-            pandas_df_new["int64_col_2x"],
-            check_names=False,
-        )
-
-        pandas.testing.assert_series_equal(
-            pandas_df_orig[col_name] * pandas_df_orig[col_name],
-            pandas_df_new["int64_col_square"],
-            check_names=False,
-        )
-    finally:
-        # clean up the gcp assets created for the remote function
-        cleanup_function_assets(multiply, bigquery_client, session.cloudfunctionsclient)
-
-
-@pytest.mark.flaky(retries=2, delay=120)
-def test_remote_function_stringify_with_ibis(
-    session,
-    scalars_table_id,
-    bigquery_client,
-    ibis_client,
-    dataset_id,
-    bq_cf_connection,
-):
-    try:
-
-        @session.remote_function(
-            # Make sure that the input/output types can be used positionally.
-            # This avoids the worst of the breaking change from 1.x to 2.x.
-            [int],
-            str,
-            dataset_id,
-            bigquery_connection=bq_cf_connection,
-            reuse=False,
-            cloud_function_service_account="default",
-        )
-        def stringify(x):
-            return f"I got {x}"
-
-        # Function should work locally.
-        assert stringify(42) == "I got 42"
-
-        _, dataset_name, table_name = scalars_table_id.split(".")
-        if not ibis_client.dataset:
-            ibis_client.dataset = dataset_name
-
-        col_name = "int64_col"
-        table = ibis_client.tables[table_name]
-        table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
-        sql = table.compile()
-        pandas_df_orig = bigquery_client.query(sql).to_dataframe()
-
-        col = table[col_name]
-        col_2x = stringify.ibis_node(col).name("int64_str_col")
-        table = table.mutate([col_2x])
-        sql = table.compile()
-        pandas_df_new = bigquery_client.query(sql).to_dataframe()
-
-        pandas.testing.assert_series_equal(
-            pandas_df_orig[col_name].apply(lambda x: f"I got {x}"),
-            pandas_df_new["int64_str_col"],
-            check_names=False,
-        )
-    finally:
-        # clean up the gcp assets created for the remote function
-        cleanup_function_assets(
-            stringify, bigquery_client, session.cloudfunctionsclient
-        )
-
-
 @pytest.mark.flaky(retries=2, delay=120)
 def test_remote_function_binop(session, scalars_dfs, dataset_id, bq_cf_connection):
     try:
@@ -2365,13 +2252,6 @@ def foo(x, y, z):
 
         assert getattr(foo, "is_row_processor") is False
         assert getattr(foo, "input_dtypes") == expected_dtypes
-        assert getattr(foo, "output_dtype") == pandas.ArrowDtype(
-            pyarrow.list_(
-                bigframes.dtypes.bigframes_dtype_to_arrow_dtype(
-                    bigframes.dtypes.STRING_DTYPE
-                )
-            )
-        )
         assert (
             getattr(foo, "bigframes_bigquery_function_output_dtype")
             == bigframes.dtypes.STRING_DTYPE
diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py
index 47ab6e2174..d5d8b29786 100644
--- a/tests/system/small/functions/test_remote_function.py
+++ b/tests/system/small/functions/test_remote_function.py
@@ -99,7 +99,7 @@ def get_bq_connection_id_path_format(connection_id_dot_format):
     return f"projects/{fields[0]}/locations/{fields[1]}/connections/{fields[2]}"
 
 
-@pytest.mark.flaky(retries=2, delay=120)
+# @pytest.mark.flaky(retries=2, delay=120)
 def test_remote_function_direct_no_session_param(
     bigquery_client,
     bigqueryconnection_client,
@@ -134,7 +134,6 @@ def square(x):
     assert hasattr(square, "bigframes_remote_function")
     assert hasattr(square, "bigframes_bigquery_function")
     assert hasattr(square, "bigframes_cloud_function")
-    assert hasattr(square, "ibis_node")
 
     scalars_df, scalars_pandas_df = scalars_dfs
 
@@ -718,7 +717,7 @@ def square1(x):
 
     assert square2.bigframes_remote_function
     assert square2.bigframes_bigquery_function
-    assert not hasattr(square2, "bigframes_cloud_function")
+    assert square2.bigframes_cloud_function is None
 
     # They should point to the same function.
     assert square1.bigframes_remote_function == square2.bigframes_remote_function  # type: ignore
diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py
index 37658bc436..77392bea2f 100644
--- a/tests/unit/core/test_dtypes.py
+++ b/tests/unit/core/test_dtypes.py
@@ -272,15 +272,3 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal():
         ValueError,
     ):
         bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"})
-
-
-def test_remote_function_io_types_are_supported_bigframes_types():
-    from bigframes_vendored.ibis.expr.datatypes.core import (
-        dtype as python_type_to_ibis_type,
-    )
-
-    from bigframes.dtypes import RF_SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types
-
-    for python_type in rf_supported_io_types:
-        ibis_type = python_type_to_ibis_type(python_type)
-        assert ibis_type in bigframes.core.compile.ibis_types.IBIS_TO_BIGFRAMES
diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py
index 259a4390bc..978281e5c9 100644
--- a/tests/unit/functions/test_remote_function.py
+++ b/tests/unit/functions/test_remote_function.py
@@ -14,12 +14,9 @@
 
 import re
 
-import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes
-from bigframes_vendored.ibis.expr import datatypes as ibis_types
 import pandas
 import pytest
 
-import bigframes.dtypes
 import bigframes.functions.function as bff
 import bigframes.series
 from bigframes.testing import mocks
@@ -56,26 +53,6 @@ def axis_1_function(myparam: series_type) -> str:  # type: ignore
 
     # Still works as a normal function.
     assert axis_1_function(pandas.Series({"str_col": "World"})) == "Hello, World!"
-    assert axis_1_function.ibis_node is not None
-
-
-def test_supported_types_correspond():
-    # The same types should be representable by the supported Python and BigQuery types.
-    ibis_types_from_python = {
-        ibis_types.dtype(t) for t in bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES
-    }
-    ibis_types_from_bigquery = {
-        third_party_ibis_bqtypes.BigQueryType.to_ibis(tk)
-        for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
-        # TODO(b/284515241): ARRAY is the only exception because it is supported
-        # as an output type of the BQ routine in the read_gbq_function path but
-        # not in the remote function path. Remove this handline once BQ remote
-        # functions supports ARRAY output and the bigframes remote functions
-        # utilizes that to support array output.
-        if tk != "ARRAY"
-    }
-
-    assert ibis_types_from_python == ibis_types_from_bigquery
 
 
 def test_missing_input_types():
diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py
index 3eceb99331..9743297e99 100644
--- a/tests/unit/functions/test_remote_function_utils.py
+++ b/tests/unit/functions/test_remote_function_utils.py
@@ -15,8 +15,7 @@
 import bigframes_vendored.constants as constants
 import pytest
 
-import bigframes.dtypes
-from bigframes.functions import _utils
+from bigframes.functions import _utils, function_typing
 
 
 @pytest.mark.parametrize(
@@ -133,7 +132,7 @@ def test_get_python_output_type_from_bigframes_metadata(
 
 
 def test_metadata_roundtrip_supported_array_types():
-    for array_of in bigframes.dtypes.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES:
+    for array_of in function_typing.RF_SUPPORTED_ARRAY_OUTPUT_PYTHON_TYPES:
         ser = _utils.get_bigframes_metadata(python_output_type=list[array_of])  # type: ignore
         deser = _utils.get_python_output_type_from_bigframes_metadata(ser)
         assert deser == list[array_of]  # type: ignore
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/udf.py b/third_party/bigframes_vendored/ibis/expr/operations/udf.py
index 4fb25a9d34..91366cace8 100644
--- a/third_party/bigframes_vendored/ibis/expr/operations/udf.py
+++ b/third_party/bigframes_vendored/ibis/expr/operations/udf.py
@@ -109,6 +109,7 @@ def _make_node(
         database: str | None = None,
         catalog: str | None = None,
         signature: tuple[tuple, Any] | None = None,
+        param_name_overrides: tuple[str, ...] | None = None,
         **kwargs,
     ) -> type[S]:
         """Construct a scalar user-defined function that is built-in to the backend."""
@@ -133,7 +134,7 @@ def _make_node(
 
         else:
             arg_types, return_annotation = signature
-            arg_names = list(inspect.signature(fn).parameters)
+            arg_names = param_name_overrides or list(inspect.signature(fn).parameters)
             fields = {
                 arg_name: Argument(pattern=rlz.ValueOf(typ), typehint=typ)
                 for arg_name, typ in zip(arg_names, arg_types)

From 8ebfa57602ce02573384232f978304eb7cf4abdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Fri, 13 Jun 2025 16:54:14 -0500
Subject: [PATCH 17/23] chore: `_read_gbq_colab` supports querying a pandas
 DataFrame (#1801)

* chore: `_read_gbq_colab` supports querying a pandas DataFrame

* make more unit test

* add session and dry_run arguments

* add dry_run to to_view

* initial pandas support with slow dry_run

* speed up dry run

* test with inline sql and load jobs

* Update bigframes/core/pyformat.py

* remove redundant test

* Update tests/unit/session/test_read_gbq_colab.py

* add dry run that works without a session

* fix unit test

* add unit tests for sessionless dry run

* avoid binding to a location too early

* dont try to set the default location unless its not a dry run

* dont try to run any assertion on the response type

* add support for small ints and floats

* don't cast from float16 in earlier versions of arrow

* rename _to_view to _to_placeholder_table

* deduplicate column names in dry run

* only allow lossless conversion if explicitly requested
---
 bigframes/core/blocks.py                      |  72 +++++--
 bigframes/core/local_data.py                  |   4 +-
 bigframes/core/pyformat.py                    |  64 +++++-
 bigframes/core/tools/bigquery_schema.py       |  48 +++++
 bigframes/dataframe.py                        |   8 +-
 bigframes/dtypes.py                           |  29 ++-
 bigframes/pandas/io/api.py                    |  85 ++++++--
 bigframes/session/__init__.py                 |  12 +-
 .../small/session/test_read_gbq_colab.py      | 146 +++++++++++++-
 tests/unit/core/test_pyformat.py              | 115 +++++++++--
 tests/unit/core/tools/test_bigquery_schema.py | 187 ++++++++++++++++++
 tests/unit/pandas/io/test_api.py              |  31 ++-
 tests/unit/session/test_read_gbq_colab.py     |  55 ++++--
 tests/unit/test_local_data.py                 |  66 +++++++
 14 files changed, 848 insertions(+), 74 deletions(-)
 create mode 100644 bigframes/core/tools/bigquery_schema.py
 create mode 100644 tests/unit/core/tools/test_bigquery_schema.py

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 4607928b78..675e8c8b7a 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -154,6 +154,7 @@ def __init__(
         self._stats_cache[" ".join(self.index_columns)] = {}
         self._transpose_cache: Optional[Block] = transpose_cache
         self._view_ref: Optional[bigquery.TableReference] = None
+        self._view_ref_dry_run: Optional[bigquery.TableReference] = None
 
     @classmethod
     def from_local(
@@ -2459,19 +2460,19 @@ def is_monotonic_decreasing(
     ) -> bool:
         return self._is_monotonic(column_id, increasing=False)
 
-    def to_sql_query(
-        self, include_index: bool, enable_cache: bool = True
-    ) -> typing.Tuple[str, list[str], list[Label]]:
+    def _array_value_for_output(
+        self, *, include_index: bool
+    ) -> Tuple[bigframes.core.ArrayValue, list[str], list[Label]]:
         """
-        Compiles this DataFrame's expression tree to SQL, optionally
-        including index columns.
+        Creates the expression tree with user-visible column names, such as for
+        SQL output.
 
         Args:
             include_index (bool):
                 whether to include index columns.
 
         Returns:
-            a tuple of (sql_string, index_column_id_list, index_column_label_list).
+            a tuple of (ArrayValue, index_column_id_list, index_column_label_list).
                 If include_index is set to False, index_column_id_list and index_column_label_list
                 return empty lists.
         """
@@ -2494,25 +2495,72 @@ def to_sql_query(
             # the BigQuery unicode column name feature?
             substitutions[old_id] = new_id
 
+        return (
+            array_value.rename_columns(substitutions),
+            new_ids[: len(idx_labels)],
+            idx_labels,
+        )
+
+    def to_sql_query(
+        self, include_index: bool, enable_cache: bool = True
+    ) -> Tuple[str, list[str], list[Label]]:
+        """
+        Compiles this DataFrame's expression tree to SQL, optionally
+        including index columns.
+
+        Args:
+            include_index (bool):
+                whether to include index columns.
+
+        Returns:
+            a tuple of (sql_string, index_column_id_list, index_column_label_list).
+                If include_index is set to False, index_column_id_list and index_column_label_list
+                return empty lists.
+        """
+        array_value, idx_ids, idx_labels = self._array_value_for_output(
+            include_index=include_index
+        )
+
         # Note: this uses the sql from the executor, so is coupled tightly to execution
         # implementaton. It will reference cached tables instead of original data sources.
         # Maybe should just compile raw BFET? Depends on user intent.
-        sql = self.session._executor.to_sql(
-            array_value.rename_columns(substitutions), enable_cache=enable_cache
-        )
+        sql = self.session._executor.to_sql(array_value, enable_cache=enable_cache)
         return (
             sql,
-            new_ids[: len(idx_labels)],
+            idx_ids,
             idx_labels,
         )
 
-    def to_view(self, include_index: bool) -> bigquery.TableReference:
+    def to_placeholder_table(
+        self, include_index: bool, *, dry_run: bool = False
+    ) -> bigquery.TableReference:
         """
-        Creates a temporary BigQuery VIEW with the SQL corresponding to this block.
+        Creates a temporary BigQuery VIEW (or empty table if dry_run) with the
+        SQL corresponding to this block.
         """
         if self._view_ref is not None:
             return self._view_ref
 
+        # Prefer the real view if it exists, but since dry_run might be called
+        # many times before the real query, we cache that empty table reference
+        # with the correct schema too.
+        if dry_run:
+            if self._view_ref_dry_run is not None:
+                return self._view_ref_dry_run
+
+            # Create empty temp table with the right schema.
+            array_value, _, _ = self._array_value_for_output(
+                include_index=include_index
+            )
+            temp_table_schema = array_value.schema.to_bigquery()
+            self._view_ref_dry_run = self.session._create_temp_table(
+                schema=temp_table_schema
+            )
+            return self._view_ref_dry_run
+
+        # We shouldn't run `to_sql_query` if we have a `dry_run`, because it
+        # could cause us to make unnecessary API calls to upload local node
+        # data.
         sql, _, _ = self.to_sql_query(include_index=include_index)
         self._view_ref = self.session._create_temp_view(sql)
         return self._view_ref
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
index da1c174bc4..a99366ad4c 100644
--- a/bigframes/core/local_data.py
+++ b/bigframes/core/local_data.py
@@ -336,7 +336,9 @@ def _adapt_arrow_array(array: pa.Array) -> tuple[pa.Array, bigframes.dtypes.Dtyp
     if target_type != array.type:
         # TODO: Maybe warn if lossy conversion?
         array = array.cast(target_type)
-    bf_type = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(target_type)
+    bf_type = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
+        target_type, allow_lossless_cast=True
+    )
 
     storage_type = _get_managed_storage_type(bf_type)
     if storage_type != array.type:
diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py
index 59ccdf1f5f..eab86dc629 100644
--- a/bigframes/core/pyformat.py
+++ b/bigframes/core/pyformat.py
@@ -21,10 +21,15 @@
 
 import string
 import typing
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 import google.cloud.bigquery
-import google.cloud.bigquery.table
+import pandas
+
+from bigframes.core import utils
+import bigframes.core.local_data
+from bigframes.core.tools import bigquery_schema
+import bigframes.session
 
 _BQ_TABLE_TYPES = Union[
     google.cloud.bigquery.Table,
@@ -37,9 +42,51 @@ def _table_to_sql(table: _BQ_TABLE_TYPES) -> str:
     return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`"
 
 
+def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str:
+    # Ensure there are no duplicate column labels.
+    #
+    # Please make sure this stays in sync with the logic used to_gbq(). See
+    # bigframes.dataframe.DataFrame._prepare_export().
+    new_col_labels, new_idx_labels = utils.get_standardized_ids(
+        pd_df.columns, pd_df.index.names
+    )
+    pd_copy = pd_df.copy()
+    pd_copy.columns = pandas.Index(new_col_labels)
+    pd_copy.index.names = new_idx_labels
+
+    managed_table = bigframes.core.local_data.ManagedArrowTable.from_pandas(pd_copy)
+    bqschema = managed_table.schema.to_bigquery()
+    return bigquery_schema.to_sql_dry_run(bqschema)
+
+
+def _pandas_df_to_sql(
+    df_pd: pandas.DataFrame,
+    *,
+    name: str,
+    session: Optional[bigframes.session.Session] = None,
+    dry_run: bool = False,
+) -> str:
+    if session is None:
+        if not dry_run:
+            message = (
+                f"Can't embed pandas DataFrame {name} in a SQL "
+                "string without a bigframes session except if for a dry run."
+            )
+            raise ValueError(message)
+
+        return _pandas_df_to_sql_dry_run(df_pd)
+
+    # Use the _deferred engine to avoid loading data too often during dry run.
+    df = session.read_pandas(df_pd, write_engine="_deferred")
+    return _table_to_sql(df._to_placeholder_table(dry_run=dry_run))
+
+
 def _field_to_template_value(
     name: str,
     value: Any,
+    *,
+    session: Optional[bigframes.session.Session] = None,
+    dry_run: bool = False,
 ) -> str:
     """Convert value to something embeddable in a SQL string."""
     import bigframes.core.sql  # Avoid circular imports
@@ -51,9 +98,11 @@ def _field_to_template_value(
     if isinstance(value, table_types):
         return _table_to_sql(value)
 
-    # TODO(tswast): convert pandas DataFrame objects to gbq tables or a literals subquery.
+    if isinstance(value, pandas.DataFrame):
+        return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name)
+
     if isinstance(value, bigframes.dataframe.DataFrame):
-        return _table_to_sql(value._to_view())
+        return _table_to_sql(value._to_placeholder_table(dry_run=dry_run))
 
     return bigframes.core.sql.simple_literal(value)
 
@@ -70,6 +119,7 @@ def _validate_type(name: str, value: Any):
         typing.get_args(_BQ_TABLE_TYPES)
         + typing.get_args(bigframes.core.sql.SIMPLE_LITERAL_TYPES)
         + (bigframes.dataframe.DataFrame,)
+        + (pandas.DataFrame,)
     )
 
     if not isinstance(value, supported_types):
@@ -91,6 +141,8 @@ def pyformat(
     sql_template: str,
     *,
     pyformat_args: dict,
+    session: Optional[bigframes.session.Session] = None,
+    dry_run: bool = False,
 ) -> str:
     """Unsafe Python-style string formatting of SQL string.
 
@@ -115,6 +167,8 @@ def pyformat(
     format_kwargs = {}
     for name in fields:
         value = pyformat_args[name]
-        format_kwargs[name] = _field_to_template_value(name, value)
+        format_kwargs[name] = _field_to_template_value(
+            name, value, session=session, dry_run=dry_run
+        )
 
     return sql_template.format(**format_kwargs)
diff --git a/bigframes/core/tools/bigquery_schema.py b/bigframes/core/tools/bigquery_schema.py
new file mode 100644
index 0000000000..227a69e0f7
--- /dev/null
+++ b/bigframes/core/tools/bigquery_schema.py
@@ -0,0 +1,48 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpers for working with BigQuery SchemaFields."""
+
+from typing import Tuple
+
+import google.cloud.bigquery
+
+
+def _type_to_sql(field: google.cloud.bigquery.SchemaField):
+    """Turn the type information of the field into SQL.
+
+    Ignores the mode, since this has already been handled by _field_to_sql.
+    """
+    if field.field_type.casefold() in ("record", "struct"):
+        return _to_struct(field.fields)
+    return field.field_type
+
+
+def _field_to_sql(field: google.cloud.bigquery.SchemaField):
+    if field.mode == "REPEATED":
+        # Unlike other types, ARRAY are represented as mode="REPEATED". To get
+        # the array type, we use SchemaField object but ignore the mode.
+        return f"`{field.name}` ARRAY<{_type_to_sql(field)}>"
+
+    return f"`{field.name}` {_type_to_sql(field)}"
+
+
+def _to_struct(bqschema: Tuple[google.cloud.bigquery.SchemaField, ...]):
+    fields = [_field_to_sql(field) for field in bqschema]
+    return f"STRUCT<{', '.join(fields)}>"
+
+
+def to_sql_dry_run(bqschema: Tuple[google.cloud.bigquery.SchemaField, ...]):
+    """Create an empty table expression with the correct schema."""
+    return f"UNNEST(ARRAY<{_to_struct(bqschema)}>[])"
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 7e5bb3049a..38879d3ec0 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -404,11 +404,13 @@ def _should_sql_have_index(self) -> bool:
             self.index.name is not None or len(self.index.names) > 1
         )
 
-    def _to_view(self) -> bigquery.TableReference:
+    def _to_placeholder_table(self, dry_run: bool = False) -> bigquery.TableReference:
         """Compiles this DataFrame's expression tree to SQL and saves it to a
-        (temporary) view.
+        (temporary) view or table (in the case of a dry run).
         """
-        return self._block.to_view(include_index=self._should_sql_have_index())
+        return self._block.to_placeholder_table(
+            include_index=self._should_sql_have_index(), dry_run=dry_run
+        )
 
     def _to_sql_query(
         self, include_index: bool, enable_cache: bool = True
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index e0c3e39ac9..b0a31595e5 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -444,8 +444,35 @@ def dtype_for_etype(etype: ExpressionType) -> Dtype:
     if mapping.arrow_dtype is not None
 }
 
+# Include types that aren't 1:1 to BigQuery but allowed to be loaded in to BigQuery:
+_ARROW_TO_BIGFRAMES_LOSSLESS = {
+    pa.int8(): INT_DTYPE,
+    pa.int16(): INT_DTYPE,
+    pa.int32(): INT_DTYPE,
+    pa.uint8(): INT_DTYPE,
+    pa.uint16(): INT_DTYPE,
+    pa.uint32(): INT_DTYPE,
+    # uint64 is omitted because uint64 -> BigQuery INT64 is a lossy conversion.
+    pa.float16(): FLOAT_DTYPE,
+    pa.float32(): FLOAT_DTYPE,
+    # TODO(tswast): Can we support datetime/timestamp/time with units larger
+    # than microseconds?
+}
+
+
+def arrow_dtype_to_bigframes_dtype(
+    arrow_dtype: pa.DataType, allow_lossless_cast: bool = False
+) -> Dtype:
+    """
+    Convert an arrow type into the pandas-y type used to represent it in BigFrames.
+
+    Args:
+        arrow_dtype: Arrow data type.
+        allow_lossless_cast: Allow lossless conversions, such as int32 to int64.
+    """
+    if allow_lossless_cast and arrow_dtype in _ARROW_TO_BIGFRAMES_LOSSLESS:
+        return _ARROW_TO_BIGFRAMES_LOSSLESS[arrow_dtype]
 
-def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
     if arrow_dtype in _ARROW_TO_BIGFRAMES:
         return _ARROW_TO_BIGFRAMES[arrow_dtype]
 
diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py
index e1fd7218bd..608eaf5a82 100644
--- a/bigframes/pandas/io/api.py
+++ b/bigframes/pandas/io/api.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import functools
 import inspect
 import threading
 import typing
@@ -51,6 +52,7 @@
 import bigframes.enums
 import bigframes.series
 import bigframes.session
+from bigframes.session import dry_runs
 import bigframes.session._io.bigquery
 import bigframes.session.clients
 
@@ -216,6 +218,27 @@ def read_gbq(
 read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq)
 
 
+def _try_read_gbq_colab_sessionless_dry_run(
+    create_query: Callable[[], str],
+) -> Optional[pandas.Series]:
+    """Run a dry_run without a session, only if the session hasn't yet started."""
+
+    global _default_location_lock
+
+    # Avoid creating a session just for dry run. We don't want to bind to a
+    # location too early. This is especially important if the query only refers
+    # to local data and not any BigQuery tables.
+    with _default_location_lock:
+        if not config.options.bigquery._session_started:
+            bqclient = _get_bqclient()
+            query = create_query()
+            job = _dry_run(query, bqclient)
+            return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ())
+
+    # Explicitly return None to indicate that we didn't run the dry run query.
+    return None
+
+
 @overload
 def _read_gbq_colab(  # type: ignore[overload-overlap]
     query_or_table: str,
@@ -263,11 +286,30 @@ def _read_gbq_colab(
     if pyformat_args is None:
         pyformat_args = {}
 
-    query = bigframes.core.pyformat.pyformat(
+    # Delay formatting the query with the special "session-less" logic. This
+    # avoids doing unnecessary work if the session already has a location or has
+    # already started.
+    create_query = functools.partial(
+        bigframes.core.pyformat.pyformat,
         query_or_table,
         pyformat_args=pyformat_args,
+        dry_run=True,
     )
-    _set_default_session_location_if_possible(query)
+
+    # Only try to set the global location if it's not a dry run. We don't want
+    # to bind to a location too early. This is especially important if the query
+    # only refers to local data and not any BigQuery tables.
+    if dry_run:
+        result = _try_read_gbq_colab_sessionless_dry_run(create_query)
+
+        if result is not None:
+            return result
+
+        # If we made it this far, we must have a session that has already
+        # started. That means we can safely call the "real" _read_gbq_colab,
+        # which generates slightly nicer SQL.
+    else:
+        _set_default_session_location_if_possible_deferred_query(create_query)
 
     return global_session.with_default_session(
         bigframes.session.Session._read_gbq_colab,
@@ -530,7 +572,30 @@ def from_glob_path(
 _default_location_lock = threading.Lock()
 
 
+def _get_bqclient() -> bigquery.Client:
+    clients_provider = bigframes.session.clients.ClientsProvider(
+        project=config.options.bigquery.project,
+        location=config.options.bigquery.location,
+        use_regional_endpoints=config.options.bigquery.use_regional_endpoints,
+        credentials=config.options.bigquery.credentials,
+        application_name=config.options.bigquery.application_name,
+        bq_kms_key_name=config.options.bigquery.kms_key_name,
+        client_endpoints_override=config.options.bigquery.client_endpoints_override,
+        requests_transport_adapters=config.options.bigquery.requests_transport_adapters,
+    )
+    return clients_provider.bqclient
+
+
+def _dry_run(query, bqclient) -> bigquery.QueryJob:
+    job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True))
+    return job
+
+
 def _set_default_session_location_if_possible(query):
+    _set_default_session_location_if_possible_deferred_query(lambda: query)
+
+
+def _set_default_session_location_if_possible_deferred_query(create_query):
     # Set the location as per the query if this is the first query the user is
     # running and:
     # (1) Default session has not started yet, and
@@ -549,24 +614,14 @@ def _set_default_session_location_if_possible(query):
         ):
             return
 
-        clients_provider = bigframes.session.clients.ClientsProvider(
-            project=config.options.bigquery.project,
-            location=config.options.bigquery.location,
-            use_regional_endpoints=config.options.bigquery.use_regional_endpoints,
-            credentials=config.options.bigquery.credentials,
-            application_name=config.options.bigquery.application_name,
-            bq_kms_key_name=config.options.bigquery.kms_key_name,
-            client_endpoints_override=config.options.bigquery.client_endpoints_override,
-            requests_transport_adapters=config.options.bigquery.requests_transport_adapters,
-        )
-
-        bqclient = clients_provider.bqclient
+        query = create_query()
+        bqclient = _get_bqclient()
 
         if bigframes.session._io.bigquery.is_query(query):
             # Intentionally run outside of the session so that we can detect the
             # location before creating the session. Since it's a dry_run, labels
             # aren't necessary.
-            job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True))
+            job = _dry_run(query, bqclient)
             config.options.bigquery.location = job.location
         else:
             table = bqclient.get_table(query)
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 7597f8eeed..c06233bad3 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -518,6 +518,8 @@ def _read_gbq_colab(
         query = bigframes.core.pyformat.pyformat(
             query,
             pyformat_args=pyformat_args,
+            session=self,
+            dry_run=dry_run,
         )
 
         return self._loader.read_gbq_query(
@@ -1965,9 +1967,17 @@ def _create_object_table(self, path: str, connection: str) -> str:
         return table
 
     def _create_temp_view(self, sql: str) -> bigquery.TableReference:
-        """Create a random id Object Table from the input path and connection."""
+        """Create a random id view from the sql string."""
         return self._anon_dataset_manager.create_temp_view(sql)
 
+    def _create_temp_table(
+        self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = []
+    ) -> bigquery.TableReference:
+        """Allocate a random temporary table with the desired schema."""
+        return self._temp_storage_manager.create_temp_table(
+            schema=schema, cluster_cols=cluster_cols
+        )
+
     def from_glob_path(
         self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None
     ) -> dataframe.DataFrame:
diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py
index 0992a10055..af78117262 100644
--- a/tests/system/small/session/test_read_gbq_colab.py
+++ b/tests/system/small/session/test_read_gbq_colab.py
@@ -14,8 +14,10 @@
 
 """System tests for read_gbq_colab helper functions."""
 
+import numpy
 import pandas
 import pandas.testing
+import pytest
 
 
 def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session):
@@ -140,33 +142,163 @@ def test_read_gbq_colab_includes_formatted_scalars(session):
     )
 
 
-def test_read_gbq_colab_includes_formatted_bigframes_dataframe(
+@pytest.mark.skipif(
+    pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x"
+)
+def test_read_gbq_colab_includes_formatted_dataframes(
     session, scalars_df_index, scalars_pandas_df_index
 ):
+    pd_df = pandas.DataFrame(
+        {
+            "rowindex": [0, 1, 2, 3, 4, 5],
+            "value": [0, 100, 200, 300, 400, 500],
+        }
+    )
+
+    # Make sure we test with some data that is too large to inline as SQL.
+    pd_df_large = pandas.DataFrame(
+        {
+            "rowindex": numpy.arange(100_000),
+            "large_value": numpy.arange(100_000),
+        }
+    )
+
     pyformat_args = {
         # Apply some operations to make sure the columns aren't renamed.
-        "some_dataframe": scalars_df_index[scalars_df_index["int64_col"] > 0].assign(
+        "bf_df": scalars_df_index[scalars_df_index["int64_col"] > 0].assign(
             int64_col=scalars_df_index["int64_too"]
         ),
+        "pd_df": pd_df,
+        "pd_df_large": pd_df_large,
         # This is not a supported type, but ignored if not referenced.
         "some_object": object(),
     }
+    sql = """
+    SELECT bf_df.int64_col + pd_df.value + pd_df_large.large_value AS int64_col,
+    COALESCE(bf_df.rowindex, pd_df.rowindex, pd_df_large.rowindex) AS rowindex
+    FROM {bf_df} AS bf_df
+    FULL OUTER JOIN {pd_df} AS pd_df
+    ON bf_df.rowindex = pd_df.rowindex
+    LEFT JOIN {pd_df_large} AS pd_df_large
+    ON bf_df.rowindex = pd_df_large.rowindex
+    ORDER BY rowindex ASC
+    """
+
+    # Do the dry run first so that we don't re-use the uploaded data from the
+    # real query.
+    dry_run_output = session._read_gbq_colab(
+        sql,
+        pyformat_args=pyformat_args,
+        dry_run=True,
+    )
+
     df = session._read_gbq_colab(
-        """
-        SELECT int64_col, rowindex
-        FROM {some_dataframe}
-        ORDER BY rowindex ASC
-        """,
+        sql,
         pyformat_args=pyformat_args,
     )
+
+    # Confirm that dry_run was accurate.
+    pandas.testing.assert_series_equal(
+        pandas.Series(dry_run_output["columnDtypes"]),
+        df.dtypes,
+    )
+
     result = df.to_pandas()
     expected = (
         scalars_pandas_df_index[scalars_pandas_df_index["int64_col"] > 0]
         .assign(int64_col=scalars_pandas_df_index["int64_too"])
         .reset_index(drop=False)[["int64_col", "rowindex"]]
+        .merge(
+            pd_df,
+            on="rowindex",
+            how="outer",
+        )
+        .merge(
+            pd_df_large,
+            on="rowindex",
+            how="left",
+        )
+        .assign(
+            int64_col=lambda df: (
+                df["int64_col"] + df["value"] + df["large_value"]
+            ).astype("Int64")
+        )
+        .drop(columns=["value", "large_value"])
+        .sort_values(by="rowindex")
+        .reset_index(drop=True)
     )
     pandas.testing.assert_frame_equal(
         result,
         expected,
         check_index_type=False,  # int64 vs Int64
     )
+
+
+@pytest.mark.parametrize(
+    ("pd_df",),
+    (
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    "rowindex": [0, 1, 2, 3, 4, 5],
+                    "value": [0, 100, 200, 300, 400, 500],
+                    "value2": [-1, -2, -3, -4, -5, -6],
+                }
+            ),
+            id="inline-df",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    # Make sure we test with some data that is too large to
+                    # inline as SQL.
+                    "rowindex": numpy.arange(100_000),
+                    "value": numpy.arange(100_000),
+                    "value2": numpy.arange(100_000),
+                }
+            ),
+            id="large-df",
+        ),
+    ),
+)
+def test_read_gbq_colab_with_formatted_dataframe_deduplicates_column_names_just_like_to_gbq(
+    session,
+    pd_df,
+):
+    # Create duplicate column names.
+    pd_df.columns = ["rowindex", "value", "value"]
+
+    pyformat_args = {
+        "pd_df": pd_df,
+    }
+    sql = """
+    SELECT rowindex, value, value_1
+    FROM {pd_df}
+    """
+
+    # Do the dry run first so that we don't re-use the uploaded data from the
+    # real query.
+    dry_run_output = session._read_gbq_colab(
+        sql,
+        pyformat_args=pyformat_args,
+        dry_run=True,
+    )
+
+    df = session._read_gbq_colab(
+        sql,
+        pyformat_args=pyformat_args,
+    )
+
+    # Confirm that dry_run was accurate.
+    pandas.testing.assert_series_equal(
+        pandas.Series(dry_run_output["columnDtypes"]),
+        df.dtypes,
+    )
+
+    # Make sure the query doesn't fail.
+    df.to_pandas_batches()
+
+    # Make sure the
+    table_id = session.read_pandas(pd_df).to_gbq()
+    table = session.bqclient.get_table(table_id)
+    assert [field.name for field in table.schema] == ["rowindex", "value", "value_1"]
diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py
index 466f3d6116..05110d8485 100644
--- a/tests/unit/core/test_pyformat.py
+++ b/tests/unit/core/test_pyformat.py
@@ -19,13 +19,21 @@
 
 from __future__ import annotations
 
+import decimal
 from typing import Any, Dict, List
 
 import google.cloud.bigquery
 import google.cloud.bigquery.table
+import pandas
 import pytest
 
-import bigframes.core.pyformat as pyformat
+from bigframes.core import pyformat
+from bigframes.testing import mocks
+
+
+@pytest.fixture
+def session():
+    return mocks.create_bigquery_session()
 
 
 @pytest.mark.parametrize(
@@ -48,31 +56,116 @@ def test_parse_fields(sql_template: str, expected: List[str]):
     assert fields == expected
 
 
-def test_pyformat_with_unsupported_type_raises_typeerror():
+def test_pyformat_with_unsupported_type_raises_typeerror(session):
     pyformat_args = {"my_object": object()}
     sql = "SELECT {my_object}"
 
     with pytest.raises(TypeError, match="my_object has unsupported type: "):
-        pyformat.pyformat(sql, pyformat_args=pyformat_args)
+        pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
 
 
-def test_pyformat_with_missing_variable_raises_keyerror():
+def test_pyformat_with_missing_variable_raises_keyerror(session):
     pyformat_args: Dict[str, Any] = {}
     sql = "SELECT {my_object}"
 
     with pytest.raises(KeyError, match="my_object"):
-        pyformat.pyformat(sql, pyformat_args=pyformat_args)
+        pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
 
 
-def test_pyformat_with_no_variables():
+def test_pyformat_with_no_variables(session):
     pyformat_args: Dict[str, Any] = {}
     sql = "SELECT '{{escaped curly brackets}}'"
     expected_sql = "SELECT '{escaped curly brackets}'"
-    got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args)
+    got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
     assert got_sql == expected_sql
 
 
-def test_pyformat_with_query_string_replaces_variables():
+@pytest.mark.parametrize(
+    ("df_pd", "expected_struct"),
+    (
+        pytest.param(
+            pandas.DataFrame(),
+            "STRUCT<>",
+            id="empty",
+        ),
+        pytest.param(
+            # Empty columns default to floating point, just like pandas.
+            pandas.DataFrame({"empty column": []}),
+            "STRUCT<`empty column` FLOAT>",
+            id="empty column",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    "col1": [1, 2, 3],
+                    "col2": ["a", "b", "c"],
+                    "col3": [
+                        decimal.Decimal(1),
+                        decimal.Decimal(2),
+                        decimal.Decimal(3),
+                    ],
+                }
+            ),
+            "STRUCT<`col1` INTEGER, `col2` STRING, `col3` NUMERIC>",
+            id="scalars",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {"array col": [[1, 2, 3]], "another array": [["a", "b", "c"]]}
+            ),
+            "STRUCT<`array col` ARRAY<INTEGER>, `another array` ARRAY<STRING>>",
+            id="arrays",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    "struct col": [
+                        {"subfield": {"subsubfield": 1}, "subfield2": 2},
+                    ],
+                }
+            ),
+            "STRUCT<`struct col` STRUCT<`subfield` STRUCT<`subsubfield` INTEGER>, `subfield2` INTEGER>>",
+            id="structs",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    "array of struct col": [
+                        [{"subfield": {"subsubfield": 1}, "subfield2": 2}],
+                    ],
+                }
+            ),
+            "STRUCT<`array of struct col` ARRAY<STRUCT<`subfield` STRUCT<`subsubfield` INTEGER>, `subfield2` INTEGER>>>",
+            id="array_of_structs",
+        ),
+        pytest.param(
+            pandas.DataFrame({"c1": [1, 2, 3], "c2": ["a", "b", "c"]}).rename(
+                columns={"c1": "c", "c2": "c"}
+            ),
+            "STRUCT<`c` INTEGER, `c_1` STRING>",
+            id="duplicate_column_names",
+        ),
+    ),
+)
+def test_pyformat_with_pandas_dataframe_dry_run_no_session(df_pd, expected_struct):
+    pyformat_args: Dict[str, Any] = {"my_pandas_df": df_pd}
+    sql = "SELECT * FROM {my_pandas_df}"
+    expected_sql = f"SELECT * FROM UNNEST(ARRAY<{expected_struct}>[])"
+    got_sql = pyformat.pyformat(
+        sql, pyformat_args=pyformat_args, dry_run=True, session=None
+    )
+    assert got_sql == expected_sql
+
+
+def test_pyformat_with_pandas_dataframe_not_dry_run_no_session_raises_valueerror():
+    pyformat_args: Dict[str, Any] = {"my_pandas_df": pandas.DataFrame()}
+    sql = "SELECT * FROM {my_pandas_df}"
+
+    with pytest.raises(ValueError, match="my_pandas_df"):
+        pyformat.pyformat(sql, pyformat_args=pyformat_args)
+
+
+def test_pyformat_with_query_string_replaces_variables(session):
     pyformat_args = {
         "my_string": "some string value",
         "max_value": 2.25,
@@ -102,7 +195,7 @@ def test_pyformat_with_query_string_replaces_variables():
     WHERE height < 2.25
     """.strip()
 
-    got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args)
+    got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
     assert got_sql == expected_sql
 
 
@@ -134,12 +227,12 @@ def test_pyformat_with_query_string_replaces_variables():
         ),
     ),
 )
-def test_pyformat_with_table_replaces_variables(table, expected_sql):
+def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session):
     pyformat_args = {
         "table": table,
         # Unreferenced values of unsupported type shouldn't cause issues.
         "my_object": object(),
     }
     sql = "SELECT * FROM {table}"
-    got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args)
+    got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
     assert got_sql == expected_sql
diff --git a/tests/unit/core/tools/test_bigquery_schema.py b/tests/unit/core/tools/test_bigquery_schema.py
new file mode 100644
index 0000000000..a5b0087801
--- /dev/null
+++ b/tests/unit/core/tools/test_bigquery_schema.py
@@ -0,0 +1,187 @@
+from google.cloud import bigquery
+import pytest
+
+from bigframes.core.tools import bigquery_schema
+
+
+# --- Tests for _type_to_sql ---
+@pytest.mark.parametrize(
+    "field, expected_sql",
+    [
+        # Simple types
+        (bigquery.SchemaField("test_field", "INTEGER"), "INTEGER"),
+        (bigquery.SchemaField("test_field", "STRING"), "STRING"),
+        (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOLEAN"),
+        # RECORD/STRUCT types with nested fields directly
+        (
+            bigquery.SchemaField(
+                "test_field",
+                "RECORD",
+                fields=(bigquery.SchemaField("sub_field", "STRING"),),
+            ),
+            "STRUCT<`sub_field` STRING>",
+        ),
+        (
+            bigquery.SchemaField(
+                "test_field",
+                "STRUCT",
+                fields=(
+                    bigquery.SchemaField("sub_field", "INTEGER"),
+                    bigquery.SchemaField("another", "BOOLEAN"),
+                ),
+            ),
+            "STRUCT<`sub_field` INTEGER, `another` BOOLEAN>",
+        ),
+        # Array is handled by _field_to_sql, instead.
+        (bigquery.SchemaField("test_field", "NUMERIC", mode="REPEATED"), "NUMERIC"),
+        (
+            bigquery.SchemaField(
+                "test_field",
+                "RECORD",
+                mode="REPEATED",
+                fields=(bigquery.SchemaField("sub_field", "STRING"),),
+            ),
+            "STRUCT<`sub_field` STRING>",
+        ),
+    ],
+)
+def test_type_to_sql(field, expected_sql):
+    assert bigquery_schema._type_to_sql(field) == expected_sql
+
+
+# --- Tests for _field_to_sql ---
+@pytest.mark.parametrize(
+    "field, expected_sql",
+    [
+        # Simple field
+        (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INTEGER"),
+        (bigquery.SchemaField("name", "STRING", "NULLABLE"), "`name` STRING"),
+        # Repeated field
+        (bigquery.SchemaField("tags", "STRING", "REPEATED"), "`tags` ARRAY<STRING>"),
+        # Repeated RECORD
+        (
+            bigquery.SchemaField(
+                "addresses",
+                "RECORD",
+                "REPEATED",
+                fields=(
+                    bigquery.SchemaField("street", "STRING"),
+                    bigquery.SchemaField("zip", "INTEGER"),
+                ),
+            ),
+            "`addresses` ARRAY<STRUCT<`street` STRING, `zip` INTEGER>>",
+        ),
+        # Simple STRUCT
+        (
+            bigquery.SchemaField(
+                "person",
+                "STRUCT",
+                "NULLABLE",
+                fields=(
+                    bigquery.SchemaField("age", "INTEGER"),
+                    bigquery.SchemaField("city", "STRING"),
+                ),
+            ),
+            "`person` STRUCT<`age` INTEGER, `city` STRING>",
+        ),
+    ],
+)
+def test_field_to_sql(field, expected_sql):
+    assert bigquery_schema._field_to_sql(field) == expected_sql
+
+
+# --- Tests for _to_struct ---
+@pytest.mark.parametrize(
+    "bqschema, expected_sql",
+    [
+        # Empty schema
+        ((), "STRUCT<>"),
+        # Simple fields
+        (
+            (
+                bigquery.SchemaField("id", "INTEGER"),
+                bigquery.SchemaField("name", "STRING"),
+            ),
+            "STRUCT<`id` INTEGER, `name` STRING>",
+        ),
+        # Nested RECORD/STRUCT
+        (
+            (
+                bigquery.SchemaField("item_id", "INTEGER"),
+                bigquery.SchemaField(
+                    "details",
+                    "RECORD",
+                    "NULLABLE",
+                    fields=(
+                        bigquery.SchemaField("price", "NUMERIC"),
+                        bigquery.SchemaField("currency", "STRING"),
+                    ),
+                ),
+            ),
+            "STRUCT<`item_id` INTEGER, `details` STRUCT<`price` NUMERIC, `currency` STRING>>",
+        ),
+        # Repeated field
+        (
+            (
+                bigquery.SchemaField("user_id", "STRING"),
+                bigquery.SchemaField("emails", "STRING", "REPEATED"),
+            ),
+            "STRUCT<`user_id` STRING, `emails` ARRAY<STRING>>",
+        ),
+        # Mixed types including complex nested repeated
+        (
+            (
+                bigquery.SchemaField("event_name", "STRING"),
+                bigquery.SchemaField(
+                    "participants",
+                    "RECORD",
+                    "REPEATED",
+                    fields=(
+                        bigquery.SchemaField("p_id", "INTEGER"),
+                        bigquery.SchemaField("roles", "STRING", "REPEATED"),
+                    ),
+                ),
+                bigquery.SchemaField("timestamp", "TIMESTAMP"),
+            ),
+            "STRUCT<`event_name` STRING, `participants` ARRAY<STRUCT<`p_id` INTEGER, `roles` ARRAY<STRING>>>, `timestamp` TIMESTAMP>",
+        ),
+    ],
+)
+def test_to_struct(bqschema, expected_sql):
+    assert bigquery_schema._to_struct(bqschema) == expected_sql
+
+
+# --- Tests for to_sql_dry_run ---
+@pytest.mark.parametrize(
+    "bqschema, expected_sql",
+    [
+        # Empty schema
+        ((), "UNNEST(ARRAY<STRUCT<>>[])"),
+        # Simple schema
+        (
+            (
+                bigquery.SchemaField("id", "INTEGER"),
+                bigquery.SchemaField("name", "STRING"),
+            ),
+            "UNNEST(ARRAY<STRUCT<`id` INTEGER, `name` STRING>>[])",
+        ),
+        # Complex schema with nested and repeated fields
+        (
+            (
+                bigquery.SchemaField("order_id", "STRING"),
+                bigquery.SchemaField(
+                    "items",
+                    "RECORD",
+                    "REPEATED",
+                    fields=(
+                        bigquery.SchemaField("item_name", "STRING"),
+                        bigquery.SchemaField("quantity", "INTEGER"),
+                    ),
+                ),
+            ),
+            "UNNEST(ARRAY<STRUCT<`order_id` STRING, `items` ARRAY<STRUCT<`item_name` STRING, `quantity` INTEGER>>>>[])",
+        ),
+    ],
+)
+def test_to_sql_dry_run(bqschema, expected_sql):
+    assert bigquery_schema.to_sql_dry_run(bqschema) == expected_sql
diff --git a/tests/unit/pandas/io/test_api.py b/tests/unit/pandas/io/test_api.py
index fbc9027552..24ef51ad47 100644
--- a/tests/unit/pandas/io/test_api.py
+++ b/tests/unit/pandas/io/test_api.py
@@ -19,7 +19,32 @@
 import bigframes.session
 
 
-@mock.patch("bigframes.pandas.io.api._set_default_session_location_if_possible")
+@mock.patch(
+    "bigframes.pandas.io.api._set_default_session_location_if_possible_deferred_query"
+)
+@mock.patch("bigframes.core.global_session.with_default_session")
+def test_read_gbq_colab_dry_run_doesnt_call_set_location(
+    mock_with_default_session, mock_set_location
+):
+    """
+    Ensure that we don't bind to a location too early. If it's a dry run, the
+    user might not be done typing.
+    """
+    mock_df = mock.create_autospec(bigframes.dataframe.DataFrame)
+    mock_with_default_session.return_value = mock_df
+
+    query_or_table = "SELECT {param1} AS param1"
+    sample_pyformat_args = {"param1": "value1"}
+    bf_io_api._read_gbq_colab(
+        query_or_table, pyformat_args=sample_pyformat_args, dry_run=True
+    )
+
+    mock_set_location.assert_not_called()
+
+
+@mock.patch(
+    "bigframes.pandas.io.api._set_default_session_location_if_possible_deferred_query"
+)
 @mock.patch("bigframes.core.global_session.with_default_session")
 def test_read_gbq_colab_calls_set_location(
     mock_with_default_session, mock_set_location
@@ -36,7 +61,9 @@ def test_read_gbq_colab_calls_set_location(
 
     # Make sure that we format the SQL first to prevent syntax errors.
     formatted_query = "SELECT 'value1' AS param1"
-    mock_set_location.assert_called_once_with(formatted_query)
+    mock_set_location.assert_called_once()
+    args, _ = mock_set_location.call_args
+    assert formatted_query == args[0]()
     mock_with_default_session.assert_called_once()
 
     # Check the actual arguments passed to with_default_session
diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py
index c4635f85a9..52b091c045 100644
--- a/tests/unit/session/test_read_gbq_colab.py
+++ b/tests/unit/session/test_read_gbq_colab.py
@@ -15,8 +15,12 @@
 """Unit tests for read_gbq_colab helper functions."""
 
 import textwrap
+from unittest import mock
 
 from google.cloud import bigquery
+import numpy
+import pandas
+import pytest
 
 from bigframes.testing import mocks
 
@@ -36,15 +40,29 @@ def test_read_gbq_colab_includes_label():
     assert "session-read_gbq_colab" in label_values
 
 
-def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch):
-    session = mocks.create_bigquery_session()
+@pytest.mark.parametrize("dry_run", [True, False])
+def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch, dry_run):
+    bqclient = mock.create_autospec(bigquery.Client, instance=True)
+    bqclient.project = "proj"
+    session = mocks.create_bigquery_session(bqclient=bqclient)
     bf_df = mocks.create_dataframe(monkeypatch, session=session)
-    bf_df._to_view = lambda: bigquery.TableReference.from_string("my-project.my_dataset.some_view")  # type: ignore
+    session._create_temp_table = mock.Mock(  # type: ignore
+        return_value=bigquery.TableReference.from_string("proj.dset.temp_table")
+    )
+    session._create_temp_view = mock.Mock(  # type: ignore
+        return_value=bigquery.TableReference.from_string("proj.dset.temp_view")
+    )
+
+    # To avoid trouble with get_table() calls getting out of sync with mock
+    # "uploaded" data, make sure this is small enough to inline in the SQL as a
+    # view.
+    pd_df = pandas.DataFrame({"rowindex": numpy.arange(3), "value": numpy.arange(3)})
 
     pyformat_args = {
         "some_integer": 123,
         "some_string": "This could be dangerous, but we escape it",
         "bf_df": bf_df,
+        "pd_df": pd_df,
         # This is not a supported type, but ignored if not referenced.
         "some_object": object(),
     }
@@ -55,30 +73,35 @@ def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch):
             SELECT {some_integer} as some_integer,
             {some_string} as some_string,
             '{{escaped}}' as escaped
-            FROM {bf_df}
+            FROM {bf_df} AS bf_df
+            FULL OUTER JOIN {pd_df} AS pd_df
+            ON bf_df.rowindex = pd_df.rowindex
             """
         ),
         pyformat_args=pyformat_args,
-        dry_run=True,
+        dry_run=dry_run,
     )
     expected = textwrap.dedent(
-        """
+        f"""
         SELECT 123 as some_integer,
         'This could be dangerous, but we escape it' as some_string,
-        '{escaped}' as escaped
-        FROM `my-project`.`my_dataset`.`some_view`
+        '{{escaped}}' as escaped
+        FROM `proj`.`dset`.`temp_{"table" if dry_run else "view"}` AS bf_df
+        FULL OUTER JOIN `proj`.`dset`.`temp_{"table" if dry_run else "view"}` AS pd_df
+        ON bf_df.rowindex = pd_df.rowindex
         """
     )
-    queries = session._queries  # type: ignore
-    configs = session._job_configs  # type: ignore
 
-    for query, config in zip(queries, configs):
-        if config is None:
-            continue
-        if config.dry_run:
-            break
+    # This should be the most recent query.
+    query = session._queries[-1]  # type: ignore
+    config = session._job_configs[-1]  # type: ignore
+
+    if dry_run:
+        assert config.dry_run
+    else:
+        # Allow for any "False-y" value.
+        assert not config.dry_run
 
-    assert config.dry_run
     assert query.strip() == expected.strip()
 
 
diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py
index 71479e89d4..dfd1cd622f 100644
--- a/tests/unit/test_local_data.py
+++ b/tests/unit/test_local_data.py
@@ -46,6 +46,72 @@ def test_local_data_well_formed_round_trip():
     pandas.testing.assert_frame_equal(pd_data_normalized, result, check_dtype=False)
 
 
+def test_local_data_small_sizes_round_trip():
+    pyarrow_version = int(pa.__version__.split(".")[0])
+
+    int8s = [126, 127, -127, -128, 0, 1, -1]
+    uint8s = [254, 255, 1, 0, 128, 129, 127]
+    int16s = [32766, 32767, -32766, -32767, 0, 1, -1]
+    uint16s = [65534, 65535, 1, 0, 32768, 32769, 32767]
+    int32s = [2**31 - 2, 2**31 - 1, -(2**31) + 1, -(2**31), 0, 1, -1]
+    uint32s = [2**32 - 2, 2**32 - 1, 1, 0, 2**31, 2**31 + 1, 2**31 - 1]
+    float16s = [
+        # Test some edge cases from:
+        # https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations
+        float.fromhex("0x1.0p-24"),  # (2 ** -24).hex()
+        float.fromhex("-0x1.0p-24"),
+        float.fromhex("0x1.ffcp-13"),  # ((2 ** -12) - (2 ** -23)).hex()
+        float.fromhex("-0x1.ffcp-13"),
+        0,
+        float.fromhex("0x1.ffcp+14"),  # (32768.0 - 16).hex()
+        float.fromhex("-0x1.ffcp+14"),
+    ]
+    float32s = [
+        # Test some edge cases from:
+        # https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Notable_single-precision_cases
+        # and
+        # https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Precision_limitations_on_decimal_values_(between_1_and_16777216)
+        float.fromhex("0x1.0p-149"),  # (2 ** -149).hex()
+        float.fromhex("-0x1.0p-149"),  # (2 ** -149).hex()
+        float.fromhex("0x1.fffffep-1"),  # (1.0 - (2 ** -24)).hex()
+        float.fromhex("-0x1.fffffep-1"),
+        0,
+        float.fromhex("0x1.fffffcp-127"),  # ((2 ** -126) * (1 - 2 ** -23)).hex()
+        float.fromhex("-0x1.fffffcp-127"),  # ((2 ** -126) * (1 - 2 ** -23)).hex()
+    ]
+    small_data = {
+        "int8": pd.Series(int8s, dtype=pd.Int8Dtype()),
+        "int16": pd.Series(int16s, dtype=pd.Int16Dtype()),
+        "int32": pd.Series(int32s, dtype=pd.Int32Dtype()),
+        "uint8": pd.Series(uint8s, dtype=pd.UInt8Dtype()),
+        "uint16": pd.Series(uint16s, dtype=pd.UInt16Dtype()),
+        "uint32": pd.Series(uint32s, dtype=pd.UInt32Dtype()),
+        "float32": pd.Series(float32s, dtype="float32"),
+    }
+    expected_data = {
+        "int8": pd.Series(int8s, dtype=pd.Int64Dtype()),
+        "int16": pd.Series(int16s, dtype=pd.Int64Dtype()),
+        "int32": pd.Series(int32s, dtype=pd.Int64Dtype()),
+        "uint8": pd.Series(uint8s, dtype=pd.Int64Dtype()),
+        "uint16": pd.Series(uint16s, dtype=pd.Int64Dtype()),
+        "uint32": pd.Series(uint32s, dtype=pd.Int64Dtype()),
+        "float32": pd.Series(float32s, dtype=pd.Float64Dtype()),
+    }
+
+    # Casting from float16 added in version 16.
+    # https://arrow.apache.org/blog/2024/04/20/16.0.0-release/#:~:text=Enhancements,New%20Features
+    if pyarrow_version >= 16:
+        small_data["float16"] = pd.Series(float16s, dtype="float16")
+        expected_data["float16"] = pd.Series(float16s, dtype=pd.Float64Dtype())
+
+    small_pd = pd.DataFrame(small_data)
+    local_entry = local_data.ManagedArrowTable.from_pandas(small_pd)
+    result = pd.DataFrame(local_entry.itertuples(), columns=small_pd.columns)
+
+    expected = pd.DataFrame(expected_data)
+    pandas.testing.assert_frame_equal(expected, result, check_dtype=False)
+
+
 def test_local_data_well_formed_round_trip_chunked():
     pa_table = pa.Table.from_pandas(pd_data, preserve_index=False)
     as_rechunked_pyarrow = pa.Table.from_batches(pa_table.to_batches(max_chunksize=2))

From 019051e453d81769891aa398475ebd04d1826e81 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 16 Jun 2025 11:53:08 -0700
Subject: [PATCH 18/23] feat: Add bbq.json_value_array and deprecate
 bbq.json_extract_string_array (#1818)

This commit introduces the `bbq.json_value_array` method, which provides similar functionality to `JSON_VALUE_ARRAY` in BigQuery Standard SQL. The `bbq.json_extract_string_array` method has been marked as deprecated and will be removed in a future version. You should migrate to `bbq.json_value_array` for equivalent functionality.

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
---
 bigframes/bigquery/__init__.py               |  2 +
 bigframes/bigquery/_operations/json.py       | 66 +++++++++++++++++++-
 bigframes/core/compile/scalar_op_compiler.py | 12 ++++
 bigframes/operations/__init__.py             |  2 +
 bigframes/operations/json_ops.py             | 17 +++++
 tests/system/small/bigquery/test_json.py     | 52 ++++++++++++++-
 6 files changed, 149 insertions(+), 2 deletions(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index cdc3718893..7ca7fb693b 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -43,6 +43,7 @@
     json_query_array,
     json_set,
     json_value,
+    json_value_array,
     parse_json,
 )
 from bigframes.bigquery._operations.search import create_vector_index, vector_search
@@ -71,6 +72,7 @@
     "json_query_array",
     "json_set",
     "json_value",
+    "json_value_array",
     "parse_json",
     # search ops
     "create_vector_index",
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
index 00d230d684..7ad7855dba 100644
--- a/bigframes/bigquery/_operations/json.py
+++ b/bigframes/bigquery/_operations/json.py
@@ -196,6 +196,10 @@ def json_extract_string_array(
     values in the array. This function uses single quotes and brackets to escape
     invalid JSONPath characters in JSON keys.
 
+    .. deprecated:: 2.6.0
+        The ``json_extract_string_array`` is deprecated and will be removed in a future version.
+        Use ``json_value_array`` instead.
+
     **Examples:**
 
         >>> import bigframes.pandas as bpd
@@ -233,6 +237,11 @@ def json_extract_string_array(
     Returns:
         bigframes.series.Series: A new Series with the parsed arrays from the input.
     """
+    msg = (
+        "The `json_extract_string_array` is deprecated and will be removed in a future version. "
+        "Use `json_value_array` instead."
+    )
+    warnings.warn(bfe.format_message(msg), category=UserWarning)
     array_series = input._apply_unary_op(
         ops.JSONExtractStringArray(json_path=json_path)
     )
@@ -334,7 +343,7 @@ def json_query_array(
 
 def json_value(
     input: series.Series,
-    json_path: str,
+    json_path: str = "$",
 ) -> series.Series:
     """Extracts a JSON scalar value and converts it to a SQL ``STRING`` value. In
     addtion, this function:
@@ -366,6 +375,61 @@ def json_value(
     return input._apply_unary_op(ops.JSONValue(json_path=json_path))
 
 
+def json_value_array(
+    input: series.Series,
+    json_path: str = "$",
+) -> series.Series:
+    """
+    Extracts a JSON array of scalar values and converts it to a SQL ``ARRAY<STRING>``
+    value. In addition, this function:
+
+    - Removes the outermost quotes and unescapes the values.
+    - Returns a SQL ``NULL`` if the selected value isn't an array or not an array
+      containing only scalar values.
+    - Uses double quotes to escape invalid ``JSON_PATH`` characters in JSON keys.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
+        >>> bbq.json_value_array(s)
+        0    ['1' '2' '3']
+        1        ['4' '5']
+        dtype: list<item: string>[pyarrow]
+
+        >>> s = bpd.Series([
+        ...   '{"fruits": ["apples", "oranges", "grapes"]',
+        ...   '{"fruits": ["guava", "grapes"]}'
+        ... ])
+        >>> bbq.json_value_array(s, "$.fruits")
+        0    ['apples' 'oranges' 'grapes']
+        1               ['guava' 'grapes']
+        dtype: list<item: string>[pyarrow]
+
+        >>> s = bpd.Series([
+        ...   '{"fruits": {"color": "red",   "names": ["apple","cherry"]}}',
+        ...   '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
+        ... ])
+        >>> bbq.json_value_array(s, "$.fruits.names")
+        0    ['apple' 'cherry']
+        1    ['guava' 'grapes']
+        dtype: list<item: string>[pyarrow]
+
+    Args:
+        input (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path (str):
+            The JSON path identifying the data that you want to obtain from the input.
+
+    Returns:
+        bigframes.series.Series: A new Series with the parsed arrays from the input.
+    """
+    return input._apply_unary_op(ops.JSONValueArray(json_path=json_path))
+
+
 @utils.preview(name="The JSON-related API `parse_json`")
 def parse_json(
     input: series.Series,
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index b819b1c4e2..075089bb7a 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -1448,6 +1448,11 @@ def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue):
     return json_value(json_obj=x, json_path=op.json_path)
 
 
+@scalar_op_compiler.register_unary_op(ops.JSONValueArray, pass_op=True)
+def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray):
+    return json_value_array(json_obj=x, json_path=op.json_path)
+
+
 # Blob Ops
 @scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op)
 def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value):
@@ -2157,6 +2162,13 @@ def json_value(  # type: ignore[empty-body]
     """Retrieve value of a JSON field as plain STRING."""
 
 
+@ibis_udf.scalar.builtin(name="json_value_array")
+def json_value_array(  # type: ignore[empty-body]
+    json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
+) -> ibis_dtypes.Array[ibis_dtypes.String]:
+    """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs."""
+
+
 @ibis_udf.scalar.builtin(name="INT64")
 def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64:  # type: ignore[empty-body]
     """Converts a JSON number to a SQL INT64 value."""
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index 291bf17fa5..86098d47cf 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -112,6 +112,7 @@
     JSONQueryArray,
     JSONSet,
     JSONValue,
+    JSONValueArray,
     ParseJSON,
     ToJSONString,
 )
@@ -363,6 +364,7 @@
     "JSONQueryArray",
     "JSONSet",
     "JSONValue",
+    "JSONValueArray",
     "ParseJSON",
     "ToJSONString",
     # Bool ops
diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py
index 95a47dcadb..81f00c39ce 100644
--- a/bigframes/operations/json_ops.py
+++ b/bigframes/operations/json_ops.py
@@ -153,6 +153,23 @@ def output_type(self, *input_types):
         return dtypes.STRING_DTYPE
 
 
+@dataclasses.dataclass(frozen=True)
+class JSONValueArray(base_ops.UnaryOp):
+    name: typing.ClassVar[str] = "json_value_array"
+    json_path: str
+
+    def output_type(self, *input_types):
+        input_type = input_types[0]
+        if not dtypes.is_json_like(input_type):
+            raise TypeError(
+                "Input type must be a valid JSON object or JSON-formatted string type."
+                + f" Received type: {input_type}"
+            )
+        return pd.ArrowDtype(
+            pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
+        )
+
+
 @dataclasses.dataclass(frozen=True)
 class JSONQuery(base_ops.UnaryOp):
     name: typing.ClassVar[str] = "json_query"
diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py
index 4ad16d6cc8..4ecbd01318 100644
--- a/tests/system/small/bigquery/test_json.py
+++ b/tests/system/small/bigquery/test_json.py
@@ -186,7 +186,10 @@ def test_json_extract_array_w_invalid_series_type():
 
 def test_json_extract_string_array_from_json_strings():
     s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
-    actual = bbq.json_extract_string_array(s, "$.a")
+    with pytest.warns(
+        UserWarning, match="The `json_extract_string_array` is deprecated"
+    ):
+        actual = bbq.json_extract_string_array(s, "$.a")
     expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]])
 
     pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
@@ -214,6 +217,53 @@ def test_json_extract_string_array_w_invalid_series_type():
         bbq.json_extract_string_array(s)
 
 
+def test_json_value_array_from_json_strings():
+    s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
+    actual = bbq.json_value_array(s, "$.a")
+    expected_data = [["ab", "2", "3 xy"], [], ["4", "5"]]
+    # Expected dtype after JSON_VALUE_ARRAY is ARRAY<STRING>
+    expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string())))
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+def test_json_value_array_from_array_strings():
+    s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
+    actual = bbq.json_value_array(s)
+    expected_data = [["1", "2", "3"], [], ["4", "5"]]
+    expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string())))
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+def test_json_value_array_w_invalid_series_type():
+    s = bpd.Series([1, 2], dtype=dtypes.INT_DTYPE)  # Not a JSON-like string
+    with pytest.raises(TypeError):
+        bbq.json_value_array(s)
+
+
+def test_json_value_array_from_json_native():
+    json_data = [
+        '{"key": ["hello", "world"]}',
+        '{"key": ["123", "45.6"]}',
+        '{"key": []}',
+        "{}",  # case with missing key
+    ]
+    s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE)
+    actual = bbq.json_value_array(s, json_path="$.key")
+
+    expected_data_pandas = [["hello", "world"], ["123", "45.6"], [], None]
+    expected = bpd.Series(
+        expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string()))
+    ).fillna(pd.NA)
+    result_pd = actual.to_pandas().fillna(pd.NA)
+    pd.testing.assert_series_equal(result_pd, expected.to_pandas())
+
+
 def test_json_query_from_json():
     s = bpd.Series(
         ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],

From aa323694e161f558bc5e60490c2f21008961e2ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Mon, 16 Jun 2025 14:29:55 -0500
Subject: [PATCH 19/23] docs: use pandas API instead of pandas-like or
 pandas-compatible (#1825)

---
 README.rst                                             | 3 ++-
 notebooks/getting_started/bq_dataframes_template.ipynb | 3 ++-
 third_party/bigframes_vendored/pandas/io/gbq.py        | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 9288f2e6a5..36d3c2ca20 100644
--- a/README.rst
+++ b/README.rst
@@ -6,7 +6,8 @@ BigQuery DataFrames (BigFrames)
 BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame
 and machine learning (ML) API powered by the BigQuery engine.
 
-* ``bigframes.pandas`` provides a pandas-compatible API for analytics.
+* `bigframes.pandas` provides a pandas API for analytics. Many workloads can be
+  migrated from pandas to bigframes by just changing a few imports.
 * ``bigframes.ml`` provides a scikit-learn-like API for ML.
 
 BigQuery DataFrames is an open-source package.
diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb
index 68c5e9f74d..ae772d035e 100644
--- a/notebooks/getting_started/bq_dataframes_template.ipynb
+++ b/notebooks/getting_started/bq_dataframes_template.ipynb
@@ -81,7 +81,8 @@
         "\n",
         "BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine.\n",
         "\n",
-        "* `bigframes.pandas` provides a pandas-like API for analytics.\n",
+        "* `bigframes.pandas` provides a pandas API for analytics. Many workloads can be\n",
+        "  migrated from pandas to bigframes by just changing a few imports.\n",
         "* `bigframes.ml` provides a scikit-learn-like API for ML.\n",
         "* `bigframes.ml.llm` provides API for large language models including Gemini.\n",
         "\n",
diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py
index a0d4092571..3dae2b6bbe 100644
--- a/third_party/bigframes_vendored/pandas/io/gbq.py
+++ b/third_party/bigframes_vendored/pandas/io/gbq.py
@@ -45,7 +45,7 @@ def read_gbq(
         * (Recommended) Set the ``index_col`` argument to one or more columns.
           Unique values for the row labels are recommended. Duplicate labels
           are possible, but note that joins on a non-unique index can duplicate
-          rows via pandas-like outer join behavior.
+          rows via pandas-compatible outer join behavior.
 
         .. note::
             By default, even SQL query inputs with an ORDER BY clause create a

From 72076c76a6ebc3efe59834d39861fdd37dbbdcab Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 16 Jun 2025 13:23:51 -0700
Subject: [PATCH 20/23] chore: compile concat nodes by sqlglot (#1824)

* chore: compile concat node

* chore: compile concat nodes by sqlglot
---
 bigframes/core/compile/sqlglot/compiler.py    |  11 ++
 bigframes/core/compile/sqlglot/sqlglot_ir.py  |  53 ++++++++-
 .../test_compile_concat/out.sql               | 107 ++++++++++++++++++
 .../test_compile_projection/out.sql           |   1 +
 .../compile/sqlglot/test_compile_concat.py    |  32 ++++++
 5 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
 create mode 100644 tests/unit/core/compile/sqlglot/test_compile_concat.py

diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
index ebe2a64699..d2b796b0aa 100644
--- a/bigframes/core/compile/sqlglot/compiler.py
+++ b/bigframes/core/compile/sqlglot/compiler.py
@@ -190,6 +190,17 @@ def compile_projection(
         )
         return child.project(projected_cols)
 
+    @_compile_node.register
+    def compile_concat(
+        self, node: nodes.ConcatNode, *children: ir.SQLGlotIR
+    ) -> ir.SQLGlotIR:
+        output_ids = [id.sql for id in node.output_ids]
+        return ir.SQLGlotIR.from_union(
+            [child.expr for child in children],
+            output_ids=output_ids,
+            uid_gen=self.uid_gen,
+        )
+
 
 def _replace_unsupported_ops(node: nodes.BigFrameNode):
     node = nodes.bottom_up(node, rewrite.rewrite_slice)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
index 95e4f90118..43bdc6b06b 100644
--- a/bigframes/core/compile/sqlglot/sqlglot_ir.py
+++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -149,6 +149,57 @@ def from_query_string(
         select_expr.set("with", sge.With(expressions=[cte]))
         return cls(expr=select_expr, uid_gen=uid_gen)
 
+    @classmethod
+    def from_union(
+        cls,
+        selects: typing.Sequence[sge.Select],
+        output_ids: typing.Sequence[str],
+        uid_gen: guid.SequentialUIDGenerator,
+    ) -> SQLGlotIR:
+        """Builds SQLGlot expression by union of multiple select expressions."""
+        assert (
+            len(list(selects)) >= 2
+        ), f"At least two select expressions must be provided, but got {selects}."
+
+        existing_ctes: list[sge.CTE] = []
+        union_selects: list[sge.Select] = []
+        for select in selects:
+            assert isinstance(
+                select, sge.Select
+            ), f"All provided expressions must be of type sge.Select, but got {type(select)}"
+
+            select_expr = select.copy()
+            existing_ctes = [*existing_ctes, *select_expr.args.pop("with", [])]
+
+            new_cte_name = sge.to_identifier(
+                next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted
+            )
+            new_cte = sge.CTE(
+                this=select_expr,
+                alias=new_cte_name,
+            )
+            existing_ctes = [*existing_ctes, new_cte]
+
+            selections = [
+                sge.Alias(
+                    this=expr.alias_or_name,
+                    alias=sge.to_identifier(output_id, quoted=cls.quoted),
+                )
+                for expr, output_id in zip(select_expr.expressions, output_ids)
+            ]
+            union_selects.append(
+                sge.Select().select(*selections).from_(sge.Table(this=new_cte_name))
+            )
+
+        union_expr = sg.union(
+            *union_selects,
+            distinct=False,
+            copy=False,
+        )
+        final_select_expr = sge.Select().select(sge.Star()).from_(union_expr.subquery())
+        final_select_expr.set("with", sge.With(expressions=existing_ctes))
+        return cls(expr=final_select_expr, uid_gen=uid_gen)
+
     def select(
         self,
         selected_cols: tuple[tuple[str, sge.Expression], ...],
@@ -181,7 +232,7 @@ def project(
             )
             for id, expr in projected_cols
         ]
-        new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=False)
+        new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=True)
         return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
 
     def insert(
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
new file mode 100644
index 0000000000..4b6b2617ac
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
@@ -0,0 +1,107 @@
+WITH `bfcte_1` AS (
+  SELECT
+    *
+  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` INT64, `bfcol_2` INT64, `bfcol_3` STRING, `bfcol_4` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, '  ¡Hola Mundo!  ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
+), `bfcte_3` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_5`,
+    `bfcol_2` AS `bfcol_6`,
+    `bfcol_1` AS `bfcol_7`,
+    `bfcol_3` AS `bfcol_8`,
+    `bfcol_4` AS `bfcol_9`
+  FROM `bfcte_1`
+), `bfcte_5` AS (
+  SELECT
+    *,
+    `bfcol_9` AS `bfcol_10`
+  FROM `bfcte_3`
+), `bfcte_7` AS (
+  SELECT
+    `bfcol_5` AS `bfcol_11`,
+    `bfcol_6` AS `bfcol_12`,
+    `bfcol_7` AS `bfcol_13`,
+    `bfcol_8` AS `bfcol_14`,
+    `bfcol_10` AS `bfcol_15`
+  FROM `bfcte_5`
+), `bfcte_9` AS (
+  SELECT
+    *,
+    0 AS `bfcol_16`
+  FROM `bfcte_7`
+), `bfcte_10` AS (
+  SELECT
+    `bfcol_11` AS `bfcol_17`,
+    `bfcol_12` AS `bfcol_18`,
+    `bfcol_13` AS `bfcol_19`,
+    `bfcol_14` AS `bfcol_20`,
+    `bfcol_16` AS `bfcol_21`,
+    `bfcol_15` AS `bfcol_22`
+  FROM `bfcte_9`
+), `bfcte_0` AS (
+  SELECT
+    *
+  FROM UNNEST(ARRAY<STRUCT<`bfcol_23` INT64, `bfcol_24` INT64, `bfcol_25` INT64, `bfcol_26` STRING, `bfcol_27` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, '  ¡Hola Mundo!  ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
+), `bfcte_2` AS (
+  SELECT
+    `bfcol_23` AS `bfcol_28`,
+    `bfcol_25` AS `bfcol_29`,
+    `bfcol_24` AS `bfcol_30`,
+    `bfcol_26` AS `bfcol_31`,
+    `bfcol_27` AS `bfcol_32`
+  FROM `bfcte_0`
+), `bfcte_4` AS (
+  SELECT
+    *,
+    `bfcol_32` AS `bfcol_33`
+  FROM `bfcte_2`
+), `bfcte_6` AS (
+  SELECT
+    `bfcol_28` AS `bfcol_34`,
+    `bfcol_29` AS `bfcol_35`,
+    `bfcol_30` AS `bfcol_36`,
+    `bfcol_31` AS `bfcol_37`,
+    `bfcol_33` AS `bfcol_38`
+  FROM `bfcte_4`
+), `bfcte_8` AS (
+  SELECT
+    *,
+    1 AS `bfcol_39`
+  FROM `bfcte_6`
+), `bfcte_11` AS (
+  SELECT
+    `bfcol_34` AS `bfcol_40`,
+    `bfcol_35` AS `bfcol_41`,
+    `bfcol_36` AS `bfcol_42`,
+    `bfcol_37` AS `bfcol_43`,
+    `bfcol_39` AS `bfcol_44`,
+    `bfcol_38` AS `bfcol_45`
+  FROM `bfcte_8`
+), `bfcte_12` AS (
+  SELECT
+    *
+  FROM (
+    SELECT
+      bfcol_17 AS `bfcol_46`,
+      bfcol_18 AS `bfcol_47`,
+      bfcol_19 AS `bfcol_48`,
+      bfcol_20 AS `bfcol_49`,
+      bfcol_21 AS `bfcol_50`,
+      bfcol_22 AS `bfcol_51`
+    FROM `bfcte_10`
+    UNION ALL
+    SELECT
+      bfcol_40 AS `bfcol_46`,
+      bfcol_41 AS `bfcol_47`,
+      bfcol_42 AS `bfcol_48`,
+      bfcol_43 AS `bfcol_49`,
+      bfcol_44 AS `bfcol_50`,
+      bfcol_45 AS `bfcol_51`
+    FROM `bfcte_11`
+  )
+)
+SELECT
+  `bfcol_46` AS `rowindex`,
+  `bfcol_47` AS `rowindex_1`,
+  `bfcol_48` AS `int64_col`,
+  `bfcol_49` AS `string_col`
+FROM `bfcte_12`
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
index 8a24b01a25..db470e3ba3 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
@@ -8,6 +8,7 @@ WITH `bfcte_0` AS (
   FROM `test-project`.`test_dataset`.`test_table`
 ), `bfcte_1` AS (
   SELECT
+    *,
     `bfcol_0` AS `bfcol_5`,
     `bfcol_2` AS `bfcol_6`,
     `bfcol_3` AS `bfcol_7`,
diff --git a/tests/unit/core/compile/sqlglot/test_compile_concat.py b/tests/unit/core/compile/sqlglot/test_compile_concat.py
new file mode 100644
index 0000000000..ec7e83a4b0
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/test_compile_concat.py
@@ -0,0 +1,32 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+import bigframes
+import bigframes.pandas as bpd
+
+pytest.importorskip("pytest_snapshot")
+
+
+def test_compile_concat(
+    scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot
+):
+    # TODO: concat two same dataframes, which SQL does not get reused.
+    # TODO: concat dataframes from a gbq table but trigger a windows compiler.
+    df1 = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session)
+    df1 = df1[["rowindex", "int64_col", "string_col"]]
+    concat_df = bpd.concat([df1, df1])
+    snapshot.assert_match(concat_df.sql, "out.sql")

From 3abc02e893e3f1a7a5f463c84594f31312680772 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 16 Jun 2025 13:52:04 -0700
Subject: [PATCH 21/23] chore: enable order_by and limit for new compiler
 (#1815)

* chore: enable order_by and limit for new compiler

* fix tests after merge main
---
 bigframes/core/compile/sqlglot/compiler.py    | 20 ++++++-
 bigframes/core/compile/sqlglot/sqlglot_ir.py  | 46 ++++++++++++----
 .../test_compile_concat/out.sql               |  5 +-
 .../test_compile_projection/out.sql           | 20 ++++---
 .../test_compile_readlocal/out.sql            | 53 +++++++++++++------
 .../test_compile_readlocal_w_json_df/out.sql  | 11 +++-
 .../test_compile_readlocal_w_lists_df/out.sql | 32 +++++++----
 .../out.sql                                   | 14 +++--
 .../test_compile_readtable_w_limit/out.sql    | 24 +++++++++
 .../test_compile_readtable_w_ordering/out.sql | 40 ++++++++++++++
 .../compile/sqlglot/test_compile_readtable.py | 12 +++++
 11 files changed, 228 insertions(+), 49 deletions(-)
 create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql
 create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql

diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
index d2b796b0aa..68b572f911 100644
--- a/bigframes/core/compile/sqlglot/compiler.py
+++ b/bigframes/core/compile/sqlglot/compiler.py
@@ -125,9 +125,25 @@ def _compile_result_node(self, root: nodes.ResultNode) -> str:
             (name, scalar_compiler.compile_scalar_expression(ref))
             for ref, name in root.output_cols
         )
-        sqlglot_ir = sqlglot_ir.select(selected_cols)
+        # Skip squashing selections to ensure the right ordering and limit keys
+        sqlglot_ir = sqlglot_ir.select(selected_cols, squash_selections=False)
+
+        if root.order_by is not None:
+            ordering_cols = tuple(
+                sge.Ordered(
+                    this=scalar_compiler.compile_scalar_expression(
+                        ordering.scalar_expression
+                    ),
+                    desc=ordering.direction.is_ascending is False,
+                    nulls_first=ordering.na_last is False,
+                )
+                for ordering in root.order_by.all_ordering_columns
+            )
+            sqlglot_ir = sqlglot_ir.order_by(ordering_cols)
+
+        if root.limit is not None:
+            sqlglot_ir = sqlglot_ir.limit(root.limit)
 
-        # TODO: add order_by, limit to sqlglot_expr
         return sqlglot_ir.sql
 
     @functools.lru_cache(maxsize=5000)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
index 43bdc6b06b..77ee0ccb78 100644
--- a/bigframes/core/compile/sqlglot/sqlglot_ir.py
+++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -28,7 +28,7 @@
 from bigframes.core import guid
 import bigframes.core.compile.sqlglot.sqlglot_types as sgt
 import bigframes.core.local_data as local_data
-import bigframes.core.schema as schemata
+import bigframes.core.schema as bf_schema
 
 # shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0.
 try:
@@ -67,7 +67,7 @@ def sql(self) -> str:
     def from_pyarrow(
         cls,
         pa_table: pa.Table,
-        schema: schemata.ArraySchema,
+        schema: bf_schema.ArraySchema,
         uid_gen: guid.SequentialUIDGenerator,
     ) -> SQLGlotIR:
         """Builds SQLGlot expression from pyarrow table."""
@@ -203,6 +203,7 @@ def from_union(
     def select(
         self,
         selected_cols: tuple[tuple[str, sge.Expression], ...],
+        squash_selections: bool = True,
     ) -> SQLGlotIR:
         selections = [
             sge.Alias(
@@ -211,15 +212,39 @@ def select(
             )
             for id, expr in selected_cols
         ]
-        # Attempts to simplify selected columns when the original and new column
-        # names are simply aliases of each other.
-        squashed_selections = _squash_selections(self.expr.expressions, selections)
-        if squashed_selections != []:
-            new_expr = self.expr.select(*squashed_selections, append=False)
-            return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+
+        # If squashing is enabled, we try to simplify the selections
+        # by checking if the new selections are simply aliases of the
+        # original columns.
+        if squash_selections:
+            new_selections = _squash_selections(self.expr.expressions, selections)
+            if new_selections != []:
+                new_expr = self.expr.select(*new_selections, append=False)
+                return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+
+        new_expr = self._encapsulate_as_cte().select(*selections, append=False)
+        return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+
+    def order_by(
+        self,
+        ordering: tuple[sge.Ordered, ...],
+    ) -> SQLGlotIR:
+        """Adds ORDER BY clause to the query."""
+        if len(ordering) == 0:
+            return SQLGlotIR(expr=self.expr.copy(), uid_gen=self.uid_gen)
+        new_expr = self.expr.order_by(*ordering)
+        return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+
+    def limit(
+        self,
+        limit: int | None,
+    ) -> SQLGlotIR:
+        """Adds LIMIT clause to the query."""
+        if limit is not None:
+            new_expr = self.expr.limit(limit)
         else:
-            new_expr = self._encapsulate_as_cte().select(*selections, append=False)
-            return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+            new_expr = self.expr.copy()
+        return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
 
     def project(
         self,
@@ -342,6 +367,7 @@ def _squash_selections(
     old_expr: list[sge.Expression], new_expr: list[sge.Alias]
 ) -> list[sge.Alias]:
     """
+    TODO: Reanble this function to optimize the SQL.
     Simplifies the select column expressions if existing (old_expr) and
     new (new_expr) selected columns are both simple aliases of column definitions.
 
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
index 4b6b2617ac..855e5874c2 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql
@@ -104,4 +104,7 @@ SELECT
   `bfcol_47` AS `rowindex_1`,
   `bfcol_48` AS `int64_col`,
   `bfcol_49` AS `string_col`
-FROM `bfcte_12`
\ No newline at end of file
+FROM `bfcte_12`
+ORDER BY
+  `bfcol_50` ASC NULLS LAST,
+  `bfcol_51` ASC NULLS LAST
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
index db470e3ba3..2804925b2d 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql
@@ -15,11 +15,19 @@ WITH `bfcte_0` AS (
     `bfcol_4` AS `bfcol_8`,
     `bfcol_1` + 1 AS `bfcol_9`
   FROM `bfcte_0`
+), `bfcte_2` AS (
+  SELECT
+    `bfcol_5` AS `bfcol_10`,
+    `bfcol_9` AS `bfcol_11`,
+    `bfcol_6` AS `bfcol_12`,
+    `bfcol_7` AS `bfcol_13`,
+    `bfcol_8` AS `bfcol_14`
+  FROM `bfcte_1`
 )
 SELECT
-  `bfcol_5` AS `rowindex`,
-  `bfcol_9` AS `int64_col`,
-  `bfcol_6` AS `string_col`,
-  `bfcol_7` AS `float64_col`,
-  `bfcol_8` AS `bool_col`
-FROM `bfcte_1`
\ No newline at end of file
+  `bfcol_10` AS `rowindex`,
+  `bfcol_11` AS `int64_col`,
+  `bfcol_12` AS `string_col`,
+  `bfcol_13` AS `float64_col`,
+  `bfcol_14` AS `bool_col`
+FROM `bfcte_2`
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql
index a34f3526d6..89c51b346d 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql
@@ -155,21 +155,42 @@ WITH `bfcte_0` AS (
     CAST(NULL AS TIMESTAMP),
     8
   )])
+), `bfcte_1` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_16`,
+    `bfcol_1` AS `bfcol_17`,
+    `bfcol_2` AS `bfcol_18`,
+    `bfcol_3` AS `bfcol_19`,
+    `bfcol_4` AS `bfcol_20`,
+    `bfcol_5` AS `bfcol_21`,
+    `bfcol_6` AS `bfcol_22`,
+    `bfcol_7` AS `bfcol_23`,
+    `bfcol_8` AS `bfcol_24`,
+    `bfcol_9` AS `bfcol_25`,
+    `bfcol_10` AS `bfcol_26`,
+    `bfcol_11` AS `bfcol_27`,
+    `bfcol_12` AS `bfcol_28`,
+    `bfcol_13` AS `bfcol_29`,
+    `bfcol_14` AS `bfcol_30`,
+    `bfcol_15` AS `bfcol_31`
+  FROM `bfcte_0`
 )
 SELECT
-  `bfcol_0` AS `rowindex`,
-  `bfcol_1` AS `bool_col`,
-  `bfcol_2` AS `bytes_col`,
-  `bfcol_3` AS `date_col`,
-  `bfcol_4` AS `datetime_col`,
-  `bfcol_5` AS `geography_col`,
-  `bfcol_6` AS `int64_col`,
-  `bfcol_7` AS `int64_too`,
-  `bfcol_8` AS `numeric_col`,
-  `bfcol_9` AS `float64_col`,
-  `bfcol_10` AS `rowindex_1`,
-  `bfcol_11` AS `rowindex_2`,
-  `bfcol_12` AS `string_col`,
-  `bfcol_13` AS `time_col`,
-  `bfcol_14` AS `timestamp_col`
-FROM `bfcte_0`
\ No newline at end of file
+  `bfcol_16` AS `rowindex`,
+  `bfcol_17` AS `bool_col`,
+  `bfcol_18` AS `bytes_col`,
+  `bfcol_19` AS `date_col`,
+  `bfcol_20` AS `datetime_col`,
+  `bfcol_21` AS `geography_col`,
+  `bfcol_22` AS `int64_col`,
+  `bfcol_23` AS `int64_too`,
+  `bfcol_24` AS `numeric_col`,
+  `bfcol_25` AS `float64_col`,
+  `bfcol_26` AS `rowindex_1`,
+  `bfcol_27` AS `rowindex_2`,
+  `bfcol_28` AS `string_col`,
+  `bfcol_29` AS `time_col`,
+  `bfcol_30` AS `timestamp_col`
+FROM `bfcte_1`
+ORDER BY
+  `bfcol_31` ASC NULLS LAST
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql
index 31b46e6c70..76cbde7c64 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql
@@ -2,7 +2,14 @@ WITH `bfcte_0` AS (
   SELECT
     *
   FROM UNNEST(ARRAY<STRUCT<`bfcol_0` JSON, `bfcol_1` INT64>>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
+), `bfcte_1` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_2`,
+    `bfcol_1` AS `bfcol_3`
+  FROM `bfcte_0`
 )
 SELECT
-  `bfcol_0` AS `json_col`
-FROM `bfcte_0`
\ No newline at end of file
+  `bfcol_2` AS `json_col`
+FROM `bfcte_1`
+ORDER BY
+  `bfcol_3` ASC NULLS LAST
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql
index 1ba602f205..6363739d9d 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql
@@ -32,14 +32,28 @@ WITH `bfcte_0` AS (
     ['', 'a'],
     2
   )])
+), `bfcte_1` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_9`,
+    `bfcol_1` AS `bfcol_10`,
+    `bfcol_2` AS `bfcol_11`,
+    `bfcol_3` AS `bfcol_12`,
+    `bfcol_4` AS `bfcol_13`,
+    `bfcol_5` AS `bfcol_14`,
+    `bfcol_6` AS `bfcol_15`,
+    `bfcol_7` AS `bfcol_16`,
+    `bfcol_8` AS `bfcol_17`
+  FROM `bfcte_0`
 )
 SELECT
-  `bfcol_0` AS `rowindex`,
-  `bfcol_1` AS `int_list_col`,
-  `bfcol_2` AS `bool_list_col`,
-  `bfcol_3` AS `float_list_col`,
-  `bfcol_4` AS `date_list_col`,
-  `bfcol_5` AS `date_time_list_col`,
-  `bfcol_6` AS `numeric_list_col`,
-  `bfcol_7` AS `string_list_col`
-FROM `bfcte_0`
\ No newline at end of file
+  `bfcol_9` AS `rowindex`,
+  `bfcol_10` AS `int_list_col`,
+  `bfcol_11` AS `bool_list_col`,
+  `bfcol_12` AS `float_list_col`,
+  `bfcol_13` AS `date_list_col`,
+  `bfcol_14` AS `date_time_list_col`,
+  `bfcol_15` AS `numeric_list_col`,
+  `bfcol_16` AS `string_list_col`
+FROM `bfcte_1`
+ORDER BY
+  `bfcol_17` ASC NULLS LAST
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql
index 54d1a1bb2b..af7206b759 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql
@@ -18,8 +18,16 @@ WITH `bfcte_0` AS (
     ),
     1
   )])
+), `bfcte_1` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_3`,
+    `bfcol_1` AS `bfcol_4`,
+    `bfcol_2` AS `bfcol_5`
+  FROM `bfcte_0`
 )
 SELECT
-  `bfcol_0` AS `id`,
-  `bfcol_1` AS `person`
-FROM `bfcte_0`
\ No newline at end of file
+  `bfcol_3` AS `id`,
+  `bfcol_4` AS `person`
+FROM `bfcte_1`
+ORDER BY
+  `bfcol_5` ASC NULLS LAST
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql
new file mode 100644
index 0000000000..837b805ca4
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql
@@ -0,0 +1,24 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`,
+    `string_col` AS `bfcol_2`,
+    `float64_col` AS `bfcol_3`,
+    `bool_col` AS `bfcol_4`
+  FROM `test-project`.`test_dataset`.`test_table`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    `bfcol_1` AS `bfcol_5`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_1` AS `int64_col`,
+  `bfcol_2` AS `string_col`,
+  `bfcol_3` AS `float64_col`,
+  `bfcol_4` AS `bool_col`
+FROM `bfcte_1`
+ORDER BY
+  `bfcol_5` ASC NULLS LAST
+LIMIT 10
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql
new file mode 100644
index 0000000000..9376691572
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql
@@ -0,0 +1,40 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`,
+    `string_col` AS `bfcol_2`,
+    `float64_col` AS `bfcol_3`,
+    `bool_col` AS `bfcol_4`
+  FROM `test-project`.`test_dataset`.`test_table`
+), `bfcte_1` AS (
+  SELECT
+    `bfcol_0` AS `bfcol_5`,
+    `bfcol_1` AS `bfcol_6`,
+    `bfcol_2` AS `bfcol_7`,
+    `bfcol_3` AS `bfcol_8`,
+    `bfcol_4` AS `bfcol_9`
+  FROM `bfcte_0`
+), `bfcte_2` AS (
+  SELECT
+    *,
+    `bfcol_5` AS `bfcol_10`
+  FROM `bfcte_1`
+), `bfcte_3` AS (
+  SELECT
+    `bfcol_5` AS `bfcol_11`,
+    `bfcol_6` AS `bfcol_12`,
+    `bfcol_7` AS `bfcol_13`,
+    `bfcol_8` AS `bfcol_14`,
+    `bfcol_9` AS `bfcol_15`,
+    `bfcol_10` AS `bfcol_16`
+  FROM `bfcte_2`
+)
+SELECT
+  `bfcol_11` AS `rowindex`,
+  `bfcol_12` AS `int64_col`,
+  `bfcol_13` AS `string_col`,
+  `bfcol_14` AS `float64_col`,
+  `bfcol_15` AS `bool_col`
+FROM `bfcte_3`
+ORDER BY
+  `bfcol_16` ASC NULLS LAST
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py
index 848ace58f3..41e01e9b25 100644
--- a/tests/unit/core/compile/sqlglot/test_compile_readtable.py
+++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py
@@ -22,3 +22,15 @@
 def test_compile_readtable(compiler_session: bigframes.Session, snapshot):
     bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
     snapshot.assert_match(bf_df.sql, "out.sql")
+
+
+def test_compile_readtable_w_ordering(compiler_session: bigframes.Session, snapshot):
+    bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
+    bf_df = bf_df.set_index("rowindex").sort_index()
+    snapshot.assert_match(bf_df.sql, "out.sql")
+
+
+def test_compile_readtable_w_limit(compiler_session: bigframes.Session, snapshot):
+    bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
+    bf_df = bf_df.sort_values("int64_col").head(10)
+    snapshot.assert_match(bf_df.sql, "out.sql")

From 33ab2b85dca8d358a022137ca49b2ee0d6f4192e Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@users.noreply.github.com>
Date: Mon, 16 Jun 2025 14:40:14 -0700
Subject: [PATCH 22/23] chore: inject dtypes to SQLGlot scalar expr compiler
 (#1821)

* chore: inject dtypes to SQLGlot scalar expr compiler

* fix format
---
 bigframes/core/compile/sqlglot/compiler.py    |  9 ++---
 .../core/compile/sqlglot/scalar_compiler.py   | 29 +++++++++++----
 bigframes/core/rewrite/schema_binding.py      |  8 ++++-
 .../test_compile_readtable_w_limit/out.sql    |  9 ++---
 .../test_compile_readtable_w_ordering/out.sql | 36 ++++---------------
 .../test_compile_numerical_add/out.sql        | 33 +++++++++++++++++
 .../test_compile_string_add/out.sql           | 33 +++++++++++++++++
 .../sqlglot/test_compile_scalar_expr.py       | 31 ++++++++++++++++
 8 files changed, 140 insertions(+), 48 deletions(-)
 create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql
 create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql
 create mode 100644 tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py

diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
index 68b572f911..84fd7124ba 100644
--- a/bigframes/core/compile/sqlglot/compiler.py
+++ b/bigframes/core/compile/sqlglot/compiler.py
@@ -119,14 +119,16 @@ def _remap_variables(self, node: nodes.ResultNode) -> nodes.ResultNode:
         return typing.cast(nodes.ResultNode, result_node)
 
     def _compile_result_node(self, root: nodes.ResultNode) -> str:
-        sqlglot_ir = self.compile_node(root.child)
-
+        # Have to bind schema as the final step before compilation.
+        root = typing.cast(nodes.ResultNode, schema_binding.bind_schema_to_tree(root))
         selected_cols: tuple[tuple[str, sge.Expression], ...] = tuple(
             (name, scalar_compiler.compile_scalar_expression(ref))
             for ref, name in root.output_cols
         )
         # Skip squashing selections to ensure the right ordering and limit keys
-        sqlglot_ir = sqlglot_ir.select(selected_cols, squash_selections=False)
+        sqlglot_ir = self.compile_node(root.child).select(
+            selected_cols, squash_selections=False
+        )
 
         if root.order_by is not None:
             ordering_cols = tuple(
@@ -220,6 +222,5 @@ def compile_concat(
 
 def _replace_unsupported_ops(node: nodes.BigFrameNode):
     node = nodes.bottom_up(node, rewrite.rewrite_slice)
-    node = nodes.bottom_up(node, schema_binding.bind_schema_to_expressions)
     node = nodes.bottom_up(node, rewrite.rewrite_range_rolling)
     return node
diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py
index 18d709732a..00ec892620 100644
--- a/bigframes/core/compile/sqlglot/scalar_compiler.py
+++ b/bigframes/core/compile/sqlglot/scalar_compiler.py
@@ -13,15 +13,25 @@
 # limitations under the License.
 from __future__ import annotations
 
+import dataclasses
 import functools
 
 import sqlglot.expressions as sge
 
+from bigframes import dtypes
 from bigframes.core import expression
 import bigframes.core.compile.sqlglot.sqlglot_ir as ir
 import bigframes.operations as ops
 
 
+@dataclasses.dataclass(frozen=True)
+class TypedExpr:
+    """SQLGlot expression with type."""
+
+    expr: sge.Expression
+    dtype: dtypes.ExpressionType
+
+
 @functools.singledispatch
 def compile_scalar_expression(
     expression: expression.Expression,
@@ -50,9 +60,12 @@ def compile_constant_expression(
 
 
 @compile_scalar_expression.register
-def compile_op_expression(expr: expression.OpExpression):
+def compile_op_expression(expr: expression.OpExpression) -> sge.Expression:
     # Non-recursively compiles the children scalar expressions.
-    args = tuple(map(compile_scalar_expression, expr.inputs))
+    args = tuple(
+        TypedExpr(compile_scalar_expression(input), input.output_type)
+        for input in expr.inputs
+    )
 
     op = expr.op
     op_name = expr.op.__class__.__name__
@@ -79,8 +92,10 @@ def compile_op_expression(expr: expression.OpExpression):
 
 
 # TODO: add parenthesize for operators
-def compile_addop(
-    op: ops.AddOp, left: sge.Expression, right: sge.Expression
-) -> sge.Expression:
-    # TODO: support addop for string dtype.
-    return sge.Add(this=left, expression=right)
+def compile_addop(op: ops.AddOp, left: TypedExpr, right: TypedExpr) -> sge.Expression:
+    if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE:
+        # String addition
+        return sge.Concat(expressions=[left.expr, right.expr])
+
+    # Numerical addition
+    return sge.Add(this=left.expr, expression=right.expr)
diff --git a/bigframes/core/rewrite/schema_binding.py b/bigframes/core/rewrite/schema_binding.py
index f3c313233b..aa5cb986b9 100644
--- a/bigframes/core/rewrite/schema_binding.py
+++ b/bigframes/core/rewrite/schema_binding.py
@@ -19,7 +19,13 @@
 from bigframes.core import nodes
 
 
-def bind_schema_to_expressions(
+def bind_schema_to_tree(
+    node: bigframe_node.BigFrameNode,
+) -> bigframe_node.BigFrameNode:
+    return nodes.bottom_up(node, bind_schema_to_node)
+
+
+def bind_schema_to_node(
     node: bigframe_node.BigFrameNode,
 ) -> bigframe_node.BigFrameNode:
     if isinstance(node, nodes.ProjectionNode):
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql
index 837b805ca4..c5724c8442 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql
@@ -6,11 +6,6 @@ WITH `bfcte_0` AS (
     `float64_col` AS `bfcol_3`,
     `bool_col` AS `bfcol_4`
   FROM `test-project`.`test_dataset`.`test_table`
-), `bfcte_1` AS (
-  SELECT
-    *,
-    `bfcol_1` AS `bfcol_5`
-  FROM `bfcte_0`
 )
 SELECT
   `bfcol_0` AS `rowindex`,
@@ -18,7 +13,7 @@ SELECT
   `bfcol_2` AS `string_col`,
   `bfcol_3` AS `float64_col`,
   `bfcol_4` AS `bool_col`
-FROM `bfcte_1`
+FROM `bfcte_0`
 ORDER BY
-  `bfcol_5` ASC NULLS LAST
+  `bfcol_1` ASC NULLS LAST
 LIMIT 10
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql
index 9376691572..238659cc01 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql
@@ -6,35 +6,13 @@ WITH `bfcte_0` AS (
     `float64_col` AS `bfcol_3`,
     `bool_col` AS `bfcol_4`
   FROM `test-project`.`test_dataset`.`test_table`
-), `bfcte_1` AS (
-  SELECT
-    `bfcol_0` AS `bfcol_5`,
-    `bfcol_1` AS `bfcol_6`,
-    `bfcol_2` AS `bfcol_7`,
-    `bfcol_3` AS `bfcol_8`,
-    `bfcol_4` AS `bfcol_9`
-  FROM `bfcte_0`
-), `bfcte_2` AS (
-  SELECT
-    *,
-    `bfcol_5` AS `bfcol_10`
-  FROM `bfcte_1`
-), `bfcte_3` AS (
-  SELECT
-    `bfcol_5` AS `bfcol_11`,
-    `bfcol_6` AS `bfcol_12`,
-    `bfcol_7` AS `bfcol_13`,
-    `bfcol_8` AS `bfcol_14`,
-    `bfcol_9` AS `bfcol_15`,
-    `bfcol_10` AS `bfcol_16`
-  FROM `bfcte_2`
 )
 SELECT
-  `bfcol_11` AS `rowindex`,
-  `bfcol_12` AS `int64_col`,
-  `bfcol_13` AS `string_col`,
-  `bfcol_14` AS `float64_col`,
-  `bfcol_15` AS `bool_col`
-FROM `bfcte_3`
+  `bfcol_0` AS `rowindex`,
+  `bfcol_1` AS `int64_col`,
+  `bfcol_2` AS `string_col`,
+  `bfcol_3` AS `float64_col`,
+  `bfcol_4` AS `bool_col`
+FROM `bfcte_0`
 ORDER BY
-  `bfcol_16` ASC NULLS LAST
\ No newline at end of file
+  `bfcol_0` ASC NULLS LAST
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql
new file mode 100644
index 0000000000..405b02d897
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql
@@ -0,0 +1,33 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`,
+    `string_col` AS `bfcol_2`,
+    `float64_col` AS `bfcol_3`,
+    `bool_col` AS `bfcol_4`
+  FROM `test-project`.`test_dataset`.`test_table`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    `bfcol_0` AS `bfcol_5`,
+    `bfcol_2` AS `bfcol_6`,
+    `bfcol_3` AS `bfcol_7`,
+    `bfcol_4` AS `bfcol_8`,
+    `bfcol_1` + `bfcol_1` AS `bfcol_9`
+  FROM `bfcte_0`
+), `bfcte_2` AS (
+  SELECT
+    `bfcol_5` AS `bfcol_10`,
+    `bfcol_9` AS `bfcol_11`,
+    `bfcol_6` AS `bfcol_12`,
+    `bfcol_7` AS `bfcol_13`,
+    `bfcol_8` AS `bfcol_14`
+  FROM `bfcte_1`
+)
+SELECT
+  `bfcol_10` AS `rowindex`,
+  `bfcol_11` AS `int64_col`,
+  `bfcol_12` AS `string_col`,
+  `bfcol_13` AS `float64_col`,
+  `bfcol_14` AS `bool_col`
+FROM `bfcte_2`
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql
new file mode 100644
index 0000000000..49ec5435f9
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql
@@ -0,0 +1,33 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`,
+    `string_col` AS `bfcol_2`,
+    `float64_col` AS `bfcol_3`,
+    `bool_col` AS `bfcol_4`
+  FROM `test-project`.`test_dataset`.`test_table`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    `bfcol_0` AS `bfcol_5`,
+    `bfcol_1` AS `bfcol_6`,
+    `bfcol_3` AS `bfcol_7`,
+    `bfcol_4` AS `bfcol_8`,
+    CONCAT(`bfcol_2`, 'a') AS `bfcol_9`
+  FROM `bfcte_0`
+), `bfcte_2` AS (
+  SELECT
+    `bfcol_5` AS `bfcol_10`,
+    `bfcol_6` AS `bfcol_11`,
+    `bfcol_9` AS `bfcol_12`,
+    `bfcol_7` AS `bfcol_13`,
+    `bfcol_8` AS `bfcol_14`
+  FROM `bfcte_1`
+)
+SELECT
+  `bfcol_10` AS `rowindex`,
+  `bfcol_11` AS `int64_col`,
+  `bfcol_12` AS `string_col`,
+  `bfcol_13` AS `float64_col`,
+  `bfcol_14` AS `bool_col`
+FROM `bfcte_2`
\ No newline at end of file
diff --git a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py b/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py
new file mode 100644
index 0000000000..ebdb82477f
--- /dev/null
+++ b/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py
@@ -0,0 +1,31 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import bigframes
+
+pytest.importorskip("pytest_snapshot")
+
+
+def test_compile_numerical_add(compiler_session: bigframes.Session, snapshot):
+    bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
+    bf_df["int64_col"] = bf_df["int64_col"] + bf_df["int64_col"]
+    snapshot.assert_match(bf_df.sql, "out.sql")
+
+
+def test_compile_string_add(compiler_session: bigframes.Session, snapshot):
+    bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
+    bf_df["string_col"] = bf_df["string_col"] + "a"
+    snapshot.assert_match(bf_df.sql, "out.sql")

From eef158b7143868131154e2643eac5cd542aac0ff Mon Sep 17 00:00:00 2001
From: "release-please[bot]"
 <55107282+release-please[bot]@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:28:48 -0700
Subject: [PATCH 23/23] chore(main): release 2.7.0 (#1805)

Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com>
Co-authored-by: Shuowei Li <shuowei@google.com>
---
 CHANGELOG.md                              | 23 +++++++++++++++++++++++
 bigframes/version.py                      |  4 ++--
 third_party/bigframes_vendored/version.py |  4 ++--
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0393ad944c..46b97c2210 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,29 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [2.7.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.6.0...v2.7.0) (2025-06-16)
+
+
+### Features
+
+* Add bbq.json_query_array and warn bbq.json_extract_array deprecated ([#1811](https://github.com/googleapis/python-bigquery-dataframes/issues/1811)) ([dc9eb27](https://github.com/googleapis/python-bigquery-dataframes/commit/dc9eb27fa75e90c2c95a0619551bf67aea6ef63b))
+* Add bbq.json_value_array and deprecate bbq.json_extract_string_array ([#1818](https://github.com/googleapis/python-bigquery-dataframes/issues/1818)) ([019051e](https://github.com/googleapis/python-bigquery-dataframes/commit/019051e453d81769891aa398475ebd04d1826e81))
+* Add groupby cumcount ([#1798](https://github.com/googleapis/python-bigquery-dataframes/issues/1798)) ([18f43e8](https://github.com/googleapis/python-bigquery-dataframes/commit/18f43e8b58e03a27b021bce07566a3d006ac3679))
+* Support custom build service account in `remote_function` ([#1796](https://github.com/googleapis/python-bigquery-dataframes/issues/1796)) ([e586151](https://github.com/googleapis/python-bigquery-dataframes/commit/e586151df81917b49f702ae496aaacbd02931636))
+
+
+### Bug Fixes
+
+* Correct read_csv behaviours with use_cols, names, index_col ([#1804](https://github.com/googleapis/python-bigquery-dataframes/issues/1804)) ([855031a](https://github.com/googleapis/python-bigquery-dataframes/commit/855031a316a6957731a5d1c5e59dedb9757d9f7a))
+* Fix single row broadcast with null index ([#1803](https://github.com/googleapis/python-bigquery-dataframes/issues/1803)) ([080eb7b](https://github.com/googleapis/python-bigquery-dataframes/commit/080eb7be3cde591e08cad0d5c52c68cc0b25ade8))
+
+
+### Documentation
+
+* Document how to use ai.map() for information extraction ([#1808](https://github.com/googleapis/python-bigquery-dataframes/issues/1808)) ([b586746](https://github.com/googleapis/python-bigquery-dataframes/commit/b5867464a5bf30300dcfc069eda546b11f03146c))
+* Rearrange README.rst to include a short code sample ([#1812](https://github.com/googleapis/python-bigquery-dataframes/issues/1812)) ([f6265db](https://github.com/googleapis/python-bigquery-dataframes/commit/f6265dbb8e22de81bb59c7def175cd325e85c041))
+* Use pandas API instead of pandas-like or pandas-compatible ([#1825](https://github.com/googleapis/python-bigquery-dataframes/issues/1825)) ([aa32369](https://github.com/googleapis/python-bigquery-dataframes/commit/aa323694e161f558bc5e60490c2f21008961e2ca))
+
 ## [2.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.5.0...v2.6.0) (2025-06-09)
 
 
diff --git a/bigframes/version.py b/bigframes/version.py
index e41364d4d1..138c380d0c 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.6.0"
+__version__ = "2.7.0"
 
 # {x-release-please-start-date}
-__release_date__ = "2025-06-09"
+__release_date__ = "2025-06-16"
 # {x-release-please-end}
diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py
index e41364d4d1..138c380d0c 100644
--- a/third_party/bigframes_vendored/version.py
+++ b/third_party/bigframes_vendored/version.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.6.0"
+__version__ = "2.7.0"
 
 # {x-release-please-start-date}
-__release_date__ = "2025-06-09"
+__release_date__ = "2025-06-16"
 # {x-release-please-end}