From 24b2eea67f1dee904e7857fe81ac5fdc5ecb8a2d Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Mon, 2 Oct 2023 17:16:31 -0500
Subject: [PATCH 01/11] fix: avoid `403 response too large to return` error
 with `read_gbq` and large query results

---
 bigframes/core/io.py |  18 +++++++-
 bigframes/session.py | 101 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 99 insertions(+), 20 deletions(-)

diff --git a/bigframes/core/io.py b/bigframes/core/io.py
index 3c2e5a25f5..7c6fdb1258 100644
--- a/bigframes/core/io.py
+++ b/bigframes/core/io.py
@@ -16,7 +16,7 @@
 
 import datetime
 import textwrap
-from typing import Dict, Union
+from typing import Dict, Iterable, Union
 
 import google.cloud.bigquery as bigquery
 
@@ -89,6 +89,22 @@ def create_snapshot_sql(
     )
 
 
+BQ_STANDARD_TYPES = {
+    "INT": "INT64",
+    "FLOAT": "FLOAT64",
+}
+
+
+def bq_schema_to_sql(schema: Iterable[bigquery.SchemaField]):
+    field_strings = []
+    for field in schema:
+        name = field.name
+        type_ = field.field_type
+        type_ = BQ_STANDARD_TYPES.get(type_, type_)
+        field_strings.append(f"`{name}` {type_}")
+    return ", ".join(field_strings)
+
+
 def format_option(key: str, value: Union[bool, str]) -> str:
     if isinstance(value, bool):
         return f"{key}=true" if value else f"{key}=false"
diff --git a/bigframes/session.py b/bigframes/session.py
index 7b827c7dcf..ab60cd8937 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -449,13 +449,6 @@ def _query_to_destination(
         index_cols: List[str],
         api_name: str,
     ) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]:
-        # If there are no index columns, then there's no reason to cache to a
-        # (clustered) session table, as we'll just have to query it again to
-        # create a default index & ordering.
-        if not index_cols:
-            _, query_job = self._start_query(query)
-            return query_job.destination, query_job
-
         # If a dry_run indicates this is not a query type job, then don't
         # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
         dry_run_config = bigquery.QueryJobConfig()
@@ -465,18 +458,17 @@ def _query_to_destination(
             _, query_job = self._start_query(query)
             return query_job.destination, query_job
 
-        # Make sure we cluster by the index column(s) so that subsequent
-        # operations are as speedy as they can be.
-        try:
-            ibis_expr = self.ibis_client.sql(query)
-            return self._ibis_to_session_table(ibis_expr, index_cols, api_name), None
-        except google.api_core.exceptions.BadRequest:
-            # Some SELECT statements still aren't compatible with CREATE TEMP
-            # TABLE ... AS SELECT ... statements. For example, if the query has
-            # a top-level ORDER BY, this conflicts with our ability to cluster
-            # the table by the index column(s).
-            _, query_job = self._start_query(query)
-            return query_job.destination, query_job
+        # Create a table to workaround BigQuery 10 GB query results limit. See:
+        # internal issue 303057336.
+        temp_table = self._create_session_table_empty(
+            api_name, dry_run_job.schema, index_cols
+        )
+
+        job_config = bigquery.QueryJobConfig()
+        job_config.destination = temp_table
+
+        _, query_job = self._start_query(query, job_config=job_config)
+        return query_job.destination, query_job
 
     def read_gbq_query(
         self,
@@ -1231,6 +1223,58 @@ def _create_session_table(self) -> bigquery.TableReference:
         )
         return dataset.table(table_name)
 
+    def _create_session_table_empty(
+        self,
+        api_name: str,
+        schema: Iterable[bigquery.SchemaField],
+        cluster_cols: List[str],
+    ) -> bigquery.TableReference:
+        clusterable_cols = [
+            col.name
+            for col in schema
+            if col.name in cluster_cols and _can_cluster_bq(col)
+        ][:_MAX_CLUSTER_COLUMNS]
+
+        # Can't set a table in _SESSION as destination via query job API, so we
+        # run DDL, instead.
+        table = self._create_session_table()
+        cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols)
+
+        # TODO(swast): Handle STRUCT (RECORD) / ARRAY (REPEATED) columns.
+        schema_sql = bigframes_io.bq_schema_to_sql(schema)
+
+        if clusterable_cols:
+            cluster_cols_sql = ", ".join(
+                f"`{cluster_col}`" for cluster_col in cluster_cols
+            )
+            cluster_sql = f"CLUSTER BY {cluster_cols_sql}"
+        else:
+            cluster_sql = ""
+
+        # TODO(swast): This might not support multi-statement SQL queries (scripts).
+        ddl_text = f"""
+        CREATE TEMP TABLE
+        `_SESSION`.`{table.table_id}`
+        ({schema_sql})
+        {cluster_sql}
+        """
+
+        job_config = bigquery.QueryJobConfig()
+
+        # Include a label so that Dataplex Lineage can identify temporary
+        # tables that BigQuery DataFrames creates. Googlers: See internal issue
+        # 296779699. We're labeling the job instead of the table because
+        # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
+        # supported`.
+        job_config.labels = {"source": "bigquery-dataframes-temp"}
+        job_config.labels["bigframes-api"] = api_name
+
+        _, query_job = self._start_query(ddl_text, job_config=job_config)
+
+        # Use fully-qualified name instead of `_SESSION` name so that the
+        # created table can be used as the destination table.
+        return query_job.destination
+
     def _create_sequential_ordering(
         self,
         table: ibis_types.Table,
@@ -1505,3 +1549,22 @@ def _can_cluster(ibis_type: ibis_dtypes.DataType):
         or ibis_type.is_timestamp()
         or ibis_type.is_boolean()
     )
+
+
+def _can_cluster_bq(field: bigquery.SchemaField):
+    # https://cloud.google.com/bigquery/docs/clustered-tables
+    # Notably, float is excluded
+    type_ = field.field_type
+    return type_ in (
+        "INTEGER",
+        "INT64",
+        "STRING",
+        "NUMERIC",
+        "DECIMAL",
+        "BIGNUMERIC",
+        "BIGDECIMAL" "DATE",
+        "DATETIME",
+        "TIMESTAMP",
+        "BOOL",
+        "BOOLEAN",
+    )

From fb0a476ee519f9245251d270512b659598743b26 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 11:01:00 -0500
Subject: [PATCH 02/11] support struct / array in read_gbq queries

---
 bigframes/core/io.py               | 40 ++++++++++++++++++++++++------
 bigframes/session.py               | 32 +++++++++++++++---------
 tests/system/small/test_session.py |  1 +
 3 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/bigframes/core/io.py b/bigframes/core/io.py
index 7c6fdb1258..920d6beb8e 100644
--- a/bigframes/core/io.py
+++ b/bigframes/core/io.py
@@ -89,20 +89,44 @@ def create_snapshot_sql(
     )
 
 
+# BigQuery REST API returns types in Legacy SQL format
+# https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL
+# names
+# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
 BQ_STANDARD_TYPES = {
-    "INT": "INT64",
+    "BOOLEAN": "BOOL",
+    "INTEGER": "INT64",
     "FLOAT": "FLOAT64",
 }
 
 
+def bq_field_to_type_sql(field: bigquery.SchemaField):
+    if field.mode == "REPEATED":
+        nested_type = bq_field_to_type_sql(
+            bigquery.SchemaField(
+                field.name, field.field_type, mode="NULLABLE", fields=field.fields
+            )
+        )
+        return f"ARRAY<{nested_type}>"
+
+    if field.field_type == "RECORD":
+        nested_fields_sql = ", ".join(
+            bq_field_to_sql(child_field) for child_field in field.fields
+        )
+        return f"STRUCT<{nested_fields_sql}>"
+
+    type_ = field.field_type
+    return BQ_STANDARD_TYPES.get(type_, type_)
+
+
+def bq_field_to_sql(field: bigquery.SchemaField):
+    name = field.name
+    type_ = bq_field_to_type_sql(field)
+    return f"`{name}` {type_}"
+
+
 def bq_schema_to_sql(schema: Iterable[bigquery.SchemaField]):
-    field_strings = []
-    for field in schema:
-        name = field.name
-        type_ = field.field_type
-        type_ = BQ_STANDARD_TYPES.get(type_, type_)
-        field_strings.append(f"`{name}` {type_}")
-    return ", ".join(field_strings)
+    return ", ".join(bq_field_to_sql(field) for field in schema)
 
 
 def format_option(key: str, value: Union[bool, str]) -> str:
diff --git a/bigframes/session.py b/bigframes/session.py
index ab60cd8937..de27c7f089 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -467,8 +467,18 @@ def _query_to_destination(
         job_config = bigquery.QueryJobConfig()
         job_config.destination = temp_table
 
-        _, query_job = self._start_query(query, job_config=job_config)
-        return query_job.destination, query_job
+        try:
+            # Write to temp table to workaround BigQuery 10 GB query results
+            # limit. See: internal issue 303057336.
+            _, query_job = self._start_query(query, job_config=job_config)
+            return query_job.destination, query_job
+        except google.api_core.exceptions.BadRequest:
+            # Some SELECT statements still aren't compatible with cluster
+            # tables as the destination. For example, if the query has a
+            # top-level ORDER BY, this conflicts with our ability to cluster
+            # the table by the index column(s).
+            _, query_job = self._start_query(query)
+            return query_job.destination, query_job
 
     def read_gbq_query(
         self,
@@ -1229,23 +1239,20 @@ def _create_session_table_empty(
         schema: Iterable[bigquery.SchemaField],
         cluster_cols: List[str],
     ) -> bigquery.TableReference:
+        # Can't set a table in _SESSION as destination via query job API, so we
+        # run DDL, instead.
+        table = self._create_session_table()
+        schema_sql = bigframes_io.bq_schema_to_sql(schema)
+
         clusterable_cols = [
             col.name
             for col in schema
             if col.name in cluster_cols and _can_cluster_bq(col)
         ][:_MAX_CLUSTER_COLUMNS]
 
-        # Can't set a table in _SESSION as destination via query job API, so we
-        # run DDL, instead.
-        table = self._create_session_table()
-        cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols)
-
-        # TODO(swast): Handle STRUCT (RECORD) / ARRAY (REPEATED) columns.
-        schema_sql = bigframes_io.bq_schema_to_sql(schema)
-
         if clusterable_cols:
             cluster_cols_sql = ", ".join(
-                f"`{cluster_col}`" for cluster_col in cluster_cols
+                f"`{cluster_col}`" for cluster_col in clusterable_cols
             )
             cluster_sql = f"CLUSTER BY {cluster_cols_sql}"
         else:
@@ -1562,7 +1569,8 @@ def _can_cluster_bq(field: bigquery.SchemaField):
         "NUMERIC",
         "DECIMAL",
         "BIGNUMERIC",
-        "BIGDECIMAL" "DATE",
+        "BIGDECIMAL",
+        "DATE",
         "DATETIME",
         "TIMESTAMP",
         "BOOL",
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index 614c953764..ef718bc131 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -57,6 +57,7 @@ def test_read_gbq_tokyo(
         ),
         pytest.param(
             """SELECT
+                t.int64_col + 1 as my_ints,
                 t.float64_col * 2 AS my_floats,
                 CONCAT(t.string_col, "_2") AS my_strings,
                 t.int64_col > 0 AS my_bools,

From 26d8177899b12e1f9d3dfffacad2ae9837837636 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 11:12:57 -0500
Subject: [PATCH 03/11] add unit tests for bq_schema_to_sql

---
 tests/unit/core/test_io.py | 55 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/tests/unit/core/test_io.py b/tests/unit/core/test_io.py
index c5074f80c2..afb38a5f75 100644
--- a/tests/unit/core/test_io.py
+++ b/tests/unit/core/test_io.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 import datetime
+from typing import Iterable
 
 import google.cloud.bigquery as bigquery
+import pytest
 
 import bigframes.core.io
 
@@ -47,3 +49,56 @@ def test_create_snapshot_sql_doesnt_timetravel_session_datasets():
 
     # Don't need the project ID for _SESSION tables.
     assert "my-test-project" not in sql
+
+
+@pytest.mark.parametrize(
+    ("schema", "expected"),
+    (
+        (
+            [bigquery.SchemaField("My Column", "INTEGER")],
+            "`My Column` INT64",
+        ),
+        (
+            [
+                bigquery.SchemaField("My Column", "INTEGER"),
+                bigquery.SchemaField("Float Column", "FLOAT"),
+                bigquery.SchemaField("Bool Column", "BOOLEAN"),
+            ],
+            "`My Column` INT64, `Float Column` FLOAT64, `Bool Column` BOOL",
+        ),
+        (
+            [
+                bigquery.SchemaField("My Column", "INTEGER", mode="REPEATED"),
+                bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"),
+                bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"),
+            ],
+            "`My Column` ARRAY<INT64>, `Float Column` ARRAY<FLOAT64>, `Bool Column` ARRAY<BOOL>",
+        ),
+        (
+            [
+                bigquery.SchemaField(
+                    "My Column",
+                    "RECORD",
+                    mode="REPEATED",
+                    fields=(
+                        bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"),
+                        bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"),
+                        bigquery.SchemaField(
+                            "Nested Column",
+                            "RECORD",
+                            fields=(bigquery.SchemaField("Int Column", "INTEGER"),),
+                        ),
+                    ),
+                ),
+            ],
+            (
+                "`My Column` ARRAY<STRUCT<"
+                + "`Float Column` ARRAY<FLOAT64>,"
+                + " `Bool Column` ARRAY<BOOL>,"
+                + " `Nested Column` STRUCT<`Int Column` INT64>>>"
+            ),
+        ),
+    ),
+)
+def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str):
+    pass

From 43b65fee27479ec4f481ed28bd47d371f0a12a3a Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 11:22:55 -0500
Subject: [PATCH 04/11] fix mypy issue

---
 bigframes/session.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bigframes/session.py b/bigframes/session.py
index de27c7f089..e53072479a 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -460,9 +460,9 @@ def _query_to_destination(
 
         # Create a table to workaround BigQuery 10 GB query results limit. See:
         # internal issue 303057336.
-        temp_table = self._create_session_table_empty(
-            api_name, dry_run_job.schema, index_cols
-        )
+        # Since we have a `statement_type == 'SELECT'`, schema should be populated.
+        schema = typing.cast(Iterable[bigquery.SchemaField], dry_run_job.schema)
+        temp_table = self._create_session_table_empty(api_name, schema, index_cols)
 
         job_config = bigquery.QueryJobConfig()
         job_config.destination = temp_table

From a538ae69cc90ac1f4612f52f61fe4a1eb2dc1a2e Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 11:32:57 -0500
Subject: [PATCH 05/11] remove redundant code

---
 bigframes/session.py | 50 ++++----------------------------------------
 1 file changed, 4 insertions(+), 46 deletions(-)

diff --git a/bigframes/session.py b/bigframes/session.py
index e53072479a..566299c359 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -1315,55 +1315,13 @@ def _ibis_to_session_table(
         cluster_cols: Iterable[str],
         api_name: str,
     ) -> bigquery.TableReference:
-        clusterable_cols = [
-            col for col in cluster_cols if _can_cluster(table[col].type())
-        ][:_MAX_CLUSTER_COLUMNS]
-        return self._query_to_session_table(
+        desination, _ = self._query_to_destination(
             self.ibis_client.compile(table),
-            cluster_cols=clusterable_cols,
+            index_cols=list(cluster_cols),
             api_name=api_name,
         )
-
-    def _query_to_session_table(
-        self,
-        query_text: str,
-        cluster_cols: Iterable[str],
-        api_name: str,
-    ) -> bigquery.TableReference:
-        if len(list(cluster_cols)) > _MAX_CLUSTER_COLUMNS:
-            raise ValueError(
-                f"Too many cluster columns: {list(cluster_cols)}, max {_MAX_CLUSTER_COLUMNS} allowed."
-            )
-        # Can't set a table in _SESSION as destination via query job API, so we
-        # run DDL, instead.
-        table = self._create_session_table()
-        cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols)
-
-        # TODO(swast): This might not support multi-statement SQL queries (scripts).
-        ddl_text = f"""
-        CREATE TEMP TABLE `_SESSION`.`{table.table_id}`
-        CLUSTER BY {cluster_cols_sql}
-        AS {query_text}
-        """
-
-        job_config = bigquery.QueryJobConfig()
-
-        # Include a label so that Dataplex Lineage can identify temporary
-        # tables that BigQuery DataFrames creates. Googlers: See internal issue
-        # 296779699. We're labeling the job instead of the table because
-        # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
-        # supported`.
-        job_config.labels = {"source": "bigquery-dataframes-temp"}
-        job_config.labels["bigframes-api"] = api_name
-
-        try:
-            self._start_query(
-                ddl_text, job_config=job_config
-            )  # Wait for the job to complete
-        except google.api_core.exceptions.Conflict:
-            # Allow query retry to succeed.
-            pass
-        return table
+        # There should always be a destination table for this query type.
+        return typing.cast(bigquery.TableReference, desination)
 
     def remote_function(
         self,

From a01f03d73563cd89dc671145737054749b6be578 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 11:35:56 -0500
Subject: [PATCH 06/11] use fully qualified table ID

---
 bigframes/session.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bigframes/session.py b/bigframes/session.py
index 566299c359..e2fdc22301 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -1300,7 +1300,9 @@ def _create_sequential_ordering(
             cluster_cols=list(index_cols) + [default_ordering_name],
             api_name=api_name,
         )
-        table = self.ibis_client.sql(f"SELECT * FROM `{table_ref.table_id}`")
+        table = self.ibis_client.table(
+            f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}"
+        )
         ordering_reference = core.OrderingColumnReference(default_ordering_name)
         ordering = core.ExpressionOrdering(
             ordering_value_columns=[ordering_reference],

From e7f9c61740f73b18a0578932afedae6ce4c63ce8 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 11:39:02 -0500
Subject: [PATCH 07/11] delete more dead code

---
 bigframes/session.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/bigframes/session.py b/bigframes/session.py
index e2fdc22301..ac48c977cb 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -1258,7 +1258,6 @@ def _create_session_table_empty(
         else:
             cluster_sql = ""
 
-        # TODO(swast): This might not support multi-statement SQL queries (scripts).
         ddl_text = f"""
         CREATE TEMP TABLE
         `_SESSION`.`{table.table_id}`
@@ -1505,19 +1504,6 @@ def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Sessi
     return Session(context)
 
 
-def _can_cluster(ibis_type: ibis_dtypes.DataType):
-    # https://cloud.google.com/bigquery/docs/clustered-tables
-    # Notably, float is excluded
-    return (
-        ibis_type.is_integer()
-        or ibis_type.is_string()
-        or ibis_type.is_decimal()
-        or ibis_type.is_date()
-        or ibis_type.is_timestamp()
-        or ibis_type.is_boolean()
-    )
-
-
 def _can_cluster_bq(field: bigquery.SchemaField):
     # https://cloud.google.com/bigquery/docs/clustered-tables
     # Notably, float is excluded

From 275b8cd26ebb7f07695b782746ba89d549314ff6 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 11:45:16 -0500
Subject: [PATCH 08/11] remove mutation from system test

---
 tests/system/small/test_session.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index ef718bc131..53ddfa3c49 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -322,11 +322,10 @@ def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index):
 
 
 def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default_index):
-    scalars_pandas_df_default_index["rowid"] = np.arange(
-        scalars_pandas_df_default_index.shape[0]
-    )
+    pandas_df = scalars_pandas_df_default_index.copy()
+    pandas_df["rowid"] = np.arange(pandas_df.shape[0])
 
-    df = session.read_pandas(scalars_pandas_df_default_index)
+    df = session.read_pandas(pandas_df)
     total_order_col = df._block._expr._ordering.total_order_col
     assert total_order_col and total_order_col.column_id == "rowid_2"
 

From 92ccd2806051417822362d2381f061d20de5d5b4 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 13:41:58 -0500
Subject: [PATCH 09/11] fix cached

---
 bigframes/core/__init__.py                  | 4 ++--
 tests/system/small/ml/test_decomposition.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
index 5e0675fd13..8008c1189a 100644
--- a/bigframes/core/__init__.py
+++ b/bigframes/core/__init__.py
@@ -1198,8 +1198,8 @@ def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue:
         destination = self._session._ibis_to_session_table(
             ibis_expr, cluster_cols=cluster_cols, api_name="cache"
         )
-        table_expression = self._session.ibis_client.sql(
-            f"SELECT * FROM `_SESSION`.`{destination.table_id}`"
+        table_expression = self._session.ibis_client.table(
+            f"{destination.project}.{destination.dataset_id}.{destination.table_id}"
         )
         new_columns = [table_expression[column] for column in self.column_names]
         new_hidden_columns = [
diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py
index c71bbbe3b0..e31681f4a0 100644
--- a/tests/system/small/ml/test_decomposition.py
+++ b/tests/system/small/ml/test_decomposition.py
@@ -15,6 +15,7 @@
 import pandas as pd
 
 from bigframes.ml import decomposition
+import tests.system.utils
 
 
 def test_pca_predict(penguins_pca_model, new_penguins_df):
@@ -129,7 +130,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA):
             "explained_variance": [3.278657, 1.270829, 1.125354],
         },
     )
-    pd.testing.assert_frame_equal(
+    tests.system.utils.assert_pandas_df_equal_ignore_ordering(
         result,
         expected,
         check_exact=False,
@@ -148,7 +149,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA):
             "explained_variance_ratio": [0.469357, 0.181926, 0.1611],
         },
     )
-    pd.testing.assert_frame_equal(
+    tests.system.utils.assert_pandas_df_equal_ignore_ordering(
         result,
         expected,
         check_exact=False,

From 7b9487a708a47c971a8a24957507a96c1d3806a8 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 13:47:01 -0500
Subject: [PATCH 10/11] make BQ_STANDARD_TYPES immutable

---
 bigframes/core/io.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/bigframes/core/io.py b/bigframes/core/io.py
index 920d6beb8e..d47efbdddc 100644
--- a/bigframes/core/io.py
+++ b/bigframes/core/io.py
@@ -16,6 +16,7 @@
 
 import datetime
 import textwrap
+import types
 from typing import Dict, Iterable, Union
 
 import google.cloud.bigquery as bigquery
@@ -93,11 +94,13 @@ def create_snapshot_sql(
 # https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL
 # names
 # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
-BQ_STANDARD_TYPES = {
-    "BOOLEAN": "BOOL",
-    "INTEGER": "INT64",
-    "FLOAT": "FLOAT64",
-}
+BQ_STANDARD_TYPES = types.MappingProxyType(
+    {
+        "BOOLEAN": "BOOL",
+        "INTEGER": "INT64",
+        "FLOAT": "FLOAT64",
+    }
+)
 
 
 def bq_field_to_type_sql(field: bigquery.SchemaField):

From 575fa14b35c34eed5b3e4a61b8453cf20be91b84 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 14:37:22 -0500
Subject: [PATCH 11/11] ignore order in
 tests/system/small/ml/test_core.py::test_pca_model_principal_component_info

---
 tests/system/small/ml/test_core.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index ace943956f..f911dd7eeb 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -23,6 +23,7 @@
 
 import bigframes
 from bigframes.ml import core
+import tests.system.utils
 
 
 def test_model_eval(
@@ -224,7 +225,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo
             "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383],
         },
     )
-    pd.testing.assert_frame_equal(
+    tests.system.utils.assert_pandas_df_equal_ignore_ordering(
         result,
         expected,
         check_exact=False,