From fc0a642412e05356ded562926c3899fe13643645 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Wed, 15 May 2024 01:33:30 +0000
Subject: [PATCH 1/2] test: stop checking ml large tests exact numbers

---
 tests/system/large/ml/test_cluster.py       |  26 +-
 tests/system/large/ml/test_compose.py       |  83 ++----
 tests/system/large/ml/test_core.py          | 117 +++-----
 tests/system/large/ml/test_decomposition.py |  72 ++---
 tests/system/large/ml/test_ensemble.py      | 173 +++++++----
 tests/system/large/ml/test_forecasting.py   |  51 ++--
 tests/system/large/ml/test_linear_model.py  | 107 +++----
 tests/system/large/ml/test_pipeline.py      | 308 ++++++++------------
 tests/system/utils.py                       |  39 ++-
 9 files changed, 473 insertions(+), 503 deletions(-)

diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py
index b633ca4ea2..fd1d30c711 100644
--- a/tests/system/large/ml/test_cluster.py
+++ b/tests/system/large/ml/test_cluster.py
@@ -13,13 +13,11 @@
 # limitations under the License.
 
 import pandas as pd
-import pytest
 
 from bigframes.ml import cluster
-from tests.system.utils import assert_pandas_df_equal
+from tests.system import utils
 
 
-@pytest.mark.flaky(retries=2)
 def test_cluster_configure_fit_score_predict(
     session, penguins_df_default_index, dataset_id
 ):
@@ -88,26 +86,18 @@ def test_cluster_configure_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = model.score(new_penguins).to_pandas()
-    score_expected = pd.DataFrame(
-        {"davies_bouldin_index": [1.502182], "mean_squared_distance": [1.953408]},
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
 
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
-    )
+    eval_metrics = ["davies_bouldin_index", "mean_squared_distance"]
+    utils.check_pandas_df_schema_and_index(score_result, columns=eval_metrics, index=1)
 
     predictions = model.predict(new_penguins).to_pandas()
     assert predictions.shape == (4, 9)
-    result = predictions[["CENTROID_ID"]]
-    expected = pd.DataFrame(
-        {"CENTROID_ID": [2, 3, 1, 2]},
-        dtype="Int64",
-        index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"),
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["CENTROID_ID"],
+        index=["test1", "test2", "test3", "test4"],
+        col_exact=False,
     )
-    expected.index.name = "observation"
-    assert_pandas_df_equal(result, expected, ignore_order=True)
 
     # save, load, check n_clusters to ensure configuration was kept
     reloaded_model = model.to_gbq(
diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py
index 7513b78b29..45322e78dd 100644
--- a/tests/system/large/ml/test_compose.py
+++ b/tests/system/large/ml/test_compose.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pandas
-
 from bigframes.ml import compose, preprocessing
+from tests.system import utils
 
 
 def test_columntransformer_standalone_fit_and_transform(
@@ -45,26 +44,18 @@ def test_columntransformer_standalone_fit_and_transform(
     )
     result = transformer.transform(new_penguins_df).to_pandas()
 
-    expected = pandas.DataFrame(
-        {
-            "onehotencoded_species": [
-                [{"index": 1, "value": 1.0}],
-                [{"index": 1, "value": 1.0}],
-                [{"index": 2, "value": 1.0}],
-            ],
-            "standard_scaled_culmen_length_mm": [
-                -0.811119671289163,
-                -0.9945520581113803,
-                -1.104611490204711,
-            ],
-            "min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210],
-            "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198],
-        },
-        index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
+    utils.check_pandas_df_schema_and_index(
+        result,
+        columns=[
+            "onehotencoded_species",
+            "standard_scaled_culmen_length_mm",
+            "min_max_scaled_culmen_length_mm",
+            "standard_scaled_flipper_length_mm",
+        ],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
-    pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False)
-
 
 def test_columntransformer_standalone_fit_transform(new_penguins_df):
     transformer = compose.ColumnTransformer(
@@ -86,25 +77,17 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df):
         new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]]
     ).to_pandas()
 
-    expected = pandas.DataFrame(
-        {
-            "onehotencoded_species": [
-                [{"index": 1, "value": 1.0}],
-                [{"index": 1, "value": 1.0}],
-                [{"index": 2, "value": 1.0}],
-            ],
-            "standard_scaled_culmen_length_mm": [
-                1.313249,
-                -0.20198,
-                -1.111118,
-            ],
-            "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338],
-        },
-        index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
+    utils.check_pandas_df_schema_and_index(
+        result,
+        columns=[
+            "onehotencoded_species",
+            "standard_scaled_culmen_length_mm",
+            "standard_scaled_flipper_length_mm",
+        ],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
-    pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False)
-
 
 def test_columntransformer_save_load(new_penguins_df, dataset_id):
     transformer = compose.ColumnTransformer(
@@ -147,21 +130,13 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id):
         new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]]
     ).to_pandas()
 
-    expected = pandas.DataFrame(
-        {
-            "onehotencoded_species": [
-                [{"index": 1, "value": 1.0}],
-                [{"index": 1, "value": 1.0}],
-                [{"index": 2, "value": 1.0}],
-            ],
-            "standard_scaled_culmen_length_mm": [
-                1.313249,
-                -0.20198,
-                -1.111118,
-            ],
-            "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338],
-        },
-        index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
+    utils.check_pandas_df_schema_and_index(
+        result,
+        columns=[
+            "onehotencoded_species",
+            "standard_scaled_culmen_length_mm",
+            "standard_scaled_flipper_length_mm",
+        ],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
-
-    pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False)
diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py
index aec1065e41..be5eea925f 100644
--- a/tests/system/large/ml/test_core.py
+++ b/tests/system/large/ml/test_core.py
@@ -12,14 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pandas
-import pytest
-
 from bigframes.ml import globals
+from tests.system import utils
 
 
-# TODO(garrettwu): Re-enable or not check exact numbers.
-@pytest.mark.skip(reason="bqml regression")
 def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df):
     df = penguins_df_default_index.dropna()
     X_train = df[
@@ -38,41 +34,33 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d
         X_train, y_train, options={"model_type": "linear_reg"}
     )
 
+    eval_metrics = [
+        "mean_absolute_error",
+        "mean_squared_error",
+        "mean_squared_log_error",
+        "median_absolute_error",
+        "r2_score",
+        "explained_variance",
+    ]
     # no data - report evaluation from the automatic data split
     evaluate_result = model.evaluate().to_pandas()
-    evaluate_expected = pandas.DataFrame(
-        {
-            "mean_absolute_error": [225.817334],
-            "mean_squared_error": [80540.705944],
-            "mean_squared_log_error": [0.004972],
-            "median_absolute_error": [173.080816],
-            "r2_score": [0.87529],
-            "explained_variance": [0.87529],
-        },
-        dtype="Float64",
-    )
-    evaluate_expected = evaluate_expected.reindex(
-        index=evaluate_expected.index.astype("Int64")
-    )
-    pandas.testing.assert_frame_equal(
-        evaluate_result, evaluate_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        evaluate_result, columns=eval_metrics, index=1
     )
 
     # evaluate on all training data
     evaluate_result = model.evaluate(df).to_pandas()
-    pandas.testing.assert_frame_equal(
-        evaluate_result, evaluate_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        evaluate_result, columns=eval_metrics, index=1
     )
 
     # predict new labels
     predictions = model.predict(new_penguins_df).to_pandas()
-    expected = pandas.DataFrame(
-        {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]},
-        dtype="Float64",
-        index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pandas.testing.assert_frame_equal(
-        predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_body_mass_g"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
     new_name = f"{dataset_id}.my_model"
@@ -108,42 +96,34 @@ def test_bqml_manual_preprocessing_e2e(
         X_train, y_train, transforms=transforms, options=options
     )
 
+    eval_metrics = [
+        "mean_absolute_error",
+        "mean_squared_error",
+        "mean_squared_log_error",
+        "median_absolute_error",
+        "r2_score",
+        "explained_variance",
+    ]
+
     # no data - report evaluation from the automatic data split
     evaluate_result = model.evaluate().to_pandas()
-    evaluate_expected = pandas.DataFrame(
-        {
-            "mean_absolute_error": [309.477334],
-            "mean_squared_error": [152184.227218],
-            "mean_squared_log_error": [0.009524],
-            "median_absolute_error": [257.727777],
-            "r2_score": [0.764356],
-            "explained_variance": [0.764356],
-        },
-        dtype="Float64",
-    )
-    evaluate_expected = evaluate_expected.reindex(
-        index=evaluate_expected.index.astype("Int64")
-    )
-
-    pandas.testing.assert_frame_equal(
-        evaluate_result, evaluate_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        evaluate_result, columns=eval_metrics, index=1
     )
 
     # evaluate on all training data
     evaluate_result = model.evaluate(df).to_pandas()
-    pandas.testing.assert_frame_equal(
-        evaluate_result, evaluate_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        evaluate_result, columns=eval_metrics, index=1
     )
 
     # predict new labels
     predictions = model.predict(new_penguins_df).to_pandas()
-    expected = pandas.DataFrame(
-        {"predicted_body_mass_g": [3968.8, 3176.3, 3545.2]},
-        dtype="Float64",
-        index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pandas.testing.assert_frame_equal(
-        predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_body_mass_g"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
     new_name = f"{dataset_id}.my_model"
@@ -168,24 +148,9 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df):
     )
 
     transformed = model.transform(new_penguins_df).to_pandas()
-    expected = pandas.DataFrame(
-        {
-            "scaled_culmen_length_mm": [-0.8099, -0.9931, -1.103],
-            "onehotencoded_species": [
-                [{"index": 1, "value": 1.0}],
-                [{"index": 1, "value": 1.0}],
-                [{"index": 2, "value": 1.0}],
-            ],
-        },
-        index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    expected["scaled_culmen_length_mm"] = expected["scaled_culmen_length_mm"].astype(
-        "Float64"
-    )
-    pandas.testing.assert_frame_equal(
-        transformed[["scaled_culmen_length_mm", "onehotencoded_species"]],
-        expected,
-        check_exact=False,
-        rtol=0.1,
-        check_dtype=False,
+    utils.check_pandas_df_schema_and_index(
+        transformed,
+        columns=["scaled_culmen_length_mm", "onehotencoded_species"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index 264b95a92e..87af7255fb 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -15,7 +15,7 @@
 import pandas as pd
 
 from bigframes.ml import decomposition
-import tests.system.utils
+from tests.system import utils
 
 
 def test_decomposition_configure_fit_score_predict(
@@ -45,34 +45,19 @@ def test_decomposition_configure_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = model.score(new_penguins).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "total_explained_variance_ratio": [0.812383],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=["total_explained_variance_ratio"], index=1
     )
 
     result = model.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "principal_component_1": [-1.459, 2.258, -1.685],
-            "principal_component_2": [-1.120, -1.351, -0.874],
-            "principal_component_3": [-0.646, 0.443, -0.704],
-        },
-        dtype="Float64",
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-
-    tests.system.utils.assert_pandas_df_equal_pca(
+    utils.check_pandas_df_schema_and_index(
         result,
-        expected,
-        check_exact=False,
-        rtol=0.1,
+        columns=[
+            "principal_component_1",
+            "principal_component_2",
+            "principal_component_3",
+        ],
+        index=[1633, 1672, 1690],
     )
 
     # save, load, check n_components to ensure configuration was kept
@@ -113,36 +98,21 @@ def test_decomposition_configure_fit_score_predict_params(
 
     # Check score to ensure the model was fitted
     score_result = model.score(new_penguins).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "total_explained_variance_ratio": [0.932897],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=["total_explained_variance_ratio"], index=1
     )
 
     result = model.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "principal_component_1": [-1.459, 2.258, -1.685],
-            "principal_component_2": [-1.120, -1.351, -0.874],
-            "principal_component_3": [-0.646, 0.443, -0.704],
-            "principal_component_4": [-0.539, 0.234, -0.571],
-            "principal_component_5": [-0.876, 0.122, 0.609],
-        },
-        dtype="Float64",
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-
-    tests.system.utils.assert_pandas_df_equal_pca(
+    utils.check_pandas_df_schema_and_index(
         result,
-        expected,
-        check_exact=False,
-        rtol=0.1,
+        columns=[
+            "principal_component_1",
+            "principal_component_2",
+            "principal_component_3",
+            "principal_component_4",
+            "principal_component_5",
+        ],
+        index=[1633, 1672, 1690],
     )
 
     # save, load, check n_components to ensure configuration was kept
diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py
index 3d1fcaf41c..b8adfb36b2 100644
--- a/tests/system/large/ml/test_ensemble.py
+++ b/tests/system/large/ml/test_ensemble.py
@@ -12,17 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from unittest import TestCase
-
-import pandas
 import pytest
 
 import bigframes.ml.ensemble
+from tests.system import utils
 
 
-# TODO(garrettwu): Re-enable or not check exact numbers.
-@pytest.mark.skip(reason="bqml regression")
-@pytest.mark.flaky(retries=2)
 def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.XGBRegressor()
 
@@ -42,19 +37,28 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    expected = pandas.DataFrame(
-        {
-            "mean_absolute_error": [97.368139],
-            "mean_squared_error": [16284.877027],
-            "mean_squared_log_error": [0.0010189],
-            "median_absolute_error": [72.158691],
-            "r2_score": [0.974784],
-            "explained_variance": [0.974845],
-        },
-        dtype="Float64",
-    )
-    expected = expected.reindex(index=expected.index.astype("Int64"))
-    pandas.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
+    # expected = pandas.DataFrame(
+    #     {
+    #         "mean_absolute_error": [97.368139],
+    #         "mean_squared_error": [16284.877027],
+    #         "mean_squared_log_error": [0.0010189],
+    #         "median_absolute_error": [72.158691],
+    #         "r2_score": [0.974784],
+    #         "explained_variance": [0.974845],
+    #     },
+    #     dtype="Float64",
+    # )
+    # expected = expected.reindex(index=expected.index.astype("Int64"))
+    # pandas.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
+    eval_metrics = [
+        "mean_absolute_error",
+        "mean_squared_error",
+        "mean_squared_log_error",
+        "median_absolute_error",
+        "r2_score",
+        "explained_variance",
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -66,7 +70,7 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
     )
 
 
-@pytest.mark.flaky(retries=2)
+# @pytest.mark.flaky(retries=2)
 def test_xgbregressor_dart_booster_multiple_params(
     penguins_df_default_index, dataset_id
 ):
@@ -103,16 +107,25 @@ def test_xgbregressor_dart_booster_multiple_params(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    TestCase().assertSequenceEqual(result.shape, (1, 6))
-    for col_name in [
+    # TestCase().assertSequenceEqual(result.shape, (1, 6))
+    # for col_name in [
+    #     "mean_absolute_error",
+    #     "mean_squared_error",
+    #     "mean_squared_log_error",
+    #     "median_absolute_error",
+    #     "r2_score",
+    #     "explained_variance",
+    # ]:
+    #     assert col_name in result.columns
+    eval_metrics = [
         "mean_absolute_error",
         "mean_squared_error",
         "mean_squared_log_error",
         "median_absolute_error",
         "r2_score",
         "explained_variance",
-    ]:
-        assert col_name in result.columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -140,7 +153,7 @@ def test_xgbregressor_dart_booster_multiple_params(
     assert reloaded_model.n_estimators == 2
 
 
-@pytest.mark.flaky(retries=2)
+# @pytest.mark.flaky(retries=2)
 def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.XGBClassifier()
 
@@ -159,16 +172,25 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id):
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    TestCase().assertSequenceEqual(result.shape, (1, 6))
-    for col_name in [
+    # TestCase().assertSequenceEqual(result.shape, (1, 6))
+    # for col_name in [
+    #     "precision",
+    #     "recall",
+    #     "accuracy",
+    #     "f1_score",
+    #     "log_loss",
+    #     "roc_auc",
+    # ]:
+    #     assert col_name in result.columns
+    eval_metrics = [
         "precision",
         "recall",
         "accuracy",
         "f1_score",
         "log_loss",
         "roc_auc",
-    ]:
-        assert col_name in result.columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -180,7 +202,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id):
     )
 
 
-@pytest.mark.flaky(retries=2)
+# @pytest.mark.flaky(retries=2)
 def test_xgbclassifier_dart_booster_multiple_params(
     penguins_df_default_index, dataset_id
 ):
@@ -216,16 +238,25 @@ def test_xgbclassifier_dart_booster_multiple_params(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    TestCase().assertSequenceEqual(result.shape, (1, 6))
-    for col_name in [
+    # TestCase().assertSequenceEqual(result.shape, (1, 6))
+    # for col_name in [
+    #     "precision",
+    #     "recall",
+    #     "accuracy",
+    #     "f1_score",
+    #     "log_loss",
+    #     "roc_auc",
+    # ]:
+    #     assert col_name in result.columns
+    eval_metrics = [
         "precision",
         "recall",
         "accuracy",
         "f1_score",
         "log_loss",
         "roc_auc",
-    ]:
-        assert col_name in result.columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -253,7 +284,7 @@ def test_xgbclassifier_dart_booster_multiple_params(
     assert reloaded_model.n_estimators == 2
 
 
-@pytest.mark.flaky(retries=2)
+# @pytest.mark.flaky(retries=2)
 def test_randomforestregressor_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.RandomForestRegressor()
 
@@ -273,16 +304,25 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    TestCase().assertSequenceEqual(result.shape, (1, 6))
-    for col_name in [
+    # TestCase().assertSequenceEqual(result.shape, (1, 6))
+    # for col_name in [
+    #     "mean_absolute_error",
+    #     "mean_squared_error",
+    #     "mean_squared_log_error",
+    #     "median_absolute_error",
+    #     "r2_score",
+    #     "explained_variance",
+    # ]:
+    #     assert col_name in result.columns
+    eval_metrics = [
         "mean_absolute_error",
         "mean_squared_error",
         "mean_squared_log_error",
         "median_absolute_error",
         "r2_score",
         "explained_variance",
-    ]:
-        assert col_name in result.columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -294,7 +334,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset
     )
 
 
-@pytest.mark.flaky(retries=2)
+# @pytest.mark.flaky(retries=2)
 def test_randomforestregressor_multiple_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.RandomForestRegressor(
         tree_method="auto",
@@ -326,16 +366,25 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    TestCase().assertSequenceEqual(result.shape, (1, 6))
-    for col_name in [
+    # TestCase().assertSequenceEqual(result.shape, (1, 6))
+    # for col_name in [
+    #     "mean_absolute_error",
+    #     "mean_squared_error",
+    #     "mean_squared_log_error",
+    #     "median_absolute_error",
+    #     "r2_score",
+    #     "explained_variance",
+    # ]:
+    #     assert col_name in result.columns
+    eval_metrics = [
         "mean_absolute_error",
         "mean_squared_error",
         "mean_squared_log_error",
         "median_absolute_error",
         "r2_score",
         "explained_variance",
-    ]:
-        assert col_name in result.columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -360,7 +409,7 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase
     assert reloaded_model.enable_global_explain is False
 
 
-@pytest.mark.flaky(retries=2)
+# @pytest.mark.flaky(retries=2)
 def test_randomforestclassifier_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.RandomForestClassifier()
 
@@ -379,16 +428,25 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    TestCase().assertSequenceEqual(result.shape, (1, 6))
-    for col_name in [
+    # TestCase().assertSequenceEqual(result.shape, (1, 6))
+    # for col_name in [
+    #     "precision",
+    #     "recall",
+    #     "accuracy",
+    #     "f1_score",
+    #     "log_loss",
+    #     "roc_auc",
+    # ]:
+    #     assert col_name in result.columns
+    eval_metrics = [
         "precision",
         "recall",
         "accuracy",
         "f1_score",
         "log_loss",
         "roc_auc",
-    ]:
-        assert col_name in result.columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -431,16 +489,25 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    TestCase().assertSequenceEqual(result.shape, (1, 6))
-    for col_name in [
+    # TestCase().assertSequenceEqual(result.shape, (1, 6))
+    # for col_name in [
+    #     "precision",
+    #     "recall",
+    #     "accuracy",
+    #     "f1_score",
+    #     "log_loss",
+    #     "roc_auc",
+    # ]:
+    #     assert col_name in result.columns
+    eval_metrics = [
         "precision",
         "recall",
         "accuracy",
         "f1_score",
         "log_loss",
         "roc_auc",
-    ]:
-        assert col_name in result.columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
index ef74398c2e..74ba12c6c6 100644
--- a/tests/system/large/ml/test_forecasting.py
+++ b/tests/system/large/ml/test_forecasting.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pandas as pd
 import pytest
 
 from bigframes.ml import forecasting
+from tests.system import utils
 
 ARIMA_EVALUATE_OUTPUT_COL = [
     "non_seasonal_p",
     "non_seasonal_d",
     "non_seasonal_q",
+    "has_drift",
     "log_likelihood",
     "AIC",
     "variance",
@@ -50,18 +51,28 @@ def test_arima_plus_model_fit_score(
     result = arima_model.score(
         new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]]
     ).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "mean_absolute_error": [154.742547],
-            "mean_squared_error": [26844.868855],
-            "root_mean_squared_error": [163.844038],
-            "mean_absolute_percentage_error": [6.189702],
-            "symmetric_mean_absolute_percentage_error": [6.097155],
-        },
-        dtype="Float64",
+    # expected = pd.DataFrame(
+    #     {
+    #         "mean_absolute_error": [154.742547],
+    #         "mean_squared_error": [26844.868855],
+    #         "root_mean_squared_error": [163.844038],
+    #         "mean_absolute_percentage_error": [6.189702],
+    #         "symmetric_mean_absolute_percentage_error": [6.097155],
+    #     },
+    #     dtype="Float64",
+    # )
+    # expected = expected.reindex(index=expected.index.astype("Int64"))
+    utils.check_pandas_df_schema_and_index(
+        result,
+        columns=[
+            "mean_absolute_error",
+            "mean_squared_error",
+            "root_mean_squared_error",
+            "mean_absolute_percentage_error",
+            "symmetric_mean_absolute_percentage_error",
+        ],
+        index=1,
     )
-    expected = expected.reindex(index=expected.index.astype("Int64"))
-    pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
 
     # save, load to ensure configuration was kept
     reloaded_model = arima_model.to_gbq(
@@ -73,10 +84,10 @@ def test_arima_plus_model_fit_score(
 
 
 def test_arima_plus_model_fit_summary(dataset_id, arima_model):
-
-    result = arima_model.summary()
-    assert result.shape == (1, 12)
-    assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
+    result = arima_model.summary().to_pandas()
+    utils.check_pandas_df_schema_and_index(
+        result, columns=ARIMA_EVALUATE_OUTPUT_COL, index=1
+    )
 
     # save, load to ensure configuration was kept
     reloaded_model = arima_model.to_gbq(
@@ -88,13 +99,13 @@ def test_arima_plus_model_fit_summary(dataset_id, arima_model):
 
 
 def test_arima_coefficients(arima_model):
-    got = arima_model.coef_
-    expected_columns = {
+    result = arima_model.coef_.to_pandas()
+    expected_columns = [
         "ar_coefficients",
         "ma_coefficients",
         "intercept_or_drift",
-    }
-    assert set(got.columns) == expected_columns
+    ]
+    utils.check_pandas_df_schema_and_index(result, columns=expected_columns, index=1)
 
 
 def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id):
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
index 99121e4a31..f508edfa9e 100644
--- a/tests/system/large/ml/test_linear_model.py
+++ b/tests/system/large/ml/test_linear_model.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pandas as pd
-
 import bigframes.ml.linear_model
+from tests.system import utils
 
 
 def test_linear_regression_configure_fit_score(penguins_df_default_index, dataset_id):
@@ -36,19 +35,21 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "mean_absolute_error": [225.735767],
-            "mean_squared_error": [80417.461828],
-            "mean_squared_log_error": [0.004967],
-            "median_absolute_error": [172.543702],
-            "r2_score": [0.87548],
-            "explained_variance": [0.87548],
-        },
-        dtype="Float64",
+    # expected = pd.DataFrame(
+    #     {
+    #         "mean_absolute_error": [225.735767],
+    #         "mean_squared_error": [80417.461828],
+    #         "mean_squared_log_error": [0.004967],
+    #         "median_absolute_error": [172.543702],
+    #         "r2_score": [0.87548],
+    #         "explained_variance": [0.87548],
+    #     },
+    #     dtype="Float64",
+    # )
+    # expected = expected.reindex(index=expected.index.astype("Int64"))
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
-    expected = expected.reindex(index=expected.index.astype("Int64"))
-    pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True)
@@ -98,19 +99,21 @@ def test_linear_regression_customized_params_fit_score(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "mean_absolute_error": [240],
-            "mean_squared_error": [91197],
-            "mean_squared_log_error": [0.00573],
-            "median_absolute_error": [197],
-            "r2_score": [0.858],
-            "explained_variance": [0.8588],
-        },
-        dtype="Float64",
+    # expected = pd.DataFrame(
+    #     {
+    #         "mean_absolute_error": [240],
+    #         "mean_squared_error": [91197],
+    #         "mean_squared_log_error": [0.00573],
+    #         "median_absolute_error": [197],
+    #         "r2_score": [0.858],
+    #         "explained_variance": [0.8588],
+    #     },
+    #     dtype="Float64",
+    # )
+    # expected = expected.reindex(index=expected.index.astype("Int64"))
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
-    expected = expected.reindex(index=expected.index.astype("Int64"))
-    pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True)
@@ -152,19 +155,21 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "precision": [0.616753],
-            "recall": [0.618615],
-            "accuracy": [0.92515],
-            "f1_score": [0.617681],
-            "log_loss": [1.498832],
-            "roc_auc": [0.975807],
-        },
-        dtype="Float64",
+    # expected = pd.DataFrame(
+    #     {
+    #         "precision": [0.616753],
+    #         "recall": [0.618615],
+    #         "accuracy": [0.92515],
+    #         "f1_score": [0.617681],
+    #         "log_loss": [1.498832],
+    #         "roc_auc": [0.975807],
+    #     },
+    #     dtype="Float64",
+    # )
+    # expected = expected.reindex(index=expected.index.astype("Int64"))
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_CLASSFICATION_METRICS, index=1
     )
-    expected = expected.reindex(index=expected.index.astype("Int64"))
-    pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -207,19 +212,21 @@ def test_logistic_regression_customized_params_fit_score(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "precision": [0.487],
-            "recall": [0.602],
-            "accuracy": [0.464],
-            "f1_score": [0.379],
-            "log_loss": [0.972],
-            "roc_auc": [0.700],
-        },
-        dtype="Float64",
+    # expected = pd.DataFrame(
+    #     {
+    #         "precision": [0.487],
+    #         "recall": [0.602],
+    #         "accuracy": [0.464],
+    #         "f1_score": [0.379],
+    #         "log_loss": [0.972],
+    #         "roc_auc": [0.700],
+    #     },
+    #     dtype="Float64",
+    # )
+    # expected = expected.reindex(index=expected.index.astype("Int64"))
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_CLASSFICATION_METRICS, index=1
     )
-    expected = expected.reindex(index=expected.index.astype("Int64"))
-    pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py
index 1a92d0f7d4..1a241315bc 100644
--- a/tests/system/large/ml/test_pipeline.py
+++ b/tests/system/large/ml/test_pipeline.py
@@ -24,7 +24,7 @@
     pipeline,
     preprocessing,
 )
-from tests.system.utils import assert_pandas_df_equal, assert_pandas_df_equal_pca
+from tests.system import utils
 
 
 def test_pipeline_linear_regression_fit_score_predict(
@@ -51,21 +51,21 @@ def test_pipeline_linear_regression_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "mean_absolute_error": [309.477331],
-            "mean_squared_error": [152184.227219],
-            "mean_squared_log_error": [0.009524],
-            "median_absolute_error": [257.728263],
-            "r2_score": [0.764356],
-            "explained_variance": [0.764356],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    # score_expected = pd.DataFrame(
+    #     {
+    #         "mean_absolute_error": [309.477331],
+    #         "mean_squared_error": [152184.227219],
+    #         "mean_squared_log_error": [0.009524],
+    #         "median_absolute_error": [257.728263],
+    #         "r2_score": [0.764356],
+    #         "explained_variance": [0.764356],
+    #     },
+    #     dtype="Float64",
+    # )
+    # score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
+
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
 
     # predict new labels
@@ -87,13 +87,19 @@ def test_pipeline_linear_regression_fit_score_predict(
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {"predicted_body_mass_g": [3968.8, 3176.3, 3545.2]},
-        dtype="Float64",
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(
-        predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    # expected = pd.DataFrame(
+    #     {"predicted_body_mass_g": [3968.8, 3176.3, 3545.2]},
+    #     dtype="Float64",
+    #     index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    # )
+    # pd.testing.assert_frame_equal(
+    #     predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    # )
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_body_mass_g"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
 
@@ -115,21 +121,24 @@ def test_pipeline_linear_regression_series_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "mean_absolute_error": [528.495599],
-            "mean_squared_error": [421722.261808],
-            "mean_squared_log_error": [0.022963],
-            "median_absolute_error": [468.895249],
-            "r2_score": [0.346999],
-            "explained_variance": [0.346999],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    # score_expected = pd.DataFrame(
+    #     {
+    #         "mean_absolute_error": [528.495599],
+    #         "mean_squared_error": [421722.261808],
+    #         "mean_squared_log_error": [0.022963],
+    #         "median_absolute_error": [468.895249],
+    #         "r2_score": [0.346999],
+    #         "explained_variance": [0.346999],
+    #     },
+    #     dtype="Float64",
+    # )
+    # score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
+
+    # pd.testing.assert_frame_equal(
+    #     score_result, score_expected, check_exact=False, rtol=0.1
+    # )
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
 
     # predict new labels
@@ -142,13 +151,19 @@ def test_pipeline_linear_regression_series_fit_score_predict(
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins["culmen_length_mm"]).to_pandas()
-    expected = pd.DataFrame(
-        {"predicted_body_mass_g": [3818.845703, 3732.022253, 3679.928123]},
-        dtype="Float64",
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(
-        predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    # expected = pd.DataFrame(
+    #     {"predicted_body_mass_g": [3818.845703, 3732.022253, 3679.928123]},
+    #     dtype="Float64",
+    #     index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    # )
+    # pd.testing.assert_frame_equal(
+    #     predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    # )
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_body_mass_g"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
 
@@ -176,21 +191,24 @@ def test_pipeline_logistic_regression_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "precision": [0.537091],
-            "recall": [0.538636],
-            "accuracy": [0.805389],
-            "f1_score": [0.537716],
-            "log_loss": [1.445433],
-            "roc_auc": [0.917818],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    # score_expected = pd.DataFrame(
+    #     {
+    #         "precision": [0.537091],
+    #         "recall": [0.538636],
+    #         "accuracy": [0.805389],
+    #         "f1_score": [0.537716],
+    #         "log_loss": [1.445433],
+    #         "roc_auc": [0.917818],
+    #     },
+    #     dtype="Float64",
+    # )
+    # score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
+
+    # pd.testing.assert_frame_equal(
+    #     score_result, score_expected, check_exact=False, rtol=0.1
+    # )
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=utils.ML_CLASSFICATION_METRICS, index=1
     )
 
     # predict new labels
@@ -211,19 +229,23 @@ def test_pipeline_logistic_regression_fit_score_predict(
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {"predicted_sex": ["MALE", "FEMALE", "FEMALE"]},
-        dtype=pd.StringDtype(storage="pyarrow"),
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(
-        predictions[["predicted_sex"]],
-        expected,
+    # expected = pd.DataFrame(
+    #     {"predicted_sex": ["MALE", "FEMALE", "FEMALE"]},
+    #     dtype=pd.StringDtype(storage="pyarrow"),
+    #     index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    # )
+    # pd.testing.assert_frame_equal(
+    #     predictions[["predicted_sex"]],
+    #     expected,
+    # )
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_sex"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
 
-# TODO(garrettwu): Re-enable or not check exact numbers.
-@pytest.mark.skip(reason="bqml regression")
 @pytest.mark.flaky(retries=2)
 def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_index):
     """Test a supervised model with a minimal preprocessing step"""
@@ -247,21 +269,8 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "mean_absolute_error": [202.298434],
-            "mean_squared_error": [74515.108971],
-            "mean_squared_log_error": [0.004365],
-            "median_absolute_error": [142.949219],
-            "r2_score": [0.88462],
-            "explained_variance": [0.886454],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
 
     # predict new labels
@@ -283,24 +292,14 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "predicted_body_mass_g": [
-                4287.34521484375,
-                3198.351806640625,
-                3385.34130859375,
-            ]
-        },
-        dtype="Float64",
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(
-        predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_body_mass_g"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
 
-# TODO(garrettwu): Re-enable or not check exact numbers.
-@pytest.mark.skip(reason="bqml regression")
 @pytest.mark.flaky(retries=2)
 def test_pipeline_random_forest_classifier_fit_score_predict(
     session, penguins_df_default_index
@@ -326,21 +325,8 @@ def test_pipeline_random_forest_classifier_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "precision": [0.585505],
-            "recall": [0.58676],
-            "accuracy": [0.877246],
-            "f1_score": [0.585657],
-            "log_loss": [0.880643],
-            "roc_auc": [0.970697],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=utils.ML_CLASSFICATION_METRICS, index=1
     )
 
     # predict new labels
@@ -361,14 +347,11 @@ def test_pipeline_random_forest_classifier_fit_score_predict(
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {"predicted_sex": ["MALE", "FEMALE", "FEMALE"]},
-        dtype=pd.StringDtype(storage="pyarrow"),
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(
-        predictions[["predicted_sex"]],
-        expected,
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_sex"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
 
@@ -412,40 +395,20 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index):
 
     # Check score to ensure the model was fitted
     score_result = pl.score(new_penguins).to_pandas()
-    score_expected = pd.DataFrame(
-        {
-            "total_explained_variance_ratio": [1.0],
-        },
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        score_result, columns=["total_explained_variance_ratio"], index=1
     )
 
     predictions = pl.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {
-            "principal_component_1": [-1.115259, -1.506141, -1.471173],
-            "principal_component_2": [-0.074825, 0.69664, 0.406103],
-            "principal_component_3": [0.500013, -0.544479, 0.075849],
-        },
-        dtype="Float64",
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-
-    assert_pandas_df_equal_pca(
-        predictions[
-            [
-                "principal_component_1",
-                "principal_component_2",
-                "principal_component_3",
-            ]
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=[
+            "principal_component_1",
+            "principal_component_2",
+            "principal_component_3",
         ],
-        expected,
-        check_exact=False,
-        rtol=0.1,
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
 
@@ -538,29 +501,16 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(new_penguins).to_pandas()
-    score_expected = pd.DataFrame(
-        {"davies_bouldin_index": [7.542981], "mean_squared_distance": [94.692409]},
-        dtype="Float64",
-    )
-    score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    pd.testing.assert_frame_equal(
-        score_result, score_expected, check_exact=False, rtol=0.1
-    )
+    eval_metrics = ["davies_bouldin_index", "mean_squared_distance"]
+    utils.check_pandas_df_schema_and_index(score_result, columns=eval_metrics, index=1)
 
     predictions = pl.predict(new_penguins).to_pandas().sort_index()
-    assert predictions.shape == (6, 9)
-    result = predictions[["CENTROID_ID"]]
-    expected = pd.DataFrame(
-        {"CENTROID_ID": [1, 2, 1, 2, 1, 2]},
-        dtype="Int64",
-        index=pd.Index(
-            ["test1", "test2", "test3", "test4", "test5", "test6"],
-            dtype="string[pyarrow]",
-        ),
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["CENTROID_ID"],
+        index=["test1", "test2", "test3", "test4", "test5", "test6"],
+        col_exact=False,
     )
-    expected.index.name = "observation"
-    assert_pandas_df_equal(result, expected, ignore_order=True)
 
 
 def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index):
@@ -632,13 +582,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins).to_pandas()
-    expected = pd.DataFrame(
-        {"predicted_body_mass_g": [3909.2, 3436.0, 2860.0]},
-        dtype="Float64",
-        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(
-        predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
+    utils.check_pandas_df_schema_and_index(
+        predictions,
+        columns=["predicted_body_mass_g"],
+        index=[1633, 1672, 1690],
+        col_exact=False,
     )
 
 
diff --git a/tests/system/utils.py b/tests/system/utils.py
index e40502e6f2..75897b7942 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -15,7 +15,7 @@
 import base64
 import decimal
 import functools
-from typing import Iterable, Optional, Set
+from typing import Iterable, Optional, Set, Union
 
 import geopandas as gpd  # type: ignore
 import google.api_core.operation
@@ -28,6 +28,23 @@
 
 from bigframes.functions import remote_function
 
+ML_REGRESSION_METRICS = [
+    "mean_absolute_error",
+    "mean_squared_error",
+    "mean_squared_log_error",
+    "median_absolute_error",
+    "r2_score",
+    "explained_variance",
+]
+ML_CLASSFICATION_METRICS = [
+    "precision",
+    "recall",
+    "accuracy",
+    "f1_score",
+    "log_loss",
+    "roc_auc",
+]
+
 
 def skip_legacy_pandas(test):
     @functools.wraps(test)
@@ -249,6 +266,26 @@ def assert_pandas_df_equal_pca(actual, expected, **kwargs):
             pd.testing.assert_series_equal(-actual[column], expected[column], **kwargs)
 
 
+def check_pandas_df_schema_and_index(
+    actual: pd.DataFrame,
+    columns: Iterable,
+    index: Union[int, Iterable],
+    col_exact: bool = True,
+):
+    """Check pandas df schema and index. But not the values."""
+    if col_exact:
+        assert list(actual.columns) == list(columns)
+    else:
+        assert set(columns) <= set(actual.columns)
+
+    if isinstance(index, int):
+        assert len(actual) == index
+    elif isinstance(index, Iterable):
+        assert list(actual.index) == list(index)
+    else:
+        raise ValueError("Unsupported index type.")
+
+
 def get_remote_function_endpoints(
     bigquery_client: bigquery.Client, dataset_id: str
 ) -> Set[str]:

From c8846f117f6c8a01cddc69c8514b432c0827be91 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Wed, 15 May 2024 17:41:23 +0000
Subject: [PATCH 2/2] clean up

---
 tests/system/large/ml/test_ensemble.py     | 190 ++++-----------------
 tests/system/large/ml/test_forecasting.py  |  11 --
 tests/system/large/ml/test_linear_model.py |  48 ------
 tests/system/large/ml/test_pipeline.py     |  70 --------
 tests/system/utils.py                      |  19 ++-
 5 files changed, 43 insertions(+), 295 deletions(-)

diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py
index b8adfb36b2..36c0e6cb17 100644
--- a/tests/system/large/ml/test_ensemble.py
+++ b/tests/system/large/ml/test_ensemble.py
@@ -18,6 +18,7 @@
 from tests.system import utils
 
 
+@pytest.mark.flaky(retries=2)
 def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.XGBRegressor()
 
@@ -37,28 +38,9 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # expected = pandas.DataFrame(
-    #     {
-    #         "mean_absolute_error": [97.368139],
-    #         "mean_squared_error": [16284.877027],
-    #         "mean_squared_log_error": [0.0010189],
-    #         "median_absolute_error": [72.158691],
-    #         "r2_score": [0.974784],
-    #         "explained_variance": [0.974845],
-    #     },
-    #     dtype="Float64",
-    # )
-    # expected = expected.reindex(index=expected.index.astype("Int64"))
-    # pandas.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
-    eval_metrics = [
-        "mean_absolute_error",
-        "mean_squared_error",
-        "mean_squared_log_error",
-        "median_absolute_error",
-        "r2_score",
-        "explained_variance",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -70,7 +52,7 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
     )
 
 
-# @pytest.mark.flaky(retries=2)
+@pytest.mark.flaky(retries=2)
 def test_xgbregressor_dart_booster_multiple_params(
     penguins_df_default_index, dataset_id
 ):
@@ -107,25 +89,9 @@ def test_xgbregressor_dart_booster_multiple_params(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # TestCase().assertSequenceEqual(result.shape, (1, 6))
-    # for col_name in [
-    #     "mean_absolute_error",
-    #     "mean_squared_error",
-    #     "mean_squared_log_error",
-    #     "median_absolute_error",
-    #     "r2_score",
-    #     "explained_variance",
-    # ]:
-    #     assert col_name in result.columns
-    eval_metrics = [
-        "mean_absolute_error",
-        "mean_squared_error",
-        "mean_squared_log_error",
-        "median_absolute_error",
-        "r2_score",
-        "explained_variance",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -153,7 +119,7 @@ def test_xgbregressor_dart_booster_multiple_params(
     assert reloaded_model.n_estimators == 2
 
 
-# @pytest.mark.flaky(retries=2)
+@pytest.mark.flaky(retries=2)
 def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.XGBClassifier()
 
@@ -172,25 +138,9 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id):
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # TestCase().assertSequenceEqual(result.shape, (1, 6))
-    # for col_name in [
-    #     "precision",
-    #     "recall",
-    #     "accuracy",
-    #     "f1_score",
-    #     "log_loss",
-    #     "roc_auc",
-    # ]:
-    #     assert col_name in result.columns
-    eval_metrics = [
-        "precision",
-        "recall",
-        "accuracy",
-        "f1_score",
-        "log_loss",
-        "roc_auc",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_CLASSFICATION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -238,25 +188,9 @@ def test_xgbclassifier_dart_booster_multiple_params(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # TestCase().assertSequenceEqual(result.shape, (1, 6))
-    # for col_name in [
-    #     "precision",
-    #     "recall",
-    #     "accuracy",
-    #     "f1_score",
-    #     "log_loss",
-    #     "roc_auc",
-    # ]:
-    #     assert col_name in result.columns
-    eval_metrics = [
-        "precision",
-        "recall",
-        "accuracy",
-        "f1_score",
-        "log_loss",
-        "roc_auc",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_CLASSFICATION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -284,7 +218,7 @@ def test_xgbclassifier_dart_booster_multiple_params(
     assert reloaded_model.n_estimators == 2
 
 
-# @pytest.mark.flaky(retries=2)
+@pytest.mark.flaky(retries=2)
 def test_randomforestregressor_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.RandomForestRegressor()
 
@@ -304,25 +238,9 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # TestCase().assertSequenceEqual(result.shape, (1, 6))
-    # for col_name in [
-    #     "mean_absolute_error",
-    #     "mean_squared_error",
-    #     "mean_squared_log_error",
-    #     "median_absolute_error",
-    #     "r2_score",
-    #     "explained_variance",
-    # ]:
-    #     assert col_name in result.columns
-    eval_metrics = [
-        "mean_absolute_error",
-        "mean_squared_error",
-        "mean_squared_log_error",
-        "median_absolute_error",
-        "r2_score",
-        "explained_variance",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -334,7 +252,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset
     )
 
 
-# @pytest.mark.flaky(retries=2)
+@pytest.mark.flaky(retries=2)
 def test_randomforestregressor_multiple_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.RandomForestRegressor(
         tree_method="auto",
@@ -366,25 +284,9 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # TestCase().assertSequenceEqual(result.shape, (1, 6))
-    # for col_name in [
-    #     "mean_absolute_error",
-    #     "mean_squared_error",
-    #     "mean_squared_log_error",
-    #     "median_absolute_error",
-    #     "r2_score",
-    #     "explained_variance",
-    # ]:
-    #     assert col_name in result.columns
-    eval_metrics = [
-        "mean_absolute_error",
-        "mean_squared_error",
-        "mean_squared_log_error",
-        "median_absolute_error",
-        "r2_score",
-        "explained_variance",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -409,7 +311,7 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase
     assert reloaded_model.enable_global_explain is False
 
 
-# @pytest.mark.flaky(retries=2)
+@pytest.mark.flaky(retries=2)
 def test_randomforestclassifier_default_params(penguins_df_default_index, dataset_id):
     model = bigframes.ml.ensemble.RandomForestClassifier()
 
@@ -428,25 +330,9 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # TestCase().assertSequenceEqual(result.shape, (1, 6))
-    # for col_name in [
-    #     "precision",
-    #     "recall",
-    #     "accuracy",
-    #     "f1_score",
-    #     "log_loss",
-    #     "roc_auc",
-    # ]:
-    #     assert col_name in result.columns
-    eval_metrics = [
-        "precision",
-        "recall",
-        "accuracy",
-        "f1_score",
-        "log_loss",
-        "roc_auc",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_CLASSFICATION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
@@ -489,25 +375,9 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # TestCase().assertSequenceEqual(result.shape, (1, 6))
-    # for col_name in [
-    #     "precision",
-    #     "recall",
-    #     "accuracy",
-    #     "f1_score",
-    #     "log_loss",
-    #     "roc_auc",
-    # ]:
-    #     assert col_name in result.columns
-    eval_metrics = [
-        "precision",
-        "recall",
-        "accuracy",
-        "f1_score",
-        "log_loss",
-        "roc_auc",
-    ]
-    utils.check_pandas_df_schema_and_index(result, columns=eval_metrics, index=1)
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_CLASSFICATION_METRICS, index=1
+    )
 
     # save, load, check parameters to ensure configuration was kept
     reloaded_model = model.to_gbq(
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
index 74ba12c6c6..57137ea64f 100644
--- a/tests/system/large/ml/test_forecasting.py
+++ b/tests/system/large/ml/test_forecasting.py
@@ -51,17 +51,6 @@ def test_arima_plus_model_fit_score(
     result = arima_model.score(
         new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]]
     ).to_pandas()
-    # expected = pd.DataFrame(
-    #     {
-    #         "mean_absolute_error": [154.742547],
-    #         "mean_squared_error": [26844.868855],
-    #         "root_mean_squared_error": [163.844038],
-    #         "mean_absolute_percentage_error": [6.189702],
-    #         "symmetric_mean_absolute_percentage_error": [6.097155],
-    #     },
-    #     dtype="Float64",
-    # )
-    # expected = expected.reindex(index=expected.index.astype("Int64"))
     utils.check_pandas_df_schema_and_index(
         result,
         columns=[
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
index f508edfa9e..69193adfa7 100644
--- a/tests/system/large/ml/test_linear_model.py
+++ b/tests/system/large/ml/test_linear_model.py
@@ -35,18 +35,6 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # expected = pd.DataFrame(
-    #     {
-    #         "mean_absolute_error": [225.735767],
-    #         "mean_squared_error": [80417.461828],
-    #         "mean_squared_log_error": [0.004967],
-    #         "median_absolute_error": [172.543702],
-    #         "r2_score": [0.87548],
-    #         "explained_variance": [0.87548],
-    #     },
-    #     dtype="Float64",
-    # )
-    # expected = expected.reindex(index=expected.index.astype("Int64"))
     utils.check_pandas_df_schema_and_index(
         result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
@@ -99,18 +87,6 @@ def test_linear_regression_customized_params_fit_score(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # expected = pd.DataFrame(
-    #     {
-    #         "mean_absolute_error": [240],
-    #         "mean_squared_error": [91197],
-    #         "mean_squared_log_error": [0.00573],
-    #         "median_absolute_error": [197],
-    #         "r2_score": [0.858],
-    #         "explained_variance": [0.8588],
-    #     },
-    #     dtype="Float64",
-    # )
-    # expected = expected.reindex(index=expected.index.astype("Int64"))
     utils.check_pandas_df_schema_and_index(
         result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
@@ -155,18 +131,6 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # expected = pd.DataFrame(
-    #     {
-    #         "precision": [0.616753],
-    #         "recall": [0.618615],
-    #         "accuracy": [0.92515],
-    #         "f1_score": [0.617681],
-    #         "log_loss": [1.498832],
-    #         "roc_auc": [0.975807],
-    #     },
-    #     dtype="Float64",
-    # )
-    # expected = expected.reindex(index=expected.index.astype("Int64"))
     utils.check_pandas_df_schema_and_index(
         result, columns=utils.ML_CLASSFICATION_METRICS, index=1
     )
@@ -212,18 +176,6 @@ def test_logistic_regression_customized_params_fit_score(
 
     # Check score to ensure the model was fitted
     result = model.score(X_train, y_train).to_pandas()
-    # expected = pd.DataFrame(
-    #     {
-    #         "precision": [0.487],
-    #         "recall": [0.602],
-    #         "accuracy": [0.464],
-    #         "f1_score": [0.379],
-    #         "log_loss": [0.972],
-    #         "roc_auc": [0.700],
-    #     },
-    #     dtype="Float64",
-    # )
-    # expected = expected.reindex(index=expected.index.astype("Int64"))
     utils.check_pandas_df_schema_and_index(
         result, columns=utils.ML_CLASSFICATION_METRICS, index=1
     )
diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py
index 1a241315bc..3d7eb2e426 100644
--- a/tests/system/large/ml/test_pipeline.py
+++ b/tests/system/large/ml/test_pipeline.py
@@ -51,19 +51,6 @@ def test_pipeline_linear_regression_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    # score_expected = pd.DataFrame(
-    #     {
-    #         "mean_absolute_error": [309.477331],
-    #         "mean_squared_error": [152184.227219],
-    #         "mean_squared_log_error": [0.009524],
-    #         "median_absolute_error": [257.728263],
-    #         "r2_score": [0.764356],
-    #         "explained_variance": [0.764356],
-    #     },
-    #     dtype="Float64",
-    # )
-    # score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
     utils.check_pandas_df_schema_and_index(
         score_result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
@@ -87,14 +74,6 @@ def test_pipeline_linear_regression_fit_score_predict(
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins).to_pandas()
-    # expected = pd.DataFrame(
-    #     {"predicted_body_mass_g": [3968.8, 3176.3, 3545.2]},
-    #     dtype="Float64",
-    #     index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    # )
-    # pd.testing.assert_frame_equal(
-    #     predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
-    # )
     utils.check_pandas_df_schema_and_index(
         predictions,
         columns=["predicted_body_mass_g"],
@@ -121,22 +100,6 @@ def test_pipeline_linear_regression_series_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    # score_expected = pd.DataFrame(
-    #     {
-    #         "mean_absolute_error": [528.495599],
-    #         "mean_squared_error": [421722.261808],
-    #         "mean_squared_log_error": [0.022963],
-    #         "median_absolute_error": [468.895249],
-    #         "r2_score": [0.346999],
-    #         "explained_variance": [0.346999],
-    #     },
-    #     dtype="Float64",
-    # )
-    # score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    # pd.testing.assert_frame_equal(
-    #     score_result, score_expected, check_exact=False, rtol=0.1
-    # )
     utils.check_pandas_df_schema_and_index(
         score_result, columns=utils.ML_REGRESSION_METRICS, index=1
     )
@@ -151,14 +114,6 @@ def test_pipeline_linear_regression_series_fit_score_predict(
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins["culmen_length_mm"]).to_pandas()
-    # expected = pd.DataFrame(
-    #     {"predicted_body_mass_g": [3818.845703, 3732.022253, 3679.928123]},
-    #     dtype="Float64",
-    #     index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    # )
-    # pd.testing.assert_frame_equal(
-    #     predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1
-    # )
     utils.check_pandas_df_schema_and_index(
         predictions,
         columns=["predicted_body_mass_g"],
@@ -191,22 +146,6 @@ def test_pipeline_logistic_regression_fit_score_predict(
 
     # Check score to ensure the model was fitted
     score_result = pl.score(X_train, y_train).to_pandas()
-    # score_expected = pd.DataFrame(
-    #     {
-    #         "precision": [0.537091],
-    #         "recall": [0.538636],
-    #         "accuracy": [0.805389],
-    #         "f1_score": [0.537716],
-    #         "log_loss": [1.445433],
-    #         "roc_auc": [0.917818],
-    #     },
-    #     dtype="Float64",
-    # )
-    # score_expected = score_expected.reindex(index=score_expected.index.astype("Int64"))
-
-    # pd.testing.assert_frame_equal(
-    #     score_result, score_expected, check_exact=False, rtol=0.1
-    # )
     utils.check_pandas_df_schema_and_index(
         score_result, columns=utils.ML_CLASSFICATION_METRICS, index=1
     )
@@ -229,15 +168,6 @@ def test_pipeline_logistic_regression_fit_score_predict(
         ).set_index("tag_number")
     )
     predictions = pl.predict(new_penguins).to_pandas()
-    # expected = pd.DataFrame(
-    #     {"predicted_sex": ["MALE", "FEMALE", "FEMALE"]},
-    #     dtype=pd.StringDtype(storage="pyarrow"),
-    #     index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    # )
-    # pd.testing.assert_frame_equal(
-    #     predictions[["predicted_sex"]],
-    #     expected,
-    # )
     utils.check_pandas_df_schema_and_index(
         predictions,
         columns=["predicted_sex"],
diff --git a/tests/system/utils.py b/tests/system/utils.py
index 75897b7942..ab4c2c119f 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -267,21 +267,28 @@ def assert_pandas_df_equal_pca(actual, expected, **kwargs):
 
 
 def check_pandas_df_schema_and_index(
-    actual: pd.DataFrame,
+    pd_df: pd.DataFrame,
     columns: Iterable,
     index: Union[int, Iterable],
     col_exact: bool = True,
 ):
-    """Check pandas df schema and index. But not the values."""
+    """Check pandas df schema and index. But not the values.
+
+    Args:
+        pd_df: the input pandas df
+        columns: target columns to check with
+        index: int or Iterable. If int, only check the length (index size) of the df. If Iterable, check index values match
+        col_exact: If True, check the columns param are exact match. Otherwise only check the df contains all of those columns
+    """
     if col_exact:
-        assert list(actual.columns) == list(columns)
+        assert list(pd_df.columns) == list(columns)
     else:
-        assert set(columns) <= set(actual.columns)
+        assert set(columns) <= set(pd_df.columns)
 
     if isinstance(index, int):
-        assert len(actual) == index
+        assert len(pd_df) == index
     elif isinstance(index, Iterable):
-        assert list(actual.index) == list(index)
+        assert list(pd_df.index) == list(index)
     else:
         raise ValueError("Unsupported index type.")