From 3baa0c5594efae066e0af331928b78d051419162 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Wed, 27 Nov 2024 15:51:55 -0600
Subject: [PATCH 01/10] feat: add LinearRegression.predict_explain to generate
 predict explain columns

---
 bigframes/ml/core.py         | 6 ++++++
 bigframes/ml/linear_model.py | 8 ++++++++
 bigframes/ml/sql.py          | 5 +++++
 3 files changed, 19 insertions(+)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index be67396fba..2d3f567816 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -123,6 +123,12 @@ def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
             self._model_manipulation_sql_generator.ml_predict,
         )
 
+    def explain_predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
+        return self._apply_ml_tvf(
+            input_data,
+            self._model_manipulation_sql_generator.ml_explain_predict,
+        )
+
     def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
         return self._apply_ml_tvf(
             input_data,
diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index ae4e1944cc..59908010ca 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -160,6 +160,14 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
 
         return self._bqml_model.predict(X)
 
+    def predict_explain(self, X: utils.ArrayType) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before predict")
+
+        (X,) = utils.convert_to_dataframe(X)
+
+        return self._bqml_model.explain_predict(X)
+
     def score(
         self,
         X: utils.ArrayType,
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index b7d550ac63..0a0e79e7ce 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -304,6 +304,11 @@ def ml_predict(self, source_sql: str) -> str:
         return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()},
   ({source_sql}))"""
 
+    def ml_explain_predict(self, source_sql: str) -> str:
+        """Encode ML.EXPLAIN_PREDICT for BQML"""
+        return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()},
+  ({source_sql}))"""
+
     def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str:
         """Encode ML.FORECAST for BQML"""
         struct_options_sql = self.struct_options(**struct_options)

From 00e89ac1c218d02575198b42452c5ef6b4ad0d93 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Mon, 2 Dec 2024 17:32:01 -0600
Subject: [PATCH 02/10] add test cases

---
 tests/system/small/ml/test_linear_model.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index 218c1074ab..10eb09de59 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -106,6 +106,23 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df):
     )
 
 
+def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df):
+    predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas()
+    assert predictions.shape == (3, 12)
+    result = predictions[["predicted_body_mass_g"]]
+    expected = pandas.DataFrame(
+        {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]},
+        dtype="Float64",
+        index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    )
+    pandas.testing.assert_frame_equal(
+        result.sort_index(),
+        expected,
+        check_exact=False,
+        rtol=0.1,
+    )
+
+
 def test_to_gbq_saved_linear_reg_model_scores(
     penguins_linear_model, dataset_id, penguins_df_default_index
 ):

From e7372217ae6a4396a9a7cb98c513fc559475204f Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Thu, 5 Dec 2024 11:40:33 -0600
Subject: [PATCH 03/10] add test case

---
 tests/system/small/ml/test_linear_model.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index 10eb09de59..78c515c6aa 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -109,9 +109,12 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df):
 def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df):
     predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas()
     assert predictions.shape == (3, 12)
-    result = predictions[["predicted_body_mass_g"]]
+    result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]]
     expected = pandas.DataFrame(
-        {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]},
+        {
+            "predicted_body_mass_g": [4030.1, 3280.8, 3177.9],
+            "baseline_prediction_value": [9362.7, 9362.7, 9362.7],
+        },
         dtype="Float64",
         index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )

From e58a0e724179acec7c257f993fade03b0b05b4b4 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Thu, 5 Dec 2024 14:02:51 -0600
Subject: [PATCH 04/10] update predict_explain

---
 bigframes/ml/linear_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 59908010ca..513abd60c5 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -164,7 +164,7 @@ def predict_explain(self, X: utils.ArrayType) -> bpd.DataFrame:
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
-        (X,) = utils.convert_to_dataframe(X)
+        (X,) = utils.batch_convert_to_dataframe(X)
 
         return self._bqml_model.explain_predict(X)
 

From cea6526c4ac4ab442af1ec62582471e43627e40b Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Tue, 10 Dec 2024 13:39:07 -0600
Subject: [PATCH 05/10] update the test

---
 tests/system/small/ml/test_linear_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index 78c515c6aa..ed0a79ac59 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -109,11 +109,11 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df):
 def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df):
     predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas()
     assert predictions.shape == (3, 12)
-    result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]]
+    result = predictions[["predicted_body_mass_g", "approximation_error"]]
     expected = pandas.DataFrame(
         {
             "predicted_body_mass_g": [4030.1, 3280.8, 3177.9],
-            "baseline_prediction_value": [9362.7, 9362.7, 9362.7],
+            "approximation_error": [0.0, 0.0, 0.0],
         },
         dtype="Float64",
         index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),

From 362dd7b2b901002ffe75590026b121d741351c65 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Mon, 16 Dec 2024 10:16:09 -0600
Subject: [PATCH 06/10] Add sql and core tests

---
 bigframes/ml/linear_model.py               | 23 ++++++++-
 tests/system/small/ml/test_core.py         | 59 ++++++++++++++++++++++
 tests/system/small/ml/test_linear_model.py | 48 +++++++++++++++++-
 tests/unit/ml/test_sql.py                  | 12 +++++
 4 files changed, 138 insertions(+), 4 deletions(-)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 513abd60c5..230b0ef61e 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -160,11 +160,30 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
 
         return self._bqml_model.predict(X)
 
-    def predict_explain(self, X: utils.ArrayType) -> bpd.DataFrame:
+    def predict_explain(
+        self,
+        X: utils.ArrayType,
+    ) -> bpd.DataFrame:
+        """
+        Explain predictions for a linear regression model.
+
+        .. note::
+
+        Output matches that of the BigQuery ML.EXPLAIN_PREDICT function.
+        See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
+                Series or a DataFrame to explain its predictions.
+
+        Returns:
+            bigframes.pandas.DataFrame:
+                The predicted DataFrames with explanation columns.
+        """
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
-        (X,) = utils.batch_convert_to_dataframe(X)
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
         return self._bqml_model.explain_predict(X)
 
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index 30b75f502d..62ab4403df 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -261,6 +261,30 @@ def test_model_predict(penguins_bqml_linear_model: core.BqmlModel, new_penguins_
     )
 
 
+def test_model_predict_explain(
+    penguins_bqml_linear_model: core.BqmlModel, new_penguins_df
+):
+    predictions = penguins_bqml_linear_model.explain_predict(
+        new_penguins_df
+    ).to_pandas()
+    expected = pd.DataFrame(
+        {
+            "predicted_body_mass_g": [4030.1, 3280.8, 3177.9],
+            "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906],
+        },
+        dtype="Float64",
+        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(
+        predictions[
+            ["predicted_body_mass_g", "baseline_prediction_value"]
+        ].sort_index(),
+        expected,
+        check_exact=False,
+        rtol=0.1,
+    )
+
+
 def test_model_predict_with_unnamed_index(
     penguins_bqml_linear_model: core.BqmlModel, new_penguins_df
 ):
@@ -289,6 +313,41 @@ def test_model_predict_with_unnamed_index(
     )
 
 
+def test_model_predict_explain_with_unnamed_index(
+    penguins_bqml_linear_model: core.BqmlModel, new_penguins_df
+):
+    # This will result in an index that lacks a name, which the ML library will
+    # need to persist through the call to ML.PREDICT
+    new_penguins_df = new_penguins_df.reset_index()
+
+    # remove the middle tag number to ensure we're really keeping the unnamed index
+    new_penguins_df = typing.cast(
+        bigframes.dataframe.DataFrame,
+        new_penguins_df[new_penguins_df.tag_number != 1672],
+    )
+
+    predictions = penguins_bqml_linear_model.explain_predict(
+        new_penguins_df
+    ).to_pandas()
+
+    expected = pd.DataFrame(
+        {
+            "predicted_body_mass_g": [4030.1, 3177.9],
+            "baseline_prediction_value": [9362.692906, 9362.692906],
+        },
+        dtype="Float64",
+        index=pd.Index([0, 2], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(
+        predictions[
+            ["predicted_body_mass_g", "baseline_prediction_value"]
+        ].sort_index(),
+        expected,
+        check_exact=False,
+        rtol=0.1,
+    )
+
+
 def test_model_detect_anomalies(
     penguins_bqml_pca_model: core.BqmlModel, new_penguins_df
 ):
diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index d7e5c5ac86..84e4a7682d 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -16,6 +16,8 @@
 import pandas
 import pytest
 
+from bigframes.ml import linear_model
+
 
 def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index):
     df = penguins_df_default_index.dropna()
@@ -109,11 +111,11 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df):
 def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df):
     predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas()
     assert predictions.shape == (3, 12)
-    result = predictions[["predicted_body_mass_g", "approximation_error"]]
+    result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]]
     expected = pandas.DataFrame(
         {
             "predicted_body_mass_g": [4030.1, 3280.8, 3177.9],
-            "approximation_error": [0.0, 0.0, 0.0],
+            "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906],
         },
         dtype="Float64",
         index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
@@ -126,6 +128,48 @@ def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df
     )
 
 
+def test_linear_reg_model_predict_params(
+    penguins_linear_model: linear_model.LinearRegression, new_penguins_df
+):
+    predictions = penguins_linear_model.predict(new_penguins_df).to_pandas()
+    assert predictions.shape[0] >= 1
+    prediction_columns = set(predictions.columns)
+    expected_columns = {
+        "predicted_body_mass_g",
+        "species",
+        "island",
+        "culmen_length_mm",
+        "culmen_depth_mm",
+        "flipper_length_mm",
+        "body_mass_g",
+        "sex",
+    }
+    assert expected_columns <= prediction_columns
+
+
+def test_linear_reg_model_predict_explain_params(
+    penguins_linear_model: linear_model.LinearRegression, new_penguins_df
+):
+    predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas()
+    assert predictions.shape[0] >= 1
+    prediction_columns = set(predictions.columns)
+    expected_columns = {
+        "predicted_body_mass_g",
+        "top_feature_attributions",
+        "baseline_prediction_value",
+        "prediction_value",
+        "approximation_error",
+        "species",
+        "island",
+        "culmen_length_mm",
+        "culmen_depth_mm",
+        "flipper_length_mm",
+        "body_mass_g",
+        "sex",
+    }
+    assert expected_columns <= prediction_columns
+
+
 def test_to_gbq_saved_linear_reg_model_scores(
     penguins_linear_model, table_id_unique, penguins_df_default_index
 ):
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index ee0821dfe9..9d18649efe 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -342,6 +342,18 @@ def test_ml_predict_correct(
     )
 
 
+def test_ml_explain_predict_correct(
+    model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
+    mock_df: bpd.DataFrame,
+):
+    sql = model_manipulation_sql_generator.ml_explain_predict(source_sql=mock_df.sql)
+    assert (
+        sql
+        == """SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`,
+  (input_X_y_sql))"""
+    )
+
+
 def test_ml_llm_evaluate_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,

From fb2d02f7b6e0f925f2bb069753d763ba622c3f1e Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Mon, 16 Dec 2024 10:33:38 -0600
Subject: [PATCH 07/10] fix docs error

---
 bigframes/ml/linear_model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 230b0ef61e..5b2e745af1 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -168,12 +168,12 @@ def predict_explain(
         Explain predictions for a linear regression model.
 
         .. note::
-
-        Output matches that of the BigQuery ML.EXPLAIN_PREDICT function.
-        See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict
+            Output matches that of the BigQuery ML.EXPLAIN_PREDICT function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict
 
         Args:
-            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series or
+            pandas.core.frame.DataFrame or pandas.core.series.Series):
                 Series or a DataFrame to explain its predictions.
 
         Returns:

From 3a784bd8b101441ce2ea68b1eafa03896e64782c Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Mon, 16 Dec 2024 11:46:54 -0600
Subject: [PATCH 08/10] add TODO comment to support method paramaters

---
 bigframes/ml/linear_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 5b2e745af1..23feca706c 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -180,6 +180,7 @@ def predict_explain(
             bigframes.pandas.DataFrame:
                 The predicted DataFrames with explanation columns.
         """
+        # TODO(b/377366612): Add support for `to_k_features` parameter
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 

From 81c3cdbf1be098c630abf39eb3dc854b22d9af8c Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Mon, 16 Dec 2024 12:21:10 -0600
Subject: [PATCH 09/10] update the test parmametr of linear model

---
 tests/system/small/ml/test_linear_model.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index 84e4a7682d..0832c559c1 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -111,11 +111,15 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df):
 def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df):
     predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas()
     assert predictions.shape == (3, 12)
-    result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]]
+    result = predictions[["predicted_body_mass_g", "approximation_error"]]
     expected = pandas.DataFrame(
         {
             "predicted_body_mass_g": [4030.1, 3280.8, 3177.9],
-            "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906],
+            "approximation_error": [
+                0.0,
+                0.0,
+                0.0,
+            ],
         },
         dtype="Float64",
         index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),

From c4a1beb3d5a9dc5b8bb85398f0da156fa8ab8c2d Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Mon, 16 Dec 2024 15:04:06 -0600
Subject: [PATCH 10/10] update test to fix failing checks

---
 bigframes/ml/linear_model.py       |  2 +-
 tests/system/small/ml/test_core.py | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 23feca706c..1a1a5e0ca0 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -180,7 +180,7 @@ def predict_explain(
             bigframes.pandas.DataFrame:
                 The predicted DataFrames with explanation columns.
         """
-        # TODO(b/377366612): Add support for `to_k_features` parameter
+        # TODO(b/377366612): Add support for `top_k_features` parameter
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index 62ab4403df..3ea31353b1 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -270,15 +270,13 @@ def test_model_predict_explain(
     expected = pd.DataFrame(
         {
             "predicted_body_mass_g": [4030.1, 3280.8, 3177.9],
-            "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906],
+            "approximation_error": [0.0, 0.0, 0.0],
         },
         dtype="Float64",
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )
     pd.testing.assert_frame_equal(
-        predictions[
-            ["predicted_body_mass_g", "baseline_prediction_value"]
-        ].sort_index(),
+        predictions[["predicted_body_mass_g", "approximation_error"]].sort_index(),
         expected,
         check_exact=False,
         rtol=0.1,
@@ -333,15 +331,13 @@ def test_model_predict_explain_with_unnamed_index(
     expected = pd.DataFrame(
         {
             "predicted_body_mass_g": [4030.1, 3177.9],
-            "baseline_prediction_value": [9362.692906, 9362.692906],
+            "approximation_error": [0.0, 0.0],
         },
         dtype="Float64",
         index=pd.Index([0, 2], dtype="Int64"),
     )
     pd.testing.assert_frame_equal(
-        predictions[
-            ["predicted_body_mass_g", "baseline_prediction_value"]
-        ].sort_index(),
+        predictions[["predicted_body_mass_g", "approximation_error"]].sort_index(),
         expected,
         check_exact=False,
         rtol=0.1,