From d333fdd36cd3e836e693191a97ad9817d7c552b6 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Mon, 16 Dec 2024 14:55:21 -0600
Subject: [PATCH 1/7] feat: add LogisticRegression.predict_explain() to
 generate ML.EXPLAIN_PREDICT columns

---
 bigframes/ml/linear_model.py               | 28 ++++++++
 tests/system/small/ml/test_linear_model.py | 77 ++++++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 1a1a5e0ca0..eac0fd1fca 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -353,6 +353,34 @@ def predict(
 
         return self._bqml_model.predict(X)
 
+    def predict_explain(
+        self,
+        X: utils.ArrayType,
+    ) -> bpd.DataFrame:
+        """
+        Explain predictions for a logistic regression model.
+
+        .. note::
+            Output matches that of the BigQuery ML.EXPLAIN_PREDICT function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series or
+            pandas.core.frame.DataFrame or pandas.core.series.Series):
+                Series or a DataFrame to explain its predictions.
+
+        Returns:
+            bigframes.pandas.DataFrame:
+                The predicted DataFrames with explanation columns.
+        """
+        # TODO(b/377366612): Add support for `top_k_features` parameter
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before predict")
+
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
+
+        return self._bqml_model.explain_predict(X)
+
     def score(
         self,
         X: utils.ArrayType,
diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index 0832c559c1..cc1228dd6d 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -307,6 +307,83 @@ def test_logistic_model_predict(penguins_logistic_model, new_penguins_df):
     )
 
 
+def test_logistic_model_predict_explain(
+    penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df
+):
+    predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas()
+    assert predictions.shape == (3, 13)
+    result = predictions[["predicted_sex", "probability"]]
+    expected = pandas.DataFrame(
+        {
+            "predicted_sex": ["MALE", "MALE", "FEMALE"],
+            "probability": [
+                0.70692675811801065,
+                0.923363163640252,
+                0.99555295967825908,
+            ],
+        },
+        index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    )
+    expected["predicted_sex"] = expected["predicted_sex"].astype(
+        pandas.StringDtype(storage="pyarrow")
+    )
+    expected["probability"] = expected["probability"].astype(pandas.Float64Dtype())
+    pandas.testing.assert_frame_equal(
+        result.sort_index(),
+        expected,
+        check_exact=False,
+        rtol=0.1,
+    )
+
+
+def test_logistic_model_predict_params(
+    penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df
+):
+    predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas()
+    assert predictions.shape[0] >= 1
+    prediction_columns = set(predictions.columns)
+    expected_columns = {
+        "predicted_sex",
+        "probability",
+        "top_feature_attributions",
+        "baseline_prediction_value",
+        "prediction_value",
+        "approximation_error",
+        "species",
+        "island",
+        "culmen_length_mm",
+        "culmen_depth_mm",
+        "flipper_length_mm",
+        "body_mass_g",
+        "sex",
+    }
+    assert expected_columns <= prediction_columns
+
+
+def test_logistic_model_predict_explain_params(
+    penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df
+):
+    predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas()
+    assert predictions.shape[0] >= 1
+    prediction_columns = set(predictions.columns)
+    expected_columns = {
+        "predicted_sex",
+        "probability",
+        "top_feature_attributions",
+        "baseline_prediction_value",
+        "prediction_value",
+        "approximation_error",
+        "species",
+        "island",
+        "culmen_length_mm",
+        "culmen_depth_mm",
+        "flipper_length_mm",
+        "body_mass_g",
+        "sex",
+    }
+    assert expected_columns <= prediction_columns
+
+
 def test_logistic_model_to_gbq_saved_score(
     penguins_logistic_model, table_id_unique, penguins_df_default_index
 ):

From 0f8800504549a0e342d916869270b61409861dab Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Tue, 17 Dec 2024 10:41:14 -0600
Subject: [PATCH 2/7] update tests

---
 tests/system/small/ml/test_linear_model.py | 37 ++--------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index cc1228dd6d..3be1147c1e 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -307,48 +307,15 @@ def test_logistic_model_predict(penguins_logistic_model, new_penguins_df):
     )
 
 
-def test_logistic_model_predict_explain(
-    penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df
-):
-    predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas()
-    assert predictions.shape == (3, 13)
-    result = predictions[["predicted_sex", "probability"]]
-    expected = pandas.DataFrame(
-        {
-            "predicted_sex": ["MALE", "MALE", "FEMALE"],
-            "probability": [
-                0.70692675811801065,
-                0.923363163640252,
-                0.99555295967825908,
-            ],
-        },
-        index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
-    )
-    expected["predicted_sex"] = expected["predicted_sex"].astype(
-        pandas.StringDtype(storage="pyarrow")
-    )
-    expected["probability"] = expected["probability"].astype(pandas.Float64Dtype())
-    pandas.testing.assert_frame_equal(
-        result.sort_index(),
-        expected,
-        check_exact=False,
-        rtol=0.1,
-    )
-
-
 def test_logistic_model_predict_params(
     penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df
 ):
-    predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas()
+    predictions = penguins_logistic_model.predict(new_penguins_df).to_pandas()
     assert predictions.shape[0] >= 1
     prediction_columns = set(predictions.columns)
     expected_columns = {
         "predicted_sex",
-        "probability",
-        "top_feature_attributions",
-        "baseline_prediction_value",
-        "prediction_value",
-        "approximation_error",
+        "predicted_sex_probs",
         "species",
         "island",
         "culmen_length_mm",

From af1f29bdae011535899223c1f8ef9e054da3a6d4 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Tue, 17 Dec 2024 14:23:25 -0600
Subject: [PATCH 3/7] chore: add support for predict_explain paramater,
 top_k_features

---
 bigframes/ml/core.py         |  9 +++++++--
 bigframes/ml/linear_model.py | 22 +++++++++++++++++-----
 bigframes/ml/sql.py          |  7 +++++--
 tests/unit/ml/test_sql.py    | 29 +++++++++++++++++------------
 4 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index 2f3b532a74..9617b5d7a5 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -123,10 +123,15 @@ def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
             self._model_manipulation_sql_generator.ml_predict,
         )
 
-    def explain_predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
+    def explain_predict(
+        self, input_data: bpd.DataFrame, options: Mapping[str, int | float]
+    ) -> bpd.DataFrame:
         return self._apply_ml_tvf(
             input_data,
-            self._model_manipulation_sql_generator.ml_explain_predict,
+            lambda source_sql: self._model_manipulation_sql_generator.ml_explain_predict(
+                source_sql=source_sql,
+                struct_options=options,
+            ),
         )
 
     def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 1a1a5e0ca0..5ca4d2cf2e 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -155,14 +155,16 @@ def _fit(
     def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
-
-        (X,) = utils.batch_convert_to_dataframe(X)
+        # add session
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
         return self._bqml_model.predict(X)
 
     def predict_explain(
         self,
         X: utils.ArrayType,
+        *,
+        top_k_features: int = 5,
     ) -> bpd.DataFrame:
         """
         Explain predictions for a linear regression model.
@@ -175,18 +177,28 @@ def predict_explain(
             X (bigframes.dataframe.DataFrame or bigframes.series.Series or
             pandas.core.frame.DataFrame or pandas.core.series.Series):
                 Series or a DataFrame to explain its predictions.
+            top_k_features (Int, default 5):
+                an INT64 value that specifies how many top feature attribution
+                pairs are generated for each row of input data. The features are
+                ranked by the absolute values of their attributions.
+
+                By default, top_k_features is set to 5. If its value is greater
+                than the number of features in the training data, the
+                attributions of all features are returned.
 
         Returns:
             bigframes.pandas.DataFrame:
                 The predicted DataFrames with explanation columns.
         """
-        # TODO(b/377366612): Add support for `top_k_features` parameter
+        # TODO(b/377366612): Add validation for `top_k_features` raising ValueError
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
         (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
-        return self._bqml_model.explain_predict(X)
+        return self._bqml_model.explain_predict(
+            X, options={"top_k_features": top_k_features}
+        )
 
     def score(
         self,
@@ -349,7 +361,7 @@ def predict(
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
-        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
+        (X,) = utils.batch_convert_to_dataframe(X)
 
         return self._bqml_model.predict(X)
 
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index 93b8a3a051..b662d4c22c 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -304,10 +304,13 @@ def ml_predict(self, source_sql: str) -> str:
         return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()},
   ({source_sql}))"""
 
-    def ml_explain_predict(self, source_sql: str) -> str:
+    def ml_explain_predict(
+        self, source_sql: str, struct_options: Mapping[str, Union[int, float]]
+    ) -> str:
         """Encode ML.EXPLAIN_PREDICT for BQML"""
+        struct_options_sql = self.struct_options(**struct_options)
         return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()},
-  ({source_sql}))"""
+  ({source_sql}), {struct_options_sql})"""
 
     def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str:
         """Encode ML.FORECAST for BQML"""
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 9d18649efe..5a7220fc38 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -342,18 +342,6 @@ def test_ml_predict_correct(
     )
 
 
-def test_ml_explain_predict_correct(
-    model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
-    mock_df: bpd.DataFrame,
-):
-    sql = model_manipulation_sql_generator.ml_explain_predict(source_sql=mock_df.sql)
-    assert (
-        sql
-        == """SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`,
-  (input_X_y_sql))"""
-    )
-
-
 def test_ml_llm_evaluate_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,
@@ -462,6 +450,23 @@ def test_ml_generate_embedding_correct(
     )
 
 
+def test_ml_explain_predict_correct(
+    model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
+    mock_df: bpd.DataFrame,
+):
+    sql = model_manipulation_sql_generator.ml_explain_predict(
+        source_sql=mock_df.sql,
+        struct_options={"option_key1": 1, "option_key2": 2.25},
+    )
+    assert (
+        sql
+        == """SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`,
+  (input_X_y_sql), STRUCT(
+  1 AS `option_key1`,
+  2.25 AS `option_key2`))"""
+    )
+
+
 def test_ml_detect_anomalies_correct_sql(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,

From c722f10eeb4d4d3e14f32366921baa6c8b7ecc56 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Tue, 17 Dec 2024 14:52:17 -0600
Subject: [PATCH 4/7] update test

---
 tests/system/small/ml/test_core.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index b9748f24d3..2a2e68b230 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -263,8 +263,9 @@ def test_model_predict(penguins_bqml_linear_model: core.BqmlModel, new_penguins_
 def test_model_predict_explain(
     penguins_bqml_linear_model: core.BqmlModel, new_penguins_df
 ):
+    options = {"top_k_features": 3}
     predictions = penguins_bqml_linear_model.explain_predict(
-        new_penguins_df
+        new_penguins_df, options
     ).to_pandas()
     expected = pd.DataFrame(
         {
@@ -317,6 +318,7 @@ def test_model_predict_explain_with_unnamed_index(
     # need to persist through the call to ML.PREDICT
     new_penguins_df = new_penguins_df.reset_index()
 
+    options = {"top_k_features": 3}
     # remove the middle tag number to ensure we're really keeping the unnamed index
     new_penguins_df = typing.cast(
         bigframes.dataframe.DataFrame,
@@ -324,7 +326,7 @@ def test_model_predict_explain_with_unnamed_index(
     )
 
     predictions = penguins_bqml_linear_model.explain_predict(
-        new_penguins_df
+        new_penguins_df, options
     ).to_pandas()
 
     expected = pd.DataFrame(

From 70108f98b64dc93ab8333fc8230a09e0f65c27e8 Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Tue, 17 Dec 2024 16:38:06 -0600
Subject: [PATCH 5/7] update logistic reg method with the new param

---
 bigframes/ml/linear_model.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index d610c075a1..6041558357 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -361,13 +361,15 @@ def predict(
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
-        (X,) = utils.batch_convert_to_dataframe(X)
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
         return self._bqml_model.predict(X)
 
     def predict_explain(
         self,
         X: utils.ArrayType,
+        *,
+        top_k_features: int = 5,
     ) -> bpd.DataFrame:
         """
         Explain predictions for a logistic regression model.
@@ -380,18 +382,28 @@ def predict_explain(
             X (bigframes.dataframe.DataFrame or bigframes.series.Series or
             pandas.core.frame.DataFrame or pandas.core.series.Series):
                 Series or a DataFrame to explain its predictions.
+            top_k_features (Int, default 5):
+                an INT64 value that specifies how many top feature attribution
+                pairs are generated for each row of input data. The features are
+                ranked by the absolute values of their attributions.
+
+                By default, top_k_features is set to 5. If its value is greater
+                than the number of features in the training data, the
+                attributions of all features are returned.
 
         Returns:
             bigframes.pandas.DataFrame:
                 The predicted DataFrames with explanation columns.
         """
-        # TODO(b/377366612): Add support for `top_k_features` parameter
+        # TODO(b/377366612): Add validation for `top_k_features` raising ValueError
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
         (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
-        return self._bqml_model.explain_predict(X)
+        return self._bqml_model.explain_predict(
+            X, options={"top_k_features": top_k_features}
+        )
 
     def score(
         self,

From 26f16b3f8bb825e77ed3bba176ada9e86be62f8e Mon Sep 17 00:00:00 2001
From: Arwa <arwas@google.com>
Date: Wed, 18 Dec 2024 13:20:37 -0600
Subject: [PATCH 6/7] add and test new param's validation

---
 bigframes/ml/linear_model.py               | 16 +++++++++---
 tests/system/small/ml/test_linear_model.py | 30 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 6041558357..8ee7269361 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -177,7 +177,7 @@ def predict_explain(
             X (bigframes.dataframe.DataFrame or bigframes.series.Series or
             pandas.core.frame.DataFrame or pandas.core.series.Series):
                 Series or a DataFrame to explain its predictions.
-            top_k_features (Int, default 5):
+            top_k_features (int, default 5):
                 an INT64 value that specifies how many top feature attribution
                 pairs are generated for each row of input data. The features are
                 ranked by the absolute values of their attributions.
@@ -190,7 +190,11 @@ def predict_explain(
             bigframes.pandas.DataFrame:
                 The predicted DataFrames with explanation columns.
         """
-        # TODO(b/377366612): Add validation for `top_k_features` raising ValueError
+        if top_k_features < 1:
+            raise ValueError(
+                f"top_k_features must be at least 1, but is {top_k_features}."
+            )
+
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
@@ -382,7 +386,7 @@ def predict_explain(
             X (bigframes.dataframe.DataFrame or bigframes.series.Series or
             pandas.core.frame.DataFrame or pandas.core.series.Series):
                 Series or a DataFrame to explain its predictions.
-            top_k_features (Int, default 5):
+            top_k_features (int, default 5):
                 an INT64 value that specifies how many top feature attribution
                 pairs are generated for each row of input data. The features are
                 ranked by the absolute values of their attributions.
@@ -395,7 +399,11 @@ def predict_explain(
             bigframes.pandas.DataFrame:
                 The predicted DataFrames with explanation columns.
         """
-        # TODO(b/377366612): Add validation for `top_k_features` raising ValueError
+        if top_k_features < 1:
+            raise ValueError(
+                f"top_k_features must be at least 1, but is {top_k_features}."
+            )
+
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index 3be1147c1e..da9fc8e14f 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
+
 import google.api_core.exceptions
 import pandas
 import pytest
@@ -132,6 +134,20 @@ def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df
     )
 
 
+def test_linear_model_predict_explain_top_k_features(
+    penguins_logistic_model: linear_model.LinearRegression, new_penguins_df
+):
+    top_k_features = 0
+
+    with pytest.raises(
+        ValueError,
+        match=re.escape(f"top_k_features must be at least 1, but is {top_k_features}."),
+    ):
+        penguins_logistic_model.predict_explain(
+            new_penguins_df, top_k_features=top_k_features
+        ).to_pandas()
+
+
 def test_linear_reg_model_predict_params(
     penguins_linear_model: linear_model.LinearRegression, new_penguins_df
 ):
@@ -307,6 +323,20 @@ def test_logistic_model_predict(penguins_logistic_model, new_penguins_df):
     )
 
 
+def test_logistic_model_predict_explain_top_k_features(
+    penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df
+):
+    top_k_features = 0
+
+    with pytest.raises(
+        ValueError,
+        match=re.escape(f"top_k_features must be at least 1, but is {top_k_features}."),
+    ):
+        penguins_logistic_model.predict_explain(
+            new_penguins_df, top_k_features=top_k_features
+        ).to_pandas()
+
+
 def test_logistic_model_predict_params(
     penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df
 ):

From 1d8a1fd3ff9d828c9d981ec547c1746314a1dfa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Fri, 20 Dec 2024 11:41:06 -0600
Subject: [PATCH 7/7] Update bigframes/ml/linear_model.py

---
 bigframes/ml/linear_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 8ee7269361..722b72f806 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -155,7 +155,6 @@ def _fit(
     def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
-        # add session
         (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
         return self._bqml_model.predict(X)