From d333fdd36cd3e836e693191a97ad9817d7c552b6 Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 16 Dec 2024 14:55:21 -0600 Subject: [PATCH 1/7] feat: add LogisticRegression.predict_explain() to generate ML.EXPLAIN_PREDICT columns --- bigframes/ml/linear_model.py | 28 ++++++++ tests/system/small/ml/test_linear_model.py | 77 ++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 1a1a5e0ca0..eac0fd1fca 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -353,6 +353,34 @@ def predict( return self._bqml_model.predict(X) + def predict_explain( + self, + X: utils.ArrayType, + ) -> bpd.DataFrame: + """ + Explain predictions for a logistic regression model. + + .. note:: + Output matches that of the BigQuery ML.EXPLAIN_PREDICT function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or + pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or a DataFrame to explain its predictions. + + Returns: + bigframes.pandas.DataFrame: + The predicted DataFrames with explanation columns. + """ + # TODO(b/377366612): Add support for `top_k_features` parameter + if not self._bqml_model: + raise RuntimeError("A model must be fitted before predict") + + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + + return self._bqml_model.explain_predict(X) + def score( self, X: utils.ArrayType, diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 0832c559c1..cc1228dd6d 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -307,6 +307,83 @@ def test_logistic_model_predict(penguins_logistic_model, new_penguins_df): ) +def test_logistic_model_predict_explain( + penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df +): + predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas() + assert predictions.shape == (3, 13) + result = predictions[["predicted_sex", "probability"]] + expected = pandas.DataFrame( + { + "predicted_sex": ["MALE", "MALE", "FEMALE"], + "probability": [ + 0.70692675811801065, + 0.923363163640252, + 0.99555295967825908, + ], + }, + index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + expected["predicted_sex"] = expected["predicted_sex"].astype( + pandas.StringDtype(storage="pyarrow") + ) + expected["probability"] = expected["probability"].astype(pandas.Float64Dtype()) + pandas.testing.assert_frame_equal( + result.sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) + + +def test_logistic_model_predict_params( + penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df +): + predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas() + assert predictions.shape[0] >= 1 + prediction_columns = set(predictions.columns) + expected_columns = { + "predicted_sex", + "probability", + "top_feature_attributions", + "baseline_prediction_value", + "prediction_value", + "approximation_error", + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + } + assert expected_columns <= prediction_columns + + +def test_logistic_model_predict_explain_params( + penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df +): + predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas() + assert predictions.shape[0] >= 1 + prediction_columns = set(predictions.columns) + expected_columns = { + "predicted_sex", + "probability", + "top_feature_attributions", + "baseline_prediction_value", + "prediction_value", + "approximation_error", + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + } + assert expected_columns <= prediction_columns + + def test_logistic_model_to_gbq_saved_score( penguins_logistic_model, table_id_unique, penguins_df_default_index ): From 0f8800504549a0e342d916869270b61409861dab Mon Sep 17 00:00:00 2001 From: Arwa Date: Tue, 17 Dec 2024 10:41:14 -0600 Subject: [PATCH 2/7] update tests --- tests/system/small/ml/test_linear_model.py | 37 ++-------------------- 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index cc1228dd6d..3be1147c1e 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -307,48 +307,15 @@ def test_logistic_model_predict(penguins_logistic_model, new_penguins_df): ) -def test_logistic_model_predict_explain( - penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df -): - predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas() - assert predictions.shape == (3, 13) - result = predictions[["predicted_sex", "probability"]] - expected = pandas.DataFrame( - { - "predicted_sex": ["MALE", "MALE", "FEMALE"], - "probability": [ - 0.70692675811801065, - 0.923363163640252, - 0.99555295967825908, - ], - }, - index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - expected["predicted_sex"] = expected["predicted_sex"].astype( - pandas.StringDtype(storage="pyarrow") - ) - expected["probability"] = expected["probability"].astype(pandas.Float64Dtype()) - pandas.testing.assert_frame_equal( - result.sort_index(), - expected, - check_exact=False, - rtol=0.1, - ) - - def test_logistic_model_predict_params( penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df ): - predictions = penguins_logistic_model.predict_explain(new_penguins_df).to_pandas() + predictions = penguins_logistic_model.predict(new_penguins_df).to_pandas() assert predictions.shape[0] >= 1 prediction_columns = set(predictions.columns) expected_columns = { "predicted_sex", - "probability", - "top_feature_attributions", - "baseline_prediction_value", - "prediction_value", - "approximation_error", + "predicted_sex_probs", "species", "island", "culmen_length_mm", From af1f29bdae011535899223c1f8ef9e054da3a6d4 Mon Sep 17 00:00:00 2001 From: Arwa Date: Tue, 17 Dec 2024 14:23:25 -0600 Subject: [PATCH 3/7] chore: add support for predict_explain paramater, top_k_features --- bigframes/ml/core.py | 9 +++++++-- bigframes/ml/linear_model.py | 22 +++++++++++++++++----- bigframes/ml/sql.py | 7 +++++-- tests/unit/ml/test_sql.py | 29 +++++++++++++++++------------ 4 files changed, 46 insertions(+), 21 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 2f3b532a74..9617b5d7a5 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -123,10 +123,15 @@ def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: self._model_manipulation_sql_generator.ml_predict, ) - def explain_predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: + def explain_predict( + self, input_data: bpd.DataFrame, options: Mapping[str, int | float] + ) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, - self._model_manipulation_sql_generator.ml_explain_predict, + lambda source_sql: self._model_manipulation_sql_generator.ml_explain_predict( + source_sql=source_sql, + struct_options=options, + ), ) def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 1a1a5e0ca0..5ca4d2cf2e 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -155,14 +155,16 @@ def _fit( def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - - (X,) = utils.batch_convert_to_dataframe(X) + # add session + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def predict_explain( self, X: utils.ArrayType, + *, + top_k_features: int = 5, ) -> bpd.DataFrame: """ Explain predictions for a linear regression model. @@ -175,18 +177,28 @@ def predict_explain( X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to explain its predictions. + top_k_features (Int, default 5): + an INT64 value that specifies how many top feature attribution + pairs are generated for each row of input data. The features are + ranked by the absolute values of their attributions. + + By default, top_k_features is set to 5. If its value is greater + than the number of features in the training data, the + attributions of all features are returned. Returns: bigframes.pandas.DataFrame: The predicted DataFrames with explanation columns. """ - # TODO(b/377366612): Add support for `top_k_features` parameter + # TODO(b/377366612): Add validation for `top_k_features` raising ValueError if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - return self._bqml_model.explain_predict(X) + return self._bqml_model.explain_predict( + X, options={"top_k_features": top_k_features} + ) def score( self, @@ -349,7 +361,7 @@ def predict( if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + (X,) = utils.batch_convert_to_dataframe(X) return self._bqml_model.predict(X) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 93b8a3a051..b662d4c22c 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -304,10 +304,13 @@ def ml_predict(self, source_sql: str) -> str: return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()}, ({source_sql}))""" - def ml_explain_predict(self, source_sql: str) -> str: + def ml_explain_predict( + self, source_sql: str, struct_options: Mapping[str, Union[int, float]] + ) -> str: """Encode ML.EXPLAIN_PREDICT for BQML""" + struct_options_sql = self.struct_options(**struct_options) return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()}, - ({source_sql}))""" + ({source_sql}), {struct_options_sql})""" def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 9d18649efe..5a7220fc38 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -342,18 +342,6 @@ def test_ml_predict_correct( ) -def test_ml_explain_predict_correct( - model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, - mock_df: bpd.DataFrame, -): - sql = model_manipulation_sql_generator.ml_explain_predict(source_sql=mock_df.sql) - assert ( - sql - == """SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, - (input_X_y_sql))""" - ) - - def test_ml_llm_evaluate_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, @@ -462,6 +450,23 @@ def test_ml_generate_embedding_correct( ) +def test_ml_explain_predict_correct( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_manipulation_sql_generator.ml_explain_predict( + source_sql=mock_df.sql, + struct_options={"option_key1": 1, "option_key2": 2.25}, + ) + assert ( + sql + == """SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, + (input_X_y_sql), STRUCT( + 1 AS `option_key1`, + 2.25 AS `option_key2`))""" + ) + + def test_ml_detect_anomalies_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, From c722f10eeb4d4d3e14f32366921baa6c8b7ecc56 Mon Sep 17 00:00:00 2001 From: Arwa Date: Tue, 17 Dec 2024 14:52:17 -0600 Subject: [PATCH 4/7] update test --- tests/system/small/ml/test_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index b9748f24d3..2a2e68b230 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -263,8 +263,9 @@ def test_model_predict(penguins_bqml_linear_model: core.BqmlModel, new_penguins_ def test_model_predict_explain( penguins_bqml_linear_model: core.BqmlModel, new_penguins_df ): + options = {"top_k_features": 3} predictions = penguins_bqml_linear_model.explain_predict( - new_penguins_df + new_penguins_df, options ).to_pandas() expected = pd.DataFrame( { @@ -317,6 +318,7 @@ def test_model_predict_explain_with_unnamed_index( # need to persist through the call to ML.PREDICT new_penguins_df = new_penguins_df.reset_index() + options = {"top_k_features": 3} # remove the middle tag number to ensure we're really keeping the unnamed index new_penguins_df = typing.cast( bigframes.dataframe.DataFrame, @@ -324,7 +326,7 @@ def test_model_predict_explain_with_unnamed_index( ) predictions = penguins_bqml_linear_model.explain_predict( - new_penguins_df + new_penguins_df, options ).to_pandas() expected = pd.DataFrame( From 70108f98b64dc93ab8333fc8230a09e0f65c27e8 Mon Sep 17 00:00:00 2001 From: Arwa Date: Tue, 17 Dec 2024 16:38:06 -0600 Subject: [PATCH 5/7] update logistic reg method with the new param --- bigframes/ml/linear_model.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index d610c075a1..6041558357 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -361,13 +361,15 @@ def predict( if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.batch_convert_to_dataframe(X) + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def predict_explain( self, X: utils.ArrayType, + *, + top_k_features: int = 5, ) -> bpd.DataFrame: """ Explain predictions for a logistic regression model. @@ -380,18 +382,28 @@ def predict_explain( X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to explain its predictions. + top_k_features (Int, default 5): + an INT64 value that specifies how many top feature attribution + pairs are generated for each row of input data. The features are + ranked by the absolute values of their attributions. + + By default, top_k_features is set to 5. If its value is greater + than the number of features in the training data, the + attributions of all features are returned. Returns: bigframes.pandas.DataFrame: The predicted DataFrames with explanation columns. """ - # TODO(b/377366612): Add support for `top_k_features` parameter + # TODO(b/377366612): Add validation for `top_k_features` raising ValueError if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - return self._bqml_model.explain_predict(X) + return self._bqml_model.explain_predict( + X, options={"top_k_features": top_k_features} + ) def score( self, From 26f16b3f8bb825e77ed3bba176ada9e86be62f8e Mon Sep 17 00:00:00 2001 From: Arwa Date: Wed, 18 Dec 2024 13:20:37 -0600 Subject: [PATCH 6/7] add and test new param's validation --- bigframes/ml/linear_model.py | 16 +++++++++--- tests/system/small/ml/test_linear_model.py | 30 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 6041558357..8ee7269361 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -177,7 +177,7 @@ def predict_explain( X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to explain its predictions. - top_k_features (Int, default 5): + top_k_features (int, default 5): an INT64 value that specifies how many top feature attribution pairs are generated for each row of input data. The features are ranked by the absolute values of their attributions. @@ -190,7 +190,11 @@ def predict_explain( bigframes.pandas.DataFrame: The predicted DataFrames with explanation columns. """ - # TODO(b/377366612): Add validation for `top_k_features` raising ValueError + if top_k_features < 1: + raise ValueError( + f"top_k_features must be at least 1, but is {top_k_features}." + ) + if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") @@ -382,7 +386,7 @@ def predict_explain( X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to explain its predictions. - top_k_features (Int, default 5): + top_k_features (int, default 5): an INT64 value that specifies how many top feature attribution pairs are generated for each row of input data. The features are ranked by the absolute values of their attributions. @@ -395,7 +399,11 @@ def predict_explain( bigframes.pandas.DataFrame: The predicted DataFrames with explanation columns. """ - # TODO(b/377366612): Add validation for `top_k_features` raising ValueError + if top_k_features < 1: + raise ValueError( + f"top_k_features must be at least 1, but is {top_k_features}." + ) + if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 3be1147c1e..da9fc8e14f 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re + import google.api_core.exceptions import pandas import pytest @@ -132,6 +134,20 @@ def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df ) +def test_linear_model_predict_explain_top_k_features( + penguins_logistic_model: linear_model.LinearRegression, new_penguins_df +): + top_k_features = 0 + + with pytest.raises( + ValueError, + match=re.escape(f"top_k_features must be at least 1, but is {top_k_features}."), + ): + penguins_logistic_model.predict_explain( + new_penguins_df, top_k_features=top_k_features + ).to_pandas() + + def test_linear_reg_model_predict_params( penguins_linear_model: linear_model.LinearRegression, new_penguins_df ): @@ -307,6 +323,20 @@ def test_logistic_model_predict(penguins_logistic_model, new_penguins_df): ) +def test_logistic_model_predict_explain_top_k_features( + penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df +): + top_k_features = 0 + + with pytest.raises( + ValueError, + match=re.escape(f"top_k_features must be at least 1, but is {top_k_features}."), + ): + penguins_logistic_model.predict_explain( + new_penguins_df, top_k_features=top_k_features + ).to_pandas() + + def test_logistic_model_predict_params( penguins_logistic_model: linear_model.LogisticRegression, new_penguins_df ): From 1d8a1fd3ff9d828c9d981ec547c1746314a1dfa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 20 Dec 2024 11:41:06 -0600 Subject: [PATCH 7/7] Update bigframes/ml/linear_model.py --- bigframes/ml/linear_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 8ee7269361..722b72f806 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -155,7 +155,6 @@ def _fit( def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - # add session (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X)