From 3baa0c5594efae066e0af331928b78d051419162 Mon Sep 17 00:00:00 2001 From: Arwa Date: Wed, 27 Nov 2024 15:51:55 -0600 Subject: [PATCH 01/10] feat: add LinearRegression.predict_explain to generate predict explain columns --- bigframes/ml/core.py | 6 ++++++ bigframes/ml/linear_model.py | 8 ++++++++ bigframes/ml/sql.py | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index be67396fba..2d3f567816 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -123,6 +123,12 @@ def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: self._model_manipulation_sql_generator.ml_predict, ) + def explain_predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: + return self._apply_ml_tvf( + input_data, + self._model_manipulation_sql_generator.ml_explain_predict, + ) + def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index ae4e1944cc..59908010ca 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -160,6 +160,14 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: return self._bqml_model.predict(X) + def predict_explain(self, X: utils.ArrayType) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("A model must be fitted before predict") + + (X,) = utils.convert_to_dataframe(X) + + return self._bqml_model.explain_predict(X) + def score( self, X: utils.ArrayType, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index b7d550ac63..0a0e79e7ce 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -304,6 +304,11 @@ def ml_predict(self, source_sql: str) -> str: return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()}, ({source_sql}))""" + def ml_explain_predict(self, source_sql: str) -> str: + """Encode ML.EXPLAIN_PREDICT for BQML""" + return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()}, + ({source_sql}))""" + def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" struct_options_sql = self.struct_options(**struct_options) From 00e89ac1c218d02575198b42452c5ef6b4ad0d93 Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 2 Dec 2024 17:32:01 -0600 Subject: [PATCH 02/10] add test cases --- tests/system/small/ml/test_linear_model.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 218c1074ab..10eb09de59 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -106,6 +106,23 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): ) +def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df): + predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas() + assert predictions.shape == (3, 12) + result = predictions[["predicted_body_mass_g"]] + expected = pandas.DataFrame( + {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, + dtype="Float64", + index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pandas.testing.assert_frame_equal( + result.sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) + + def test_to_gbq_saved_linear_reg_model_scores( penguins_linear_model, dataset_id, penguins_df_default_index ): From e7372217ae6a4396a9a7cb98c513fc559475204f Mon Sep 17 00:00:00 2001 From: Arwa Date: Thu, 5 Dec 2024 11:40:33 -0600 Subject: [PATCH 03/10] add test case --- tests/system/small/ml/test_linear_model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 10eb09de59..78c515c6aa 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -109,9 +109,12 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df): predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas() assert predictions.shape == (3, 12) - result = predictions[["predicted_body_mass_g"]] + result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]] expected = pandas.DataFrame( - {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, + { + "predicted_body_mass_g": [4030.1, 3280.8, 3177.9], + "baseline_prediction_value": [9362.7, 9362.7, 9362.7], + }, dtype="Float64", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) From e58a0e724179acec7c257f993fade03b0b05b4b4 Mon Sep 17 00:00:00 2001 From: Arwa Date: Thu, 5 Dec 2024 14:02:51 -0600 Subject: [PATCH 04/10] update predict_explain --- bigframes/ml/linear_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 59908010ca..513abd60c5 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -164,7 +164,7 @@ def predict_explain(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.batch_convert_to_dataframe(X) return self._bqml_model.explain_predict(X) From cea6526c4ac4ab442af1ec62582471e43627e40b Mon Sep 17 00:00:00 2001 From: Arwa Date: Tue, 10 Dec 2024 13:39:07 -0600 Subject: [PATCH 05/10] update the test --- tests/system/small/ml/test_linear_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 78c515c6aa..ed0a79ac59 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -109,11 +109,11 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df): predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas() assert predictions.shape == (3, 12) - result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]] + result = predictions[["predicted_body_mass_g", "approximation_error"]] expected = pandas.DataFrame( { "predicted_body_mass_g": [4030.1, 3280.8, 3177.9], - "baseline_prediction_value": [9362.7, 9362.7, 9362.7], + "approximation_error": [0.0, 0.0, 0.0], }, dtype="Float64", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), From 362dd7b2b901002ffe75590026b121d741351c65 Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 16 Dec 2024 10:16:09 -0600 Subject: [PATCH 06/10] Add sql and core tests --- bigframes/ml/linear_model.py | 23 ++++++++- tests/system/small/ml/test_core.py | 59 ++++++++++++++++++++++ tests/system/small/ml/test_linear_model.py | 48 +++++++++++++++++- tests/unit/ml/test_sql.py | 12 +++++ 4 files changed, 138 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 513abd60c5..230b0ef61e 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -160,11 +160,30 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: return self._bqml_model.predict(X) - def predict_explain(self, X: utils.ArrayType) -> bpd.DataFrame: + def predict_explain( + self, + X: utils.ArrayType, + ) -> bpd.DataFrame: + """ + Explain predictions for a linear regression model. + + .. note:: + + Output matches that of the BigQuery ML.EXPLAIN_PREDICT function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or a DataFrame to explain its predictions. + + Returns: + bigframes.pandas.DataFrame: + The predicted DataFrames with explanation columns. + """ if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.batch_convert_to_dataframe(X) + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.explain_predict(X) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 30b75f502d..62ab4403df 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -261,6 +261,30 @@ def test_model_predict(penguins_bqml_linear_model: core.BqmlModel, new_penguins_ ) +def test_model_predict_explain( + penguins_bqml_linear_model: core.BqmlModel, new_penguins_df +): + predictions = penguins_bqml_linear_model.explain_predict( + new_penguins_df + ).to_pandas() + expected = pd.DataFrame( + { + "predicted_body_mass_g": [4030.1, 3280.8, 3177.9], + "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pd.testing.assert_frame_equal( + predictions[ + ["predicted_body_mass_g", "baseline_prediction_value"] + ].sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) + + def test_model_predict_with_unnamed_index( penguins_bqml_linear_model: core.BqmlModel, new_penguins_df ): @@ -289,6 +313,41 @@ def test_model_predict_with_unnamed_index( ) +def test_model_predict_explain_with_unnamed_index( + penguins_bqml_linear_model: core.BqmlModel, new_penguins_df +): + # This will result in an index that lacks a name, which the ML library will + # need to persist through the call to ML.PREDICT + new_penguins_df = new_penguins_df.reset_index() + + # remove the middle tag number to ensure we're really keeping the unnamed index + new_penguins_df = typing.cast( + bigframes.dataframe.DataFrame, + new_penguins_df[new_penguins_df.tag_number != 1672], + ) + + predictions = penguins_bqml_linear_model.explain_predict( + new_penguins_df + ).to_pandas() + + expected = pd.DataFrame( + { + "predicted_body_mass_g": [4030.1, 3177.9], + "baseline_prediction_value": [9362.692906, 9362.692906], + }, + dtype="Float64", + index=pd.Index([0, 2], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + predictions[ + ["predicted_body_mass_g", "baseline_prediction_value"] + ].sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) + + def test_model_detect_anomalies( penguins_bqml_pca_model: core.BqmlModel, new_penguins_df ): diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index d7e5c5ac86..84e4a7682d 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -16,6 +16,8 @@ import pandas import pytest +from bigframes.ml import linear_model + def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index): df = penguins_df_default_index.dropna() @@ -109,11 +111,11 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df): predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas() assert predictions.shape == (3, 12) - result = predictions[["predicted_body_mass_g", "approximation_error"]] + result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]] expected = pandas.DataFrame( { "predicted_body_mass_g": [4030.1, 3280.8, 3177.9], - "approximation_error": [0.0, 0.0, 0.0], + "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906], }, dtype="Float64", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), @@ -126,6 +128,48 @@ def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df ) +def test_linear_reg_model_predict_params( + penguins_linear_model: linear_model.LinearRegression, new_penguins_df +): + predictions = penguins_linear_model.predict(new_penguins_df).to_pandas() + assert predictions.shape[0] >= 1 + prediction_columns = set(predictions.columns) + expected_columns = { + "predicted_body_mass_g", + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + } + assert expected_columns <= prediction_columns + + +def test_linear_reg_model_predict_explain_params( + penguins_linear_model: linear_model.LinearRegression, new_penguins_df +): + predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas() + assert predictions.shape[0] >= 1 + prediction_columns = set(predictions.columns) + expected_columns = { + "predicted_body_mass_g", + "top_feature_attributions", + "baseline_prediction_value", + "prediction_value", + "approximation_error", + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + } + assert expected_columns <= prediction_columns + + def test_to_gbq_saved_linear_reg_model_scores( penguins_linear_model, table_id_unique, penguins_df_default_index ): diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index ee0821dfe9..9d18649efe 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -342,6 +342,18 @@ def test_ml_predict_correct( ) +def test_ml_explain_predict_correct( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_manipulation_sql_generator.ml_explain_predict(source_sql=mock_df.sql) + assert ( + sql + == """SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, + (input_X_y_sql))""" + ) + + def test_ml_llm_evaluate_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, From fb2d02f7b6e0f925f2bb069753d763ba622c3f1e Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 16 Dec 2024 10:33:38 -0600 Subject: [PATCH 07/10] fix docs error --- bigframes/ml/linear_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 230b0ef61e..5b2e745af1 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -168,12 +168,12 @@ def predict_explain( Explain predictions for a linear regression model. .. note:: - - Output matches that of the BigQuery ML.EXPLAIN_PREDICT function. - See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict + Output matches that of the BigQuery ML.EXPLAIN_PREDICT function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or + pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to explain its predictions. Returns: From 3a784bd8b101441ce2ea68b1eafa03896e64782c Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 16 Dec 2024 11:46:54 -0600 Subject: [PATCH 08/10] add TODO comment to support method paramaters --- bigframes/ml/linear_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 5b2e745af1..23feca706c 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -180,6 +180,7 @@ def predict_explain( bigframes.pandas.DataFrame: The predicted DataFrames with explanation columns. """ + # TODO(b/377366612): Add support for `to_k_features` parameter if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") From 81c3cdbf1be098c630abf39eb3dc854b22d9af8c Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 16 Dec 2024 12:21:10 -0600 Subject: [PATCH 09/10] update the test parmametr of linear model --- tests/system/small/ml/test_linear_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 84e4a7682d..0832c559c1 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -111,11 +111,15 @@ def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): def test_linear_reg_model_predict_explain(penguins_linear_model, new_penguins_df): predictions = penguins_linear_model.predict_explain(new_penguins_df).to_pandas() assert predictions.shape == (3, 12) - result = predictions[["predicted_body_mass_g", "baseline_prediction_value"]] + result = predictions[["predicted_body_mass_g", "approximation_error"]] expected = pandas.DataFrame( { "predicted_body_mass_g": [4030.1, 3280.8, 3177.9], - "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906], + "approximation_error": [ + 0.0, + 0.0, + 0.0, + ], }, dtype="Float64", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), From c4a1beb3d5a9dc5b8bb85398f0da156fa8ab8c2d Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 16 Dec 2024 15:04:06 -0600 Subject: [PATCH 10/10] update test to fix failing checks --- bigframes/ml/linear_model.py | 2 +- tests/system/small/ml/test_core.py | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 23feca706c..1a1a5e0ca0 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -180,7 +180,7 @@ def predict_explain( bigframes.pandas.DataFrame: The predicted DataFrames with explanation columns. """ - # TODO(b/377366612): Add support for `to_k_features` parameter + # TODO(b/377366612): Add support for `top_k_features` parameter if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 62ab4403df..3ea31353b1 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -270,15 +270,13 @@ def test_model_predict_explain( expected = pd.DataFrame( { "predicted_body_mass_g": [4030.1, 3280.8, 3177.9], - "baseline_prediction_value": [9362.692906, 9362.692906, 9362.692906], + "approximation_error": [0.0, 0.0, 0.0], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pd.testing.assert_frame_equal( - predictions[ - ["predicted_body_mass_g", "baseline_prediction_value"] - ].sort_index(), + predictions[["predicted_body_mass_g", "approximation_error"]].sort_index(), expected, check_exact=False, rtol=0.1, @@ -333,15 +331,13 @@ def test_model_predict_explain_with_unnamed_index( expected = pd.DataFrame( { "predicted_body_mass_g": [4030.1, 3177.9], - "baseline_prediction_value": [9362.692906, 9362.692906], + "approximation_error": [0.0, 0.0], }, dtype="Float64", index=pd.Index([0, 2], dtype="Int64"), ) pd.testing.assert_frame_equal( - predictions[ - ["predicted_body_mass_g", "baseline_prediction_value"] - ].sort_index(), + predictions[["predicted_body_mass_g", "approximation_error"]].sort_index(), expected, check_exact=False, rtol=0.1,