From f593716a55214abb8f452d2ef11987b7c86e1b82 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 21 Oct 2024 08:14:58 +0000 Subject: [PATCH 1/8] feat: allow `fit` to take additional eval data The additional eval data would be used to measure the fitted model and attach the measurement to the underlying BQML model, which can be used as benchmark for the model consumers in BigQuery Studio and otherwise. --- bigframes/ml/base.py | 35 +++++ bigframes/ml/linear_model.py | 29 +++- bigframes/ml/utils.py | 38 ++++- tests/system/large/ml/test_linear_model.py | 154 +++++++++++++++++++++ 4 files changed, 249 insertions(+), 7 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 81181b58cf..e68aa73c66 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -132,6 +132,7 @@ def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: @abc.abstractmethod def to_gbq(self, model_name, replace): pass + pass class TrainablePredictor(Predictor): @@ -163,6 +164,40 @@ def fit( return self._fit(X, y) +class TrainableWithEvaluationPredictor(Predictor): + """A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs. + + Additional evaluation data can be provided to measure the model in the fit phase.""" + + @abc.abstractmethod + def _fit(self, X, y, transforms=None, X_eval=None, y_eval=None): + pass + + @abc.abstractmethod + def score(self, X, y): + pass + + +class SupervisedTrainableWithEvaluationPredictor(TrainableWithEvaluationPredictor): + """A BigQuery DataFrames ML Supervised Model base class that can be used to fit and predict outputs. + + Need to provide both X and y in supervised tasks. + + Additional X_eval and y_eval can be provided to measure the model in the fit phase. + """ + + _T = TypeVar("_T", bound="SupervisedTrainableWithEvaluationPredictor") + + def fit( + self: _T, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + ) -> _T: + return self._fit(X, y, X_eval=X_eval, y_eval=y_eval) + + class UnsupervisedTrainablePredictor(TrainablePredictor): """A BigQuery DataFrames ML Unsupervised Model base class that can be used to fit and predict outputs. diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 8fe1d6ec27..e7692379c7 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -47,7 +47,7 @@ @log_adapter.class_logger class LinearRegression( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.linear_model._base.LinearRegression, ): __doc__ = bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__ @@ -131,14 +131,24 @@ def _fit( X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, + X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> LinearRegression: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -183,7 +193,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: @log_adapter.class_logger class LogisticRegression( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, ): __doc__ = ( @@ -283,15 +293,24 @@ def _fit( X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, + X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> LogisticRegression: - """Fit model with transforms.""" X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 96f0bc31e9..8d6ea2a1b3 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -13,12 +13,12 @@ # limitations under the License. import typing -from typing import Any, Generator, Iterable, Literal, Mapping, Optional, Union +from typing import Any, Generator, Iterable, Literal, Mapping, Optional, Tuple, Union import bigframes_vendored.constants as constants from google.cloud import bigquery -from bigframes.core import blocks +from bigframes.core import blocks, guid import bigframes.pandas as bpd # Internal type alias @@ -139,3 +139,37 @@ def retrieve_params_from_bq_model( kwargs[bf_param] = bf_param_type(last_fitting[bqml_param]) return kwargs + + +def combine_training_and_evaluation_data( + X_train: bpd.DataFrame, + y_train: bpd.DataFrame, + X_eval: bpd.DataFrame, + y_eval: bpd.DataFrame, + bqml_options: dict, +) -> Tuple[bpd.DataFrame, bpd.DataFrame, dict]: + """ + Combine training data and labels with evlauation data and labels, and keep + them differentiated through a split column in the combined data and labels. + """ + + assert X_train.columns.equals(X_eval.columns) + assert y_train.columns.equals(y_eval.columns) + + # create a custom split column for BQML and supply the evaluation + # data along with the training data in a combined single table + # https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-dnn-models#data_split_col. + split_col = guid.generate_guid() + assert split_col not in X_train.columns + + X_train[split_col] = False + X_eval[split_col] = True + X = bpd.concat([X_train, X_eval]) + y = bpd.concat([y_train, y_eval]) + + # create options copy to not mutate the incoming one + bqml_options = bqml_options.copy() + bqml_options["data_split_method"] = "CUSTOM" + bqml_options["data_split_col"] = split_col + + return X, y, bqml_options diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 273da97bc5..aca2210b24 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd + +from bigframes.ml import model_selection import bigframes.ml.linear_model from tests.system import utils @@ -58,6 +61,85 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase assert reloaded_model.tol == 0.01 +def test_linear_regression_configure_fit_with_eval_score( + penguins_df_default_index, dataset_id +): + model = bigframes.ml.linear_model.LinearRegression() + + df = penguins_df_default_index.dropna() + X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y = df[["body_mass_g"]] + + X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y) + + model.fit(X_train, y_train, X_eval=X_eval, y_eval=y_eval) + + # Check score to ensure the model was fitted + result = model.score(X_eval, y_eval).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + bq_model_name = f"{dataset_id}.temp_configured_model" + reloaded_model = model.to_gbq(bq_model_name, replace=True) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.fit_intercept is True + assert reloaded_model.calculate_p_values is False + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None + assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learning_rate is None + assert reloaded_model.learning_rate_strategy == "line_search" + assert reloaded_model.ls_init_learning_rate is None + assert reloaded_model.max_iterations == 20 + assert reloaded_model.tol == 0.01 + + # make sure the bqml model was internally created with custom split + bq_model = penguins_df_default_index._session.bqclient.get_model(bq_model_name) + last_fitting = bq_model.training_runs[-1]["trainingOptions"] + assert last_fitting["dataSplitMethod"] == "CUSTOM" + assert "dataSplitColumn" in last_fitting + + # make sure the bqml model has the same evaluation metrics attached as + # returned by model.score() + bq_model_expected_eval_metrics = result[utils.ML_REGRESSION_METRICS[:5]] + bq_model_eval_metrics = bq_model.training_runs[-1]["evaluationMetrics"][ + "regressionMetrics" + ] + bq_model_eval_metrics = pd.DataFrame( + [ + [ + bq_model_eval_metrics["meanAbsoluteError"], + bq_model_eval_metrics["meanSquaredError"], + bq_model_eval_metrics["meanSquaredLogError"], + bq_model_eval_metrics["medianAbsoluteError"], + bq_model_eval_metrics["rSquared"], + ] + ], + columns=utils.ML_REGRESSION_METRICS[:5], + ) + pd.testing.assert_frame_equal( + bq_model_expected_eval_metrics, + bq_model_eval_metrics, + check_dtype=False, + check_index_type=False, + ) + + def test_linear_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): @@ -216,6 +298,78 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data assert reloaded_model.class_weight is None +def test_logistic_regression_configure_fit_with_eval_score( + penguins_df_default_index, dataset_id +): + model = bigframes.ml.linear_model.LogisticRegression() + + df = penguins_df_default_index.dropna() + X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + y = df[["sex"]] + + X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y) + + model.fit(X_train, y_train, X_eval=X_eval, y_eval=y_eval) + + # Check score to ensure the model was fitted + result = model.score(X_eval, y_eval).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + bq_model_name = f"{dataset_id}.temp_configured_logistic_reg_model" + reloaded_model = model.to_gbq(bq_model_name, replace=True) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_logistic_reg_model" + in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.fit_intercept is True + assert reloaded_model.class_weight is None + + # make sure the bqml model was internally created with custom split + bq_model = penguins_df_default_index._session.bqclient.get_model(bq_model_name) + last_fitting = bq_model.training_runs[-1]["trainingOptions"] + assert last_fitting["dataSplitMethod"] == "CUSTOM" + assert "dataSplitColumn" in last_fitting + + # make sure the bqml model has the same evaluation metrics attached as + # returned by model.score() + bq_model_expected_eval_metrics = result + bq_model_eval_metrics = bq_model.training_runs[-1]["evaluationMetrics"][ + "binaryClassificationMetrics" + ]["aggregateClassificationMetrics"] + bq_model_eval_metrics = pd.DataFrame( + [ + [ + bq_model_eval_metrics["precision"], + bq_model_eval_metrics["recall"], + bq_model_eval_metrics["accuracy"], + bq_model_eval_metrics["f1Score"], + bq_model_eval_metrics["logLoss"], + bq_model_eval_metrics["rocAuc"], + ] + ], + columns=utils.ML_CLASSFICATION_METRICS, + ) + pd.testing.assert_frame_equal( + bq_model_expected_eval_metrics, + bq_model_eval_metrics, + check_dtype=False, + check_index_type=False, + ) + + def test_logistic_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): From 096233d8db40b1ec67038368e9cfa3e09f311ec4 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 22 Oct 2024 08:38:29 +0000 Subject: [PATCH 2/8] subclass from TrainablePredictor --- bigframes/ml/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index e68aa73c66..ecf665747b 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -132,7 +132,6 @@ def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: @abc.abstractmethod def to_gbq(self, model_name, replace): pass - pass class TrainablePredictor(Predictor): @@ -164,7 +163,7 @@ def fit( return self._fit(X, y) -class TrainableWithEvaluationPredictor(Predictor): +class TrainableWithEvaluationPredictor(TrainablePredictor): """A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs. Additional evaluation data can be provided to measure the model in the fit phase.""" From 4278acfac73bff90a4a4bbfe444d9115172c4cce Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 22 Oct 2024 22:03:11 +0000 Subject: [PATCH 3/8] add support for fit-time evaluation in ensemble models --- bigframes/ml/ensemble.py | 56 ++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 0194d768b8..dbe4e926ee 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -52,7 +52,7 @@ @log_adapter.class_logger class XGBRegressor( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.xgboost.sklearn.XGBRegressor, ): __doc__ = bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__ @@ -145,14 +145,24 @@ def _fit( X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, + X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> XGBRegressor: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -200,7 +210,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: @log_adapter.class_logger class XGBClassifier( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.xgboost.sklearn.XGBClassifier, ): @@ -294,14 +304,24 @@ def _fit( X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, + X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> XGBClassifier: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -347,7 +367,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: @log_adapter.class_logger class RandomForestRegressor( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, ): @@ -430,14 +450,24 @@ def _fit( X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, + X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> RandomForestRegressor: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self @@ -503,7 +533,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso @log_adapter.class_logger class RandomForestClassifier( - base.SupervisedTrainablePredictor, + base.SupervisedTrainableWithEvaluationPredictor, bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, ): @@ -586,14 +616,24 @@ def _fit( X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, + X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> RandomForestClassifier: X, y = utils.convert_to_dataframe(X, y) + bqml_options = self._bqml_options + + if X_eval is not None and y_eval is not None: + X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval) + X, y, bqml_options = utils.combine_training_and_evaluation_data( + X, y, X_eval, y_eval, bqml_options + ) + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, - options=self._bqml_options, + options=bqml_options, ) return self From d724087a7aef157246803da57c99eefbb99fa9c6 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 22 Oct 2024 23:48:42 +0000 Subject: [PATCH 4/8] fetch logistic regression eval numbers from multiClassClassificationMetrics --- tests/system/large/ml/test_linear_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index aca2210b24..03fdd17dd6 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -347,7 +347,7 @@ def test_logistic_regression_configure_fit_with_eval_score( # returned by model.score() bq_model_expected_eval_metrics = result bq_model_eval_metrics = bq_model.training_runs[-1]["evaluationMetrics"][ - "binaryClassificationMetrics" + "multiClassClassificationMetrics" ]["aggregateClassificationMetrics"] bq_model_eval_metrics = pd.DataFrame( [ From 6b69a7f26000520e16732541d257d2b44d549e24 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 25 Oct 2024 20:10:34 +0000 Subject: [PATCH 5/8] use the generic type template --- bigframes/ml/base.py | 10 +++++----- bigframes/ml/ensemble.py | 18 +++++++++--------- bigframes/ml/linear_model.py | 8 ++++---- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 84f8b3c5ae..5662e54d6d 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -22,7 +22,7 @@ """ import abc -from typing import cast, Optional, TypeVar, Union +from typing import cast, Optional, TypeVar import bigframes_vendored.sklearn.base @@ -190,10 +190,10 @@ class SupervisedTrainableWithEvaluationPredictor(TrainableWithEvaluationPredicto def fit( self: _T, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], - X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, - y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.ArrayType, + y: utils.ArrayType, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> _T: return self._fit(X, y, X_eval=X_eval, y_eval=y_eval) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 295791cd3f..253ef7c5c1 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional import bigframes_vendored.sklearn.ensemble._forest import bigframes_vendored.xgboost.sklearn @@ -145,8 +145,8 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, - X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, - y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> XGBRegressor: X, y = utils.convert_to_dataframe(X, y) @@ -304,8 +304,8 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, - X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, - y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> XGBClassifier: X, y = utils.convert_to_dataframe(X, y) @@ -450,8 +450,8 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, - X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, - y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> RandomForestRegressor: X, y = utils.convert_to_dataframe(X, y) @@ -616,8 +616,8 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, - X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, - y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> RandomForestClassifier: X, y = utils.convert_to_dataframe(X, y) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 7de5a5a28d..85be54e596 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -131,8 +131,8 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, - X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, - y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> LinearRegression: X, y = utils.convert_to_dataframe(X, y) @@ -293,8 +293,8 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, - X_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, - y_eval: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X_eval: Optional[utils.ArrayType] = None, + y_eval: Optional[utils.ArrayType] = None, ) -> LogisticRegression: X, y = utils.convert_to_dataframe(X, y) From 1abfa6d8f3938873f9b3c6ac1e193ae55c67169c Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 28 Oct 2024 18:22:23 +0000 Subject: [PATCH 6/8] update vendored docstrings for fit taking X_eval, y_eval --- .../bigframes_vendored/sklearn/ensemble/_forest.py | 7 +++++++ .../bigframes_vendored/sklearn/linear_model/_base.py | 7 +++++++ .../bigframes_vendored/sklearn/linear_model/_logistic.py | 8 ++++++++ third_party/bigframes_vendored/xgboost/sklearn.py | 7 +++++++ 4 files changed, 29 insertions(+) diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 1f6284c146..fb81bd6684 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -54,6 +54,13 @@ def fit(self, X, y): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation data. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). + Evaluation target values. Will be cast to X_eval's dtype if necessary. + Returns: ForestModel: Fitted estimator. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index fa8f28a656..d6b8a473bd 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -108,6 +108,13 @@ def fit( Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation data. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). + Evaluation target values. Will be cast to X_eval's dtype if necessary. + Returns: LinearRegression: Fitted estimator. """ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index f3419ba8a9..479be19596 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -79,6 +79,14 @@ def fit( y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples,). Target vector relative to X. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation vector, + where `n_samples` is the number of samples and `n_features` is + the number of features. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + DataFrame of shape (n_samples,). Target vector relative to X_eval. + Returns: LogisticRegression: Fitted estimator. diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index da1396af02..60a22e83d0 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -37,6 +37,13 @@ def fit(self, X, y): DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + X_eval (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Evaluation data. + + y_eval (bigframes.dataframe.DataFrame or bigframes.series.Series): + DataFrame of shape (n_samples,) or (n_samples, n_targets). + Evaluation target values. Will be cast to X_eval's dtype if necessary. + Returns: XGBModel: Fitted estimator. """ From 54450c24f4bd73ac6c53b2c415a58ad4f59f82bb Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 28 Oct 2024 20:14:48 +0000 Subject: [PATCH 7/8] update key to fetch model eval metrics --- tests/system/large/ml/test_linear_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 03fdd17dd6..aca2210b24 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -347,7 +347,7 @@ def test_logistic_regression_configure_fit_with_eval_score( # returned by model.score() bq_model_expected_eval_metrics = result bq_model_eval_metrics = bq_model.training_runs[-1]["evaluationMetrics"][ - "multiClassClassificationMetrics" + "binaryClassificationMetrics" ]["aggregateClassificationMetrics"] bq_model_eval_metrics = pd.DataFrame( [ From 344ce423c0d9bcc700c69e8c116cdd63bee37b54 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 28 Oct 2024 22:38:42 +0000 Subject: [PATCH 8/8] enfore binary classification in the logistic regression test --- tests/system/large/ml/test_linear_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index aca2210b24..f6ca26e7e4 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -304,6 +304,8 @@ def test_logistic_regression_configure_fit_with_eval_score( model = bigframes.ml.linear_model.LogisticRegression() df = penguins_df_default_index.dropna() + df = df[df["sex"].isin(["MALE", "FEMALE"])] + X = df[ [ "species",