Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: allow fit to take additional eval data in linear and ensemble models #1096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions 34 bigframes/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,40 @@ def fit(
return self._fit(X, y)


class TrainableWithEvaluationPredictor(TrainablePredictor):
"""A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs.

Additional evaluation data can be provided to measure the model in the fit phase."""

@abc.abstractmethod
def _fit(self, X, y, transforms=None, X_eval=None, y_eval=None):
pass

@abc.abstractmethod
def score(self, X, y):
pass


class SupervisedTrainableWithEvaluationPredictor(TrainableWithEvaluationPredictor):
"""A BigQuery DataFrames ML Supervised Model base class that can be used to fit and predict outputs.

Need to provide both X and y in supervised tasks.

Additional X_eval and y_eval can be provided to measure the model in the fit phase.
"""

_T = TypeVar("_T", bound="SupervisedTrainableWithEvaluationPredictor")

def fit(
self: _T,
X: utils.ArrayType,
y: utils.ArrayType,
X_eval: Optional[utils.ArrayType] = None,
y_eval: Optional[utils.ArrayType] = None,
) -> _T:
return self._fit(X, y, X_eval=X_eval, y_eval=y_eval)


class UnsupervisedTrainablePredictor(TrainablePredictor):
"""A BigQuery DataFrames ML Unsupervised Model base class that can be used to fit and predict outputs.

Expand Down
56 changes: 48 additions & 8 deletions 56 bigframes/ml/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

@log_adapter.class_logger
class XGBRegressor(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.xgboost.sklearn.XGBRegressor,
):
__doc__ = bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__
Expand Down Expand Up @@ -145,14 +145,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[utils.ArrayType] = None,
y_eval: Optional[utils.ArrayType] = None,
) -> XGBRegressor:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -200,7 +210,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor:

@log_adapter.class_logger
class XGBClassifier(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.xgboost.sklearn.XGBClassifier,
):

Expand Down Expand Up @@ -294,14 +304,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[utils.ArrayType] = None,
y_eval: Optional[utils.ArrayType] = None,
) -> XGBClassifier:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -347,7 +367,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier:

@log_adapter.class_logger
class RandomForestRegressor(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor,
):

Expand Down Expand Up @@ -430,14 +450,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[utils.ArrayType] = None,
y_eval: Optional[utils.ArrayType] = None,
) -> RandomForestRegressor:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -503,7 +533,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso

@log_adapter.class_logger
class RandomForestClassifier(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier,
):

Expand Down Expand Up @@ -586,14 +616,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[utils.ArrayType] = None,
y_eval: Optional[utils.ArrayType] = None,
) -> RandomForestClassifier:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down
29 changes: 24 additions & 5 deletions 29 bigframes/ml/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

@log_adapter.class_logger
class LinearRegression(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.linear_model._base.LinearRegression,
):
__doc__ = bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__
Expand Down Expand Up @@ -131,14 +131,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[utils.ArrayType] = None,
y_eval: Optional[utils.ArrayType] = None,
) -> LinearRegression:
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down Expand Up @@ -183,7 +193,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression:

@log_adapter.class_logger
class LogisticRegression(
base.SupervisedTrainablePredictor,
base.SupervisedTrainableWithEvaluationPredictor,
bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression,
):
__doc__ = (
Expand Down Expand Up @@ -283,15 +293,24 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
X_eval: Optional[utils.ArrayType] = None,
y_eval: Optional[utils.ArrayType] = None,
) -> LogisticRegression:
"""Fit model with transforms."""
X, y = utils.convert_to_dataframe(X, y)

bqml_options = self._bqml_options

if X_eval is not None and y_eval is not None:
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
X, y, bqml_options = utils.combine_training_and_evaluation_data(
X, y, X_eval, y_eval, bqml_options
)

self._bqml_model = self._bqml_model_factory.create_model(
X,
y,
transforms=transforms,
options=self._bqml_options,
options=bqml_options,
)
return self

Expand Down
38 changes: 36 additions & 2 deletions 38 bigframes/ml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
# limitations under the License.

import typing
from typing import Any, Generator, Literal, Mapping, Optional, Union
from typing import Any, Generator, Literal, Mapping, Optional, Tuple, Union

import bigframes_vendored.constants as constants
from google.cloud import bigquery
import pandas as pd

from bigframes.core import blocks
from bigframes.core import blocks, guid
import bigframes.pandas as bpd
from bigframes.session import Session

Expand Down Expand Up @@ -155,3 +155,37 @@ def retrieve_params_from_bq_model(
kwargs[bf_param] = bf_param_type(last_fitting[bqml_param])

return kwargs


def combine_training_and_evaluation_data(
X_train: bpd.DataFrame,
y_train: bpd.DataFrame,
X_eval: bpd.DataFrame,
y_eval: bpd.DataFrame,
bqml_options: dict,
) -> Tuple[bpd.DataFrame, bpd.DataFrame, dict]:
"""
Combine training data and labels with evlauation data and labels, and keep
them differentiated through a split column in the combined data and labels.
"""

assert X_train.columns.equals(X_eval.columns)
assert y_train.columns.equals(y_eval.columns)

# create a custom split column for BQML and supply the evaluation
# data along with the training data in a combined single table
# https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-dnn-models#data_split_col.
split_col = guid.generate_guid()
assert split_col not in X_train.columns

X_train[split_col] = False
X_eval[split_col] = True
X = bpd.concat([X_train, X_eval])
y = bpd.concat([y_train, y_eval])

# create options copy to not mutate the incoming one
bqml_options = bqml_options.copy()
bqml_options["data_split_method"] = "CUSTOM"
bqml_options["data_split_col"] = split_col

return X, y, bqml_options
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.