Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: support data split for evaluation in linear and ensemble models #1081

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
105 changes: 92 additions & 13 deletions 105 bigframes/ml/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from __future__ import annotations

from typing import Dict, List, Literal, Optional, Union
from typing import List, Literal, Optional, Union

import bigframes_vendored.sklearn.ensemble._forest
import bigframes_vendored.xgboost.sklearn
Expand Down Expand Up @@ -47,6 +47,9 @@
"max_iterations": "maxIterations",
"enable_global_explain": "enableGlobalExplain",
"xgboost_version": "xgboostVersion",
"data_split_method": "dataSplitMethod",
"data_split_eval_fraction": "dataSplitEvalFraction",
"data_split_col": "dataSplitColumn",
}


Expand Down Expand Up @@ -78,6 +81,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.booster = booster
Expand All @@ -97,6 +109,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -115,11 +130,11 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "BOOSTED_TREE_REGRESSOR",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"booster_type": self.booster,
Expand All @@ -140,6 +155,13 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down Expand Up @@ -227,6 +249,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.booster = booster
Expand All @@ -246,6 +277,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -264,11 +298,11 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "BOOSTED_TREE_CLASSIFIER",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"booster_type": self.booster,
Expand All @@ -289,6 +323,13 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down Expand Up @@ -370,6 +411,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.tree_method = tree_method
Expand All @@ -385,6 +435,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -403,9 +456,9 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "RANDOM_FOREST_REGRESSOR",
"early_stop": True,
"num_parallel_tree": self.n_estimators,
Expand All @@ -420,11 +473,18 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"min_rel_progress": self.tol,
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down Expand Up @@ -526,6 +586,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.tree_method = tree_method
Expand All @@ -541,6 +610,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -559,9 +631,9 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "RANDOM_FOREST_CLASSIFIER",
"early_stop": True,
"num_parallel_tree": self.n_estimators,
Expand All @@ -576,11 +648,18 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"min_rel_progress": self.tol,
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down
39 changes: 37 additions & 2 deletions 39 bigframes/ml/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
"warm_start": "warmStart",
"calculate_p_values": "calculatePValues",
"enable_global_explain": "enableGlobalExplain",
"data_split_method": "dataSplitMethod",
"data_split_eval_fraction": "dataSplitEvalFraction",
"data_split_col": "dataSplitColumn",
}


Expand Down Expand Up @@ -69,6 +72,15 @@ def __init__(
ls_init_learning_rate: Optional[float] = None,
calculate_p_values: bool = False,
enable_global_explain: bool = False,
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.optimize_strategy = optimize_strategy
self.fit_intercept = fit_intercept
Expand All @@ -82,6 +94,9 @@ def __init__(
self.ls_init_learning_rate = ls_init_learning_rate
self.calculate_p_values = calculate_p_values
self.enable_global_explain = enable_global_explain
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -104,7 +119,7 @@ def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
options = {
"model_type": "LINEAR_REG",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"optimize_strategy": self.optimize_strategy,
"fit_intercept": self.fit_intercept,
"l2_reg": self.l2_reg,
Expand All @@ -123,6 +138,10 @@ def _bqml_options(self) -> dict:
# Even presenting warm_start returns error for NORMAL_EQUATION optimizer
if self.warm_start:
options["warm_start"] = self.warm_start
if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

Expand Down Expand Up @@ -209,6 +228,15 @@ def __init__(
calculate_p_values: bool = False,
enable_global_explain: bool = False,
class_weight: Optional[Union[Literal["balanced"], Dict[str, float]]] = None,
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.optimize_strategy = optimize_strategy
self.fit_intercept = fit_intercept
Expand All @@ -223,6 +251,9 @@ def __init__(
self.calculate_p_values = calculate_p_values
self.enable_global_explain = enable_global_explain
self.class_weight = class_weight
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._auto_class_weight = class_weight == "balanced"
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()
Expand Down Expand Up @@ -253,7 +284,7 @@ def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
options = {
"model_type": "LOGISTIC_REG",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"fit_intercept": self.fit_intercept,
"auto_class_weights": self._auto_class_weight,
"optimize_strategy": self.optimize_strategy,
Expand All @@ -275,6 +306,10 @@ def _bqml_options(self) -> dict:
# Even presenting warm_start returns error for NORMAL_EQUATION optimizer
if self.warm_start:
options["warm_start"] = self.warm_start
if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

Expand Down
Loading
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.