From f24dcebdc0f22f726dc88dfa6bc75c9ea8f3d684 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Mon, 17 Jun 2024 20:48:14 +0000 Subject: [PATCH 1/4] feat: add ml.preprocessing.PolynomialFeatures class --- bigframes/ml/preprocessing.py | 104 ++++++++++++++++++ bigframes/ml/sql.py | 12 ++ tests/system/small/ml/test_preprocessing.py | 67 +++++++++++ tests/unit/ml/test_sql.py | 7 ++ .../sklearn/preprocessing/_polynomial.py | 38 +++++++ 5 files changed, 228 insertions(+) create mode 100644 third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 4b1a3fb7b7..f3621d3a33 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -24,6 +24,7 @@ import bigframes_vendored.sklearn.preprocessing._discretization import bigframes_vendored.sklearn.preprocessing._encoder import bigframes_vendored.sklearn.preprocessing._label +import bigframes_vendored.sklearn.preprocessing._polynomial from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils @@ -661,6 +662,109 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger +class PolynomialFeatures( + base.Transformer, + bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures, +): + __doc__ = ( + bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures.__doc__ + ) + + def __init__(self, degree: int = 2): + self.degree = degree + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is PolynomialFeatures and self._bqml_model == other._bqml_model + ) + + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: + a list of column names to transform. + X (default None): + Ignored. + + Returns: a list of tuples of (sql_expression, output_name)""" + output_name = "poly_feat" + return [ + ( + self._base_sql_generator.ml_polynomial_expand( + columns, self.degree, output_name + ), + output_name, + ) + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, str]: + """Parse SQL to tuple(PolynomialFeatures, column_label). + + Args: + sql: SQL string of format "ML.POLYNOMIAL_EXPAND(STRUCT(col_label0, col_label1, ...), degree)" + + Returns: + tuple(MaxAbsScaler, column_label)""" + col_label = sql[sql.find("STRUCT(") + 7 : sql.find(")")] + degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")]) + return cls(degree), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> PolynomialFeatures: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist()) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # TODO(garrettwu): generalize the approach to other transformers + output_names = [] + for transform_col in self._bqml_model._model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) + # pass the columns that are not transformed + if "transformSql" not in transform_col_dict: + continue + transform_sql: str = transform_col_dict["transformSql"] + if not transform_sql.startswith("ML."): + continue + + output_names.append(transform_col_dict["name"]) + + self._output_names = output_names + + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) + + # TODO(garrettwu): to_gbq() + + PreprocessingType = Union[ OneHotEncoder, StandardScaler, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index f060584a11..0399db3a10 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -73,6 +73,11 @@ def struct_options(self, **kwargs: Union[int, float]) -> str: """Encode a BQ STRUCT as options.""" return f"STRUCT({self.build_structs(**kwargs)})" + def struct_columns(self, columns: Iterable[str]) -> str: + """Encode a BQ Table columns to a STRUCT.""" + columns_str = ", ".join(columns) + return f"STRUCT({columns_str})" + def input(self, **kwargs: str) -> str: """Encode a BQML INPUT clause.""" return f"INPUT({self.build_schema(**kwargs)})" @@ -153,6 +158,13 @@ def ml_label_encoder( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + def ml_polynomial_expand( + self, columns: Iterable[str], degree: int, name: str + ) -> str: + """Encode ML.POLYNOMIAL_EXPAND. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand""" + return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {name}""" + def ml_distance( self, col_x: str, diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 5b457cc9c0..73b1855e09 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -19,6 +19,7 @@ import bigframes.features from bigframes.ml import preprocessing +from tests.system import utils ONE_HOT_ENCODED_DTYPE = ( pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))) @@ -840,3 +841,69 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id): # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. + + +def test_poly_features_default_params(new_penguins_df): + transformer = preprocessing.PolynomialFeatures() + df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]] + transformer.fit(df) + + result = transformer.transform(df).to_pandas() + + expected = pd.DataFrame( + { + "poly_feat_culmen_length_mm": [ + 39.5, + 38.5, + 37.9, + ], + "poly_feat_culmen_length_mm_culmen_length_mm": [ + 1560.25, + 1482.25, + 1436.41, + ], + "poly_feat_culmen_length_mm_culmen_depth_mm": [ + 742.6, + 662.2, + 685.99, + ], + "poly_feat_culmen_depth_mm": [ + 18.8, + 17.2, + 18.1, + ], + "poly_feat_culmen_depth_mm_culmen_depth_mm": [ + 353.44, + 295.84, + 327.61, + ], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) + + +def test_poly_features_params(new_penguins_df): + transformer = preprocessing.PolynomialFeatures(degree=3) + df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]] + transformer.fit(df) + + result = transformer.transform(df).to_pandas() + + utils.check_pandas_df_schema_and_index( + result, + [ + "poly_feat_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm_culmen_depth_mm", + ], + [1633, 1672, 1690], + ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 01f173812c..e90146565d 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -145,6 +145,13 @@ def test_label_encoder_correct( assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" +def test_polynomial_expand( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_polynomial_expand(["col_a", "col_b"], 2, "poly_exp") + assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(col_a, col_b), 2) AS poly_exp" + + def test_distance_correct( base_sql_generator: ml_sql.BaseSqlGenerator, mock_df: bpd.DataFrame, diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py new file mode 100644 index 0000000000..4e4624ba84 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py @@ -0,0 +1,38 @@ +""" +This file contains preprocessing tools based on polynomials. +""" + +from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin + +from bigframes import constants + + +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features.""" + + def fit(self, X, y=None): + """Compute number of output features. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. + + y (default None): + Ignored. + + Returns: + PolynomialFeatures: Fitted transformer. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Transform data to polynomial features. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: Transformed result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 533b2a158b60f5c6c02e78e113216713b95a7b1e Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Mon, 24 Jun 2024 23:42:44 +0000 Subject: [PATCH 2/4] feat: add PolynomailFeatures to_gbq and pipeline support --- bigframes/ml/base.py | 10 +++ bigframes/ml/compose.py | 60 +++++++++----- bigframes/ml/core.py | 9 ++ bigframes/ml/impute.py | 13 +-- bigframes/ml/pipeline.py | 1 + bigframes/ml/preprocessing.py | 83 +++++++------------ tests/system/large/ml/test_pipeline.py | 52 +++++++++++- tests/system/small/ml/test_preprocessing.py | 32 +++++++ .../sklearn/preprocessing/_polynomial.py | 7 +- 9 files changed, 181 insertions(+), 86 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 6c81b66e55..70854a36e9 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -184,6 +184,16 @@ class BaseTransformer(BaseEstimator): def __init__(self): self._bqml_model: Optional[core.BqmlModel] = None + @abc.abstractmethod + def _keys(self): + pass + + def __eq__(self, other) -> bool: + return type(self) is type(other) and self._keys() == other._keys() + + def __hash__(self) -> int: + return hash(self._keys()) + _T = TypeVar("_T", bound="BaseTransformer") def to_gbq(self: _T, model_name: str, replace: bool = False) -> _T: diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index abf1a662b9..e285572c1c 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -21,7 +21,7 @@ import re import types import typing -from typing import cast, List, Optional, Tuple, Union +from typing import cast, Iterable, List, Optional, Set, Tuple, Union import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery @@ -40,6 +40,7 @@ "ML.BUCKETIZE": preprocessing.KBinsDiscretizer, "ML.QUANTILE_BUCKETIZE": preprocessing.KBinsDiscretizer, "ML.LABEL_ENCODER": preprocessing.LabelEncoder, + "ML.POLYNOMIAL_EXPAND": preprocessing.PolynomialFeatures, "ML.IMPUTER": impute.SimpleImputer, } ) @@ -56,21 +57,24 @@ class ColumnTransformer( def __init__( self, - transformers: List[ + transformers: Iterable[ Tuple[ str, Union[preprocessing.PreprocessingType, impute.SimpleImputer], - Union[str, List[str]], + Union[str, Iterable[str]], ] ], ): # TODO: if any(transformers) has fitted raise warning - self.transformers = transformers + self.transformers = list(transformers) self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() # call self.transformers_ to check chained transformers self.transformers_ + def _keys(self): + return (self.transformers, self._bqml_model) + @property def transformers_( self, @@ -107,13 +111,13 @@ def _extract_from_bq_model( """Extract transformers as ColumnTransformer obj from a BQ Model. Keep the _bqml_model field as None.""" assert "transformColumns" in bq_model._properties - transformers: List[ + transformers_set: Set[ Tuple[ str, Union[preprocessing.PreprocessingType, impute.SimpleImputer], Union[str, List[str]], ] - ] = [] + ] = set() def camel_to_snake(name): name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) @@ -134,7 +138,7 @@ def camel_to_snake(name): for prefix in _BQML_TRANSFROM_TYPE_MAPPING: if transform_sql.startswith(prefix): transformer_cls = _BQML_TRANSFROM_TYPE_MAPPING[prefix] - transformers.append( + transformers_set.add( ( camel_to_snake(transformer_cls.__name__), *transformer_cls._parse_from_sql(transform_sql), # type: ignore @@ -148,7 +152,7 @@ def camel_to_snake(name): f"Unsupported transformer type. {constants.FEEDBACK_LINK}" ) - transformer = cls(transformers=transformers) + transformer = cls(transformers=list(transformers_set)) transformer._output_names = output_names return transformer @@ -159,23 +163,37 @@ def _merge( ColumnTransformer, Union[preprocessing.PreprocessingType, impute.SimpleImputer] ]: """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" - transformers = self.transformers_ + transformers = self.transformers assert len(transformers) > 0 _, transformer_0, column_0 = transformers[0] + feature_columns_sorted = sorted( + [ + cast(str, feature_column.name) + for feature_column in bq_model.feature_columns + ] + ) + + if ( + len(transformers) == 1 + and isinstance(transformer_0, preprocessing.PolynomialFeatures) + and sorted(column_0) == feature_columns_sorted + ): + transformer_0._output_names = self._output_names + return transformer_0 + + if isinstance(column_0, Iterable): + return self columns = [column_0] for _, transformer, column in transformers[1:]: + if isinstance(column, Iterable): + return self # all transformers are the same if transformer != transformer_0: return self columns.append(column) # all feature columns are transformed - if sorted( - [ - cast(str, feature_column.name) - for feature_column in bq_model.feature_columns - ] - ) == sorted(columns): + if sorted(columns) == feature_columns_sorted: transformer_0._output_names = self._output_names return transformer_0 @@ -197,12 +215,12 @@ def _compile_to_sql( Returns: a list of tuples of (sql_expression, output_name)""" - return [ - transformer._compile_to_sql([column], X=X)[0] - for column in columns - for _, transformer, target_column in self.transformers_ - if column == target_column - ] + result = [] + for _, transformer, target_columns in self.transformers: + if isinstance(target_columns, str): + target_columns = [target_columns] + result += transformer._compile_to_sql(target_columns, X=X) + return result def fit( self, diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 168bc584f7..ee4d8a8c27 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -112,6 +112,15 @@ def __init__(self, session: bigframes.Session, model: bigquery.Model): self.model_name ) + def _keys(self): + return (self._session, self._model) + + def __eq__(self, other): + return isinstance(other, self.__class__) and self._keys() == other._keys() + + def __hash__(self): + return hash(self._keys()) + @property def session(self) -> bigframes.Session: """Get the BigQuery DataFrames session that this BQML model wrapper is tied to""" diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index d21fcbb1ad..ae71637aa5 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import Any, List, Literal, Optional, Tuple, Union +from typing import Iterable, List, Literal, Optional, Tuple, Union import bigframes_vendored.sklearn.impute._base @@ -44,17 +44,12 @@ def __init__( self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return ( - type(other) is SimpleImputer - and self.strategy == other.strategy - and self._bqml_model == other._bqml_model - ) + def _keys(self): + return (self._bqml_model, self.strategy) def _compile_to_sql( self, - columns: List[str], + columns: Iterable[str], X=None, ) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 03e5688453..04b8d73cf5 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -64,6 +64,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): preprocessing.MinMaxScaler, preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, + preprocessing.PolynomialFeatures, impute.SimpleImputer, ), ): diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index f3621d3a33..07fdc171cf 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import Any, cast, List, Literal, Optional, Tuple, Union +from typing import cast, Iterable, List, Literal, Optional, Tuple, Union import bigframes_vendored.sklearn.preprocessing._data import bigframes_vendored.sklearn.preprocessing._discretization @@ -43,11 +43,10 @@ def __init__(self): self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return type(other) is StandardScaler and self._bqml_model == other._bqml_model + def _keys(self): + return (self._bqml_model,) - def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -125,11 +124,10 @@ def __init__(self): self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return type(other) is MaxAbsScaler and self._bqml_model == other._bqml_model + def _keys(self): + return (self._bqml_model,) - def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -207,11 +205,10 @@ def __init__(self): self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return type(other) is MinMaxScaler and self._bqml_model == other._bqml_model + def _keys(self): + return (self._bqml_model,) - def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -301,18 +298,12 @@ def __init__( self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return ( - type(other) is KBinsDiscretizer - and self.n_bins == other.n_bins - and self.strategy == other.strategy - and self._bqml_model == other._bqml_model - ) + def _keys(self): + return (self._bqml_model, self.n_bins, self.strategy) def _compile_to_sql( self, - columns: List[str], + columns: Iterable[str], X: bpd.DataFrame, ) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in @@ -446,17 +437,10 @@ def __init__( self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return ( - type(other) is OneHotEncoder - and self._bqml_model == other._bqml_model - and self.drop == other.drop - and self.min_frequency == other.min_frequency - and self.max_categories == other.max_categories - ) + def _keys(self): + return (self._bqml_model, self.drop, self.min_frequency, self.max_categories) - def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -572,16 +556,10 @@ def __init__( self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return ( - type(other) is LabelEncoder - and self._bqml_model == other._bqml_model - and self.min_frequency == other.min_frequency - and self.max_categories == other.max_categories - ) + def _keys(self): + return (self._bqml_model, self.min_frequency, self.max_categories) - def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -672,18 +650,17 @@ class PolynomialFeatures( ) def __init__(self, degree: int = 2): + if degree not in range(1, 5): + raise ValueError(f"degree has to be [1, 4], input is {degree}.") self.degree = degree self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() - # TODO(garrettwu): implement __hash__ - def __eq__(self, other: Any) -> bool: - return ( - type(other) is PolynomialFeatures and self._bqml_model == other._bqml_model - ) + def _keys(self): + return (self._bqml_model, self.degree) - def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -705,17 +682,18 @@ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: ] @classmethod - def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, str]: - """Parse SQL to tuple(PolynomialFeatures, column_label). + def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, tuple[str, ...]]: + """Parse SQL to tuple(PolynomialFeatures, column_labels). Args: sql: SQL string of format "ML.POLYNOMIAL_EXPAND(STRUCT(col_label0, col_label1, ...), degree)" Returns: tuple(MaxAbsScaler, column_label)""" - col_label = sql[sql.find("STRUCT(") + 7 : sql.find(")")] + col_labels = sql[sql.find("STRUCT(") + 7 : sql.find(")")].split(",") + col_labels = [label.strip() for label in col_labels] degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")]) - return cls(degree), col_label + return cls(degree), tuple(col_labels) def fit( self, @@ -762,8 +740,6 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: df[self._output_names], ) - # TODO(garrettwu): to_gbq() - PreprocessingType = Union[ OneHotEncoder, @@ -772,4 +748,5 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: MinMaxScaler, KBinsDiscretizer, LabelEncoder, + PolynomialFeatures, ] diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 6e18248e0f..84a6b11ff2 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -487,6 +487,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind preprocessing.LabelEncoder(), "species", ), + ( + "poly_feats", + preprocessing.PolynomialFeatures(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ), ), @@ -567,6 +572,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id impute.SimpleImputer(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "polynomial_features", + preprocessing.PolynomialFeatures(), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -589,7 +599,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id ) assert isinstance(pl_loaded._transform, compose.ColumnTransformer) - transformers = pl_loaded._transform.transformers_ + transformers = pl_loaded._transform.transformers expected = [ ( "one_hot_encoder", @@ -629,9 +639,14 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id impute.SimpleImputer(), "flipper_length_mm", ), + ( + "polynomial_features", + preprocessing.PolynomialFeatures(), + ("culmen_length_mm", "flipper_length_mm"), + ), ] - assert transformers == expected + assert set(transformers) == set(expected) assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) assert pl_loaded._estimator.fit_intercept is False @@ -849,3 +864,36 @@ def test_pipeline_simple_imputer_to_gbq(penguins_df_default_index, dataset_id): assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) assert pl_loaded._estimator.fit_intercept is False + + +def test_pipeline_poly_features_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ( + "transform", + preprocessing.PolynomialFeatures(degree=3), + ), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "culmen_length_mm", + "flipper_length_mm", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_poly_features", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.PolynomialFeatures) + + poly_features = pl_loaded._transform + assert poly_features.degree == 3 + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 73b1855e09..16b153ab45 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -907,3 +907,35 @@ def test_poly_features_params(new_penguins_df): ], [1633, 1672, 1690], ) + + +def test_poly_features_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.PolynomialFeatures(degree=3) + transformer.fit(new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]]) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.PolynomialFeatures) + assert reloaded_transformer.degree == 3 + assert reloaded_transformer._bqml_model is not None + + result = reloaded_transformer.transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]] + ).to_pandas() + + utils.check_pandas_df_schema_and_index( + result, + [ + "poly_feat_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm_culmen_depth_mm", + ], + [1633, 1672, 1690], + ) diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py index 4e4624ba84..9ad43b7956 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py @@ -8,7 +8,12 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator): - """Generate polynomial and interaction features.""" + """Generate polynomial and interaction features. + + Args: + degree (int): + Specifies the maximal degree of the polynomial features. Valid values [1, 4]. Default to 2. + """ def fit(self, X, y=None): """Compute number of output features. From 33892a33ec90d8c40fbc81b9582660a4fdc5444d Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 25 Jun 2024 00:14:53 +0000 Subject: [PATCH 3/4] fix tests --- bigframes/ml/compose.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index e285572c1c..7f1bfe8d55 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -182,11 +182,11 @@ def _merge( transformer_0._output_names = self._output_names return transformer_0 - if isinstance(column_0, Iterable): + if not isinstance(column_0, str): return self columns = [column_0] for _, transformer, column in transformers[1:]: - if isinstance(column, Iterable): + if not isinstance(column, str): return self # all transformers are the same if transformer != transformer_0: From 7e5ab52b14a0f0c59c8b6f34ff295cf2673bdc40 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 25 Jun 2024 23:36:05 +0000 Subject: [PATCH 4/4] fix tests --- tests/system/large/ml/test_compose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 45322e78dd..59c5a1538f 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -123,7 +123,7 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), ] - assert reloaded_transformer.transformers_ == expected + assert set(reloaded_transformer.transformers) == set(expected) assert reloaded_transformer._bqml_model is not None result = transformer.fit_transform(