From e803f7a398270f35b16084519a75450dcd4bb96f Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 25 Sep 2023 21:54:01 +0000 Subject: [PATCH 1/4] fix: LabelEncoder params consistent with Sklearn --- bigframes/ml/preprocessing.py | 17 +++++------ tests/system/small/ml/test_preprocessing.py | 30 ++++--------------- .../sklearn/preprocessing/_label.py | 12 ++++---- 3 files changed, 19 insertions(+), 40 deletions(-) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index f4f5446651..24138ae660 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -399,18 +399,17 @@ def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: return cls(min_frequency, max_categories), col_label - def fit( + def fit( # type: ignore[override] self, - X: Union[bpd.DataFrame, bpd.Series], - y=None, # ignored + y: Union[bpd.DataFrame, bpd.Series], ) -> LabelEncoder: - (X,) = utils.convert_to_dataframe(X) + (y,) = utils.convert_to_dataframe(y) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) + compiled_transforms = self._compile_to_sql(y.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] self._bqml_model = self._bqml_model_factory.create_model( - X, + y, options={"model_type": "transform_only"}, transforms=transform_sqls, ) @@ -419,13 +418,13 @@ def fit( self._output_names = [name for _, name in compiled_transforms] return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (y,) = utils.convert_to_dataframe(y) - df = self._bqml_model.transform(X) + df = self._bqml_model.transform(y) return typing.cast( bpd.DataFrame, df[self._output_names], diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 7779eb8f6e..61bddb144d 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -357,9 +357,9 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ def test_label_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - encoder.fit(new_penguins_df[["species", "sex"]]) + encoder.fit(new_penguins_df["species"]) - result = encoder.transform(new_penguins_df).to_pandas() + result = encoder.transform(new_penguins_df["species"]).to_pandas() # TODO: bug? feature columns seem to be in nondeterministic random order # workaround: sort columns by name. Can't repro it in pantheon, so could @@ -368,11 +368,6 @@ def test_label_encoder_default_params(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 2, - 1, - 1, - ], "labelencoded_species": [ 1, 1, @@ -389,7 +384,7 @@ def test_label_encoder_default_params(new_penguins_df): def test_label_encoder_default_params_fit_transform(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() + result = encoder.fit_transform(new_penguins_df[["species"]]).to_pandas() # TODO: bug? feature columns seem to be in nondeterministic random order # workaround: sort columns by name. Can't repro it in pantheon, so could @@ -398,11 +393,6 @@ def test_label_encoder_default_params_fit_transform(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 2, - 1, - 1, - ], "labelencoded_species": [ 1, 1, @@ -444,7 +434,7 @@ def test_label_encoder_series_default_params(new_penguins_df): def test_label_encoder_params(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) - encoder.fit(new_penguins_df[["species", "sex"]]) + encoder.fit(new_penguins_df[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -455,11 +445,6 @@ def test_label_encoder_params(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 0, - 0, - 0, - ], "labelencoded_species": [ 0, 0, @@ -475,7 +460,7 @@ def test_label_encoder_params(new_penguins_df): def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - encoder.fit(penguins_df_default_index[["species", "sex"]]) + encoder.fit(penguins_df_default_index[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -486,11 +471,6 @@ def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df expected = pd.DataFrame( { - "labelencoded_sex": [ - 3, - 2, - 2, - ], "labelencoded_species": [ 1, 1, diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index 7e60c846d4..83f8eb0f9c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -28,11 +28,11 @@ class LabelEncoder(BaseEstimator): Default None, set limit to 1,000,000. """ - def fit(self, X): - """Fit LabelEncoder to X. + def fit(self, y): + """Fit label encoder. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series): The DataFrame or Series with training data. Returns: @@ -40,11 +40,11 @@ def fit(self, X): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def transform(self, X): - """Transform X using label encoding. + def transform(self, y): + """Transform y using label encoding. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series): The DataFrame or Series to be transformed. Returns: From ab00cacb87fed135ddba2437bf735756620a1f1d Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 25 Sep 2023 22:16:37 +0000 Subject: [PATCH 2/4] add LabelTransformer --- bigframes/ml/base.py | 20 ++++++++++++++++++++ bigframes/ml/preprocessing.py | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index f07274f8fc..c4031efac5 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -195,3 +195,23 @@ def fit_transform( y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> bpd.DataFrame: return self.fit(X, y).transform(X) + + +class LabelTransformer(BaseEstimator): + """A BigQuery DataFrames Transformer base class that transforms data. + + Also the transformers can be attached to a pipeline with a predictor.""" + + @abc.abstractmethod + def fit(self, y): + pass + + @abc.abstractmethod + def transform(self, y): + pass + + def fit_transform( + self, + y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + ) -> bpd.DataFrame: + return self.fit(y).transform(y) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 24138ae660..ed0b36deef 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -315,7 +315,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: class LabelEncoder( - base.Transformer, + base.LabelTransformer, third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, ): # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax @@ -399,7 +399,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: return cls(min_frequency, max_categories), col_label - def fit( # type: ignore[override] + def fit( self, y: Union[bpd.DataFrame, bpd.Series], ) -> LabelEncoder: From a77c978f8bc8366bfa00d0a63149912be28a9fef Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 25 Sep 2023 23:48:53 +0000 Subject: [PATCH 3/4] address comments for base LabelTransformer --- bigframes/ml/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index c4031efac5..7570f249a3 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -198,7 +198,7 @@ def fit_transform( class LabelTransformer(BaseEstimator): - """A BigQuery DataFrames Transformer base class that transforms data. + """A BigQuery DataFrames Label Transformer base class that transforms data. Also the transformers can be attached to a pipeline with a predictor.""" @@ -212,6 +212,6 @@ def transform(self, y): def fit_transform( self, - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + y: Optional[Union[bpd.DataFrame, bpd.Series]], ) -> bpd.DataFrame: return self.fit(y).transform(y) From db598a678de80fb5457c324d6f72a2ca30ac487e Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 25 Sep 2023 23:57:25 +0000 Subject: [PATCH 4/4] fix params --- bigframes/ml/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 7570f249a3..f899ac7119 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -212,6 +212,6 @@ def transform(self, y): def fit_transform( self, - y: Optional[Union[bpd.DataFrame, bpd.Series]], + y: Union[bpd.DataFrame, bpd.Series], ) -> bpd.DataFrame: return self.fit(y).transform(y)