diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 81181b58cf..550b4a8178 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -22,11 +22,12 @@ """ import abc -from typing import cast, Optional, TypeVar, Union +from typing import cast, Optional, TypeVar import bigframes_vendored.sklearn.base from bigframes.ml import core +import bigframes.ml.utils as utils import bigframes.pandas as bpd @@ -157,8 +158,8 @@ class SupervisedTrainablePredictor(TrainablePredictor): def fit( self: _T, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> _T: return self._fit(X, y) @@ -172,8 +173,8 @@ class UnsupervisedTrainablePredictor(TrainablePredictor): def fit( self: _T, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.ArrayType, + y: Optional[utils.ArrayType] = None, ) -> _T: return self._fit(X, y) @@ -243,8 +244,8 @@ def transform(self, X): def fit_transform( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.ArrayType, + y: Optional[utils.ArrayType] = None, ) -> bpd.DataFrame: return self.fit(X, y).transform(X) @@ -264,6 +265,6 @@ def transform(self, y): def fit_transform( self, - y: Union[bpd.DataFrame, bpd.Series], + y: utils.ArrayType, ) -> bpd.DataFrame: return self.fit(y).transform(y) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 43cfbdd424..a221ea8e89 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -21,6 +21,7 @@ import bigframes_vendored.sklearn.cluster._kmeans from google.cloud import bigquery +import pandas as pd import bigframes from bigframes.core import log_adapter @@ -101,7 +102,7 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored transforms: Optional[List[str]] = None, ) -> KMeans: @@ -125,17 +126,20 @@ def cluster_centers_(self) -> bpd.DataFrame: def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def detect_anomalies( - self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1 + self, + X: Union[bpd.DataFrame, bpd.Series, pd.DataFrame, pd.Series], + *, + contamination: float = 0.1, ) -> bpd.DataFrame: """Detect the anomaly data points of the input. @@ -156,7 +160,7 @@ def detect_anomalies( if not self._bqml_model: raise RuntimeError("A model must be fitted before detect_anomalies") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.detect_anomalies( X, options={"contamination": contamination} @@ -181,12 +185,12 @@ def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: def score( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.evaluate(X) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 14cf12014f..27d9bfb4f4 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -332,7 +332,7 @@ def _compile_to_sql( def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> ColumnTransformer: (X,) = utils.convert_to_dataframe(X) @@ -347,11 +347,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 41dea7617f..aaf06ef5c9 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -84,7 +84,7 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, transforms: Optional[List[str]] = None, ) -> PCA: @@ -129,16 +129,19 @@ def explained_variance_ratio_(self) -> bpd.DataFrame: ["principal_component_id", "explained_variance_ratio"] ] - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def detect_anomalies( - self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1 + self, + X: utils.ArrayType, + *, + contamination: float = 0.1, ) -> bpd.DataFrame: """Detect the anomaly data points of the input. @@ -159,7 +162,7 @@ def detect_anomalies( if not self._bqml_model: raise RuntimeError("A model must be fitted before detect_anomalies") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.detect_anomalies( X, options={"contamination": contamination} diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 0194d768b8..91c14e4336 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional import bigframes_vendored.sklearn.ensemble._forest import bigframes_vendored.xgboost.sklearn @@ -142,8 +142,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> XGBRegressor: X, y = utils.convert_to_dataframe(X, y) @@ -158,24 +158,24 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): - X, y = utils.convert_to_dataframe(X, y) - if not self._bqml_model: raise RuntimeError("A model must be fitted before score") + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) + input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None ) @@ -291,8 +291,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> XGBClassifier: X, y = utils.convert_to_dataframe(X, y) @@ -305,22 +305,22 @@ def _fit( ) return self - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None @@ -427,8 +427,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> RandomForestRegressor: X, y = utils.convert_to_dataframe(X, y) @@ -443,18 +443,18 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): """Calculate evaluation metrics of the model. @@ -476,7 +476,7 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None @@ -583,8 +583,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> RandomForestClassifier: X, y = utils.convert_to_dataframe(X, y) @@ -599,18 +599,18 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): """Calculate evaluation metrics of the model. @@ -632,7 +632,7 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index a1ae8435d5..29ace99c61 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import List, Optional, Union +from typing import List, Optional from google.cloud import bigquery @@ -180,8 +180,8 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ): """Fit the model to training data. @@ -276,14 +276,14 @@ def coef_( def detect_anomalies( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, anomaly_prob_threshold: float = 0.95, ) -> bpd.DataFrame: """Detect the anomaly data points of the input. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to detect anomalies. anomaly_prob_threshold (float, default 0.95): Identifies the custom threshold to use for anomaly detection. The value must be in the range [0, 1), with a default value of 0.95. @@ -298,7 +298,7 @@ def detect_anomalies( if not self._bqml_model: raise RuntimeError("A model must be fitted before detect_anomalies") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.detect_anomalies( X, options={"anomaly_prob_threshold": anomaly_prob_threshold} @@ -306,8 +306,8 @@ def detect_anomalies( def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. @@ -318,11 +318,11 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame only contains 1 column as evaluation timestamp. The timestamp must be within the horizon of the model, which by default is 1000 data points. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame only contains 1 column as evaluation numeric values. @@ -331,7 +331,7 @@ def score( """ if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index cb8fe7a96e..dfee12f523 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import cast, Mapping, Optional, Union +from typing import cast, Mapping, Optional from google.cloud import bigquery @@ -64,11 +64,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame. Schema is defined by the model. Returns: @@ -143,11 +143,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series. Schema is defined by the model. Returns: @@ -159,7 +159,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: self._bqml_model = self._create_bqml_model() self._bqml_model = cast(core.BqmlModel, self._bqml_model) - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) @@ -259,11 +259,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series. Schema is defined by the model. Returns: @@ -275,7 +275,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: self._bqml_model = self._create_bqml_model() self._bqml_model = cast(core.BqmlModel, self._bqml_model) - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index dddade8cc5..37b9849b4f 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import Iterable, List, Literal, Optional, Union +from typing import Iterable, List, Literal, Optional import bigframes_vendored.sklearn.impute._base @@ -84,7 +84,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[SimpleImputer, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> SimpleImputer: (X,) = utils.convert_to_dataframe(X) @@ -99,11 +99,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 8fe1d6ec27..5665507286 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -128,8 +128,8 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> LinearRegression: X, y = utils.convert_to_dataframe(X, y) @@ -142,7 +142,7 @@ def _fit( ) return self - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") @@ -152,13 +152,13 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) @@ -280,8 +280,8 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> LogisticRegression: """Fit model with transforms.""" @@ -297,24 +297,24 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index b757a57502..cf1a78b8f7 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import cast, Literal, Optional, Union +from typing import cast, Literal, Optional import warnings import bigframes_vendored.constants as constants @@ -218,8 +218,8 @@ def _bqml_options(self) -> dict: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> PaLM2TextGenerator: """Fine tune PaLM2TextGenerator model. @@ -231,9 +231,9 @@ def fit( (https://cloud.google.com/products#product-launch-stages). Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series: + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Training labels. Returns: @@ -255,7 +255,7 @@ def fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, temperature: float = 0.0, max_output_tokens: int = 128, @@ -265,7 +265,7 @@ def predict( """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. @@ -327,7 +327,7 @@ def predict( if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -354,8 +354,8 @@ def predict( def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, task_type: Literal[ "text_generation", "classification", "summarization", "question_answering" ] = "text_generation", @@ -376,10 +376,10 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation data, which contains only one column of input_text that contains the prompt text to use when evaluating the model. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation labels, which contains only one column of output_text that you would expect to be returned by the model. task_type (str): @@ -392,7 +392,7 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) if len(X.columns) != 1 or len(y.columns) != 1: raise ValueError( @@ -542,11 +542,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. Returns: @@ -554,7 +554,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -698,11 +698,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. Returns: @@ -710,7 +710,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -867,8 +867,8 @@ def _bqml_options(self) -> dict: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> GeminiTextGenerator: """Fine tune GeminiTextGenerator model. Only support "gemini-pro" model for now. @@ -907,7 +907,7 @@ def fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, temperature: float = 0.9, max_output_tokens: int = 8192, @@ -917,7 +917,7 @@ def predict( """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. @@ -961,7 +961,7 @@ def predict( if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -988,8 +988,8 @@ def predict( def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, task_type: Literal[ "text_generation", "classification", "summarization", "question_answering" ] = "text_generation", @@ -1010,10 +1010,10 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation data, which contains only one column of input_text that contains the prompt text to use when evaluating the model. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation labels, which contains only one column of output_text that you would expect to be returned by the model. task_type (str): @@ -1030,7 +1030,7 @@ def score( if self._bqml_model.model_name.startswith("gemini-1.5"): raise NotImplementedError("Score is not supported for gemini-1.5 model.") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) if len(X.columns) != 1 or len(y.columns) != 1: raise ValueError( @@ -1195,7 +1195,7 @@ def _bqml_options(self) -> dict: def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, max_output_tokens: int = 128, top_k: int = 40, @@ -1204,7 +1204,7 @@ def predict( """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. @@ -1244,7 +1244,7 @@ def predict( if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index f9d7e6cf73..8fc0095931 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -23,6 +23,7 @@ import bigframes_vendored.sklearn.model_selection._split as vendored_model_selection_split import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation +import pandas as pd from bigframes.core import log_adapter from bigframes.ml import utils @@ -30,7 +31,7 @@ def train_test_split( - *arrays: Union[bpd.DataFrame, bpd.Series], + *arrays: utils.ArrayType, test_size: Union[float, None] = None, train_size: Union[float, None] = None, random_state: Union[int, None] = None, @@ -125,9 +126,9 @@ def get_n_splits(self) -> int: def split( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series, None] = None, - ) -> Generator[tuple[Union[bpd.DataFrame, bpd.Series, None]], None, None]: + X: utils.ArrayType, + y: Union[utils.ArrayType, None] = None, + ) -> Generator[tuple[Union[bpd.DataFrame, bpd.Series, None], ...], None, None]: X_df = next(utils.convert_to_dataframe(X)) y_df_or = next(utils.convert_to_dataframe(y)) if y is not None else None joined_df = X_df.join(y_df_or, how="outer") if y_df_or is not None else X_df @@ -146,15 +147,35 @@ def split( X_test = test_df[X_df.columns] y_test = test_df[y_df_or.columns] if y_df_or is not None else None - yield utils.convert_to_types( - [X_train, X_test, y_train, y_test], [X, X, y, y] + yield ( + KFold._convert_to_bf_type(X_train, X), + KFold._convert_to_bf_type(X_test, X), + KFold._convert_to_bf_type(y_train, y), + KFold._convert_to_bf_type(y_test, y), ) + @staticmethod + def _convert_to_bf_type( + input, + type_instance: Union[bpd.DataFrame, bpd.Series, pd.DataFrame, pd.Series, None], + ) -> Union[bpd.DataFrame, bpd.Series, None]: + if isinstance(type_instance, pd.Series) or isinstance( + type_instance, bpd.Series + ): + return next(utils.convert_to_series(input)) + + if isinstance(type_instance, pd.DataFrame) or isinstance( + type_instance, bpd.DataFrame + ): + return next(utils.convert_to_dataframe(input)) + + return None + def cross_validate( estimator, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series, None] = None, + X: utils.ArrayType, + y: Union[utils.ArrayType, None] = None, *, cv: Optional[Union[int, KFold]] = None, ) -> dict[str, list]: diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index dc3bd1f3f4..4313a05acf 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -18,7 +18,7 @@ from __future__ import annotations -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.pipeline @@ -101,8 +101,8 @@ def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> Pipel def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.BigFramesArrayType, + y: Optional[utils.BigFramesArrayType] = None, ) -> Pipeline: (X,) = utils.convert_to_dataframe(X) @@ -115,13 +115,13 @@ def fit( self._estimator._fit(X=X, y=y, transforms=transform_sqls) return self - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: return self._estimator.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.BigFramesArrayType, + y: Optional[utils.BigFramesArrayType] = None, ) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) if y is not None: diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index eb53904a78..94b3a601d4 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -80,7 +80,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[StandardScaler, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> StandardScaler: (X,) = utils.convert_to_dataframe(X) @@ -95,11 +95,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -158,7 +158,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[MaxAbsScaler, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> MaxAbsScaler: (X,) = utils.convert_to_dataframe(X) @@ -173,11 +173,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -236,7 +236,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[MinMaxScaler, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> MinMaxScaler: (X,) = utils.convert_to_dataframe(X) @@ -251,11 +251,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -359,7 +359,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> KBinsDiscretizer: (X,) = utils.convert_to_dataframe(X) @@ -374,11 +374,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -475,7 +475,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[OneHotEncoder, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> OneHotEncoder: (X,) = utils.convert_to_dataframe(X) @@ -490,11 +490,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -584,7 +584,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: def fit( self, - y: Union[bpd.DataFrame, bpd.Series], + y: utils.ArrayType, ) -> LabelEncoder: (y,) = utils.convert_to_dataframe(y) @@ -598,11 +598,11 @@ def fit( self._extract_output_names() return self - def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, y: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (y,) = utils.convert_to_dataframe(y) + (y,) = utils.convert_to_dataframe(y, session=self._bqml_model.session) df = self._bqml_model.transform(y) return typing.cast( @@ -667,7 +667,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, tuple[str, ...]] def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> PolynomialFeatures: (X,) = utils.convert_to_dataframe(X) @@ -683,11 +683,11 @@ def fit( return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index 8fb6d9db4c..05e6354f9f 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import Mapping, Optional, Union +from typing import Mapping, Optional import warnings import bigframes @@ -121,19 +121,19 @@ def standardize_type(v: str): def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: """Predict the result from the input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, which needs to comply with the input parameter of the model. Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.predict(X) diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 96f0bc31e9..bdca45e457 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -13,37 +13,75 @@ # limitations under the License. import typing -from typing import Any, Generator, Iterable, Literal, Mapping, Optional, Union +from typing import Any, Generator, Literal, Mapping, Optional, Union import bigframes_vendored.constants as constants from google.cloud import bigquery +import pandas as pd from bigframes.core import blocks import bigframes.pandas as bpd +from bigframes.session import Session # Internal type alias -ArrayType = Union[bpd.DataFrame, bpd.Series] +ArrayType = Union[bpd.DataFrame, bpd.Series, pd.DataFrame, pd.Series] +BigFramesArrayType = Union[bpd.DataFrame, bpd.Series] -def convert_to_dataframe(*input: ArrayType) -> Generator[bpd.DataFrame, None, None]: - return (_convert_to_dataframe(frame) for frame in input) +def convert_to_dataframe( + *input: ArrayType, + session: Optional[Session] = None, +) -> Generator[bpd.DataFrame, None, None]: + """Converts the input to BigFrames DataFrame. + Args: + session: + The session to convert local pandas instances to BigFrames counter-parts. + It is not used if the input itself is already a BigFrame data frame or series. -def _convert_to_dataframe(frame: ArrayType) -> bpd.DataFrame: + """ + return (_convert_to_dataframe(frame, session) for frame in input) + + +def _convert_to_dataframe( + frame: ArrayType, session: Optional[Session] = None +) -> bpd.DataFrame: if isinstance(frame, bpd.DataFrame): return frame if isinstance(frame, bpd.Series): return frame.to_frame() + if isinstance(frame, pd.DataFrame): + if session is None: + return bpd.read_pandas(frame) + else: + return session.read_pandas(frame) + if isinstance(frame, pd.Series): + if session is None: + return bpd.read_pandas(frame).to_frame() + else: + return session.read_pandas(frame).to_frame() raise ValueError( f"Unsupported type {type(frame)} to convert to DataFrame. {constants.FEEDBACK_LINK}" ) -def convert_to_series(*input: ArrayType) -> Generator[bpd.Series, None, None]: - return (_convert_to_series(frame) for frame in input) +def convert_to_series( + *input: ArrayType, session: Optional[Session] = None +) -> Generator[bpd.Series, None, None]: + """Converts the input to BigFrames Series. + + Args: + session: + The session to convert local pandas instances to BigFrames counter-parts. + It is not used if the input itself is already a BigFrame data frame or series. + """ + return (_convert_to_series(frame, session) for frame in input) -def _convert_to_series(frame: ArrayType) -> bpd.Series: + +def _convert_to_series( + frame: ArrayType, session: Optional[Session] = None +) -> bpd.Series: if isinstance(frame, bpd.DataFrame): if len(frame.columns) != 1: raise ValueError( @@ -55,44 +93,22 @@ def _convert_to_series(frame: ArrayType) -> bpd.Series: return typing.cast(bpd.Series, frame[label]) if isinstance(frame, bpd.Series): return frame + if isinstance(frame, pd.DataFrame): + # Recursively call this method to re-use the length-checking logic + if session is None: + return _convert_to_series(bpd.read_pandas(frame)) + else: + return _convert_to_series(session.read_pandas(frame), session) + if isinstance(frame, pd.Series): + if session is None: + return bpd.read_pandas(frame) + else: + return session.read_pandas(frame) raise ValueError( f"Unsupported type {type(frame)} to convert to Series. {constants.FEEDBACK_LINK}" ) -def convert_to_types( - inputs: Iterable[Union[ArrayType, None]], - type_instances: Iterable[Union[ArrayType, None]], -) -> tuple[Union[ArrayType, None]]: - """Convert the DF, Series and None types of the input to corresponding type_instances types.""" - results = [] - for input, type_instance in zip(inputs, type_instances): - results.append(_convert_to_type(input, type_instance)) - return tuple(results) - - -def _convert_to_type( - input: Union[ArrayType, None], type_instance: Union[ArrayType, None] -): - if type_instance is None: - if input is not None: - raise ValueError( - f"Trying to convert not None type to None. {constants.FEEDBACK_LINK}" - ) - return None - if input is None: - raise ValueError( - f"Trying to convert None type to not None. {constants.FEEDBACK_LINK}" - ) - if isinstance(type_instance, bpd.DataFrame): - return _convert_to_dataframe(input) - if isinstance(type_instance, bpd.Series): - return _convert_to_series(input) - raise ValueError( - f"Unsupport converting to {type(type_instance)}. {constants.FEEDBACK_LINK}" - ) - - def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]: """Parse model endpoint string to model_name and version.""" model_name = model_endpoint diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index e6b5f8cdc2..c1a1e073b9 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -434,10 +434,10 @@ def test_KFold_seeded_correct_rows(session, penguins_pandas_df_default_index): y = df["body_mass_g"] X_train, X_test, y_train, y_test = next(kf.split(X, y)) # type: ignore - X_train_sorted = X_train.to_pandas().sort_index() - X_test_sorted = X_test.to_pandas().sort_index() - y_train_sorted = y_train.to_pandas().sort_index() - y_test_sorted = y_test.to_pandas().sort_index() + X_train_sorted = X_train.to_pandas().sort_index() # type: ignore + X_test_sorted = X_test.to_pandas().sort_index() # type: ignore + y_train_sorted = y_train.to_pandas().sort_index() # type: ignore + y_test_sorted = y_test.to_pandas().sort_index() # type: ignore train_index: pd.Index = pd.Index( [ diff --git a/tests/system/small/ml/test_utils.py b/tests/system/small/ml/test_utils.py new file mode 100644 index 0000000000..0543f36852 --- /dev/null +++ b/tests/system/small/ml/test_utils.py @@ -0,0 +1,80 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing +import pytest + +import bigframes.ml.utils as utils + +_DATA_FRAME = pd.DataFrame({"column": [1, 2, 3]}) +_SERIES = pd.Series([1, 2, 3], name="column") + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_to_dataframe(session, data): + bf_data = session.read_pandas(data) + + (actual_result,) = utils.convert_to_dataframe(bf_data) + + pandas.testing.assert_frame_equal( + actual_result.to_pandas(), + _DATA_FRAME, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_pandas_to_dataframe(data, session): + (actual_result,) = utils.convert_to_dataframe(data, session=session) + + pandas.testing.assert_frame_equal( + actual_result.to_pandas(), + _DATA_FRAME, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_to_series(session, data): + bf_data = session.read_pandas(data) + + (actual_result,) = utils.convert_to_series(bf_data) + + pandas.testing.assert_series_equal( + actual_result.to_pandas(), _SERIES, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_pandas_to_series(data, session): + (actual_result,) = utils.convert_to_series(data, session=session) + + pandas.testing.assert_series_equal( + actual_result.to_pandas(), _SERIES, check_index_type=False, check_dtype=False + ) diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index aaf43dbcfe..b3c83c8d96 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -69,7 +69,7 @@ def fit( """Compute k-means clustering. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). Training data. y (default None): Not used, present here for API consistency by convention. @@ -86,7 +86,7 @@ def predict( """Predict the closest cluster each sample in X belongs to. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). New data to predict. Returns: @@ -108,7 +108,7 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). New Data. y (default None) Not used, present here for API consistency by convention. diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index 4b0bd42706..e4e71c1ff9 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -37,7 +37,7 @@ def fit( """Fit all transformers using X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. @@ -54,7 +54,7 @@ def transform( """Transform X separately by each transformer, concatenate results. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Series or DataFrame to be transformed by subset. Returns: diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index ae6f0b0561..a0cccdcb4e 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -34,7 +34,7 @@ def fit(self, X, y=None): """Fit the model according to the given training data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. @@ -71,7 +71,7 @@ def predict(self, X): """Predict the closest cluster for each sample in X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to predict. Returns: diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 92794bb68e..1f6284c146 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -47,10 +47,10 @@ def fit(self, X, y): """Build a forest of trees from the training set (X, y). Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. @@ -73,7 +73,7 @@ def predict(self, X): mean predicted regression targets of the trees in the forest. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py index 3064e8a118..4e33d976a9 100644 --- a/third_party/bigframes_vendored/sklearn/impute/_base.py +++ b/third_party/bigframes_vendored/sklearn/impute/_base.py @@ -30,7 +30,7 @@ def fit(self, X, y=None): """Fit the imputer on X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -45,7 +45,7 @@ def transform(self, X): """Impute all missing values in X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 69f98697af..fa8f28a656 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -31,7 +31,7 @@ def predict(self, X): """Predict using the linear model. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Samples. Returns: @@ -45,7 +45,7 @@ def predict(self, X): """Predict class labels for samples in X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. @@ -101,10 +101,10 @@ def fit( """Fit linear model. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index c52a37018c..f3419ba8a9 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -71,12 +71,12 @@ def fit( """Fit the model according to the given training data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples,). Target vector relative to X. diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index 1ff83aa640..b051cb24b4 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -48,7 +48,7 @@ def fit(self, X, y=None): """Compute the mean and std to be used for later scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -63,7 +63,7 @@ def transform(self, X): """Perform standardization by centering and scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: @@ -85,7 +85,7 @@ def fit(self, X, y=None): """Compute the maximum absolute value to be used for later scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -100,7 +100,7 @@ def transform(self, X): """Scale the data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: @@ -121,7 +121,7 @@ def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -136,7 +136,7 @@ def transform(self, X): """Scale the data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index 54c81af71d..5fa84d2d15 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -25,7 +25,7 @@ def fit(self, X, y=None): """Fit the estimator. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -40,7 +40,7 @@ def transform(self, X): """Discretize the data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 7cdca9229a..5476a9fb3c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -65,7 +65,7 @@ def fit(self, X, y=None): """Fit OneHotEncoder to X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series with training data. y (default None): @@ -80,7 +80,7 @@ def transform(self, X): """Transform X using one-hot encoding. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index 61a44db92f..74b3ca347a 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -33,7 +33,7 @@ def fit(self, y): """Fit label encoder. Args: - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series with training data. Returns: @@ -45,7 +45,7 @@ def transform(self, y): """Transform y using label encoding. Args: - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py index 9ad43b7956..aeed4dce92 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py @@ -19,7 +19,7 @@ def fit(self, X, y=None): """Compute number of output features. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -34,7 +34,7 @@ def transform(self, X): """Transform data to polynomial features. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: