diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 42c13fdb40..48eb5a93a7 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -35,7 +35,7 @@ def train_test_split( Args: *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series): A sequence of BigQuery DataFrames or Series that can be joined on - their indexes + their indexes. test_size (default None): The proportion of the dataset to include in the test split. If None, this will default to the complement of train_size. If both diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 3f0175359a..f34612cb11 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -37,7 +37,7 @@ def dayofweek(self): """The day of the week with Monday=0, Sunday=6. Return the day of the week. It is assumed the week starts on - Monday, which is denoted by 0 and ends on Sunday which is denoted + Monday, which is denoted by 0 and ends on Sunday, which is denoted by 6. **Examples:** diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index fd8db7a227..1a151a1119 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -153,7 +153,7 @@ def fit_transform(self, X, y=None): Target values (None for unsupervised transformations). Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new) + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new). Transformed DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index 00bbf8cd60..8e8b2c1952 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -122,7 +122,7 @@ def recall_score( ): """Compute the recall. - The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + The recall is the ratio ``tp / (tp + fn)``, where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. @@ -170,7 +170,7 @@ def precision_score( ): """Compute the precision. - The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + The precision is the ratio ``tp / (tp + fp)``, where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. @@ -244,9 +244,9 @@ def f1_score( dtype: float64 Args: - y_true: Series or DataFrame of shape (n_samples,) + y_true: Series or DataFrame of shape (n_samples,). Ground truth (correct) target values. - y_pred: Series or DataFrame of shape (n_samples,) + y_pred: Series or DataFrame of shape (n_samples,). Estimated targets as returned by a classifier. average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ default='binary' diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py index aed1565960..8a98ee4141 100644 --- a/third_party/bigframes_vendored/sklearn/pipeline.py +++ b/third_party/bigframes_vendored/sklearn/pipeline.py @@ -20,13 +20,14 @@ class Pipeline(BaseEstimator, metaclass=ABCMeta): """Pipeline of transforms with a final estimator. Sequentially apply a list of transforms and a final estimator. - Intermediate steps of the pipeline must be `transforms`, that is, they + Intermediate steps of the pipeline must be `transforms`. That is, they must implement `fit` and `transform` methods. The final estimator only needs to implement `fit`. The purpose of the pipeline is to assemble several steps that can be - cross-validated together while setting different parameters. This simplifies code, and allows deploying an estimator - and peprocessing together, e.g. with `Pipeline.to_gbq(...).` + cross-validated together while setting different parameters. This + simplifies code and allows for deploying an estimator and peprocessing + together, e.g. with `Pipeline.to_gbq(...).` """ def fit( diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 5e5e8ac042..b883e82249 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -23,15 +23,21 @@ class OneHotEncoder(BaseEstimator): Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. - .. code-block:: - - from bigframes.ml.preprocessing import OneHotEncoder - import bigframes.pandas as bpd - - enc = OneHotEncoder() - X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) - enc.fit(X) - print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) + >>> from bigframes.ml.preprocessing import OneHotEncoder + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> enc = OneHotEncoder() + >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) + >>> enc.fit(X) + OneHotEncoder() + + >>> print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) + onehotencoded_a onehotencoded_b + 0 [{'index': 1, 'value': 1.0}] [{'index': 1, 'value': 1.0}] + 1 [{'index': 2, 'value': 1.0}] [{'index': 0, 'value': 1.0}] + + [2 rows x 2 columns] Args: drop (Optional[Literal["most_frequent"]], default None): @@ -52,7 +58,7 @@ class OneHotEncoder(BaseEstimator): Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, max_categories includes the category representing the infrequent categories along with the frequent categories. - Default None, set limit to 1,000,000. + Default None. Set limit to 1,000,000. """ def fit(self, X, y=None): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index cc6b995c8c..61a44db92f 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -26,7 +26,7 @@ class LabelEncoder(BaseEstimator): Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, max_categories includes the category representing the infrequent categories along with the frequent categories. - Default None, set limit to 1,000,000. + Default None. Set limit to 1,000,000. """ def fit(self, y):