googleapis · ashleyxuu · Apr 17, 2024 · Apr 12, 2024 · Apr 17, 2024
@@ -35,7 +35,7 @@ def train_test_split(
    Args:
        *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series):
            A sequence of BigQuery DataFrames or Series that can be joined on
-            their indexes
+            their indexes.
        test_size (default None):
            The proportion of the dataset to include in the test split. If
            None, this will default to the complement of train_size. If both

@@ -37,7 +37,7 @@ def dayofweek(self):
        """The day of the week with Monday=0, Sunday=6.

        Return the day of the week. It is assumed the week starts on
-        Monday, which is denoted by 0 and ends on Sunday which is denoted
+        Monday, which is denoted by 0 and ends on Sunday, which is denoted
        by 6.

        **Examples:**

@@ -153,7 +153,7 @@ def fit_transform(self, X, y=None):
                Target values (None for unsupervised transformations).

        Returns:
-            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new)
+            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new).
                Transformed DataFrame.
        """
        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@@ -122,7 +122,7 @@ def recall_score(
 ):
    """Compute the recall.

-    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    The recall is the ratio ``tp / (tp + fn)``, where ``tp`` is the number of
    true positives and ``fn`` the number of false negatives. The recall is
    intuitively the ability of the classifier to find all the positive samples.

@@ -170,7 +170,7 @@ def precision_score(
 ):
    """Compute the precision.

-    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    The precision is the ratio ``tp / (tp + fp)``, where ``tp`` is the number of
    true positives and ``fp`` the number of false positives. The precision is
    intuitively the ability of the classifier not to label as positive a sample
    that is negative.
@@ -244,9 +244,9 @@ def f1_score(
        dtype: float64

    Args:
-        y_true: Series or DataFrame of shape (n_samples,)
+        y_true: Series or DataFrame of shape (n_samples,).
            Ground truth (correct) target values.
-        y_pred: Series or DataFrame of shape (n_samples,)
+        y_pred: Series or DataFrame of shape (n_samples,).
            Estimated targets as returned by a classifier.
        average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
                default='binary'

@@ -20,13 +20,14 @@ class Pipeline(BaseEstimator, metaclass=ABCMeta):
    """Pipeline of transforms with a final estimator.

    Sequentially apply a list of transforms and a final estimator.
-    Intermediate steps of the pipeline must be `transforms`, that is, they
+    Intermediate steps of the pipeline must be `transforms`. That is, they
    must implement `fit` and `transform` methods.
    The final estimator only needs to implement `fit`.

    The purpose of the pipeline is to assemble several steps that can be
-    cross-validated together while setting different parameters. This simplifies code, and allows deploying an estimator
-    and peprocessing together, e.g. with `Pipeline.to_gbq(...).`
+    cross-validated together while setting different parameters. This
+    simplifies code and allows for deploying an estimator and peprocessing
+    together, e.g. with `Pipeline.to_gbq(...).`
    """

    def fit(

@@ -23,15 +23,21 @@ class OneHotEncoder(BaseEstimator):
        Given a dataset with two features, we let the encoder find the unique
        values per feature and transform the data to a binary one-hot encoding.

-        .. code-block::
-
-            from bigframes.ml.preprocessing import OneHotEncoder
-            import bigframes.pandas as bpd
-
-            enc = OneHotEncoder()
-            X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]})
-            enc.fit(X)
-            print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]})))
+        >>> from bigframes.ml.preprocessing import OneHotEncoder
+        >>> import bigframes.pandas as bpd
+        >>> bpd.options.display.progress_bar = None
+
+        >>> enc = OneHotEncoder()
+        >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]})
+        >>> enc.fit(X)
+        OneHotEncoder()
+
+        >>> print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]})))
+                        onehotencoded_a               onehotencoded_b
+        0  [{'index': 1, 'value': 1.0}]  [{'index': 1, 'value': 1.0}]
+        1  [{'index': 2, 'value': 1.0}]  [{'index': 0, 'value': 1.0}]
+        <BLANKLINE>
+        [2 rows x 2 columns]

    Args:
        drop (Optional[Literal["most_frequent"]], default None):
@@ -52,7 +58,7 @@ class OneHotEncoder(BaseEstimator):
            Specifies an upper limit to the number of output features for each input feature
            when considering infrequent categories. If there are infrequent categories,
            max_categories includes the category representing the infrequent categories along with the frequent categories.
-            Default None, set limit to 1,000,000.
+            Default None. Set limit to 1,000,000.
    """

    def fit(self, X, y=None):

@@ -26,7 +26,7 @@ class LabelEncoder(BaseEstimator):
            Specifies an upper limit to the number of output features for each input feature
            when considering infrequent categories. If there are infrequent categories,
            max_categories includes the category representing the infrequent categories along with the frequent categories.
-            Default None, set limit to 1,000,000.
+            Default None. Set limit to 1,000,000.
    """

    def fit(self, y):