diff --git a/bigframes/series.py b/bigframes/series.py index 1952acbf6d..4fab1fe943 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1447,6 +1447,22 @@ def map( result_df = self_df.join(map_df, on="series") return result_df[self.name] + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + *, + random_state: Optional[int] = None, + ) -> Series: + if n is not None and frac is not None: + raise ValueError("Only one of 'n' or 'frac' parameter can be specified.") + + ns = (n,) if n is not None else () + fracs = (frac,) if frac is not None else () + return Series( + self._block._split(ns=ns, fracs=fracs, random_state=random_state)[0] + ) + def __array_ufunc__( self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs ) -> Series: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f59d64fe06..d9fc23fad0 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2922,3 +2922,30 @@ def test_map_series_input_duplicates_error(scalars_dfs): scalars_pandas_df.int64_too.map(pd_map_series) with pytest.raises(pd.errors.InvalidIndexError): scalars_df.int64_too.map(bf_map_series, verify_integrity=True) + + +@pytest.mark.parametrize( + ("frac", "n", "random_state"), + [ + (None, 4, None), + (0.5, None, None), + (None, 4, 10), + (0.5, None, 10), + (None, None, None), + ], + ids=[ + "n_wo_random_state", + "frac_wo_random_state", + "n_w_random_state", + "frac_w_random_state", + "n_default", + ], +) +def test_sample(scalars_dfs, frac, n, random_state): + scalars_df, _ = scalars_dfs + df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state) + bf_result = df.to_pandas() + + n = 1 if n is None else n + expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n + assert bf_result.shape[0] == expected_sample_size