From b6e04d0291e4c0254f82575e09f8bb64cb257a7e Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Wed, 8 Nov 2023 17:18:04 +0000 Subject: [PATCH] feat: add series.sample (identical to existing dataframe.sample) --- bigframes/series.py | 16 ++++++++++++++++ tests/system/small/test_series.py | 27 +++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index 032bdf6c42..34e1ee611e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1438,6 +1438,22 @@ def map( result_df = self_df.join(map_df, on="series") return result_df[self.name] + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + *, + random_state: Optional[int] = None, + ) -> Series: + if n is not None and frac is not None: + raise ValueError("Only one of 'n' or 'frac' parameter can be specified.") + + ns = (n,) if n is not None else () + fracs = (frac,) if frac is not None else () + return Series( + self._block._split(ns=ns, fracs=fracs, random_state=random_state)[0] + ) + def __array_ufunc__( self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs ) -> Series: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c8bd0f7afd..9d54fbb8e2 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2921,3 +2921,30 @@ def test_map_series_input_duplicates_error(scalars_dfs): scalars_pandas_df.int64_too.map(pd_map_series) with pytest.raises(pd.errors.InvalidIndexError): scalars_df.int64_too.map(bf_map_series, verify_integrity=True) + + +@pytest.mark.parametrize( + ("frac", "n", "random_state"), + [ + (None, 4, None), + (0.5, None, None), + (None, 4, 10), + (0.5, None, 10), + (None, None, None), + ], + ids=[ + "n_wo_random_state", + "frac_wo_random_state", + "n_w_random_state", + "frac_w_random_state", + "n_default", + ], +) +def test_sample(scalars_dfs, frac, n, random_state): + scalars_df, _ = scalars_dfs + df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state) + bf_result = df.to_pandas() + + n = 1 if n is None else n + expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n + assert bf_result.shape[0] == expected_sample_size