From c57d2380dd1372d84980dc914d997fc96d68401b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 20 Mar 2024 04:15:06 +0000 Subject: [PATCH 1/3] fix: sampling plot cannot preserve ordering if index is not ordered --- bigframes/core/blocks.py | 17 ++++++++++++++--- bigframes/dataframe.py | 5 ++++- bigframes/operations/_matplotlib/core.py | 8 +++----- bigframes/series.py | 5 ++++- tests/system/small/operations/test_plotting.py | 15 ++++++++++++++- tests/system/small/test_dataframe.py | 18 ++++++++++++++++++ .../bigframes_vendored/pandas/core/generic.py | 7 +++++++ 7 files changed, 64 insertions(+), 11 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0ebbe48cc4..27538c712a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -563,7 +563,7 @@ def _downsample( block = self._split( fracs=(fraction,), random_state=random_state, - preserve_order=True, + sort=None, )[0] return block else: @@ -579,7 +579,7 @@ def _split( fracs: Iterable[float] = (), *, random_state: Optional[int] = None, - preserve_order: Optional[bool] = False, + sort: Optional[bool] = False, ) -> List[Block]: """Internal function to support splitting Block to multiple parts along index axis. @@ -631,7 +631,18 @@ def _split( typing.cast(Block, block.slice(start=lower, stop=upper)) for lower, upper in intervals ] - if preserve_order: + + if sort is True: + sliced_blocks = [ + sliced_block.order_by( + [ + ordering.OrderingColumnReference(idx_col) + for idx_col in sliced_block.index_columns + ] + ) + for sliced_block in sliced_blocks + ] + elif sort is None: sliced_blocks = [ sliced_block.order_by([ordering.OrderingColumnReference(ordering_col)]) for sliced_block in sliced_blocks diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0f99a3e4db..1264e5fa73 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2497,6 +2497,7 @@ def sample( frac: Optional[float] = None, *, random_state: Optional[int] = None, + sort: Optional[bool] = False, ) -> DataFrame: if n is not None and frac is not None: raise ValueError("Only one of 'n' or 'frac' parameter can be specified.") @@ -2504,7 +2505,9 @@ def sample( ns = (n,) if n is not None else () fracs = (frac,) if frac is not None else () return DataFrame( - self._block._split(ns=ns, fracs=fracs, random_state=random_state)[0] + self._block._split( + ns=ns, fracs=fracs, random_state=random_state, sort=sort + )[0] ) def _split( diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 5c9d771f61..8c5c0e1f2e 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -47,11 +47,9 @@ def _compute_plot_data(self, data): # TODO: Cache the sampling data in the PlotAccessor. sampling_n = self.kwargs.pop("sampling_n", 100) sampling_random_state = self.kwargs.pop("sampling_random_state", 0) - return ( - data.sample(n=sampling_n, random_state=sampling_random_state) - .to_pandas() - .sort_index() - ) + return data.sample( + n=sampling_n, random_state=sampling_random_state, sort=None + ).to_pandas() class LinePlot(SamplingPlot): diff --git a/bigframes/series.py b/bigframes/series.py index 6128238057..7d8147bb11 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1529,6 +1529,7 @@ def sample( frac: Optional[float] = None, *, random_state: Optional[int] = None, + sort: Optional[bool] = False, ) -> Series: if n is not None and frac is not None: raise ValueError("Only one of 'n' or 'frac' parameter can be specified.") @@ -1536,7 +1537,9 @@ def sample( ns = (n,) if n is not None else () fracs = (frac,) if frac is not None else () return Series( - self._block._split(ns=ns, fracs=fracs, random_state=random_state)[0] + self._block._split( + ns=ns, fracs=fracs, random_state=random_state, sort=sort + )[0] ) def __array_ufunc__( diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 876c8f7d04..47491cdada 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +import pandas as pd import pandas._testing as tm import pytest @@ -235,6 +236,18 @@ def test_sampling_plot_args_random_state(): tm.assert_almost_equal(ax_0.lines[0].get_data()[1], ax_2.lines[0].get_data()[1]) +def test_sampling_preserve_ordering(): + df = bpd.DataFrame([0.0, 1.0, 2.0, 3.0, 4.0], index=[1, 3, 4, 2, 0]) + pd_df = pd.DataFrame([0.0, 1.0, 2.0, 3.0, 4.0], index=[1, 3, 4, 2, 0]) + ax = df.plot.line() + pd_ax = pd_df.plot.line() + tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) + tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) + for line, pd_line in zip(ax.lines, pd_ax.lines): + # Compare y coordinates between the lines + tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) + + @pytest.mark.parametrize( ("kind", "col_names", "kwargs"), [ @@ -251,7 +264,7 @@ def test_sampling_plot_args_random_state(): marks=pytest.mark.xfail(raises=ValueError), ), pytest.param( - "uknown", + "bar", ["int64_col", "int64_too"], {}, marks=pytest.mark.xfail(raises=NotImplementedError), diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ee32fb25ac..73b3dbdf02 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3011,6 +3011,24 @@ def test_sample_raises_value_error(scalars_dfs): scalars_df.sample(frac=0.5, n=4) +def test_sample_args_sort(scalars_dfs): + scalars_df, _ = scalars_dfs + index = [4, 3, 2, 5, 1, 0] + scalars_df = scalars_df.iloc[index] + + kwargs = {"frac": 1.0, "random_state": 333} + + df = scalars_df.sample(**kwargs).to_pandas() + assert df.index.values != index + assert df.index.values != sorted(index) + + df = scalars_df.sample(sort=True, **kwargs).to_pandas() + assert df.index.values == sorted(index) + + df = scalars_df.sample(sort=None, **kwargs).to_pandas() + assert df.index.values == index + + @pytest.mark.parametrize( ("axis",), [ diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 9358dca17b..fc8a06d412 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -472,6 +472,7 @@ def sample( frac: Optional[float] = None, *, random_state: Optional[int] = None, + sort: Optional[bool] = None, ): """Return a random sample of items from an axis of object. @@ -530,6 +531,12 @@ def sample( Fraction of axis items to return. Cannot be used with `n`. random_state (Optional[int], default None): Seed for random number generator. + sort (Optional[bool], default False): + + - 'False' (default): No specific ordering will be applied after + sampling. + - 'True' : Index columns will determine the sample's order. + - None: The sample will retain the original object's order. Returns: A new object of same type as caller containing `n` items randomly From 24940a1e26b4bd7948e162047c5be5b0e9b9f62c Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 20 Mar 2024 04:20:35 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/small/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 73b3dbdf02..7847729620 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3015,7 +3015,7 @@ def test_sample_args_sort(scalars_dfs): scalars_df, _ = scalars_dfs index = [4, 3, 2, 5, 1, 0] scalars_df = scalars_df.iloc[index] - + kwargs = {"frac": 1.0, "random_state": 333} df = scalars_df.sample(**kwargs).to_pandas() From db507c329bc022d7ea4418395ac79598d51ebc13 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 20 Mar 2024 22:21:14 +0000 Subject: [PATCH 3/3] change sort type --- bigframes/core/blocks.py | 8 ++++---- bigframes/dataframe.py | 2 +- bigframes/operations/_matplotlib/core.py | 4 +++- bigframes/series.py | 4 ++-- tests/system/small/test_dataframe.py | 6 +++++- third_party/bigframes_vendored/pandas/core/generic.py | 8 ++++---- 6 files changed, 19 insertions(+), 13 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 27538c712a..64c305ba0f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -26,7 +26,7 @@ import itertools import random import typing -from typing import Iterable, List, Mapping, Optional, Sequence, Tuple +from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple import warnings import bigframes_vendored.pandas.io.common as vendored_pandas_io_common @@ -563,7 +563,7 @@ def _downsample( block = self._split( fracs=(fraction,), random_state=random_state, - sort=None, + sort=False, )[0] return block else: @@ -579,7 +579,7 @@ def _split( fracs: Iterable[float] = (), *, random_state: Optional[int] = None, - sort: Optional[bool] = False, + sort: Optional[bool | Literal["random"]] = "random", ) -> List[Block]: """Internal function to support splitting Block to multiple parts along index axis. @@ -642,7 +642,7 @@ def _split( ) for sliced_block in sliced_blocks ] - elif sort is None: + elif sort is False: sliced_blocks = [ sliced_block.order_by([ordering.OrderingColumnReference(ordering_col)]) for sliced_block in sliced_blocks diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1264e5fa73..9f970befa4 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2497,7 +2497,7 @@ def sample( frac: Optional[float] = None, *, random_state: Optional[int] = None, - sort: Optional[bool] = False, + sort: Optional[bool | Literal["random"]] = "random", ) -> DataFrame: if n is not None and frac is not None: raise ValueError("Only one of 'n' or 'frac' parameter can be specified.") diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 8c5c0e1f2e..7cbeb3df4f 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -48,7 +48,9 @@ def _compute_plot_data(self, data): sampling_n = self.kwargs.pop("sampling_n", 100) sampling_random_state = self.kwargs.pop("sampling_random_state", 0) return data.sample( - n=sampling_n, random_state=sampling_random_state, sort=None + n=sampling_n, + random_state=sampling_random_state, + sort=False, ).to_pandas() diff --git a/bigframes/series.py b/bigframes/series.py index 7d8147bb11..311861bd15 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -21,7 +21,7 @@ import numbers import textwrap import typing -from typing import Any, Mapping, Optional, Tuple, Union +from typing import Any, Literal, Mapping, Optional, Tuple, Union import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery @@ -1529,7 +1529,7 @@ def sample( frac: Optional[float] = None, *, random_state: Optional[int] = None, - sort: Optional[bool] = False, + sort: Optional[bool | Literal["random"]] = "random", ) -> Series: if n is not None and frac is not None: raise ValueError("Only one of 'n' or 'frac' parameter can be specified.") diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 7847729620..9bf4683739 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3022,10 +3022,14 @@ def test_sample_args_sort(scalars_dfs): assert df.index.values != index assert df.index.values != sorted(index) + df = scalars_df.sample(sort="random", **kwargs).to_pandas() + assert df.index.values != index + assert df.index.values != sorted(index) + df = scalars_df.sample(sort=True, **kwargs).to_pandas() assert df.index.values == sorted(index) - df = scalars_df.sample(sort=None, **kwargs).to_pandas() + df = scalars_df.sample(sort=False, **kwargs).to_pandas() assert df.index.values == index diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index fc8a06d412..1391f64251 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -472,7 +472,7 @@ def sample( frac: Optional[float] = None, *, random_state: Optional[int] = None, - sort: Optional[bool] = None, + sort: Optional[bool | Literal["random"]] = "random", ): """Return a random sample of items from an axis of object. @@ -531,12 +531,12 @@ def sample( Fraction of axis items to return. Cannot be used with `n`. random_state (Optional[int], default None): Seed for random number generator. - sort (Optional[bool], default False): + sort (Optional[bool|Literal["random"]], default "random"): - - 'False' (default): No specific ordering will be applied after + - 'random' (default): No specific ordering will be applied after sampling. - 'True' : Index columns will determine the sample's order. - - None: The sample will retain the original object's order. + - 'False': The sample will retain the original object's order. Returns: A new object of same type as caller containing `n` items randomly