diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index e3ed8edd21..6bcc25319b 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -14,7 +14,7 @@ from __future__ import annotations import typing -from typing import Iterable, Literal, Optional, Tuple, Union +from typing import Iterable, Literal, Optional, Union import pandas as pd @@ -113,7 +113,7 @@ def cut( bins: Union[ int, pd.IntervalIndex, - Iterable[Tuple[Union[int, float], Union[int, float]]], + Iterable, ], *, labels: Optional[bool] = None, @@ -125,9 +125,29 @@ def cut( if isinstance(bins, pd.IntervalIndex): as_index: pd.IntervalIndex = bins bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) - else: + elif len(list(bins)) == 0: + raise ValueError("`bins` iterable should have at least one item") + elif isinstance(list(bins)[0], tuple): as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) + elif pd.api.types.is_number(list(bins)[0]): + bins_list = list(bins) + if len(bins_list) < 2: + raise ValueError( + "`bins` iterable of numeric breaks should have" + " at least two items" + ) + as_index = pd.IntervalIndex.from_breaks(bins_list) + single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list]) + numeric_type = type(bins_list[0]) if single_type else float + bins = tuple( + [ + (numeric_type(bins_list[i]), numeric_type(bins_list[i + 1])) + for i in range(len(bins_list) - 1) + ] + ) + else: + raise ValueError("`bins` iterable should contain tuples or numerics") if as_index.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 36fa787644..f33dc16e30 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -17,7 +17,7 @@ import abc import dataclasses import typing -from typing import ClassVar, Hashable, Optional, Tuple +from typing import ClassVar, Iterable, Optional import pandas as pd import pyarrow as pa @@ -213,7 +213,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? - bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]] + bins: typing.Union[int, Iterable] labels: Optional[bool] @property @@ -232,7 +232,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT interval_dtype = ( pa.float64() if isinstance(self.bins, int) - else dtypes.infer_literal_arrow_type(self.bins[0][0]) + else dtypes.infer_literal_arrow_type(list(self.bins)[0][0]) ) pa_type = pa.struct( [ diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 95b34a56c5..d543f92655 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -424,6 +424,58 @@ def test_cut_default_labels(scalars_dfs): ) +@pytest.mark.parametrize( + ("breaks",), + [ + ([0, 5, 10, 15, 20, 100, 1000],), # ints + ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],), # floats + ([0, 5, 10.5, 15.5, 20, 100, 1000.5],), # mixed + ], +) +def test_cut_numeric_breaks(scalars_dfs, breaks): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks) + bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas() + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("bins",), + [ + (-1,), # negative integer bins argument + ([],), # empty iterable of bins + (["notabreak"],), # iterable of wrong type + ([1],), # numeric breaks with only one numeric + # this is supported by pandas but not by + # the bigquery operation and a bigframes workaround + # is not yet available. Should return column + # of structs with all NaN values. + ], +) +def test_cut_errors(scalars_dfs, bins): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + bpd.cut(scalars_df["float64_col"], bins) + + @pytest.mark.parametrize( ("bins",), [ diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index fbd1d2d052..6ba3950a76 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -76,10 +76,20 @@ def cut( 3 {'left_exclusive': 5, 'right_inclusive': 20} dtype: struct[pyarrow] + Cut with an iterable of ints: + + >>> bins_ints = [0, 1, 5, 20] + >>> bpd.cut(s, bins=bins_ints) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 {'left_exclusive': 1, 'right_inclusive': 5} + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + Args: x (Series): The input Series to be binned. Must be 1-dimensional. - bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]): + bins (int, pd.IntervalIndex, Iterable): The criteria to bin by. int: Defines the number of equal-width bins in the range of `x`. The @@ -88,6 +98,10 @@ def cut( pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used. It's important to ensure that these bins are non-overlapping. + + Iterable of numerics: Defines the exact bins by using the interval + between each item and its following item. The items must be monotonically + increasing. labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the