From 7381e846549791b67dac92feace17d577c5aeaac Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 4 Apr 2024 23:33:12 +0000 Subject: [PATCH 1/8] feat: support list of numerics in pandas.cut --- bigframes/core/reshape/__init__.py | 17 +++++++++--- bigframes/operations/aggregations.py | 6 ++--- tests/system/small/test_pandas.py | 26 +++++++++++++++++++ .../pandas/core/reshape/tile.py | 16 +++++++++++- 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index e3ed8edd21..a306ace53c 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -14,7 +14,7 @@ from __future__ import annotations import typing -from typing import Iterable, Literal, Optional, Tuple, Union +from typing import Iterable, Literal, Optional, Union import pandas as pd @@ -113,7 +113,7 @@ def cut( bins: Union[ int, pd.IntervalIndex, - Iterable[Tuple[Union[int, float], Union[int, float]]], + Iterable, ], *, labels: Optional[bool] = None, @@ -125,9 +125,20 @@ def cut( if isinstance(bins, pd.IntervalIndex): as_index: pd.IntervalIndex = bins bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) - else: + elif len(list(bins)) == 0: + raise ValueError(("`bins` iterable should have at least one item")) + elif isinstance(list(bins)[0], tuple): as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) + elif pd.api.types.is_number(list(bins)[0]): + if len(list(bins)) < 2: + raise ValueError( + "`bins` iterable of numeric breaks should have" + " at least two items" + ) + as_index = pd.IntervalIndex.from_breaks(list(bins)) + bins = list(bins) + bins = tuple([(bins[i], bins[i + 1]) for i in range(len(bins) - 1)]) if as_index.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 76aa2a6112..d9eec504e9 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -17,7 +17,7 @@ import abc import dataclasses import typing -from typing import ClassVar, Hashable, Optional, Tuple +from typing import ClassVar, Iterable, Optional import pandas as pd import pyarrow as pa @@ -191,7 +191,7 @@ def output_type(self, *input_types: dtypes.ExpressionType): @dataclasses.dataclass(frozen=True) class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? - bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]] + bins: typing.Union[int, Iterable] labels: Optional[bool] @property @@ -210,7 +210,7 @@ def output_type(self, *input_types: dtypes.ExpressionType): interval_dtype = ( pa.float64() if isinstance(self.bins, int) - else dtypes.infer_literal_arrow_type(self.bins[0][0]) + else dtypes.infer_literal_arrow_type(list(self.bins)[0][0]) ) pa_type = pa.struct( [ diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a080a969c8..e9a2e40545 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -424,6 +424,32 @@ def test_cut_default_labels(scalars_dfs): ) +def test_cut_numeric_breaks(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000]) + bf_result = bpd.cut( + scalars_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000] + ).to_pandas() + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("bins",), [ diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index fbd1d2d052..5e81c35ca3 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -76,10 +76,20 @@ def cut( 3 {'left_exclusive': 5, 'right_inclusive': 20} dtype: struct[pyarrow] + Cut with an iterable of ints: + + >>> bins_ints = [0, 1, 5, 20] + >>> bpd.cut(s, bins=bins_tuples) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 {'left_exclusive': 1, 'right_inclusive': 5} + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + Args: x (Series): The input Series to be binned. Must be 1-dimensional. - bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]): + bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]], Iterable[Union[int, float]]): The criteria to bin by. int: Defines the number of equal-width bins in the range of `x`. The @@ -88,6 +98,10 @@ def cut( pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used. It's important to ensure that these bins are non-overlapping. + + Iterable of floats or ints: Defines the exact bins by using the interval + between each item and its following item. The items must be monotonically + increasing. labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the From 1c62a3dd0479402fdc031aa333884d5f4bc953b4 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 4 Apr 2024 23:37:55 +0000 Subject: [PATCH 2/8] tweak test --- tests/system/small/test_pandas.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index e9a2e40545..f66dbe1543 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -427,10 +427,9 @@ def test_cut_default_labels(scalars_dfs): def test_cut_numeric_breaks(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000]) - bf_result = bpd.cut( - scalars_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000] - ).to_pandas() + breaks = [0, 5, 10, 15, 20, 100, 1000] + pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks) + bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas() # Convert to match data format pd_result_converted = pd.Series( From ce9fd2c187207d03426c730ebac4f55311fa61dc Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 4 Apr 2024 23:39:48 +0000 Subject: [PATCH 3/8] fix doctest --- third_party/bigframes_vendored/pandas/core/reshape/tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 5e81c35ca3..15ed6971c5 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -79,7 +79,7 @@ def cut( Cut with an iterable of ints: >>> bins_ints = [0, 1, 5, 20] - >>> bpd.cut(s, bins=bins_tuples) + >>> bpd.cut(s, bins=bins_ints) 0 1 {'left_exclusive': 0, 'right_inclusive': 1} 2 {'left_exclusive': 1, 'right_inclusive': 5} From c5d74cee67599055d82fb85fb6c36683992822fd Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 4 Apr 2024 23:41:33 +0000 Subject: [PATCH 4/8] fix docstring --- third_party/bigframes_vendored/pandas/core/reshape/tile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 15ed6971c5..6ba3950a76 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -89,7 +89,7 @@ def cut( Args: x (Series): The input Series to be binned. Must be 1-dimensional. - bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]], Iterable[Union[int, float]]): + bins (int, pd.IntervalIndex, Iterable): The criteria to bin by. int: Defines the number of equal-width bins in the range of `x`. The @@ -99,7 +99,7 @@ def cut( pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used. It's important to ensure that these bins are non-overlapping. - Iterable of floats or ints: Defines the exact bins by using the interval + Iterable of numerics: Defines the exact bins by using the interval between each item and its following item. The items must be monotonically increasing. labels (None): From ee25deb7936154aa2db6d96b8013173b23fbfc15 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 4 Apr 2024 23:45:47 +0000 Subject: [PATCH 5/8] improve error case --- bigframes/core/reshape/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index a306ace53c..630c5febc3 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -126,7 +126,7 @@ def cut( as_index: pd.IntervalIndex = bins bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) elif len(list(bins)) == 0: - raise ValueError(("`bins` iterable should have at least one item")) + raise ValueError("`bins` iterable should have at least one item") elif isinstance(list(bins)[0], tuple): as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) @@ -139,6 +139,8 @@ def cut( as_index = pd.IntervalIndex.from_breaks(list(bins)) bins = list(bins) bins = tuple([(bins[i], bins[i + 1]) for i in range(len(bins) - 1)]) + else: + raise ValueError("`bins` iterable should contain tuples or numerics") if as_index.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") From 497262ea0b144a5399d09c0f453a5efa345c21da Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Wed, 10 Apr 2024 21:02:27 +0000 Subject: [PATCH 6/8] add floats and mixed test cases --- bigframes/core/reshape/__init__.py | 15 +++++++++++---- tests/system/small/test_pandas.py | 11 +++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 630c5febc3..6bcc25319b 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -131,14 +131,21 @@ def cut( as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) elif pd.api.types.is_number(list(bins)[0]): - if len(list(bins)) < 2: + bins_list = list(bins) + if len(bins_list) < 2: raise ValueError( "`bins` iterable of numeric breaks should have" " at least two items" ) - as_index = pd.IntervalIndex.from_breaks(list(bins)) - bins = list(bins) - bins = tuple([(bins[i], bins[i + 1]) for i in range(len(bins) - 1)]) + as_index = pd.IntervalIndex.from_breaks(bins_list) + single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list]) + numeric_type = type(bins_list[0]) if single_type else float + bins = tuple( + [ + (numeric_type(bins_list[i]), numeric_type(bins_list[i + 1])) + for i in range(len(bins_list) - 1) + ] + ) else: raise ValueError("`bins` iterable should contain tuples or numerics") diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index f66dbe1543..3add3baeb5 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -424,10 +424,17 @@ def test_cut_default_labels(scalars_dfs): ) -def test_cut_numeric_breaks(scalars_dfs): +@pytest.mark.parametrize( + ("breaks",), + [ + ([0, 5, 10, 15, 20, 100, 1000],), # ints + ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],), # floats + ([0, 5, 10.5, 15.5, 20, 100, 1000.5],), # mixed + ], +) +def test_cut_numeric_breaks(scalars_dfs, breaks): scalars_df, scalars_pandas_df = scalars_dfs - breaks = [0, 5, 10, 15, 20, 100, 1000] pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks) bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas() From 5e9f4ab1e3c1178dcae382106e4e21db50399480 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 11 Apr 2024 20:25:39 +0000 Subject: [PATCH 7/8] add errors test --- tests/system/small/test_pandas.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index d3480e557b..ef80b6e4da 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -456,6 +456,30 @@ def test_cut_numeric_breaks(scalars_dfs, breaks): ) +def test_cut_errors(scalars_dfs): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + # negative integer bins argument + bpd.cut(scalars_df["float64_col"], -1) + + with pytest.raises(ValueError): + # empty iterable of bins + bpd.cut(scalars_df["float64_col"], []) + + with pytest.raises(ValueError): + # iterable of wrong type + bpd.cut(scalars_df["float64_col"], ["notabreak"]) + + with pytest.raises(ValueError): + # numeric breaks with only one numeric + # this is supported by pandas but not by + # the bigquery operation and a bigframes workaround + # is not yet available. Should return column + # of structs with all NaN values. + bpd.cut(scalars_df["float64_col"], [1]) + + @pytest.mark.parametrize( ("bins",), [ From 64f82c93ff5c130382d3a0f1c6895904f81b6109 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Fri, 12 Apr 2024 21:32:28 +0000 Subject: [PATCH 8/8] parameterize error test --- tests/system/small/test_pandas.py | 32 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index ef80b6e4da..d543f92655 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -456,28 +456,24 @@ def test_cut_numeric_breaks(scalars_dfs, breaks): ) -def test_cut_errors(scalars_dfs): - scalars_df, _ = scalars_dfs - - with pytest.raises(ValueError): - # negative integer bins argument - bpd.cut(scalars_df["float64_col"], -1) - - with pytest.raises(ValueError): - # empty iterable of bins - bpd.cut(scalars_df["float64_col"], []) - - with pytest.raises(ValueError): - # iterable of wrong type - bpd.cut(scalars_df["float64_col"], ["notabreak"]) - - with pytest.raises(ValueError): - # numeric breaks with only one numeric +@pytest.mark.parametrize( + ("bins",), + [ + (-1,), # negative integer bins argument + ([],), # empty iterable of bins + (["notabreak"],), # iterable of wrong type + ([1],), # numeric breaks with only one numeric # this is supported by pandas but not by # the bigquery operation and a bigframes workaround # is not yet available. Should return column # of structs with all NaN values. - bpd.cut(scalars_df["float64_col"], [1]) + ], +) +def test_cut_errors(scalars_dfs, bins): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + bpd.cut(scalars_df["float64_col"], bins) @pytest.mark.parametrize(