From 7381e846549791b67dac92feace17d577c5aeaac Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Thu, 4 Apr 2024 23:33:12 +0000
Subject: [PATCH 1/8] feat: support list of numerics in pandas.cut

---
 bigframes/core/reshape/__init__.py            | 17 +++++++++---
 bigframes/operations/aggregations.py          |  6 ++---
 tests/system/small/test_pandas.py             | 26 +++++++++++++++++++
 .../pandas/core/reshape/tile.py               | 16 +++++++++++-
 4 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py
index e3ed8edd21..a306ace53c 100644
--- a/bigframes/core/reshape/__init__.py
+++ b/bigframes/core/reshape/__init__.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 import typing
-from typing import Iterable, Literal, Optional, Tuple, Union
+from typing import Iterable, Literal, Optional, Union
 
 import pandas as pd
 
@@ -113,7 +113,7 @@ def cut(
     bins: Union[
         int,
         pd.IntervalIndex,
-        Iterable[Tuple[Union[int, float], Union[int, float]]],
+        Iterable,
     ],
     *,
     labels: Optional[bool] = None,
@@ -125,9 +125,20 @@ def cut(
         if isinstance(bins, pd.IntervalIndex):
             as_index: pd.IntervalIndex = bins
             bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
-        else:
+        elif len(list(bins)) == 0:
+            raise ValueError(("`bins` iterable should have at least one item"))
+        elif isinstance(list(bins)[0], tuple):
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple(bins)
+        elif pd.api.types.is_number(list(bins)[0]):
+            if len(list(bins)) < 2:
+                raise ValueError(
+                    "`bins` iterable of numeric breaks should have"
+                    " at least two items"
+                )
+            as_index = pd.IntervalIndex.from_breaks(list(bins))
+            bins = list(bins)
+            bins = tuple([(bins[i], bins[i + 1]) for i in range(len(bins) - 1)])
 
         if as_index.is_overlapping:
             raise ValueError("Overlapping IntervalIndex is not accepted.")
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index 76aa2a6112..d9eec504e9 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -17,7 +17,7 @@
 import abc
 import dataclasses
 import typing
-from typing import ClassVar, Hashable, Optional, Tuple
+from typing import ClassVar, Iterable, Optional
 
 import pandas as pd
 import pyarrow as pa
@@ -191,7 +191,7 @@ def output_type(self, *input_types: dtypes.ExpressionType):
 @dataclasses.dataclass(frozen=True)
 class CutOp(UnaryWindowOp):
     # TODO: Unintuitive, refactor into multiple ops?
-    bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]]
+    bins: typing.Union[int, Iterable]
     labels: Optional[bool]
 
     @property
@@ -210,7 +210,7 @@ def output_type(self, *input_types: dtypes.ExpressionType):
             interval_dtype = (
                 pa.float64()
                 if isinstance(self.bins, int)
-                else dtypes.infer_literal_arrow_type(self.bins[0][0])
+                else dtypes.infer_literal_arrow_type(list(self.bins)[0][0])
             )
             pa_type = pa.struct(
                 [
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index a080a969c8..e9a2e40545 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -424,6 +424,32 @@ def test_cut_default_labels(scalars_dfs):
     )
 
 
+def test_cut_numeric_breaks(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000])
+    bf_result = bpd.cut(
+        scalars_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000]
+    ).to_pandas()
+
+    # Convert to match data format
+    pd_result_converted = pd.Series(
+        [
+            {"left_exclusive": interval.left, "right_inclusive": interval.right}
+            if pd.notna(val)
+            else pd.NA
+            for val, interval in zip(
+                pd_result, pd_result.cat.categories[pd_result.cat.codes]
+            )
+        ],
+        name=pd_result.name,
+    )
+
+    pd.testing.assert_series_equal(
+        bf_result, pd_result_converted, check_index=False, check_dtype=False
+    )
+
+
 @pytest.mark.parametrize(
     ("bins",),
     [
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
index fbd1d2d052..5e81c35ca3 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
@@ -76,10 +76,20 @@ def cut(
         3    {'left_exclusive': 5, 'right_inclusive': 20}
         dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]
 
+    Cut with an iterable of ints:
+
+        >>> bins_ints = [0, 1, 5, 20]
+        >>> bpd.cut(s, bins=bins_tuples)
+        0                                            <NA>
+        1     {'left_exclusive': 0, 'right_inclusive': 1}
+        2     {'left_exclusive': 1, 'right_inclusive': 5}
+        3    {'left_exclusive': 5, 'right_inclusive': 20}
+        dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]
+
     Args:
         x (Series):
             The input Series to be binned. Must be 1-dimensional.
-        bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]):
+        bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]], Iterable[Union[int, float]]):
             The criteria to bin by.
 
             int: Defines the number of equal-width bins in the range of `x`. The
@@ -88,6 +98,10 @@ def cut(
 
             pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used.
             It's important to ensure that these bins are non-overlapping.
+
+            Iterable of floats or ints: Defines the exact bins by using the interval
+            between each item and its following item. The items must be monotonically
+            increasing.
         labels (None):
             Specifies the labels for the returned bins. Must be the same length as
             the resulting bins. If False, returns only integer indicators of the

From 1c62a3dd0479402fdc031aa333884d5f4bc953b4 Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Thu, 4 Apr 2024 23:37:55 +0000
Subject: [PATCH 2/8] tweak test

---
 tests/system/small/test_pandas.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index e9a2e40545..f66dbe1543 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -427,10 +427,9 @@ def test_cut_default_labels(scalars_dfs):
 def test_cut_numeric_breaks(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    pd_result = pd.cut(scalars_pandas_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000])
-    bf_result = bpd.cut(
-        scalars_df["float64_col"], [0, 5, 10, 15, 20, 100, 1000]
-    ).to_pandas()
+    breaks = [0, 5, 10, 15, 20, 100, 1000]
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks)
+    bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas()
 
     # Convert to match data format
     pd_result_converted = pd.Series(

From ce9fd2c187207d03426c730ebac4f55311fa61dc Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Thu, 4 Apr 2024 23:39:48 +0000
Subject: [PATCH 3/8] fix doctest

---
 third_party/bigframes_vendored/pandas/core/reshape/tile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
index 5e81c35ca3..15ed6971c5 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
@@ -79,7 +79,7 @@ def cut(
     Cut with an iterable of ints:
 
         >>> bins_ints = [0, 1, 5, 20]
-        >>> bpd.cut(s, bins=bins_tuples)
+        >>> bpd.cut(s, bins=bins_ints)
         0                                            <NA>
         1     {'left_exclusive': 0, 'right_inclusive': 1}
         2     {'left_exclusive': 1, 'right_inclusive': 5}

From c5d74cee67599055d82fb85fb6c36683992822fd Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Thu, 4 Apr 2024 23:41:33 +0000
Subject: [PATCH 4/8] fix docstring

---
 third_party/bigframes_vendored/pandas/core/reshape/tile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
index 15ed6971c5..6ba3950a76 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
@@ -89,7 +89,7 @@ def cut(
     Args:
         x (Series):
             The input Series to be binned. Must be 1-dimensional.
-        bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]], Iterable[Union[int, float]]):
+        bins (int, pd.IntervalIndex, Iterable):
             The criteria to bin by.
 
             int: Defines the number of equal-width bins in the range of `x`. The
@@ -99,7 +99,7 @@ def cut(
             pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used.
             It's important to ensure that these bins are non-overlapping.
 
-            Iterable of floats or ints: Defines the exact bins by using the interval
+            Iterable of numerics: Defines the exact bins by using the interval
             between each item and its following item. The items must be monotonically
             increasing.
         labels (None):

From ee25deb7936154aa2db6d96b8013173b23fbfc15 Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Thu, 4 Apr 2024 23:45:47 +0000
Subject: [PATCH 5/8] improve error case

---
 bigframes/core/reshape/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py
index a306ace53c..630c5febc3 100644
--- a/bigframes/core/reshape/__init__.py
+++ b/bigframes/core/reshape/__init__.py
@@ -126,7 +126,7 @@ def cut(
             as_index: pd.IntervalIndex = bins
             bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
         elif len(list(bins)) == 0:
-            raise ValueError(("`bins` iterable should have at least one item"))
+            raise ValueError("`bins` iterable should have at least one item")
         elif isinstance(list(bins)[0], tuple):
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple(bins)
@@ -139,6 +139,8 @@ def cut(
             as_index = pd.IntervalIndex.from_breaks(list(bins))
             bins = list(bins)
             bins = tuple([(bins[i], bins[i + 1]) for i in range(len(bins) - 1)])
+        else:
+            raise ValueError("`bins` iterable should contain tuples or numerics")
 
         if as_index.is_overlapping:
             raise ValueError("Overlapping IntervalIndex is not accepted.")

From 497262ea0b144a5399d09c0f453a5efa345c21da Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Wed, 10 Apr 2024 21:02:27 +0000
Subject: [PATCH 6/8] add floats and mixed test cases

---
 bigframes/core/reshape/__init__.py | 15 +++++++++++----
 tests/system/small/test_pandas.py  | 11 +++++++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py
index 630c5febc3..6bcc25319b 100644
--- a/bigframes/core/reshape/__init__.py
+++ b/bigframes/core/reshape/__init__.py
@@ -131,14 +131,21 @@ def cut(
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple(bins)
         elif pd.api.types.is_number(list(bins)[0]):
-            if len(list(bins)) < 2:
+            bins_list = list(bins)
+            if len(bins_list) < 2:
                 raise ValueError(
                     "`bins` iterable of numeric breaks should have"
                     " at least two items"
                 )
-            as_index = pd.IntervalIndex.from_breaks(list(bins))
-            bins = list(bins)
-            bins = tuple([(bins[i], bins[i + 1]) for i in range(len(bins) - 1)])
+            as_index = pd.IntervalIndex.from_breaks(bins_list)
+            single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list])
+            numeric_type = type(bins_list[0]) if single_type else float
+            bins = tuple(
+                [
+                    (numeric_type(bins_list[i]), numeric_type(bins_list[i + 1]))
+                    for i in range(len(bins_list) - 1)
+                ]
+            )
         else:
             raise ValueError("`bins` iterable should contain tuples or numerics")
 
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index f66dbe1543..3add3baeb5 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -424,10 +424,17 @@ def test_cut_default_labels(scalars_dfs):
     )
 
 
-def test_cut_numeric_breaks(scalars_dfs):
+@pytest.mark.parametrize(
+    ("breaks",),
+    [
+        ([0, 5, 10, 15, 20, 100, 1000],),  # ints
+        ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],),  # floats
+        ([0, 5, 10.5, 15.5, 20, 100, 1000.5],),  # mixed
+    ],
+)
+def test_cut_numeric_breaks(scalars_dfs, breaks):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    breaks = [0, 5, 10, 15, 20, 100, 1000]
     pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks)
     bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas()
 

From 5e9f4ab1e3c1178dcae382106e4e21db50399480 Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Thu, 11 Apr 2024 20:25:39 +0000
Subject: [PATCH 7/8] add errors test

---
 tests/system/small/test_pandas.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index d3480e557b..ef80b6e4da 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -456,6 +456,30 @@ def test_cut_numeric_breaks(scalars_dfs, breaks):
     )
 
 
+def test_cut_errors(scalars_dfs):
+    scalars_df, _ = scalars_dfs
+
+    with pytest.raises(ValueError):
+        # negative integer bins argument
+        bpd.cut(scalars_df["float64_col"], -1)
+
+    with pytest.raises(ValueError):
+        # empty iterable of bins
+        bpd.cut(scalars_df["float64_col"], [])
+
+    with pytest.raises(ValueError):
+        # iterable of wrong type
+        bpd.cut(scalars_df["float64_col"], ["notabreak"])
+
+    with pytest.raises(ValueError):
+        # numeric breaks with only one numeric
+        # this is supported by pandas but not by
+        # the bigquery operation and a bigframes workaround
+        # is not yet available. Should return column
+        # of structs with all NaN values.
+        bpd.cut(scalars_df["float64_col"], [1])
+
+
 @pytest.mark.parametrize(
     ("bins",),
     [

From 64f82c93ff5c130382d3a0f1c6895904f81b6109 Mon Sep 17 00:00:00 2001
From: milkshakeiii <milkshakeiii@gmail.com>
Date: Fri, 12 Apr 2024 21:32:28 +0000
Subject: [PATCH 8/8] parameterize error test

---
 tests/system/small/test_pandas.py | 32 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index ef80b6e4da..d543f92655 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -456,28 +456,24 @@ def test_cut_numeric_breaks(scalars_dfs, breaks):
     )
 
 
-def test_cut_errors(scalars_dfs):
-    scalars_df, _ = scalars_dfs
-
-    with pytest.raises(ValueError):
-        # negative integer bins argument
-        bpd.cut(scalars_df["float64_col"], -1)
-
-    with pytest.raises(ValueError):
-        # empty iterable of bins
-        bpd.cut(scalars_df["float64_col"], [])
-
-    with pytest.raises(ValueError):
-        # iterable of wrong type
-        bpd.cut(scalars_df["float64_col"], ["notabreak"])
-
-    with pytest.raises(ValueError):
-        # numeric breaks with only one numeric
+@pytest.mark.parametrize(
+    ("bins",),
+    [
+        (-1,),  # negative integer bins argument
+        ([],),  # empty iterable of bins
+        (["notabreak"],),  # iterable of wrong type
+        ([1],),  # numeric breaks with only one numeric
         # this is supported by pandas but not by
         # the bigquery operation and a bigframes workaround
         # is not yet available. Should return column
         # of structs with all NaN values.
-        bpd.cut(scalars_df["float64_col"], [1])
+    ],
+)
+def test_cut_errors(scalars_dfs, bins):
+    scalars_df, _ = scalars_dfs
+
+    with pytest.raises(ValueError):
+        bpd.cut(scalars_df["float64_col"], bins)
 
 
 @pytest.mark.parametrize(