Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: support list of numerics in pandas.cut #580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions 26 bigframes/core/reshape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from __future__ import annotations

import typing
from typing import Iterable, Literal, Optional, Tuple, Union
from typing import Iterable, Literal, Optional, Union

import pandas as pd

Expand Down Expand Up @@ -113,7 +113,7 @@ def cut(
bins: Union[
int,
pd.IntervalIndex,
Iterable[Tuple[Union[int, float], Union[int, float]]],
Iterable,
],
*,
labels: Optional[bool] = None,
Expand All @@ -125,9 +125,29 @@ def cut(
if isinstance(bins, pd.IntervalIndex):
as_index: pd.IntervalIndex = bins
bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
else:
elif len(list(bins)) == 0:
raise ValueError("`bins` iterable should have at least one item")
elif isinstance(list(bins)[0], tuple):
as_index = pd.IntervalIndex.from_tuples(list(bins))
bins = tuple(bins)
elif pd.api.types.is_number(list(bins)[0]):
bins_list = list(bins)
if len(bins_list) < 2:
raise ValueError(
milkshakeiii marked this conversation as resolved.
Show resolved Hide resolved
"`bins` iterable of numeric breaks should have"
" at least two items"
)
as_index = pd.IntervalIndex.from_breaks(bins_list)
single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list])
numeric_type = type(bins_list[0]) if single_type else float
bins = tuple(
[
(numeric_type(bins_list[i]), numeric_type(bins_list[i + 1]))
for i in range(len(bins_list) - 1)
]
)
else:
raise ValueError("`bins` iterable should contain tuples or numerics")

if as_index.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")
Expand Down
6 changes: 3 additions & 3 deletions 6 bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import abc
import dataclasses
import typing
from typing import ClassVar, Hashable, Optional, Tuple
from typing import ClassVar, Iterable, Optional

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -213,7 +213,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
@dataclasses.dataclass(frozen=True)
class CutOp(UnaryWindowOp):
# TODO: Unintuitive, refactor into multiple ops?
bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]]
bins: typing.Union[int, Iterable]
labels: Optional[bool]

@property
Expand All @@ -232,7 +232,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
interval_dtype = (
pa.float64()
if isinstance(self.bins, int)
else dtypes.infer_literal_arrow_type(self.bins[0][0])
else dtypes.infer_literal_arrow_type(list(self.bins)[0][0])
)
pa_type = pa.struct(
[
Expand Down
52 changes: 52 additions & 0 deletions 52 tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,58 @@ def test_cut_default_labels(scalars_dfs):
)


@pytest.mark.parametrize(
("breaks",),
[
([0, 5, 10, 15, 20, 100, 1000],), # ints
([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],), # floats
([0, 5, 10.5, 15.5, 20, 100, 1000.5],), # mixed
],
)
def test_cut_numeric_breaks(scalars_dfs, breaks):
milkshakeiii marked this conversation as resolved.
Show resolved Hide resolved
scalars_df, scalars_pandas_df = scalars_dfs

pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks)
bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas()

# Convert to match data format
pd_result_converted = pd.Series(
[
{"left_exclusive": interval.left, "right_inclusive": interval.right}
if pd.notna(val)
else pd.NA
for val, interval in zip(
pd_result, pd_result.cat.categories[pd_result.cat.codes]
)
],
name=pd_result.name,
)

pd.testing.assert_series_equal(
bf_result, pd_result_converted, check_index=False, check_dtype=False
)


@pytest.mark.parametrize(
("bins",),
[
(-1,), # negative integer bins argument
([],), # empty iterable of bins
(["notabreak"],), # iterable of wrong type
([1],), # numeric breaks with only one numeric
# this is supported by pandas but not by
# the bigquery operation and a bigframes workaround
# is not yet available. Should return column
# of structs with all NaN values.
],
)
def test_cut_errors(scalars_dfs, bins):
scalars_df, _ = scalars_dfs

with pytest.raises(ValueError):
bpd.cut(scalars_df["float64_col"], bins)


@pytest.mark.parametrize(
("bins",),
[
Expand Down
16 changes: 15 additions & 1 deletion 16 third_party/bigframes_vendored/pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,20 @@ def cut(
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]

Cut with an iterable of ints:

>>> bins_ints = [0, 1, 5, 20]
>>> bpd.cut(s, bins=bins_ints)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 {'left_exclusive': 1, 'right_inclusive': 5}
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]

Args:
x (Series):
The input Series to be binned. Must be 1-dimensional.
bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]):
bins (int, pd.IntervalIndex, Iterable):
The criteria to bin by.

int: Defines the number of equal-width bins in the range of `x`. The
Expand All @@ -88,6 +98,10 @@ def cut(

pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used.
It's important to ensure that these bins are non-overlapping.

Iterable of numerics: Defines the exact bins by using the interval
between each item and its following item. The items must be monotonically
increasing.
labels (None):
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.