From c07a2417263ae49a49573b8ccd5356e655800c19 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 19 Feb 2025 21:44:02 +0000 Subject: [PATCH 1/3] fix: fix a bug where to_timdelta() calls over timedeltas changes their values --- bigframes/core/compile/scalar_op_compiler.py | 5 ++++ bigframes/core/rewrite/timedeltas.py | 19 +++++++++++--- bigframes/operations/__init__.py | 2 ++ bigframes/operations/timedelta_ops.py | 27 +++++++++++++++++--- tests/system/small/test_pandas.py | 15 +++++++++++ 5 files changed, 60 insertions(+), 8 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index d5ce6e9e09..e0d6460092 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1174,6 +1174,11 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): ).floor() +@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op) +def timedelta_floor_op_impl(x: ibis_types.NumericValue): + return x.floor() + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): ibis_node = getattr(op.func, "ibis_node", None) diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index dad474e5a1..bde1a4431c 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -125,6 +125,9 @@ def _rewrite_op_expr( # but for timedeltas: int(timedelta) // float => int(timedelta) return _rewrite_floordiv_op(inputs[0], inputs[1]) + if isinstance(expr.op, ops.ToTimedeltaOp): + return _rewrite_to_timedelta_op(expr.op, inputs[0]) + return _TypedExpr.create_op_expr(expr.op, *inputs) @@ -154,9 +157,9 @@ def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.mul_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -165,7 +168,7 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.div_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -174,11 +177,19 @@ def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result +def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr): + if arg.dtype is dtypes.TIMEDELTA_DTYPE: + # Do nothing for values that are already timedeltas + return arg + + return _TypedExpr.create_op_expr(op, arg) + + @functools.cache def _rewrite_aggregation( aggregation: ex.Aggregation, schema: schema.ArraySchema diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index f2bc1ecf85..30e4694113 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -182,6 +182,7 @@ from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op from bigframes.operations.timedelta_ops import ( + timedelta_floor_op, timestamp_add_op, timestamp_sub_op, ToTimedeltaOp, @@ -257,6 +258,7 @@ "second_op", "normalize_op", # Timedelta ops + "timedelta_floor_op", "timestamp_add_op", "timestamp_sub_op", "ToTimedeltaOp", diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 689966e21b..364154f728 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -36,7 +36,26 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) -class TimestampAdd(base_ops.BinaryOp): +class TimedeltaFloorOp(base_ops.UnaryOp): + """Floors the numeric value to the nearest integer and use it to represent a timedelta. + + This operator is only meant to be used during expression tree rewrites. Do not use it anywhere else! + """ + + name: typing.ClassVar[str] = "timedelta_floor" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + input_type = input_types[0] + if dtypes.is_numeric(input_type) or input_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"unsupported type: {input_type}") + + +timedelta_floor_op = TimedeltaFloorOp() + + +@dataclasses.dataclass(frozen=True) +class TimestampAddOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_add" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -57,10 +76,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_add_op = TimestampAdd() +timestamp_add_op = TimestampAddOp() -class TimestampSub(base_ops.BinaryOp): +class TimestampSubOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_sub" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -76,4 +95,4 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_sub_op = TimestampSub() +timestamp_sub_op = TimestampSubOp() diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 4b4264e33c..da78432cdb 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -829,3 +829,18 @@ def test_to_timedelta_with_bf_series_invalid_unit(session, unit): @pytest.mark.parametrize("input", [1, 1.2, "1s"]) def test_to_timedelta_non_bf_series(input): assert bpd.to_timedelta(input) == pd.to_timedelta(input) + + +def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs): + bf_df, pd_df = scalars_dfs + bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us") + pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us") + + actual_result = ( + bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]") + ) + + expected_result = pd.to_timedelta(pd_series, unit="s") + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) From 29f592d2fd8011160892d7dc8baf231eb9fb2e03 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 20 Feb 2025 17:59:17 +0000 Subject: [PATCH 2/3] add tests for floats too --- tests/system/small/test_pandas.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index da78432cdb..2f0175f852 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -831,10 +831,11 @@ def test_to_timedelta_non_bf_series(input): assert bpd.to_timedelta(input) == pd.to_timedelta(input) -def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs): +@pytest.mark.parametrize("column", ["int64_too", "float64_col"]) +def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs, column): bf_df, pd_df = scalars_dfs - bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us") - pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us") + bf_series = bpd.to_timedelta(bf_df[column], unit="us") + pd_series = pd.to_timedelta(pd_df[column], unit="us").dt.floor("us") actual_result = ( bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]") From d36a65e2b4d4a642140c7bd283edf7ba1fe382f7 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 20 Feb 2025 19:05:21 +0000 Subject: [PATCH 3/3] remove float test case because py 3.9 env does not support NA very well --- tests/system/small/test_pandas.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 2f0175f852..da78432cdb 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -831,11 +831,10 @@ def test_to_timedelta_non_bf_series(input): assert bpd.to_timedelta(input) == pd.to_timedelta(input) -@pytest.mark.parametrize("column", ["int64_too", "float64_col"]) -def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs, column): +def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs): bf_df, pd_df = scalars_dfs - bf_series = bpd.to_timedelta(bf_df[column], unit="us") - pd_series = pd.to_timedelta(pd_df[column], unit="us").dt.floor("us") + bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us") + pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us") actual_result = ( bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]")