diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 8a44844fba..e8e5a1f3ac 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -17,6 +17,7 @@ import functools import typing +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.common.exceptions import ibis.expr.datatypes as ibis_dtypes @@ -737,7 +738,7 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): return struct_value[name].name(name) -def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: +def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: if not isinstance(x, ibis_types.IntegerValue) and not isinstance( x, ibis_types.FloatingValue ): @@ -779,7 +780,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. unit = "us" - x_converted = numeric_to_datatime(x, unit) + x_converted = numeric_to_datetime(x, unit) if to_type == ibis_dtypes.timestamp: return x_converted.cast(ibis_dtypes.Timestamp()) elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): @@ -818,23 +819,39 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() == ibis_dtypes.str: - x = x.to_timestamp(op.format) if op.format else timestamp(x) - elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): + return vendored_ibis_ops.SafeCastToDatetime(x).to_expr() + else: + # Numerical inputs. if op.format: - raise NotImplementedError( - f"Format parameter is not supported for Timestamp input types. {constants.FEEDBACK_LINK}" - ) - return x - elif x.type() != ibis_dtypes.timestamp: + x = x.cast(ibis_dtypes.str).to_timestamp(op.format) + else: + # The default unit is set to "ns" (nanoseconds) for consistency + # with pandas, where "ns" is the default unit for datetime operations. + unit = op.unit or "ns" + x = numeric_to_datetime(x, unit) + + return x.cast(ibis_dtypes.Timestamp(None)) + + +@scalar_op_compiler.register_unary_op(ops.ToTimestampOp, pass_op=True) +def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp): + if x.type() == ibis_dtypes.str: + x = ( + typing.cast(ibis_types.StringValue, x).to_timestamp(op.format) + if op.format + else timestamp(x) + ) + else: + # Numerical inputs. if op.format: x = x.cast(ibis_dtypes.str).to_timestamp(op.format) else: # The default unit is set to "ns" (nanoseconds) for consistency # with pandas, where "ns" is the default unit for datetime operations. unit = op.unit or "ns" - x = numeric_to_datatime(x, unit) + x = numeric_to_datetime(x, unit) - return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) + return x.cast(ibis_dtypes.Timestamp(timezone="UTC")) @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index a2851bc256..5eac4cceb9 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -21,6 +21,7 @@ import bigframes.constants as constants import bigframes.dataframe +import bigframes.dtypes import bigframes.operations as ops import bigframes.series @@ -51,25 +52,68 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - arg = bigframes.series.Series(arg) + arg = bigframes.series.Series(arg)._cached() - if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore - raise NotImplementedError( - f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}" - ) - - if format and unit and arg.dtype in ("Int64", "Float64"): # type: ignore + if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise ValueError("cannot specify both format and unit") - if unit and arg.dtype not in ("Int64", "Float64"): # type: ignore + if unit and arg.dtype not in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise NotImplementedError( f"Unit parameter is not supported for non-numerical input types. {constants.FEEDBACK_LINK}" ) - return arg._apply_unary_op( # type: ignore - ops.ToDatetimeOp( - utc=utc, - format=format, - unit=unit, + if arg.dtype in (bigframes.dtypes.TIMESTAMP_DTYPE, bigframes.dtypes.DATETIME_DTYPE): + to_type = ( + bigframes.dtypes.TIMESTAMP_DTYPE if utc else bigframes.dtypes.DATETIME_DTYPE + ) + return arg._apply_unary_op(ops.AsTypeOp(to_type=to_type)) # type: ignore + if (not utc) and arg.dtype == bigframes.dtypes.STRING_DTYPE: + if format: + raise NotImplementedError( + f"Customized formats are not supported for string inputs when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}" + ) + + assert unit is None + as_datetime = arg._apply_unary_op( # type: ignore + ops.ToDatetimeOp( + format=format, + unit=unit, + ) + ) + failed_datetime_cast = arg.notnull() & as_datetime.isnull() + is_utc = arg._apply_unary_op( + ops.EndsWithOp( + pat=("Z", "-00:00", "+00:00", "-0000", "+0000", "-00", "+00") + ) + ) + + # Cast to DATETIME shall succeed if all inputs are tz-naive. + if not failed_datetime_cast.any(): + return as_datetime + + if is_utc.all(): + return arg._apply_unary_op( # type: ignore + ops.ToTimestampOp( + format=format, + unit=unit, + ) + ) + + raise NotImplementedError( + f"Non-UTC string inputs are not supported when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}" + ) + # If utc: + elif utc: + return arg._apply_unary_op( # type: ignore + ops.ToTimestampOp( + format=format, + unit=unit, + ) + ) + else: + return arg._apply_unary_op( # type: ignore + ops.ToDatetimeOp( + format=format, + unit=unit, + ) ) - ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 929ccaecc5..2f39b096ce 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -23,6 +23,7 @@ import pandas as pd import pyarrow as pa +import bigframes.dtypes import bigframes.dtypes as dtypes import bigframes.operations.type as op_typing @@ -527,13 +528,34 @@ def output_type(self, *input_types): @dataclasses.dataclass(frozen=True) class ToDatetimeOp(UnaryOp): name: typing.ClassVar[str] = "to_datetime" - utc: bool = False format: typing.Optional[str] = None unit: typing.Optional[str] = None def output_type(self, *input_types): - timezone = "UTC" if self.utc else None - return pd.ArrowDtype(pa.timestamp("us", tz=timezone)) + if input_types[0] not in ( + bigframes.dtypes.FLOAT_DTYPE, + bigframes.dtypes.INT_DTYPE, + bigframes.dtypes.STRING_DTYPE, + ): + raise TypeError("expected string or numeric input") + return pd.ArrowDtype(pa.timestamp("us", tz=None)) + + +@dataclasses.dataclass(frozen=True) +class ToTimestampOp(UnaryOp): + name: typing.ClassVar[str] = "to_timestamp" + format: typing.Optional[str] = None + unit: typing.Optional[str] = None + + def output_type(self, *input_types): + # Must be numeric or string + if input_types[0] not in ( + bigframes.dtypes.FLOAT_DTYPE, + bigframes.dtypes.INT_DTYPE, + bigframes.dtypes.STRING_DTYPE, + ): + raise TypeError("expected string or numeric input") + return pd.ArrowDtype(pa.timestamp("us", tz="UTC")) @dataclasses.dataclass(frozen=True) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index d543f92655..6eee01dd31 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -634,3 +634,99 @@ def test_to_datetime_format_param(arg, utc, format): pd.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) + + +@pytest.mark.parametrize( + ("arg", "utc", "output_in_utc", "format"), + [ + ( + ["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"], + False, + False, + None, + ), + ( + [ + "2008-12-25 05:30:00Z", + "2008-12-25 05:30:00-00:00", + "2008-12-25 05:30:00+00:00", + "2008-12-25 05:30:00-0000", + "2008-12-25 05:30:00+0000", + "2008-12-25 05:30:00-00", + "2008-12-25 05:30:00+00", + ], + False, + True, + None, + ), + ( + ["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"], + True, + True, + "%Y-%m-%d %H:%M:%S", + ), + ( + [ + "2014-08-15 08:15:12+05:00", + "2011-08-15 08:15:12+05:00", + "2015-08-15 08:15:12+05:00", + ], + True, + True, + None, + ), + ], +) +def test_to_datetime_string_inputs(arg, utc, output_in_utc, format): + bf_result = ( + bpd.to_datetime(arg, utc=utc, format=format) + .to_pandas() + .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") + ) + pd_result = pd.Series(pd.to_datetime(arg, utc=utc, format=format)).dt.floor("us") + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) + + +@pytest.mark.parametrize( + ("arg", "utc", "output_in_utc"), + [ + ( + [datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)], + False, + False, + ), + ( + [datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)], + True, + True, + ), + ( + [ + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")), + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")), + ], + True, + True, + ), + ( + [ + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York")), + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")), + ], + True, + True, + ), + ], +) +def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc): + bf_result = ( + bpd.to_datetime(arg, utc=utc) + .to_pandas() + .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") + ) + pd_result = pd.Series(pd.to_datetime(arg, utc=utc)).dt.floor("us") + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index fddeab19a2..64ef05366d 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -32,6 +32,11 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): return f"GENERATE_ARRAY(0, {arg})" +def _safe_cast_to_datetime(translator, op: vendored_ibis_ops.SafeCastToDatetime): + arg = translator.translate(op.arg) + return f"SAFE_CAST({arg} AS DATETIME)" + + def _quantile(translator, op: ibis_reductions.Quantile): arg = translator.translate(op.arg) quantile = translator.translate(op.quantile) @@ -44,6 +49,7 @@ def _quantile(translator, op: ibis_reductions.Quantile): vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore + vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore ibis_reductions.Quantile: _quantile, # type:ignore } diff --git a/third_party/bigframes_vendored/ibis/expr/operations/generic.py b/third_party/bigframes_vendored/ibis/expr/operations/generic.py index 82d0a13371..98acaacfbd 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/generic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/generic.py @@ -7,3 +7,7 @@ class GenerateArray(Unary): dtype = dt.Array(dt.int64) + + +class SafeCastToDatetime(Unary): + dtype = dt.Timestamp(timezone=None)