Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: to_datetime supports utc=False for string inputs #579

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 14, 2024
39 changes: 28 additions & 11 deletions 39 bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import functools
import typing

import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import ibis
import ibis.common.exceptions
import ibis.expr.datatypes as ibis_dtypes
Expand Down Expand Up @@ -737,7 +738,7 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp):
return struct_value[name].name(name)


def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
if not isinstance(x, ibis_types.IntegerValue) and not isinstance(
x, ibis_types.FloatingValue
):
Expand Down Expand Up @@ -779,7 +780,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
# with pandas converting int64[pyarrow] to timestamp[us][pyarrow],
# timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow].
unit = "us"
x_converted = numeric_to_datatime(x, unit)
x_converted = numeric_to_datetime(x, unit)
if to_type == ibis_dtypes.timestamp:
return x_converted.cast(ibis_dtypes.Timestamp())
elif to_type == ibis_dtypes.Timestamp(timezone="UTC"):
Expand Down Expand Up @@ -818,23 +819,39 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp):
@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True)
def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do worry this operation is getting a bit too complicated. Do the type rules reflect the fact that a DateTime will be returned for utc==False?.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will work on the refactor in a separate PR :-)

if x.type() == ibis_dtypes.str:
x = x.to_timestamp(op.format) if op.format else timestamp(x)
elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"):
return vendored_ibis_ops.SafeCastToDatetime(x).to_expr()
else:
# Numerical inputs.
if op.format:
raise NotImplementedError(
f"Format parameter is not supported for Timestamp input types. {constants.FEEDBACK_LINK}"
)
return x
elif x.type() != ibis_dtypes.timestamp:
x = x.cast(ibis_dtypes.str).to_timestamp(op.format)
else:
# The default unit is set to "ns" (nanoseconds) for consistency
# with pandas, where "ns" is the default unit for datetime operations.
unit = op.unit or "ns"
x = numeric_to_datetime(x, unit)

return x.cast(ibis_dtypes.Timestamp(None))


@scalar_op_compiler.register_unary_op(ops.ToTimestampOp, pass_op=True)
def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp):
if x.type() == ibis_dtypes.str:
x = (
typing.cast(ibis_types.StringValue, x).to_timestamp(op.format)
if op.format
else timestamp(x)
)
else:
# Numerical inputs.
if op.format:
x = x.cast(ibis_dtypes.str).to_timestamp(op.format)
else:
# The default unit is set to "ns" (nanoseconds) for consistency
# with pandas, where "ns" is the default unit for datetime operations.
unit = op.unit or "ns"
x = numeric_to_datatime(x, unit)
x = numeric_to_datetime(x, unit)

return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None))
return x.cast(ibis_dtypes.Timestamp(timezone="UTC"))


@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)
Expand Down
72 changes: 58 additions & 14 deletions 72 bigframes/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import bigframes.constants as constants
import bigframes.dataframe
import bigframes.dtypes
import bigframes.operations as ops
import bigframes.series

Expand Down Expand Up @@ -51,25 +52,68 @@ def to_datetime(
f"to datetime is not implemented. {constants.FEEDBACK_LINK}"
)

arg = bigframes.series.Series(arg)
arg = bigframes.series.Series(arg)._cached()

if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore
raise NotImplementedError(
f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}"
)

if format and unit and arg.dtype in ("Int64", "Float64"): # type: ignore
if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore
raise ValueError("cannot specify both format and unit")

if unit and arg.dtype not in ("Int64", "Float64"): # type: ignore
if unit and arg.dtype not in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore
raise NotImplementedError(
f"Unit parameter is not supported for non-numerical input types. {constants.FEEDBACK_LINK}"
)

return arg._apply_unary_op( # type: ignore
ops.ToDatetimeOp(
utc=utc,
format=format,
unit=unit,
if arg.dtype in (bigframes.dtypes.TIMESTAMP_DTYPE, bigframes.dtypes.DATETIME_DTYPE):
to_type = (
bigframes.dtypes.TIMESTAMP_DTYPE if utc else bigframes.dtypes.DATETIME_DTYPE
)
return arg._apply_unary_op(ops.AsTypeOp(to_type=to_type)) # type: ignore
if (not utc) and arg.dtype == bigframes.dtypes.STRING_DTYPE:
if format:
raise NotImplementedError(
f"Customized formats are not supported for string inputs when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}"
)

assert unit is None
as_datetime = arg._apply_unary_op( # type: ignore
ops.ToDatetimeOp(
format=format,
unit=unit,
)
)
failed_datetime_cast = arg.notnull() & as_datetime.isnull()
is_utc = arg._apply_unary_op(
ops.EndsWithOp(
pat=("Z", "-00:00", "+00:00", "-0000", "+0000", "-00", "+00")
)
)

# Cast to DATETIME shall succeed if all inputs are tz-naive.
if not failed_datetime_cast.any():
return as_datetime

if is_utc.all():
return arg._apply_unary_op( # type: ignore
ops.ToTimestampOp(
format=format,
unit=unit,
)
)

raise NotImplementedError(
f"Non-UTC string inputs are not supported when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}"
)
# If utc:
elif utc:
return arg._apply_unary_op( # type: ignore
ops.ToTimestampOp(
format=format,
unit=unit,
)
)
else:
return arg._apply_unary_op( # type: ignore
ops.ToDatetimeOp(
format=format,
unit=unit,
)
)
)
28 changes: 25 additions & 3 deletions 28 bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import pandas as pd
import pyarrow as pa

import bigframes.dtypes
import bigframes.dtypes as dtypes
import bigframes.operations.type as op_typing

Expand Down Expand Up @@ -527,13 +528,34 @@ def output_type(self, *input_types):
@dataclasses.dataclass(frozen=True)
class ToDatetimeOp(UnaryOp):
name: typing.ClassVar[str] = "to_datetime"
utc: bool = False
format: typing.Optional[str] = None
unit: typing.Optional[str] = None

def output_type(self, *input_types):
timezone = "UTC" if self.utc else None
return pd.ArrowDtype(pa.timestamp("us", tz=timezone))
if input_types[0] not in (
bigframes.dtypes.FLOAT_DTYPE,
bigframes.dtypes.INT_DTYPE,
bigframes.dtypes.STRING_DTYPE,
):
raise TypeError("expected string or numeric input")
return pd.ArrowDtype(pa.timestamp("us", tz=None))


@dataclasses.dataclass(frozen=True)
class ToTimestampOp(UnaryOp):
name: typing.ClassVar[str] = "to_timestamp"
format: typing.Optional[str] = None
unit: typing.Optional[str] = None

def output_type(self, *input_types):
# Must be numeric or string
if input_types[0] not in (
bigframes.dtypes.FLOAT_DTYPE,
bigframes.dtypes.INT_DTYPE,
bigframes.dtypes.STRING_DTYPE,
):
raise TypeError("expected string or numeric input")
return pd.ArrowDtype(pa.timestamp("us", tz="UTC"))


@dataclasses.dataclass(frozen=True)
Expand Down
96 changes: 96 additions & 0 deletions 96 tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,3 +634,99 @@ def test_to_datetime_format_param(arg, utc, format):
pd.testing.assert_series_equal(
bf_result, pd_result, check_index_type=False, check_names=False
)


@pytest.mark.parametrize(
("arg", "utc", "output_in_utc", "format"),
[
(
["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"],
False,
False,
None,
),
(
[
"2008-12-25 05:30:00Z",
"2008-12-25 05:30:00-00:00",
"2008-12-25 05:30:00+00:00",
"2008-12-25 05:30:00-0000",
"2008-12-25 05:30:00+0000",
"2008-12-25 05:30:00-00",
"2008-12-25 05:30:00+00",
],
False,
True,
None,
),
(
["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"],
True,
True,
"%Y-%m-%d %H:%M:%S",
),
(
[
"2014-08-15 08:15:12+05:00",
"2011-08-15 08:15:12+05:00",
"2015-08-15 08:15:12+05:00",
],
True,
True,
None,
),
],
)
def test_to_datetime_string_inputs(arg, utc, output_in_utc, format):
bf_result = (
bpd.to_datetime(arg, utc=utc, format=format)
.to_pandas()
.astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]")
)
pd_result = pd.Series(pd.to_datetime(arg, utc=utc, format=format)).dt.floor("us")
pd.testing.assert_series_equal(
bf_result, pd_result, check_index_type=False, check_names=False
)


@pytest.mark.parametrize(
("arg", "utc", "output_in_utc"),
[
(
[datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)],
False,
False,
),
(
[datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)],
True,
True,
),
(
[
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
],
True,
True,
),
(
[
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York")),
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
],
True,
True,
),
],
)
def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc):
bf_result = (
bpd.to_datetime(arg, utc=utc)
.to_pandas()
.astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]")
)
pd_result = pd.Series(pd.to_datetime(arg, utc=utc)).dt.floor("us")
pd.testing.assert_series_equal(
bf_result, pd_result, check_index_type=False, check_names=False
)
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray):
return f"GENERATE_ARRAY(0, {arg})"


def _safe_cast_to_datetime(translator, op: vendored_ibis_ops.SafeCastToDatetime):
arg = translator.translate(op.arg)
return f"SAFE_CAST({arg} AS DATETIME)"


def _quantile(translator, op: ibis_reductions.Quantile):
arg = translator.translate(op.arg)
quantile = translator.translate(op.quantile)
Expand All @@ -44,6 +49,7 @@ def _quantile(translator, op: ibis_reductions.Quantile):
vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore
vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore
vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore
vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore
ibis_reductions.Quantile: _quantile, # type:ignore
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@

class GenerateArray(Unary):
dtype = dt.Array(dt.int64)


class SafeCastToDatetime(Unary):
dtype = dt.Timestamp(timezone=None)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.