From 9b133fcf91dd0e04afef1935e3280170cffe2d2c Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 2 Apr 2024 19:52:51 +0000 Subject: [PATCH 1/2] fix: Restore string to date/time type coercion --- bigframes/dtypes.py | 23 ++++++++-- bigframes/operations/__init__.py | 24 ++++------ bigframes/operations/type.py | 22 +++++++--- .../system/small/operations/test_datetimes.py | 44 +++++++++++++++++++ 4 files changed, 87 insertions(+), 26 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 79e1456f31..3d8c06d188 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -648,6 +648,7 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype: + """Get the supertype of the two types.""" if dtype1 == dtype2: return dtype1 # Implicit conversion currently only supported for numeric types @@ -664,12 +665,26 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype: return hierarchy[lcd_index] -def lcd_etype(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType: - if etype1 is None: +def coerce_to_common(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType: + """Coerce types to a common type or throw a TypeError""" + if etype1 is not None and etype2 is not None: + common_supertype = lcd_type(etype1, etype2) + if common_supertype is not None: + return common_supertype + if can_coerce(etype1, etype2): return etype2 - if etype2 is None: + if can_coerce(etype2, etype1): return etype1 - return lcd_type_or_throw(etype1, etype2) + raise TypeError(f"Cannot coerce {etype1} and {etype2} to a common type.") + + +def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool: + if source_type is None: + return True # None can be coerced to any supported type + else: + return (source_type == STRING_DTYPE) and ( + target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE) + ) def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index dcd5494626..0dcc643238 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -548,16 +548,10 @@ def output_type(self, *input_types): # Binary Ops -fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COMMON_SUPERTYPE) -cliplower_op = create_binary_op( - name="clip_lower", type_signature=op_typing.COMMON_SUPERTYPE -) -clipupper_op = create_binary_op( - name="clip_upper", type_signature=op_typing.COMMON_SUPERTYPE -) -coalesce_op = create_binary_op( - name="coalesce", type_signature=op_typing.COMMON_SUPERTYPE -) +fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) +cliplower_op = create_binary_op(name="clip_lower", type_signature=op_typing.COERCE) +clipupper_op = create_binary_op(name="clip_upper", type_signature=op_typing.COERCE) +coalesce_op = create_binary_op(name="coalesce", type_signature=op_typing.COERCE) ## Math Ops @@ -575,7 +569,7 @@ def output_type(self, *input_types): right_type is None or dtypes.is_numeric(right_type) ): # Numeric addition - return dtypes.lcd_etype(left_type, right_type) + return dtypes.coerce_to_common(left_type, right_type) # TODO: Add temporal addition once delta types supported raise TypeError(f"Cannot add dtypes {left_type} and {right_type}") @@ -592,7 +586,7 @@ def output_type(self, *input_types): right_type is None or dtypes.is_numeric(right_type) ): # Numeric subtraction - return dtypes.lcd_etype(left_type, right_type) + return dtypes.coerce_to_common(left_type, right_type) # TODO: Add temporal addition once delta types supported raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") @@ -652,7 +646,7 @@ class WhereOp(TernaryOp): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if input_types[1] != dtypes.BOOL_DTYPE: raise TypeError("where condition must be a boolean") - return dtypes.lcd_etype(input_types[0], input_types[2]) + return dtypes.coerce_to_common(input_types[0], input_types[2]) where_op = WhereOp() @@ -663,8 +657,8 @@ class ClipOp(TernaryOp): name: typing.ClassVar[str] = "clip" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return dtypes.lcd_etype( - input_types[0], dtypes.lcd_etype(input_types[1], input_types[2]) + return dtypes.coerce_to_common( + input_types[0], dtypes.coerce_to_common(input_types[1], input_types[2]) ) diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index a1dc8edffc..f469070805 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -118,7 +118,7 @@ def output_type( raise TypeError(f"Type {left_type} is not numeric") if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): raise TypeError(f"Type {right_type} is not numeric") - return bigframes.dtypes.lcd_etype(left_type, right_type) + return bigframes.dtypes.coerce_to_common(left_type, right_type) @dataclasses.dataclass @@ -132,7 +132,7 @@ def output_type( raise TypeError(f"Type {left_type} is not numeric") if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): raise TypeError(f"Type {right_type} is not numeric") - lcd_type = bigframes.dtypes.lcd_etype(left_type, right_type) + lcd_type = bigframes.dtypes.coerce_to_common(left_type, right_type) if lcd_type == bigframes.dtypes.INT_DTYPE: # Real numeric ops produce floats on int input return bigframes.dtypes.FLOAT_DTYPE @@ -140,13 +140,21 @@ def output_type( @dataclasses.dataclass -class Supertype(BinaryTypeSignature): - """Type signature for functions that return a the supertype of its inputs. Currently BigFrames just supports upcasting numerics.""" +class CoerceCommon(BinaryTypeSignature): + """Attempt to coerce inputs to a compatible type.""" def output_type( self, left_type: ExpressionType, right_type: ExpressionType ) -> ExpressionType: - return bigframes.dtypes.lcd_etype(left_type, right_type) + try: + return bigframes.dtypes.coerce_to_common(left_type, right_type) + except TypeError: + pass + if bigframes.dtypes.can_coerce(left_type, right_type): + return right_type + if bigframes.dtypes.can_coerce(right_type, left_type): + return left_type + raise TypeError(f"Cannot coerce {left_type} and {right_type} to a common type.") @dataclasses.dataclass @@ -156,7 +164,7 @@ class Comparison(BinaryTypeSignature): def output_type( self, left_type: ExpressionType, right_type: ExpressionType ) -> ExpressionType: - common_type = bigframes.dtypes.lcd_etype(left_type, right_type) + common_type = CoerceCommon().output_type(left_type, right_type) if not bigframes.dtypes.is_comparable(common_type): raise TypeError(f"Types {left_type} and {right_type} are not comparable") return bigframes.dtypes.BOOL_DTYPE @@ -188,7 +196,7 @@ def output_type( BINARY_NUMERIC = BinaryNumeric() BINARY_REAL_NUMERIC = BinaryRealNumeric() COMPARISON = Comparison() -COMMON_SUPERTYPE = Supertype() +COERCE = CoerceCommon() LOGICAL = Logical() STRING_TRANSFORM = TypePreserving( bigframes.dtypes.is_string_like, description="numeric" diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index b952289a72..bf9c974884 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -303,3 +303,47 @@ def test_dt_floor(scalars_dfs, col_name, freq): pd_result.astype(scalars_df[col_name].dtype), # floor preserves type bf_result, ) + + +def test_dt_compare_coerce_str_datetime(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["datetime_col"] + bf_result = (bf_series >= "2024-01-01").to_pandas() + + pd_result = scalars_pandas_df["datetime_col"] >= pd.to_datetime("2024-01-01") + + # pandas produces pyarrow bool dtype + assert_series_equal(pd_result, bf_result, check_dtype=False) + + +def test_dt_clip_coerce_str_date(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["date_col"] + bf_result = bf_series.clip("2020-01-01", "2024-01-01").to_pandas() + + pd_result = scalars_pandas_df["date_col"].clip( + pd.to_datetime("2020-01-01"), pd.to_datetime("2024-01-01") + ) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_dt_clip_coerce_str_timestamp(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["timestamp_col"] + bf_result = bf_series.clip( + "2020-01-01T20:03:50Z", "2024-01-01T20:03:50Z" + ).to_pandas() + + pd_result = scalars_pandas_df["timestamp_col"].clip( + pd.to_datetime("2020-01-01T20:03:50Z", utc=True), + pd.to_datetime("2024-01-01T20:03:50Z", utc=True), + ) + + assert_series_equal( + pd_result, + bf_result, + ) From 52003e79e7922a0b873cdfbd294b1346c312423a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 2 Apr 2024 23:39:17 +0000 Subject: [PATCH 2/2] fix tests --- .../system/small/operations/test_datetimes.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index bf9c974884..2824e86979 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime + import pandas as pd import pytest @@ -316,13 +318,31 @@ def test_dt_compare_coerce_str_datetime(scalars_dfs): assert_series_equal(pd_result, bf_result, check_dtype=False) +def test_dt_clip_datetime_literals(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["date_col"] + bf_result = bf_series.clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ).to_pandas() + + pd_result = scalars_pandas_df["date_col"].clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ) + + assert_series_equal( + pd_result, + bf_result, + ) + + def test_dt_clip_coerce_str_date(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df["date_col"] bf_result = bf_series.clip("2020-01-01", "2024-01-01").to_pandas() + # Pandas can't coerce with pyarrow types so convert first pd_result = scalars_pandas_df["date_col"].clip( - pd.to_datetime("2020-01-01"), pd.to_datetime("2024-01-01") + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) ) assert_series_equal(