diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1251e64fb0..1d8169960b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1562,6 +1562,21 @@ def interpolate(self, method: str = "linear") -> DataFrame: def fillna(self, value=None) -> DataFrame: return self._apply_binop(value, ops.fillna_op, how="left") + def replace( + self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False + ): + if utils.is_dict_like(value): + return self.apply( + lambda x: x.replace( + to_replace=to_replace, value=value[x.name], regex=regex + ) + if (x.name in value) + else x + ) + return self.apply( + lambda x: x.replace(to_replace=to_replace, value=value, regex=regex) + ) + def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame: window = bigframes.core.WindowSpec(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 774eb74d06..6dfcc17f37 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -14,6 +14,7 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" +import datetime import textwrap import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union @@ -437,3 +438,50 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict: gcb3p_pandas_helpers.bq_to_arrow_data_type(field) ) return dtypes + + +def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: + """Captures whether a scalar can be losslessly represented by a dtype.""" + if scalar is None: + return True + if pd.api.types.is_bool_dtype(dtype): + return pd.api.types.is_bool(scalar) + if pd.api.types.is_float_dtype(dtype): + return pd.api.types.is_float(scalar) + if pd.api.types.is_integer_dtype(dtype): + return pd.api.types.is_integer(scalar) + if isinstance(dtype, pd.StringDtype): + return isinstance(scalar, str) + if isinstance(dtype, pd.ArrowDtype): + pa_type = dtype.pyarrow_dtype + return is_patype(scalar, pa_type) + return False + + +def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: + """Determine whether a scalar's type matches a given pyarrow type.""" + if pa_type == pa.time64("us"): + return isinstance(scalar, datetime.time) + if pa_type == pa.timestamp("us"): + if isinstance(scalar, datetime.datetime): + return not scalar.tzinfo + if isinstance(scalar, pd.Timestamp): + return not scalar.tzinfo + if pa_type == pa.timestamp("us", tz="UTC"): + if isinstance(scalar, datetime.datetime): + return scalar.tzinfo == datetime.timezone.utc + if isinstance(scalar, pd.Timestamp): + return scalar.tzinfo == datetime.timezone.utc + if pa_type == pa.date32(): + return isinstance(scalar, datetime.date) + return False + + +def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool: + """Whether scalar can be compare to items of dtype (though maybe requiring coercion)""" + if is_dtype(scalar, dtype): + return True + elif pd.api.types.is_numeric_dtype(dtype): + return pd.api.types.is_number(scalar) + else: + return False diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 0655aafdb3..753870a42d 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -523,6 +523,20 @@ def _as_ibis(self, x: ibis_types.Value): return bigframes.dtypes.cast_ibis_value(x, self.to_type) +class MapOp(UnaryOp): + def __init__( + self, + mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...], + ): + self._mappings = mappings + + def _as_ibis(self, x: ibis_types.Value): + case = ibis.case() + for mapping in self._mappings: + case = case.when(x == mapping[0], mapping[1]) + return case.else_(x).end() + + class FindOp(UnaryOp): def __init__(self, sub, start, end): self._sub = sub diff --git a/bigframes/series.py b/bigframes/series.py index 8d8c711c92..1b9982877a 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -442,42 +442,67 @@ def replace( self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False ): if regex: - if not (isinstance(to_replace, str) and isinstance(value, str)): - raise NotImplementedError( - f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}" - ) - block, result_col = self._block.apply_unary_op( - self._value_column, - ops.ReplaceRegexOp(to_replace, value), - result_label=self.name, - ) - return Series(block.select_column(result_col)) + # No-op unless to_replace and series dtype are both string type + if not isinstance(to_replace, str) or not isinstance( + self.dtype, pandas.StringDtype + ): + return self + return self._regex_replace(to_replace, value) elif utils.is_dict_like(to_replace): - raise NotImplementedError( - f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}" - ) + return self._mapping_replace(to_replace) # type: ignore elif utils.is_list_like(to_replace): - block, cond = self._block.apply_unary_op( - self._value_column, ops.IsInOp(to_replace) - ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, - ) - return Series(block.select_column(result_col)) + replace_list = to_replace else: # Scalar - block, cond = self._block.apply_unary_op( - self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace) + replace_list = [to_replace] + replace_list = [ + i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype) + ] + return self._simple_replace(replace_list, value) if replace_list else self + + def _regex_replace(self, to_replace: str, value: str): + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, + block, result_col = self._block.apply_unary_op( + self._value_column, + ops.ReplaceRegexOp(to_replace, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + + def _simple_replace(self, to_replace_list: typing.Sequence, value): + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) - return Series(block.select_column(result_col)) + + block, cond = self._block.apply_unary_op( + self._value_column, ops.IsInOp(to_replace_list) + ) + block, result_col = block.apply_binary_op( + cond, + self._value_column, + ops.partial_arg1(ops.where_op, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + + def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): + tuples = [] + for key, value in mapping.items(): + if not bigframes.dtypes.is_comparable(key, self.dtype): + continue + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" + ) + tuples.append((key, value)) + + block, result = self._block.apply_unary_op( + self._value_column, ops.MapOp(tuple(tuples)) + ) + return Series(block.select_column(result)) def interpolate(self, method: str = "linear") -> Series: if method == "pad": diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ab68543d91..ed78e73e5d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas() + pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!") + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_regex_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() + pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas() + pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!") + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_value_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() + pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + def test_df_ffill(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c082b87336..00be9e5e9e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4356,6 +4356,94 @@ def fillna(self, value): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def replace( + self, + to_replace, + value=None, + *, + regex=False, + ): + """ + Replace values given in `to_replace` with `value`. + + Values of the Series/DataFrame are replaced with other values dynamically. + This differs from updating with ``.loc`` or ``.iloc``, which require + you to specify a location to update with some value. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'int_col': [1, 1, 2, 3], + ... 'string_col': ["a", "b", "c", "b"], + ... }) + + Using scalar `to_replace` and `value`: + + >>> df.replace("b", "e") + int_col string_col + 0 1 a + 1 1 e + 2 2 c + 3 3 e + + [4 rows x 2 columns] + + Using dictionary: + + >>> df.replace({"a": "e", 2: 5}) + int_col string_col + 0 1 e + 1 1 b + 2 5 c + 3 3 b + + [4 rows x 2 columns] + + Using regex: + + >>> df.replace("[ab]", "e", regex=True) + int_col string_col + 0 1 e + 1 1 e + 2 2 c + 3 3 e + + [4 rows x 2 columns] + + + Args: + to_replace (str, regex, list, int, float or None): + How to find the values that will be replaced. + numeric: numeric values equal to `to_replace` will be replaced with `value` + str: string exactly matching `to_replace` will be replaced with `value` + regex: regexs matching `to_replace` will be replaced with`value` + list of str, regex, or numeric: + First, if `to_replace` and `value` are both lists, they **must** be the same length. + Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + str, regex and numeric rules apply as above. + + value (scalar, default None): + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + regex (bool, default False): + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. + + Returns: + Series/DataFrame: Object after replacement. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def iloc(self): """Purely integer-location based indexing for selection by position."""