diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 30c7902981..4c30d7631d 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -25,6 +25,39 @@ import bigframes.operations.aggregations as agg_ops +def equals(block1: blocks.Block, block2: blocks.Block) -> bool: + if not block1.column_labels.equals(block2.column_labels): + return False + if block1.dtypes != block2.dtypes: + return False + # TODO: More advanced expression tree traversals to short circuit actually querying data + + block1 = block1.reset_index(drop=False) + block2 = block2.reset_index(drop=False) + + joined, (lmap, rmap) = block1.index.join(block2.index, how="outer") + joined_block = joined._block + + equality_ids = [] + for lcol, rcol in zip(block1.value_columns, block2.value_columns): + lcolmapped = lmap(lcol) + rcolmapped = rmap(rcol) + joined_block, result_id = joined_block.apply_binary_op( + lcolmapped, rcolmapped, ops.eq_nulls_match_op + ) + joined_block, result_id = joined_block.apply_unary_op( + result_id, ops.partial_right(ops.fillna_op, False) + ) + equality_ids.append(result_id) + + joined_block = joined_block.select_columns(equality_ids).with_column_labels( + list(range(len(equality_ids))) + ) + stacked_block = joined_block.stack(dropna=False, sort=False) + result = stacked_block.get_stat(stacked_block.value_columns[0], agg_ops.all_op) + return typing.cast(bool, result) + + def indicate_duplicates( block: blocks.Block, columns: typing.Sequence[str], keep: str = "first" ) -> typing.Tuple[blocks.Block, str]: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0492e62c15..87d0f21b62 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1066,6 +1066,12 @@ def rename_axis( labels = [mapper] return DataFrame(self._block.with_index_labels(labels)) + def equals(self, other: typing.Union[bigframes.series.Series, DataFrame]) -> bool: + # Must be same object type, same column dtypes, and same label values + if not isinstance(other, DataFrame): + return False + return block_ops.equals(self._block, other._block) + def assign(self, **kwargs) -> DataFrame: # TODO(garrettwu) Support list-like values. Requires ordering. # TODO(garrettwu) Support callable values. diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index bc08298eb7..f330a703b2 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -705,6 +705,16 @@ def eq_op( return x == y +def eq_nulls_match_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" + left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) + right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) + return left == right + + def ne_op( x: ibis_types.Value, y: ibis_types.Value, diff --git a/bigframes/series.py b/bigframes/series.py index 717a85a93e..e5afe91e44 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -209,6 +209,14 @@ def rename_axis( labels = [mapper] return Series(self._block.with_index_labels(labels)) + def equals( + self, other: typing.Union[Series, bigframes.dataframe.DataFrame] + ) -> bool: + # Must be same object type, same column dtypes, and same label values + if not isinstance(other, Series): + return False + return block_ops.equals(self._block, other._block) + def reset_index( self, *, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 19ea9b8ae5..711da10c55 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2551,6 +2551,74 @@ def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): ) +def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "geography_col", + ] + scalars_df_index = scalars_df_index.drop(columns=unsupported) + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported) + + bf_result = scalars_df_index.equals(scalars_df_index) + pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index) + + assert pd_result == bf_result + + +def test_df_equals_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"]) + pd_result = scalars_pandas_df_index[["int64_col"]].equals( + scalars_pandas_df_index["int64_col"] + ) + + assert pd_result == bf_result + + +def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified = bf_modified.astype("Float64") + + pd_modified = scalars_pandas_df_index.copy() + pd_modified = pd_modified.astype("Float64") + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified["int64_col"] = bf_modified.int64_col + 1 + + pd_modified = scalars_pandas_df_index.copy() + pd_modified["int64_col"] = pd_modified.int64_col + 1 + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + more_columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns]) + pd_result = scalars_pandas_df_index[columns].equals( + scalars_pandas_df_index[more_columns] + ) + + assert pd_result == bf_result + + def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): reindex_target_bf = scalars_df_index.reindex( columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 8c1c36720b..993df18c95 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -112,6 +112,44 @@ def test_series_get_column_default(scalars_dfs): assert result == "default_val" +def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col) + pd_result = scalars_pandas_df_index.int64_col.equals( + scalars_pandas_df_index.int64_col + ) + + assert pd_result == bf_result + + +def test_series_equals_df(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index["int64_col"].equals(scalars_df_index[["int64_col"]]) + pd_result = scalars_pandas_df_index["int64_col"].equals( + scalars_pandas_df_index[["int64_col"]] + ) + + assert pd_result == bf_result + + +def test_series_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"] + pd_series = scalars_pandas_df_index["int64_col"] + + bf_result = bf_series.equals(bf_series.astype("Float64")) + pd_result = pd_series.equals(pd_series.astype("Float64")) + + assert pd_result == bf_result + + +def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"] + pd_series = scalars_pandas_df_index["int64_col"] + + bf_result = bf_series.equals(bf_series + 1) + pd_result = pd_series.equals(pd_series + 1) + + assert pd_result == bf_result + + def test_series_get_with_default_index(scalars_dfs): col_name = "float64_col" key = 2 diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 17d941fbdd..644e043e83 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -399,6 +399,28 @@ def to_orc(self, path=None, **kwargs) -> bytes | None: # ---------------------------------------------------------------------- # Unsorted + def equals(self, other) -> bool: + """ + Test whether two objects contain the same elements. + + This function allows two Series or DataFrames to be compared against + each other to see if they have the same shape and elements. NaNs in + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns must be of + the same dtype. + + Args: + other (Series or DataFrame): + The other Series or DataFrame to be compared with the first. + + Returns: + bool: True if all elements are the same in both objects, False + otherwise. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame.