diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 1990647e0a..398c6ab26a 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,7 +15,7 @@ import functools import typing -from typing import Sequence +from typing import Optional, Sequence import bigframes_vendored.constants as constants import pandas as pd @@ -488,11 +488,19 @@ def dropna( block: blocks.Block, column_ids: typing.Sequence[str], how: typing.Literal["all", "any"] = "any", + subset: Optional[typing.Sequence[str]] = None, ): """ Drop na entries from block """ - predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids] + if subset is None: + subset = column_ids + + predicates = [ + ops.notnull_op.as_expr(column_id) + for column_id in column_ids + if column_id in subset + ] if len(predicates) == 0: return block if how == "any": diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c4597ab843..c2fb9336f3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2027,8 +2027,9 @@ def dropna( self, *, axis: int | str = 0, - inplace: bool = False, how: str = "any", + subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: if inplace: @@ -2040,8 +2041,25 @@ def dropna( axis_n = utils.get_axis_number(axis) + if subset is not None and axis_n != 0: + raise NotImplementedError( + f"subset only supported when axis=0. {constants.FEEDBACK_LINK}" + ) + if axis_n == 0: - result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore + # subset needs to be converted into column IDs, not column labels. + if subset is None: + subset_ids = None + elif not utils.is_list_like(subset): + subset_ids = [id_ for id_ in self._block.label_to_col_id[subset]] + else: + subset_ids = [ + id_ + for label in subset + for id_ in self._block.label_to_col_id[label] + ] + + result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index fe63a1ed28..b4c81bfbef 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs): @skip_legacy_pandas @pytest.mark.parametrize( - ("axis", "how", "ignore_index"), + ("axis", "how", "ignore_index", "subset"), [ - (0, "any", False), - (0, "any", True), - (1, "any", False), - (1, "all", False), + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index): +def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index) + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 6370d1b987..560c0cf0f4 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -20,6 +20,15 @@ from . import resources +def test_dataframe_dropna_axis_1_subset_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = resources.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="subset"): + dataframe.dropna(axis=1, subset=["col1", "col2"]) + + def test_dataframe_repr_with_uninitialized_object(): """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e7f555c729..970883257c 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1614,6 +1614,8 @@ def dropna( *, axis: int | str = 0, how: str = "any", + subset=None, + inplace: bool = False, ignore_index=False, ) -> DataFrame: """Remove missing values. @@ -1662,6 +1664,15 @@ def dropna( [3 rows x 3 columns] + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'toy']) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [2 rows x 3 columns] + Args: axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Determine if rows or columns which contain missing values are @@ -1675,6 +1686,12 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. + subset (column label or sequence of labels, optional): + Labels along other axis to consider, e.g. if you are dropping + rows these would be a list of columns to include. + Only supports axis=0. + inplace (bool, default ``False``): + Not supported. ignore_index (bool, default ``False``): If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.