From a280ff18083232fb02f42a4582c7428297e2dd26 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 9 Apr 2024 23:30:17 +0000 Subject: [PATCH 1/6] feat: Add hasnans, combine_first, update to Series --- bigframes/core/convert.py | 4 +- bigframes/series.py | 21 ++- tests/system/small/test_series.py | 50 +++++++ .../bigframes_vendored/pandas/core/series.py | 138 ++++++++++++++++++ 4 files changed, 211 insertions(+), 2 deletions(-) diff --git a/bigframes/core/convert.py b/bigframes/core/convert.py index 98f854ad72..268460298a 100644 --- a/bigframes/core/convert.py +++ b/bigframes/core/convert.py @@ -13,13 +13,15 @@ # limitations under the License. from __future__ import annotations +from typing import Optional + import pandas as pd import bigframes.core.indexes as index import bigframes.series as series -def to_bf_series(obj, default_index: index.Index) -> series.Series: +def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series: if isinstance(obj, series.Series): return obj if isinstance(obj, pd.Series): diff --git a/bigframes/series.py b/bigframes/series.py index 185891bc01..2b5abb82cd 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -22,7 +22,7 @@ import os import textwrap import typing -from typing import Any, Literal, Mapping, Optional, Tuple, Union +from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery @@ -130,6 +130,11 @@ def ndim(self) -> int: def empty(self) -> bool: return self.shape[0] == 0 + @property + def hasnans(self) -> bool: + # Note, hasnans is actually a null check, and NaNs don't count for nullable float + return self.isnull().any() + @property def values(self) -> numpy.ndarray: return self.to_numpy() @@ -753,6 +758,20 @@ def __matmul__(self, other): dot = __matmul__ + def combine_first(self, other: Series) -> Series: + result = self._apply_binary_op(other, ops.coalesce_op) + result.name = self.name + return result + + def update(self, other: Union[Series | Sequence | Mapping]) -> None: + import bigframes.core.convert + + other = bigframes.core.convert.to_bf_series(other, default_index=None) + result = self._apply_binary_op( + other, ops.coalesce_op, reverse=True, alignment="left" + ) + self._set_block(result._get_block()) + def abs(self) -> Series: return self._apply_unary_op(ops.abs_op) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c882677508..8436ce625e 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1261,6 +1261,38 @@ def test_binop_right_filtered(scalars_dfs): ) +def test_series_combine_first(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7) + bf_result = int64_col.combine_first(float64_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7) + pd_result = pd_int64_col.combine_first(pd_float64_col) + + assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_update(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + int64_col = scalars_df["int64_col"].head(7) + float64_col = scalars_df["float64_col"].tail(7) + float64_col.update(int64_col) + + pd_int64_col = scalars_pandas_df["int64_col"].head(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7) + pd_float64_col.update(pd_int64_col) + + assert_series_equal( + float64_col.to_pandas(), + pd_float64_col, + ) + + def test_mean(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -1649,6 +1681,24 @@ def test_size(scalars_dfs): assert pd_result == bf_result +def test_series_hasnans_true(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].hasnans + pd_result = scalars_pandas_df["string_col"].hasnans + + assert pd_result == bf_result + + +def test_series_hasnans_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["string_col"].dropna().hasnans + pd_result = scalars_pandas_df["string_col"].dropna().hasnans + + assert pd_result == bf_result + + def test_empty_false(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a75d6c2167..c6a806f8e0 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -175,6 +175,31 @@ def name(self) -> Hashable: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def hasnans(self) -> bool: + """ + Return True if there are any NaNs. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3, None]) + >>> s + 0 1.0 + 1 2.0 + 2 3.0 + 3 + dtype: Float64 + >>> s.hasnans + True + + Returns: + bool + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def T(self) -> Series: """Return the transpose, which is by definition self. @@ -2343,6 +2368,119 @@ def rdivmod(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def combine_first(self, other) -> Series: + """ + Update null elements with value in the same location in 'other'. + + Combine two Series objects by filling null values in one Series with + non-null values from the other Series. Result index will be the union + of the two indexes. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series([1, np.nan]) + >>> s2 = bpd.Series([3, 4, 5]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + 2 5.0 + dtype: Float64 + + Null values still persist if the location of that null value + does not exist in `other` + + >>> s1 = bpd.Series({'falcon': np.nan, 'eagle': 160.0}) + >>> s2 = bpd.Series({'eagle': 200.0, 'duck': 30.0}) + >>> s1.combine_first(s2) + falcon + eagle 160.0 + duck 30.0 + dtype: Float64 + + Args: + other (Series): + The value(s) to be used for filling null values. + + Returns: + Series: The result of combining the provided Series with the other object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def update(self, other: Series | Sequence | Mapping) -> None: + """ + Modify Series in place using values from passed Series. + + Uses non-NA values from passed Series to make updates. Aligns + on index. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: Int64 + + >>> s = bpd.Series(['a', 'b', 'c']) + >>> s.update(bpd.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: string + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: Int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update(bpd.Series([4, np.nan, 6], dtype=pd.Int64Dtype())) + >>> s + 0 4 + 1 2 + 2 6 + dtype: Int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update([4, np.nan, 6]) + >>> s + 0 4.0 + 1 2.0 + 2 6.0 + dtype: Float64 + + >>> s = bpd.Series([1, 2, 3]) + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: Int64 + + Args: + other (Series, or object coercible into Series) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def all( self, ): From 9f8b033aa6e256d44e110144b4cbd580238dcd2a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 10 Apr 2024 00:12:50 +0000 Subject: [PATCH 2/6] fix type annotation for series.update --- bigframes/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index 2b5abb82cd..b975979eaf 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -763,7 +763,7 @@ def combine_first(self, other: Series) -> Series: result.name = self.name return result - def update(self, other: Union[Series | Sequence | Mapping]) -> None: + def update(self, other: Union[Series, Sequence, Mapping]) -> None: import bigframes.core.convert other = bigframes.core.convert.to_bf_series(other, default_index=None) From 4bac86232873f48f03043345a980e373e2ce5760 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 10 Apr 2024 00:33:20 +0000 Subject: [PATCH 3/6] fix type annoation --- third_party/bigframes_vendored/pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c6a806f8e0..572f29ff17 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2409,7 +2409,7 @@ def combine_first(self, other) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def update(self, other: Series | Sequence | Mapping) -> None: + def update(self, other) -> None: """ Modify Series in place using values from passed Series. From c6115ad22568e05d4d3b893cb891b1272ef283c4 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 10 Apr 2024 16:51:57 +0000 Subject: [PATCH 4/6] skip combine_first test for legacy pandas --- tests/system/small/test_series.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 8436ce625e..d995ceed67 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1261,6 +1261,7 @@ def test_binop_right_filtered(scalars_dfs): ) +@skip_legacy_pandas def test_series_combine_first(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs int64_col = scalars_df["int64_col"].head(7) From b99f0d660386c66bf222f7b2252a6d00d63fca29 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 10 Apr 2024 20:36:18 +0000 Subject: [PATCH 5/6] add docstrings to conversion utils --- bigframes/core/convert.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/bigframes/core/convert.py b/bigframes/core/convert.py index 268460298a..1ef329b0c7 100644 --- a/bigframes/core/convert.py +++ b/bigframes/core/convert.py @@ -22,6 +22,18 @@ def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series: + """ + Convert a an object to a bigframes series + + Args: + obj (list-like or Series): + Object to convert to bigframes Series + default_index (list-like or Index or None): + Index to use if obj has no index + + Returns + bigframes.pandas.Series + """ if isinstance(obj, series.Series): return obj if isinstance(obj, pd.Series): @@ -37,6 +49,18 @@ def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series: def to_pd_series(obj, default_index: pd.Index) -> pd.Series: + """ + Convert a an object to a pandas series + + Args: + obj (list-like or Series): + Object to convert to pandas Series + default_index (list-like or Index or None): + Index to use if obj has no index + + Returns + pandas.Series + """ if isinstance(obj, series.Series): return obj.to_pandas() if isinstance(obj, pd.Series): From e74df0b33a922f76509b79eb811feffe4c03fc68 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 10 Apr 2024 22:38:51 +0000 Subject: [PATCH 6/6] fix update test side effect --- tests/system/small/test_series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d995ceed67..c93af1bf2f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1281,11 +1281,11 @@ def test_series_combine_first(scalars_dfs): def test_series_update(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs int64_col = scalars_df["int64_col"].head(7) - float64_col = scalars_df["float64_col"].tail(7) + float64_col = scalars_df["float64_col"].tail(7).copy() float64_col.update(int64_col) pd_int64_col = scalars_pandas_df["int64_col"].head(7) - pd_float64_col = scalars_pandas_df["float64_col"].tail(7) + pd_float64_col = scalars_pandas_df["float64_col"].tail(7).copy() pd_float64_col.update(pd_int64_col) assert_series_equal(