From 377b8887285b91e7d512719f831c4d48fa4473e8 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 7 Mar 2024 20:28:52 +0000 Subject: [PATCH 1/6] feat: add DataFrame.pipe() method --- tests/system/small/test_dataframe.py | 25 +++++++++++ .../bigframes_vendored/pandas/core/common.py | 44 +++++++++++++++++++ .../bigframes_vendored/pandas/core/generic.py | 32 +++++++++++++- 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 third_party/bigframes_vendored/pandas/core/common.py diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9f4e138b73..4a75040513 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -983,6 +983,31 @@ def test_apply_series_scalar_callable( pandas.testing.assert_series_equal(bf_result, pd_result) +def test_df_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x.cumsum()) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x.cumsum()) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_df_keys( scalars_df_index, scalars_pandas_df_index, diff --git a/third_party/bigframes_vendored/pandas/core/common.py b/third_party/bigframes_vendored/pandas/core/common.py new file mode 100644 index 0000000000..f7706853b1 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/common.py @@ -0,0 +1,44 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/common.py +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from third_party.bigframes_vendored.pandas.pandas._typing import T + + +def pipe( + obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs +) -> T: + """ + Apply a function ``func`` to object ``obj`` either by passing obj as the + first argument to the function or, in the case that the func is a tuple, + interpret the first element of the tuple as a function and pass the obj to + that function as a keyword argument whose key is the value of the second + element of the tuple. + + Parameters + ---------- + func : callable or tuple of (callable, str) + Function to apply to this object or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of ``callable`` that expects the + object. + *args : iterable, optional + Positional arguments passed into ``func``. + **kwargs : dict, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + """ + if isinstance(func, tuple): + func, target = func + if target in kwargs: + msg = f"{target} is both the pipe target and a keyword argument" + raise ValueError(msg) + kwargs[target] = obj + return func(*args, **kwargs) + else: + return func(obj, *args, **kwargs) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index b55c7e23d8..36fdee7fcb 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1,10 +1,14 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/generic.py from __future__ import annotations -from typing import Iterator, Literal, Optional +from typing import Callable, Iterator, Literal, Optional, TYPE_CHECKING from bigframes import constants from third_party.bigframes_vendored.pandas.core import indexing +import third_party.bigframes_vendored.pandas.core.common as common + +if TYPE_CHECKING: + from third_party.bigframes_vendored.pandas.pandas._typing import T class NDFrame(indexing.IndexingMixin): @@ -962,6 +966,32 @@ def expanding(self, min_periods=1): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pipe( + self, + func: Callable[..., T] | tuple[Callable[..., T], str], + *args, + **kwargs, + ) -> T: + """ + Apply chainable functions that expect Series or DataFrames. + + Args: + func (function): + Function to apply to the {klass}. + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects the {klass}. + *args (iterable, optional): + Positional arguments passed into ``func``. + **kwargs (mapping, optional): + A dictionary of keyword arguments passed into ``func``. + + Returns: + same type as caller + """ + return common.pipe(self, func, *args, **kwargs) + def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " From d93de42994843d5135909bae675413a1129c118b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 7 Mar 2024 21:08:35 +0000 Subject: [PATCH 2/6] make pipe test compatible with legacy sql --- tests/system/small/test_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 4a75040513..3e8ee82f31 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -995,14 +995,14 @@ def foo(x: int, y: int, df): bf_result = ( scalars_df_index[columns] .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x.cumsum()) + .pipe(lambda x: x**2) .to_pandas() ) pd_result = ( scalars_pandas_df_index[columns] .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x.cumsum()) + .pipe(lambda x: x**2) ) pandas.testing.assert_frame_equal(bf_result, pd_result) From 43a406ec123ab16f5cebbbb9a4ff93dd692f8498 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 8 Mar 2024 18:38:14 +0000 Subject: [PATCH 3/6] add examples to pipe docstring --- .../bigframes_vendored/pandas/core/generic.py | 77 ++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 36fdee7fcb..80fc402b6e 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -977,11 +977,11 @@ def pipe( Args: func (function): - Function to apply to the {klass}. + Function to apply to this object. ``args``, and ``kwargs`` are passed into ``func``. Alternatively a ``(callable, data_keyword)`` tuple where ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects the {klass}. + ``callable`` that expects this object. *args (iterable, optional): Positional arguments passed into ``func``. **kwargs (mapping, optional): @@ -989,6 +989,79 @@ def pipe( Returns: same type as caller + + **Examples:** + + Constructing a income DataFrame from a dictionary. + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] + >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) + >>> df + Salary Others + 0 8000 1000.0 + 1 9500 + 2 5000 2000.0 + + [3 rows x 2 columns] + + Functions that perform tax reductions on an income DataFrame. + + >>> def subtract_federal_tax(df): + ... return df * 0.9 + >>> def subtract_state_tax(df, rate): + ... return df * (1 - rate) + >>> def subtract_national_insurance(df, rate, rate_increase): + ... new_rate = rate + rate_increase + ... return df * (1 - new_rate) + + Instead of writing + + >>> subtract_national_insurance( + ... subtract_state_tax(subtract_federal_tax(df), rate=0.12), + ... rate=0.05, + ... rate_increase=0.02) # doctest: +SKIP + + You can write + + >>> ( + ... df.pipe(subtract_federal_tax) + ... .pipe(subtract_state_tax, rate=0.12) + ... .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02) + ... ) + Salary Others + 0 5892.48 736.56 + 1 6997.32 + 2 3682.8 1473.12 + + [3 rows x 2 columns] + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``national_insurance`` takes its data as ``df`` + in the second argument: + + >>> def subtract_national_insurance(rate, df, rate_increase): + ... new_rate = rate + rate_increase + ... return df * (1 - new_rate) + >>> ( + ... df.pipe(subtract_federal_tax) + ... .pipe(subtract_state_tax, rate=0.12) + ... .pipe( + ... (subtract_national_insurance, 'df'), + ... rate=0.05, + ... rate_increase=0.02 + ... ) + ... ) + Salary Others + 0 5892.48 736.56 + 1 6997.32 + 2 3682.8 1473.12 + + [3 rows x 2 columns] """ return common.pipe(self, func, *args, **kwargs) From 68c317076c1c016c37785be7099b7f417b85b409 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 8 Mar 2024 18:41:34 +0000 Subject: [PATCH 4/6] add series.pipe test --- tests/system/small/test_series.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 37b4f8c1de..f5c5b1c216 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3203,3 +3203,28 @@ def test_apply_not_supported(scalars_dfs, col, lambda_, exception): bf_col = scalars_df[col] with pytest.raises(exception): bf_col.apply(lambda_, by_row=False) + + +def test_series_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + column = "int64_too" + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + assert_series_equal(bf_result, pd_result) From e76f60c56c33b85322ed98e786a103ed432d52ec Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 8 Mar 2024 23:14:10 +0000 Subject: [PATCH 5/6] fix docstring render issues --- .../bigframes_vendored/pandas/core/generic.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 80fc402b6e..29d6004c31 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -975,21 +975,6 @@ def pipe( """ Apply chainable functions that expect Series or DataFrames. - Args: - func (function): - Function to apply to this object. - ``args``, and ``kwargs`` are passed into ``func``. - Alternatively a ``(callable, data_keyword)`` tuple where - ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects this object. - *args (iterable, optional): - Positional arguments passed into ``func``. - **kwargs (mapping, optional): - A dictionary of keyword arguments passed into ``func``. - - Returns: - same type as caller - **Examples:** Constructing a income DataFrame from a dictionary. @@ -1062,6 +1047,21 @@ def pipe( 2 3682.8 1473.12 [3 rows x 2 columns] + + Args: + func (function): + Function to apply to this object. + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects this object. + args (iterable, optional): + Positional arguments passed into ``func``. + kwargs (mapping, optional): + A dictionary of keyword arguments passed into ``func``. + + Returns: + same type as caller """ return common.pipe(self, func, *args, **kwargs) From 88151670adbe7feb52f4cba4a1c6897b5a783f28 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 14 Mar 2024 19:26:15 +0000 Subject: [PATCH 6/6] fix vendored imports --- .../bigframes_vendored/pandas/core/common.py | 28 +++++++++---------- .../bigframes_vendored/pandas/core/generic.py | 4 +-- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/common.py b/third_party/bigframes_vendored/pandas/core/common.py index f7706853b1..ded5a22b8f 100644 --- a/third_party/bigframes_vendored/pandas/core/common.py +++ b/third_party/bigframes_vendored/pandas/core/common.py @@ -4,7 +4,7 @@ from typing import Callable, TYPE_CHECKING if TYPE_CHECKING: - from third_party.bigframes_vendored.pandas.pandas._typing import T + from bigframes_vendored.pandas.pandas._typing import T def pipe( @@ -17,21 +17,19 @@ def pipe( that function as a keyword argument whose key is the value of the second element of the tuple. - Parameters - ---------- - func : callable or tuple of (callable, str) - Function to apply to this object or, alternatively, a - ``(callable, data_keyword)`` tuple where ``data_keyword`` is a - string indicating the keyword of ``callable`` that expects the - object. - *args : iterable, optional - Positional arguments passed into ``func``. - **kwargs : dict, optional - A dictionary of keyword arguments passed into ``func``. + Args: + func (callable or tuple of (callable, str)): + Function to apply to this object or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of ``callable`` that expects the + object. + args (iterable, optional): + Positional arguments passed into ``func``. + kwargs (dict, optional): + A dictionary of keyword arguments passed into ``func``. - Returns - ------- - object : the return type of ``func``. + Returns: + object: the return type of ``func``. """ if isinstance(func, tuple): func, target = func diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 4ca3bd77df..7f8e1f7b53 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -9,7 +9,7 @@ from bigframes import constants if TYPE_CHECKING: - from third_party.bigframes_vendored.pandas.pandas._typing import T + from bigframes_vendored.pandas.pandas._typing import T class NDFrame(indexing.IndexingMixin): @@ -987,7 +987,7 @@ def pipe( >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) >>> df - Salary Others + Salary Others 0 8000 1000.0 1 9500 2 5000 2000.0