From a91823f449dba3c9e04ab47fe99e66e8b2f021bd Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 8 Nov 2023 01:05:21 +0000 Subject: [PATCH 1/3] docs: Add docstring code samples for `Series.apply` and `DataFrame.map` --- .../bigframes_vendored/pandas/core/frame.py | 57 ++++++++++++++++- .../bigframes_vendored/pandas/core/series.py | 64 +++++++++++++++++-- 2 files changed, 114 insertions(+), 7 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6f4f6be35d..022fb329ab 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2159,8 +2159,63 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: In pandas 2.1.0, DataFrame.applymap is deprecated and renamed to DataFrame.map. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> @bpd.remote_function([int], float) + ... def minutes_to_hours(x): + ... return x/60 + + >>> df_minutes = bpd.DataFrame( + ... {"system_minutes" : [0, 30, 60, 90, 120], + ... "user_minutes" : [0, 15, 75, 90, 6]}) + >>> df_minutes + system_minutes user_minutes + 0 0 0 + 1 30 15 + 2 60 75 + 3 90 90 + 4 120 6 + + [5 rows x 2 columns] + + >>> df_hours = df_minutes.map(minutes_to_hours) + >>> df_hours + system_minutes user_minutes + 0 0.0 0.0 + 1 0.5 0.25 + 2 1.0 1.25 + 3 1.5 1.5 + 4 2.0 0.1 + + [5 rows x 2 columns] + + If there are ``NA``/``None`` values in the data, you can ignore + applying the remote function on such values by specifying + ``na_action='ignore'``. + + >>> df_minutes = bpd.DataFrame( + ... { + ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA], + ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA] + ... }, dtype="Int64") + >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') + >>> df_hours + system_minutes user_minutes + 0 0.0 0.0 + 1 0.5 0.25 + 2 1.0 1.25 + 3 1.5 + 4 1.5 0.1 + 5 2.0 + 6 + + [7 rows x 2 columns] + Args: - func: + func (function): Python function wrapped by ``remote_function`` decorator, returns a single value from a single value. na_action (Optional[str], default None): diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b569e5699c..c9762b9def 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -728,18 +728,70 @@ def apply( func, ) -> DataFrame | Series: """ - Invoke function on values of Series. + Invoke function on values of a Series. - Can be ufunc (a NumPy function that applies to the entire Series) - or a Python function that only works on single values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> @bpd.remote_function([int], float) + ... def minutes_to_hours(x): + ... return x/60 + + >>> minutes = bpd.Series([0, 30, 60, 90, 120]) + >>> minutes + 0 0 + 1 30 + 2 60 + 3 90 + 4 120 + dtype: Int64 + + >>> hours = minutes.apply(minutes_to_hours) + >>> hours + 0 0.0 + 1 0.5 + 2 1.0 + 3 1.5 + 4 2.0 + dtype: Float64 + + You could turn a user defined function with external package + dependencies into a BigQuery DataFrames remote function. You would + provide the names of the packages via ``packages`` param. + + >>> @bpd.remote_function( + ... [str], + ... str, + ... bigquery_connection="bigframes-rf-conn", + ... reuse=False, + ... packages=["cryptography"], + ... ) + ... def get_hash(input): + ... from cryptography.fernet import Fernet + ... + ... # handle missing value + ... if input is None: + ... input = "" + ... + ... key = Fernet.generate_key() + ... f = Fernet(key) + ... return f.encrypt(input.encode()).decode() + + >>> names = bpd.Series(["Alice", "Bob"]) + >>> hashes = names.apply(get_hash) Args: func (function): - Python function or NumPy ufunc to apply. + BigFrames DataFrames ``remote_function`` to apply. The function + should take a scalar and return a scalar. It will be applied to + every element in the ``Series``. Returns: - bigframes.series.Series: If func returns a Series object the result - will be a DataFrame. + bigframes.series.Series: A new Series with values representing the + return value of the ``func`` applied to each element of the original + Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 7e14da38d2ecf01b7d0a19939e623b9bc5244661 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 8 Nov 2023 06:50:36 +0000 Subject: [PATCH 2/3] improved docstring with concurrency-safe code samples --- third_party/bigframes_vendored/pandas/core/frame.py | 7 ++++++- third_party/bigframes_vendored/pandas/core/series.py | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 022fb329ab..783f2d286e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2164,7 +2164,12 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> @bpd.remote_function([int], float) + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + is created every time we run the following code, but you can skip it + to potentially reuse a previously deployed ``remote_function`` from + the same user defined function. + + >>> @bpd.remote_function([int], float, reuse=False) ... def minutes_to_hours(x): ... return x/60 diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c9762b9def..0cab57cfc6 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -735,7 +735,12 @@ def apply( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> @bpd.remote_function([int], float) + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + is created every time we run the following code, but you can skip it + to potentially reuse a previously deployed ``remote_function`` from + the same user defined function. + + >>> @bpd.remote_function([int], float, reuse=False) ... def minutes_to_hours(x): ... return x/60 @@ -764,7 +769,6 @@ def apply( >>> @bpd.remote_function( ... [str], ... str, - ... bigquery_connection="bigframes-rf-conn", ... reuse=False, ... packages=["cryptography"], ... ) From d89b3e5b3b8d58a51445c88a53ba879463b8fc23 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 9 Nov 2023 00:15:21 +0000 Subject: [PATCH 3/3] Correct indentation of text in code samples --- .../bigframes_vendored/pandas/core/frame.py | 14 +++++++------- .../bigframes_vendored/pandas/core/series.py | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 783f2d286e..088e226c20 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2164,10 +2164,10 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Let's use ``reuse=False`` flag to make sure a new ``remote_function`` - is created every time we run the following code, but you can skip it - to potentially reuse a previously deployed ``remote_function`` from - the same user defined function. + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + is created every time we run the following code, but you can skip it + to potentially reuse a previously deployed ``remote_function`` from + the same user defined function. >>> @bpd.remote_function([int], float, reuse=False) ... def minutes_to_hours(x): @@ -2197,9 +2197,9 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: [5 rows x 2 columns] - If there are ``NA``/``None`` values in the data, you can ignore - applying the remote function on such values by specifying - ``na_action='ignore'``. + If there are ``NA``/``None`` values in the data, you can ignore + applying the remote function on such values by specifying + ``na_action='ignore'``. >>> df_minutes = bpd.DataFrame( ... { diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 0cab57cfc6..55c53fd1eb 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -735,10 +735,10 @@ def apply( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Let's use ``reuse=False`` flag to make sure a new ``remote_function`` - is created every time we run the following code, but you can skip it - to potentially reuse a previously deployed ``remote_function`` from - the same user defined function. + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + is created every time we run the following code, but you can skip it + to potentially reuse a previously deployed ``remote_function`` from + the same user defined function. >>> @bpd.remote_function([int], float, reuse=False) ... def minutes_to_hours(x): @@ -762,9 +762,9 @@ def apply( 4 2.0 dtype: Float64 - You could turn a user defined function with external package - dependencies into a BigQuery DataFrames remote function. You would - provide the names of the packages via ``packages`` param. + You could turn a user defined function with external package + dependencies into a BigQuery DataFrames remote function. You would + provide the names of the packages via ``packages`` param. >>> @bpd.remote_function( ... [str],