From a747fb90567ba07e4ef133a858819c294a0e43d2 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 4 Sep 2024 02:31:16 +0000 Subject: [PATCH 1/5] fix: support `read_gbq_function` for axis=1 application --- .../functions/_remote_function_session.py | 2 +- bigframes/functions/remote_function.py | 10 ++++++ bigframes/pandas/__init__.py | 3 +- bigframes/session/__init__.py | 35 +++++++++++++++++++ tests/system/large/test_remote_function.py | 14 ++++++++ tests/system/small/test_remote_function.py | 9 ++++- 6 files changed, 70 insertions(+), 3 deletions(-) diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py index 0ab19ca353..c69e430836 100644 --- a/bigframes/functions/_remote_function_session.py +++ b/bigframes/functions/_remote_function_session.py @@ -176,7 +176,7 @@ def remote_function( getting and setting IAM roles on cloud resources. If this param is not provided then resource manager client from the session would be used. - dataset (str, Optional.): + dataset (str, Optional): Dataset in which to create a BigQuery remote function. It should be in `.` or `` format. If this parameter is not provided then session dataset id is used. diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 7e9df74e76..39e3bfd8f0 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -14,6 +14,7 @@ from __future__ import annotations +import inspect import logging from typing import cast, Optional, TYPE_CHECKING import warnings @@ -107,6 +108,7 @@ def read_gbq_function( function_name: str, *, session: Session, + is_row_processor: bool = False, ): """ Read an existing BigQuery function and prepare it for use in future queries. @@ -149,6 +151,13 @@ def func(*ignored_args, **ignored_kwargs): expr = node(*ignored_args, **ignored_kwargs) # type: ignore return ibis_client.execute(expr) + func.__signature__ = inspect.signature(func).replace( # type: ignore + parameters=[ + inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) + for name in ibis_signature.parameter_names + ] + ) + # TODO: Move ibis logic to compiler step func.__name__ = routine_ref.routine_id @@ -186,5 +195,6 @@ def func(*ignored_args, **ignored_kwargs): func.output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( # type: ignore ibis_signature.output_type ) + func.is_row_processor = is_row_processor # type: ignore func.ibis_node = node # type: ignore return func diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 08d808572d..9f33a8a1ea 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -692,10 +692,11 @@ def remote_function( remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function) -def read_gbq_function(function_name: str): +def read_gbq_function(function_name: str, is_row_processor: bool = False): return global_session.with_default_session( bigframes.session.Session.read_gbq_function, function_name=function_name, + is_row_processor=is_row_processor, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c91266b875..a780c1aa92 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1223,6 +1223,7 @@ def remote_function( def read_gbq_function( self, function_name: str, + is_row_processor: bool = False, ): """Loads a BigQuery function from BigQuery. @@ -1264,6 +1265,35 @@ def read_gbq_function( [3 rows x 3 columns] + You can even use a function with multiple inputs. For example, let's use + ``cw_regexp_replace_5`` from Community UDFs + (https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_regexp_replace_5haystack-string-regexp-string-replacement-string-offset-int64-occurrence-int64). + + >>> func = bpd.read_gbq_function("bqutil.fn.cw_regexp_replace_5") + >>> func('TestStr123456', 'Str', 'Cad$', 1, 1) + 'TestCad$123456' + + >>> df = bpd.DataFrame({ + ... "haystack" : ["TestStr123456", "TestStr123456Str", "TestStr123456Str"], + ... "regexp" : ["Str", "Str", "Str"], + ... "replacement" : ["Cad$", "Cad$", "Cad$"], + ... "offset" : [1, 1, 1], + ... "occurrence" : [1, 2, 1] + ... }) + >>> df + haystack regexp replacement offset occurrence + 0 TestStr123456 Str Cad$ 1 1 + 1 TestStr123456Str Str Cad$ 1 2 + 2 TestStr123456Str Str Cad$ 1 1 + + [3 rows x 5 columns] + >>> df.apply(func, axis=1) + 0 TestCad$123456 + 1 TestStr123456Cad$ + 2 TestCad$123456Str + dtype: string + + Args: function_name (str): the function's name in BigQuery in the format @@ -1271,6 +1301,10 @@ def read_gbq_function( `dataset_id.function_name` to load from the default project, or `function_name` to load from the default project and the dataset associated with the current session. + is_row_processor (bool, default False): + Whether the function is a row processor. This is set to True + for a function which receives an entire row of a DataFrame as + a pandas Series. Returns: callable: A function object pointing to the BigQuery function read @@ -1284,6 +1318,7 @@ def read_gbq_function( return bigframes_rf.read_gbq_function( function_name=function_name, session=self, + is_row_processor=is_row_processor, ) def _prepare_copy_job_config(self) -> bigquery.CopyJobConfig: diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index d6eefc1e31..77ea4627ec 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1603,6 +1603,13 @@ def serialize_row(row): # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object' # , ignore this mismatch by using check_dtype=False. pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + # Let's make sure the read_gbq_function path works for this function + serialize_row_reuse = session.read_gbq_function( + serialize_row_remote.bigframes_remote_function, is_row_processor=True + ) + bf_result = scalars_df[columns].apply(serialize_row_reuse, axis=1).to_pandas() + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -2085,6 +2092,13 @@ def foo(x, y, z): pandas.testing.assert_series_equal( expected_result, bf_result, check_dtype=False, check_index_type=False ) + + # Let's make sure the read_gbq_function path works for this function + foo_reuse = session.read_gbq_function(foo.bigframes_remote_function) + bf_result = bf_df.apply(foo_reuse, axis=1).to_pandas() + pandas.testing.assert_series_equal( + expected_result, bf_result, check_dtype=False, check_index_type=False + ) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index db573efa40..b000354ed4 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -671,12 +671,19 @@ def square1(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_runs_existing_udf(session, bigquery_client, dataset_id): +def test_read_gbq_function_runs_existing_udf(session): func = session.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only") got = func("AURÉLIE") assert got == "aurÉlie" +@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_runs_existing_udf_4_params(session): + func = session.read_gbq_function("bqutil.fn.cw_instr4") + got = func("TestStr123456Str", "Str", 1, 2) + assert got == 14 + + @pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) From 9099e8d23fcee3f0d46fa58cd6b5acd68f5ecac3 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 4 Sep 2024 18:24:52 +0000 Subject: [PATCH 2/5] remove stray newline --- bigframes/session/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 45168abba1..aa94766224 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1293,7 +1293,6 @@ def read_gbq_function( 2 TestCad$123456Str dtype: string - Args: function_name (str): the function's name in BigQuery in the format From 34855b32d9c0d6bbebe1a5e35edec76a8541f2a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 5 Sep 2024 16:35:45 -0500 Subject: [PATCH 3/5] Update bigframes/session/__init__.py --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index aa94766224..2182359eda 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1265,7 +1265,7 @@ def read_gbq_function( [3 rows x 3 columns] - You can even use a function with multiple inputs. For example, let's use + You can even use a function with multiple inputs. For example, use ``cw_regexp_replace_5`` from Community UDFs (https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_regexp_replace_5haystack-string-regexp-string-replacement-string-offset-int64-occurrence-int64). From c2154b72cc5411ea40171b606747e8cac7d84388 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 6 Sep 2024 07:30:00 +0000 Subject: [PATCH 4/5] remove first person reference in the doc --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b92101d39a..a2f6478c80 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1276,7 +1276,7 @@ def read_gbq_function( [3 rows x 3 columns] - You can even use a function with multiple inputs. For example, let's use + You can even use a function with multiple inputs. For example, [cw_regexp_replace_5](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_regexp_replace_5haystack-string-regexp-string-replacement-string-offset-int64-occurrence-int64) from Community UDFs. From 36a4cfd58dc127dbce2ca5e855f68dd37d4418f7 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 6 Sep 2024 07:36:18 +0000 Subject: [PATCH 5/5] use correct product name --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index a2f6478c80..045483bd53 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1256,7 +1256,7 @@ def read_gbq_function( >>> func('AURÉLIE') 'aurÉlie' - You can apply it to a BigQuery DataFrame Series. + You can apply it to a BigQuery DataFrames Series. >>> df = bpd.DataFrame({'id': [1, 2, 3], 'name': ['AURÉLIE', 'CÉLESTINE', 'DAPHNÉ']}) >>> df