From e92a19613b24d3f6ff33efada27325d654689664 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 5 Mar 2025 14:02:33 -0800 Subject: [PATCH 01/19] support cryptograph>=3.1 (#1454) --- bigframes/blob/_functions.py | 4 ++-- bigframes/operations/blob.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 480e04f02c..830bc8de06 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -393,7 +393,7 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str: return all_text -pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests"]) +pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests", "pypdf[crypto]"]) # Extracts text from a PDF url and chunks it simultaneously @@ -438,4 +438,4 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s return all_text_json_string -pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests"]) +pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests", "pypdf[crypto]"]) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 88b34bf758..6541a14655 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -553,8 +553,7 @@ def pdf_extract( container_cpu: Union[float, int] = 0.33, container_memory: str = "512Mi", ) -> bigframes.series.Series: - """Extracts and chunks text from PDF URLs and saves the text as - arrays of string. + """Extracts text from PDF URLs and saves the text as string. .. note:: BigFrames Blob is still under experiments. It may not work and From 67162834b5c7e39e430c797c055aee53e6aa6d74 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 5 Mar 2025 15:52:25 -0800 Subject: [PATCH 02/19] chore: add experimental blob properties tests (#1449) * chore: add experimental blob properties tests * include files * fix * fix mypy * debug * fix --- tests/system/small/blob/conftest.py | 42 +++++++ tests/system/small/blob/test_io.py | 35 ++---- tests/system/small/blob/test_properties.py | 135 +++++++++++++++++++++ 3 files changed, 190 insertions(+), 22 deletions(-) create mode 100644 tests/system/small/blob/conftest.py create mode 100644 tests/system/small/blob/test_properties.py diff --git a/tests/system/small/blob/conftest.py b/tests/system/small/blob/conftest.py new file mode 100644 index 0000000000..5305acc193 --- /dev/null +++ b/tests/system/small/blob/conftest.py @@ -0,0 +1,42 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes +import bigframes.pandas as bpd + + +@pytest.fixture(scope="session") +def images_gcs_path() -> str: + return "gs://bigframes_blob_test/images/*" + + +@pytest.fixture(scope="session") +def images_uris() -> list[str]: + return [ + "gs://bigframes_blob_test/images/img0.jpg", + "gs://bigframes_blob_test/images/img1.jpg", + ] + + +@pytest.fixture(scope="session") +def images_mm_df( + images_gcs_path, session: bigframes.Session, bq_connection: str +) -> bpd.DataFrame: + bigframes.options.experiments.blob = True + + return session.from_glob_path( + images_gcs_path, name="blob_col", connection=bq_connection + ) diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index ca068afe46..c30f7674af 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -18,21 +18,18 @@ import bigframes.pandas as bpd -def test_blob_create_from_uri_str(bq_connection: str, session: bigframes.Session): +def test_blob_create_from_uri_str( + bq_connection: str, session: bigframes.Session, images_uris +): bigframes.options.experiments.blob = True - uris = [ - "gs://bigframes_blob_test/images/img0.jpg", - "gs://bigframes_blob_test/images/img1.jpg", - ] - - uri_series = bpd.Series(uris, session=session) + uri_series = bpd.Series(images_uris, session=session) blob_series = uri_series.str.to_blob(connection=bq_connection) pd_blob_df = blob_series.struct.explode().to_pandas() expected_pd_df = pd.DataFrame( { - "uri": uris, + "uri": images_uris, "version": [None, None], "authorizer": [bq_connection.casefold(), bq_connection.casefold()], "details": [None, None], @@ -44,19 +41,18 @@ def test_blob_create_from_uri_str(bq_connection: str, session: bigframes.Session ) -def test_blob_create_from_glob_path(bq_connection: str, session: bigframes.Session): +def test_blob_create_from_glob_path( + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris +): bigframes.options.experiments.blob = True blob_df = session.from_glob_path( - "gs://bigframes_blob_test/images/*", connection=bq_connection, name="blob_col" + images_gcs_path, connection=bq_connection, name="blob_col" ) pd_blob_df = blob_df["blob_col"].struct.explode().to_pandas() expected_df = pd.DataFrame( { - "uri": [ - "gs://bigframes_blob_test/images/img0.jpg", - "gs://bigframes_blob_test/images/img1.jpg", - ], + "uri": images_uris, "version": [None, None], "authorizer": [bq_connection.casefold(), bq_connection.casefold()], "details": [None, None], @@ -69,22 +65,17 @@ def test_blob_create_from_glob_path(bq_connection: str, session: bigframes.Sessi def test_blob_create_read_gbq_object_table( - bq_connection: str, session: bigframes.Session + bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris ): bigframes.options.experiments.blob = True - obj_table = session._create_object_table( - "gs://bigframes_blob_test/images/*", bq_connection - ) + obj_table = session._create_object_table(images_gcs_path, bq_connection) blob_df = session.read_gbq_object_table(obj_table, name="blob_col") pd_blob_df = blob_df["blob_col"].struct.explode().to_pandas() expected_df = pd.DataFrame( { - "uri": [ - "gs://bigframes_blob_test/images/img0.jpg", - "gs://bigframes_blob_test/images/img1.jpg", - ], + "uri": images_uris, "version": [None, None], "authorizer": [bq_connection.casefold(), bq_connection.casefold()], "details": [None, None], diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py new file mode 100644 index 0000000000..dedd1f916a --- /dev/null +++ b/tests/system/small/blob/test_properties.py @@ -0,0 +1,135 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import db_dtypes # type: ignore +import pandas as pd + +import bigframes +import bigframes.pandas as bpd + + +def test_blob_uri(images_uris: list[str], images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.uri().to_pandas() + expected = pd.Series(images_uris, name="uri") + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) + + +def test_blob_authorizer(images_mm_df: bpd.DataFrame, bq_connection: str): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.authorizer().to_pandas() + expected = pd.Series( + [bq_connection.casefold(), bq_connection.casefold()], name="authorizer" + ) + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) + + +def test_blob_version(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.version().to_pandas() + expected = pd.Series(["1739574332294150", "1739574332271343"], name="version") + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) + + +def test_blob_metadata(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.metadata().to_pandas() + expected = pd.Series( + [ + { + "content_type": "image/jpeg", + "md5_hash": "e130ad042261a1883cd2cc06831cf748", + "size": 338390, + "updated": 1739574332000000, + }, + { + "content_type": "image/jpeg", + "md5_hash": "e2ae3191ff2b809fd0935f01a537c650", + "size": 43333, + "updated": 1739574332000000, + }, + ], + name="metadata", + dtype=db_dtypes.JSONDtype(), + ) + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) + + +def test_blob_content_type(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.content_type().to_pandas() + expected = pd.Series(["image/jpeg", "image/jpeg"], name="content_type") + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) + + +def test_blob_md5_hash(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.md5_hash().to_pandas() + expected = pd.Series( + ["e130ad042261a1883cd2cc06831cf748", "e2ae3191ff2b809fd0935f01a537c650"], + name="md5_hash", + ) + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) + + +def test_blob_size(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.size().to_pandas() + expected = pd.Series([338390, 43333], name="size") + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) + + +def test_blob_updated(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.updated().to_pandas() + expected = pd.Series( + [ + pd.Timestamp("2025-02-14 23:05:32", tz="UTC"), + pd.Timestamp("2025-02-14 23:05:32", tz="UTC"), + ], + name="updated", + ) + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) From 7b0cab5c504ec2b24ea35b29ee32901da65681b6 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 5 Mar 2025 19:42:20 -0800 Subject: [PATCH 03/19] chore: format warning message with newlines and ansi color (#1447) * use ibis fill_null instead of fillna * minor typo in JSON warning messages * chore: format warning message with newlines and ansi color --- bigframes/_config/bigquery_options.py | 8 +-- bigframes/_config/experiment_options.py | 6 +-- bigframes/core/array_value.py | 14 +++-- bigframes/core/blocks.py | 9 ++-- bigframes/core/compile/aggregate_compiler.py | 16 ++---- bigframes/core/global_session.py | 2 +- bigframes/core/indexers.py | 2 +- bigframes/core/utils.py | 2 +- bigframes/dataframe.py | 7 ++- bigframes/exceptions.py | 26 +++++++++ bigframes/functions/_function_session.py | 6 +-- bigframes/functions/function.py | 2 +- bigframes/ml/base.py | 5 +- bigframes/ml/llm.py | 54 +++++++++++-------- bigframes/ml/remote.py | 3 +- bigframes/operations/_matplotlib/core.py | 9 ++-- bigframes/operations/semantics.py | 20 +++---- bigframes/session/__init__.py | 16 +++--- .../session/_io/bigquery/read_gbq_table.py | 8 +-- bigframes/session/clients.py | 6 ++- bigframes/session/executor.py | 9 ++-- bigframes/streaming/dataframe.py | 8 ++- tests/system/large/test_dataframe_io.py | 4 +- tests/system/large/test_location.py | 6 +-- .../small/functions/test_remote_function.py | 2 +- tests/unit/_config/test_bigquery_options.py | 17 +++--- 26 files changed, 160 insertions(+), 107 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 3968e98a69..84bc4f6d01 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -59,7 +59,9 @@ def _get_validated_location(value: Optional[str]) -> Optional[str]: # -> bpd.options.bigquery.location = "us-central-1" # -> location.setter # -> _get_validated_location - msg = UNKNOWN_LOCATION_MESSAGE.format(location=location, possibility=possibility) + msg = bfe.format_message( + UNKNOWN_LOCATION_MESSAGE.format(location=location, possibility=possibility) + ) warnings.warn(msg, stacklevel=3, category=bfe.UnknownLocationWarning) return value @@ -294,7 +296,7 @@ def use_regional_endpoints(self, value: bool): ) if value: - msg = ( + msg = bfe.format_message( "Use of regional endpoints is a feature in preview and " "available only in selected regions and projects. " ) @@ -354,7 +356,7 @@ def client_endpoints_override(self) -> dict: @client_endpoints_override.setter def client_endpoints_override(self, value: dict): - msg = ( + msg = bfe.format_message( "This is an advanced configuration option for directly setting endpoints. " "Incorrect use may lead to unexpected behavior or system instability. " "Proceed only if you fully understand its implications." diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index b958667628..3d52976004 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -34,7 +34,7 @@ def semantic_operators(self) -> bool: @semantic_operators.setter def semantic_operators(self, value: bool): if value is True: - msg = ( + msg = bfe.format_message( "Semantic operators are still under experiments, and are subject " "to change in the future." ) @@ -48,7 +48,7 @@ def blob(self) -> bool: @blob.setter def blob(self, value: bool): if value is True: - msg = ( + msg = bfe.format_message( "BigFrames Blob is still under experiments. It may not work and " "subject to change in the future." ) @@ -62,7 +62,7 @@ def udf(self) -> bool: @udf.setter def udf(self, value: bool): if value is True: - msg = ( + msg = bfe.format_message( "BigFrames managed function (udf) is still under experiments. " "It may not work and subject to change in the future." ) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 9325e3e5a8..9c44255941 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -107,8 +107,8 @@ def from_table( if offsets_col and primary_key: raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): - msg = ( - "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is" + msg = bfe.format_message( + "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is " "in preview; this behavior may change in future versions." ) warnings.warn(msg, bfe.PreviewWarning) @@ -232,7 +232,9 @@ def slice( self, start: Optional[int], stop: Optional[int], step: Optional[int] ) -> ArrayValue: if self.node.order_ambiguous and not (self.session._strictly_ordered): - msg = "Window ordering may be ambiguous, this can cause unstable results." + msg = bfe.format_message( + "Window ordering may be ambiguous, this can cause unstable results." + ) warnings.warn(msg, bfe.AmbiguousWindowWarning) return ArrayValue( nodes.SliceNode( @@ -254,7 +256,7 @@ def promote_offsets(self) -> Tuple[ArrayValue, str]: "Generating offsets not supported in partial ordering mode" ) else: - msg = ( + msg = bfe.format_message( "Window ordering may be ambiguous, this can cause unstable results." ) warnings.warn(msg, category=bfe.AmbiguousWindowWarning) @@ -417,7 +419,9 @@ def project_window_op( "Generating offsets not supported in partial ordering mode" ) else: - msg = "Window ordering may be ambiguous, this can cause unstable results." + msg = bfe.format_message( + "Window ordering may be ambiguous, this can cause unstable results." + ) warnings.warn(msg, category=bfe.AmbiguousWindowWarning) output_name = self._gen_namespaced_uid() diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 7ac2b03f28..b4e3ea0f86 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -64,6 +64,7 @@ import bigframes.core.utils as utils import bigframes.core.window_spec as windows import bigframes.dtypes +import bigframes.exceptions as bfe import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -630,12 +631,12 @@ def _materialize_local( # Since we cannot acquire the table size without a query_job, # we skip the sampling. if sample_config.enable_downsampling: - warnings.warn( + msg = bfe.format_message( "Sampling is disabled and there is no download size limit when 'allow_large_results' is set to " "False. To prevent downloading excessive data, it is recommended to use the peek() method, or " - "limit the data with methods like .head() or .sample() before proceeding with downloads.", - UserWarning, + "limit the data with methods like .head() or .sample() before proceeding with downloads." ) + warnings.warn(msg, category=UserWarning) fraction = 2 # TODO: Maybe materialize before downsampling @@ -652,7 +653,7 @@ def _materialize_local( " # Setting it to None will download all the data\n" f"{constants.FEEDBACK_LINK}" ) - msg = ( + msg = bfe.format_message( f"The data size ({table_mb:.2f} MB) exceeds the maximum download limit of" f"({max_download_size} MB). It will be downsampled to {max_download_size} " "MB for download.\nPlease refer to the documentation for configuring " diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index edf1e14b3a..93fddf196e 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -165,7 +165,7 @@ def _( ) -> ibis_types.NumericValue: # Will be null if all inputs are null. Pandas defaults to zero sum though. bq_sum = _apply_window_if_present(column.sum(), window) - return bq_sum.fillna(ibis_types.literal(0)) + return bq_sum.fill_null(ibis_types.literal(0)) @compile_unary_agg.register @@ -610,12 +610,7 @@ def _( result = _apply_window_if_present(_is_true(column).all(), window) literal = ibis_types.literal(True) - return cast( - ibis_types.BooleanScalar, - result.fill_null(literal) - if hasattr(result, "fill_null") - else result.fillna(literal), - ) + return cast(ibis_types.BooleanScalar, result.fill_null(literal)) @compile_unary_agg.register @@ -628,12 +623,7 @@ def _( result = _apply_window_if_present(_is_true(column).any(), window) literal = ibis_types.literal(False) - return cast( - ibis_types.BooleanScalar, - result.fill_null(literal) - if hasattr(result, "fill_null") - else result.fillna(literal), - ) + return cast(ibis_types.BooleanScalar, result.fill_null(literal)) @compile_ordered_unary_agg.register diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 8b32fee5b4..d4d70f5a06 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -39,7 +39,7 @@ def _try_close_session(session: bigframes.session.Session): session_id = session.session_id location = session._location project_id = session._project - msg = ( + msg = bfe.format_message( f"Session cleanup failed for session with id: {session_id}, " f"location: {location}, project: {project_id}" ) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 97115a3ed0..c0c4d9ec11 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -407,7 +407,7 @@ def _struct_accessor_check_and_warn( return if not bigframes.dtypes.is_string_like(series.index.dtype): - msg = ( + msg = bfe.format_message( "Are you trying to access struct fields? If so, please use Series.struct.field(...) " "method instead." ) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 18061dca18..e38c43e73e 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -196,7 +196,7 @@ def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): - warnings.warn(msg, category=bfe.PreviewWarning) + warnings.warn(bfe.format_message(msg), category=bfe.PreviewWarning) return func(*args, **kwargs) return wrapper diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b5174dbd3e..a48e06d86c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1581,7 +1581,10 @@ def to_arrow( Returns: pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame. """ - msg = "to_arrow is in preview. Types and unnamed / duplicate name columns may change in future." + msg = bfe.format_message( + "to_arrow is in preview. Types and unnamed or duplicate name columns may " + "change in future." + ) warnings.warn(msg, category=bfe.PreviewWarning) pa_table, query_job = self._block.to_arrow( @@ -4104,7 +4107,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # to the applied function should be a Series, not a scalar. if utils.get_axis_number(axis) == 1: - msg = "axis=1 scenario is in preview." + msg = bfe.format_message("axis=1 scenario is in preview.") warnings.warn(msg, category=bfe.PreviewWarning) # TODO(jialuo): Deprecate the "bigframes_remote_function" attribute. diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 97e2da40a1..8b35d9122b 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -14,6 +14,8 @@ """Public exceptions and warnings used across BigQuery DataFrames.""" +import textwrap + # NOTE: This module should not depend on any others in the package. @@ -87,3 +89,27 @@ class ApiDeprecationWarning(FutureWarning): class BadIndexerKeyWarning(Warning): """The indexer key is not used correctly.""" + + +class ColorFormatter: + WARNING = "\033[93m" + ENDC = "\033[0m" + + +def format_message(message: str, fill: bool = True): + """Formats a warning message with ANSI color codes for the warning color. + + Args: + message: The warning message string. + fill: Whether to wrap the message text using `textwrap.fill`. + Defaults to True. Set to False to prevent wrapping, + especially if the message already contains newlines. + + Returns: + The formatted message string, with ANSI color codes for warning color + if color is supported, otherwise the original message. If `fill` is + True, the message will be wrapped to fit the terminal width. + """ + if fill: + message = textwrap.fill(message) + return ColorFormatter.WARNING + message + ColorFormatter.ENDC diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 20dcf45103..ce0ade26ff 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -489,7 +489,7 @@ def remote_function( if cloud_function_ingress_settings is None: cloud_function_ingress_settings = "all" - msg = ( + msg = bfe.format_message( "The `cloud_function_ingress_settings` are set to 'all' by default, " "which will change to 'internal-only' for enhanced security in future version 2.0 onwards. " "However, you will be able to explicitly pass cloud_function_ingress_settings='all' if you need. " @@ -549,7 +549,7 @@ def wrapper(func): (input_type := input_types[0]) == bf_series.Series or input_type == pandas.Series ): - msg = "input_types=Series is in preview." + msg = bfe.format_message("input_types=Series is in preview.") warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) # we will model the row as a json serialized string containing the data @@ -836,7 +836,7 @@ def wrapper(func): (input_type := input_types[0]) == bf_series.Series or input_type == pandas.Series ): - msg = "input_types=Series is in preview." + msg = bfe.format_message("input_types=Series is in preview.") warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) # we will model the row as a json serialized string containing diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 392a209714..16416eb864 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -231,7 +231,7 @@ def func(*bigframes_args, **bigframes_kwargs): ) function_input_dtypes.append(input_dtype) if has_unknown_dtypes: - msg = ( + msg = bfe.format_message( "The function has one or more missing input data types. BigQuery DataFrames " f"will assume default data type {bigframes.dtypes.DEFAULT_DTYPE} for them." ) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index c353e47f3a..a0800c19e6 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -27,6 +27,7 @@ import bigframes_vendored.sklearn.base +import bigframes.exceptions as bfe from bigframes.ml import core import bigframes.ml.utils as utils import bigframes.pandas as bpd @@ -269,7 +270,7 @@ def _predict_and_retry( if df_succ.empty: if max_retries > 0: - msg = "Can't make any progress, stop retrying." + msg = bfe.format_message("Can't make any progress, stop retrying.") warnings.warn(msg, category=RuntimeWarning) break @@ -281,7 +282,7 @@ def _predict_and_retry( break if not df_fail.empty: - msg = ( + msg = bfe.format_message( f"Some predictions failed. Check column {self._status_col} for detailed " "status. You may want to filter the failed rows and retry." ) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 72c49e124b..0117444f16 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -189,9 +189,11 @@ def _create_bqml_model(self): ) if self.model_name not in _TEXT_GENERATOR_ENDPOINTS: - msg = _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_TEXT_GENERATOR_ENDPOINTS), + msg = exceptions.format_message( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_TEXT_GENERATOR_ENDPOINTS), + ) ) warnings.warn(msg) @@ -368,7 +370,7 @@ def predict( df = self._bqml_model.generate_text(X, options) if (df[_ML_GENERATE_TEXT_STATUS] != "").any(): - msg = ( + msg = exceptions.format_message( f"Some predictions failed. Check column {_ML_GENERATE_TEXT_STATUS} for " "detailed status. You may want to filter the failed rows and retry." ) @@ -522,9 +524,11 @@ def _create_bqml_model(self): ) if self.model_name not in _PALM2_EMBEDDING_GENERATOR_ENDPOINTS: - msg = _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_PALM2_EMBEDDING_GENERATOR_ENDPOINTS), + msg = exceptions.format_message( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_PALM2_EMBEDDING_GENERATOR_ENDPOINTS), + ) ) warnings.warn(msg) @@ -598,7 +602,7 @@ def predict(self, X: utils.ArrayType) -> bigframes.dataframe.DataFrame: ) if (df[_ML_EMBED_TEXT_STATUS] != "").any(): - msg = ( + msg = exceptions.format_message( f"Some predictions failed. Check column {_ML_EMBED_TEXT_STATUS} for " "detailed status. You may want to filter the failed rows and retry." ) @@ -666,9 +670,11 @@ def _create_bqml_model(self): ) if self.model_name not in _TEXT_EMBEDDING_ENDPOINTS: - msg = _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_TEXT_EMBEDDING_ENDPOINTS), + msg = exceptions.format_message( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_TEXT_EMBEDDING_ENDPOINTS), + ) ) warnings.warn(msg) @@ -805,9 +811,11 @@ def _create_bqml_model(self): ) if self.model_name != _MULTIMODAL_EMBEDDING_001_ENDPOINT: - msg = _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=_MULTIMODAL_EMBEDDING_001_ENDPOINT, + msg = exceptions.format_message( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=_MULTIMODAL_EMBEDDING_001_ENDPOINT, + ) ) warnings.warn(msg) @@ -952,7 +960,7 @@ def __init__( max_iterations: int = 300, ): if model_name in _GEMINI_PREVIEW_ENDPOINTS: - msg = ( + msg = exceptions.format_message( f'Model {model_name} is subject to the "Pre-GA Offerings Terms" in ' "the General Service Terms section of the Service Specific Terms" "(https://cloud.google.com/terms/service-terms#1). Pre-GA products and " @@ -976,9 +984,11 @@ def _create_bqml_model(self): ) if self.model_name not in _GEMINI_ENDPOINTS: - msg = _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_GEMINI_ENDPOINTS), + msg = exceptions.format_message( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_GEMINI_ENDPOINTS), + ) ) warnings.warn(msg) @@ -1343,9 +1353,11 @@ def _create_bqml_model(self): ) if self.model_name not in _CLAUDE_3_ENDPOINTS: - msg = _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_CLAUDE_3_ENDPOINTS), + msg = exceptions.format_message( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_CLAUDE_3_ENDPOINTS), + ) ) warnings.warn(msg) options = { diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index 6ee6840656..cc711cbe3b 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -21,6 +21,7 @@ from bigframes.core import global_session, log_adapter import bigframes.dataframe +import bigframes.exceptions as bfe from bigframes.ml import base, core, globals, utils import bigframes.session @@ -119,7 +120,7 @@ def predict( # unlike LLM models, the general remote model status is null for successful runs. if (df[_REMOTE_MODEL_STATUS].notna()).any(): - msg = ( + msg = bfe.format_message( f"Some predictions failed. Check column {_REMOTE_MODEL_STATUS} for " "detailed status. You may want to filter the failed rows and retry." ) diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 9c68a2c5ca..a5f53b9f64 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -20,6 +20,7 @@ import pandas as pd import bigframes.dtypes as dtypes +import bigframes.exceptions as bfe DEFAULT_SAMPLING_N = 1000 DEFAULT_SAMPLING_STATE = 0 @@ -70,10 +71,12 @@ def _compute_sample_data(self, data): if self._sampling_warning_msg is not None: total_n = data.shape[0] if sampling_n < total_n: - msg = self._sampling_warning_msg.format( - sampling_n=sampling_n, total_n=total_n + msg = bfe.format_message( + self._sampling_warning_msg.format( + sampling_n=sampling_n, total_n=total_n + ) ) - warnings.warn(msg) + warnings.warn(msg, category=UserWarning) sampling_random_state = self.kwargs.pop( "sampling_random_state", DEFAULT_SAMPLING_STATE diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 3b7a77e5b7..686db50a43 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -141,11 +141,11 @@ def agg( column = columns[0] if ground_with_google_search: - msg = ( + msg = exceptions.format_message( "Enables Grounding with Google Search may impact billing cost. See pricing " "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - warnings.warn(msg) + warnings.warn(msg, category=UserWarning) user_instruction = self._format_instruction(instruction, columns) @@ -372,11 +372,11 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals raise ValueError(f"Column {column} not found.") if ground_with_google_search: - msg = ( + msg = exceptions.format_message( "Enables Grounding with Google Search may impact billing cost. See pricing " "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - warnings.warn(msg) + warnings.warn(msg, category=UserWarning) self._confirm_operation(len(self._df)) @@ -471,11 +471,11 @@ def map( raise ValueError(f"Column {column} not found.") if ground_with_google_search: - msg = ( + msg = exceptions.format_message( "Enables Grounding with Google Search may impact billing cost. See pricing " "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - warnings.warn(msg) + warnings.warn(msg, category=UserWarning) self._confirm_operation(len(self._df)) @@ -573,11 +573,11 @@ def join( columns = self._parse_columns(instruction) if ground_with_google_search: - msg = ( + msg = exceptions.format_message( "Enables Grounding with Google Search may impact billing cost. See pricing " "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - warnings.warn(msg) + warnings.warn(msg, category=UserWarning) work_estimate = len(self._df) * len(other) self._confirm_operation(work_estimate) @@ -816,11 +816,11 @@ def top_k( ) if ground_with_google_search: - msg = ( + msg = exceptions.format_message( "Enables Grounding with Google Search may impact billing cost. See pricing " "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - warnings.warn(msg) + warnings.warn(msg, category=UserWarning) work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) self._confirm_operation(work_estimate) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 13e49fca42..3f081e2177 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -152,7 +152,9 @@ def __init__( if context.location is None: self._location = "US" - msg = f"No explicit location is set, so using location {self._location} for the session." + msg = bfe.format_message( + f"No explicit location is set, so using location {self._location} for the session." + ) # User's code # -> get_global_session() # -> connect() @@ -344,25 +346,25 @@ def _project(self): @property def bytes_processed_sum(self): """The sum of all bytes processed by bigquery jobs using this session.""" - warnings.warn( + msg = bfe.format_message( "Queries executed with `allow_large_results=False` within the session will not " "have their bytes processed counted in this sum. If you need precise " "bytes processed information, query the `INFORMATION_SCHEMA` tables " "to get relevant metrics.", - UserWarning, ) + warnings.warn(msg, UserWarning) return self._metrics.bytes_processed @property def slot_millis_sum(self): """The sum of all slot time used by bigquery jobs in this session.""" - warnings.warn( + msg = bfe.format_message( "Queries executed with `allow_large_results=False` within the session will not " "have their slot milliseconds counted in this sum. If you need precise slot " "milliseconds information, query the `INFORMATION_SCHEMA` tables " "to get relevant metrics.", - UserWarning, ) + warnings.warn(msg, UserWarning) return self._metrics.slot_millis @property @@ -612,7 +614,9 @@ def read_gbq_table_streaming( bigframes.streaming.dataframe.StreamingDataFrame: A StreamingDataFrame representing results of the table. """ - msg = "The bigframes.streaming module is a preview feature, and subject to change." + msg = bfe.format_message( + "The bigframes.streaming module is a preview feature, and subject to change." + ) warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) import bigframes.streaming.dataframe as streaming_dataframe diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index ed68762ee8..9fa97cb6e1 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -59,7 +59,7 @@ def get_table_metadata( # Cache hit could be unexpected. See internal issue 329545805. # Raise a warning with more information about how to avoid the # problems with the cache. - msg = ( + msg = bfe.format_message( f"Reading cached table from {snapshot_timestamp} to avoid " "incompatibilies with previous reads of this table. To read " "the latest version, set `use_cache=False` or close the " @@ -104,7 +104,7 @@ def validate_table( # Only true tables support time travel elif table.table_type != "TABLE": if table.table_type == "MATERIALIZED_VIEW": - msg = ( + msg = bfe.format_message( "Materialized views do not support FOR SYSTEM_TIME AS OF queries. " "Attempting query without time travel. Be aware that as materialized views " "are updated periodically, modifications to the underlying data in the view may " @@ -142,7 +142,7 @@ def validate_table( snapshot_sql, job_config=bigquery.QueryJobConfig(dry_run=True) ) if time_travel_not_found: - msg = ( + msg = bfe.format_message( "NotFound error when reading table with time travel." " Attempting query without time travel. Warning: Without" " time travel, modifications to the underlying table may" @@ -269,7 +269,7 @@ def get_index_cols( # resource utilization because of the default sequential index. See # internal issue 335727141. if _is_table_clustered_or_partitioned(table) and not primary_keys: - msg = ( + msg = bfe.format_message( f"Table '{str(table.reference)}' is clustered and/or " "partitioned, but BigQuery DataFrames was not able to find a " "suitable index. To avoid this warning, set at least one of: " diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index fd8f387c3d..5b707ad478 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -32,6 +32,7 @@ import pydata_google_auth import bigframes.constants +import bigframes.exceptions as bfe import bigframes.version _ENV_DEFAULT_PROJECT = "GOOGLE_CLOUD_PROJECT" @@ -102,12 +103,13 @@ def __init__( and location.lower() not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS ): - warnings.warn( + msg = bfe.format_message( bigframes.constants.LEP_DEPRECATION_WARNING_MESSAGE.format( location=location ), - category=FutureWarning, + fill=False, ) + warnings.warn(msg, category=FutureWarning) self._location = location self._use_regional_endpoints = use_regional_endpoints diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 22d1c1dcea..0644b0e6d9 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -48,6 +48,7 @@ import bigframes.core.schema import bigframes.core.tree_properties as tree_properties import bigframes.dtypes +import bigframes.exceptions as bfe import bigframes.features import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics @@ -271,13 +272,13 @@ def iterator_supplier(): size_bytes = None if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES: - warnings.warn( + msg = bfe.format_message( "The query result size has exceeded 10 GB. In BigFrames 2.0 and " "later, you might need to manually set `allow_large_results=True` in " "the IO method or adjust the BigFrames option: " - "`bigframes.options.bigquery.allow_large_results=True`.", - FutureWarning, + "`bigframes.options.bigquery.allow_large_results=True`." ) + warnings.warn(msg, FutureWarning) # Runs strict validations to ensure internal type predictions and ibis are completely in sync # Do not execute these validations outside of testing suite. if "PYTEST_CURRENT_TEST" in os.environ: @@ -383,7 +384,7 @@ def peek( """ plan = self.replace_cached_subtrees(array_value.node) if not tree_properties.can_fast_peek(plan): - msg = "Peeking this value cannot be done efficiently." + msg = bfe.format_message("Peeking this value cannot be done efficiently.") warnings.warn(msg) if use_explicit_destination is None: use_explicit_destination = bigframes.options.bigquery.allow_large_results diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py index 2180a66207..4acefd6283 100644 --- a/bigframes/streaming/dataframe.py +++ b/bigframes/streaming/dataframe.py @@ -372,7 +372,9 @@ def _to_bigtable( For example, the job can be cancelled or its error status can be examined. """ - msg = "The bigframes.streaming module is a preview feature, and subject to change." + msg = bfe.format_message( + "The bigframes.streaming module is a preview feature, and subject to change." + ) warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) # get default client if not passed @@ -484,7 +486,9 @@ def _to_pubsub( For example, the job can be cancelled or its error status can be examined. """ - msg = "The bigframes.streaming module is a preview feature, and subject to change." + msg = bfe.format_message( + "The bigframes.streaming module is a preview feature, and subject to change." + ) warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) # get default client if not passed diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py index c055babce6..76a7001fe3 100644 --- a/tests/system/large/test_dataframe_io.py +++ b/tests/system/large/test_dataframe_io.py @@ -46,9 +46,7 @@ def test_to_pandas_batches_override_global_option( ) assert len(w) == 2 assert issubclass(w[0].category, FutureWarning) - assert str(w[0].message).startswith( - "The query result size has exceeded 10 GB." - ) + assert "The query result size has exceeded 10 GB." in str(w[0].message) def test_to_pandas_raise_when_large_result_not_allowed(session): diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index 0b4a7afe2b..7801f5dada 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -163,11 +163,7 @@ def test_bq_lep_endpoints(bigquery_location): location=bigquery_location, use_regional_endpoints=True ) assert len(record) == 1 - assert typing.cast(Warning, record[0].message).args[ - 0 - ] == bigframes.constants.LEP_DEPRECATION_WARNING_MESSAGE.format( - location=bigquery_location - ) + assert bigquery_location in typing.cast(Warning, record[0].message).args[0] # Verify that location and endpoints are correctly set for the BigQuery API # client diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index c12d0e03f5..075a57f23d 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -929,7 +929,7 @@ def test_read_gbq_function_requires_explicit_types( ) with pytest.warns( bigframes.exceptions.UnknownDataTypeWarning, - match="missing input data types.*assume default data type", + match=r"missing input data types[\s\S]*assume default data type", ): bff.read_gbq_function( str(only_return_type_specified.reference), diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index 31f43ffee5..98a74d4e4c 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -164,14 +164,19 @@ def set_location_property(): options.location = invalid_location for op in [set_location_in_constructor, set_location_property]: - with pytest.warns( - bigframes.exceptions.UnknownLocationWarning, - match=re.escape( - f"The location '{invalid_location}' is set to an unknown value. Did you mean '{possibility}'?" - ), - ): + with warnings.catch_warnings(record=True) as w: op() + assert issubclass( + w[0].category, bigframes.exceptions.UnknownLocationWarning + ) + assert ( + f"The location '{invalid_location}' is set to an unknown value. " + in str(w[0].message) + ) + # The message might contain newlines added by textwrap.fill. + assert possibility in str(w[0].message).replace("\n", "") + def test_client_endpoints_override_set_shows_warning(): options = bigquery_options.BigQueryOptions() From 024113942aed1e0dcfa3877378fe729b29044155 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 6 Mar 2025 10:10:31 -0800 Subject: [PATCH 04/19] chore: add experimental blob url tests (#1463) * chore: add experimental blob url tests * fix --- tests/system/small/blob/test_urls.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/system/small/blob/test_urls.py diff --git a/tests/system/small/blob/test_urls.py b/tests/system/small/blob/test_urls.py new file mode 100644 index 0000000000..da972348f2 --- /dev/null +++ b/tests/system/small/blob/test_urls.py @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes +import bigframes.pandas as bpd + + +def test_blob_read_url(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + urls = images_mm_df["blob_col"].blob.read_url() + + assert urls.str.startswith("https://storage.googleapis.com/").all() + + +def test_blob_write_url(images_mm_df: bpd.DataFrame): + bigframes.options.experiments.blob = True + + urls = images_mm_df["blob_col"].blob.write_url() + + assert urls.str.startswith("https://storage.googleapis.com/").all() From fe72ada9cebb32947560c97567d7937c8b618f0d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 6 Mar 2025 10:46:00 -0800 Subject: [PATCH 05/19] fix: Fix list-like indexers in partial ordering mode (#1456) --- bigframes/core/blocks.py | 20 ++++++++++++++++---- bigframes/core/indexers.py | 12 ++++++++++-- bigframes/dataframe.py | 10 ++++++++-- tests/system/conftest.py | 10 ++++++++++ tests/system/small/test_dataframe.py | 14 ++++++++++++++ 5 files changed, 58 insertions(+), 8 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index b4e3ea0f86..66d9d6772f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2325,6 +2325,7 @@ def _apply_binop( return self.project_exprs(exprs, labels=labels, drop=True) + # TODO: Re-implement join in terms of merge (requires also adding remaining merge args) def join( self, other: Block, @@ -2332,6 +2333,7 @@ def join( how="left", sort: bool = False, block_identity_join: bool = False, + always_order: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: """ Join two blocks objects together, and provide mappings between source columns and output columns. @@ -2345,6 +2347,8 @@ def join( if true will sort result by index block_identity_join (bool): If true, will not convert join to a projection (implicitly assuming unique indices) + always_order (bool): + If true, will always preserve input ordering, even if ordering mode is partial Returns: Block, (left_mapping, right_mapping): Result block and mappers from input column ids to result column ids. @@ -2390,10 +2394,14 @@ def join( self._throw_if_null_index("join") other._throw_if_null_index("join") if self.index.nlevels == other.index.nlevels == 1: - return join_mono_indexed(self, other, how=how, sort=sort) + return join_mono_indexed( + self, other, how=how, sort=sort, propogate_order=always_order + ) else: # Handles cases where one or both sides are multi-indexed # Always sort mult-index join - return join_multi_indexed(self, other, how=how, sort=sort) + return join_multi_indexed( + self, other, how=how, sort=sort, propogate_order=always_order + ) def is_monotonic_increasing( self, column_id: typing.Union[str, Sequence[str]] @@ -2850,7 +2858,8 @@ def join_mono_indexed( right: Block, *, how="left", - sort=False, + sort: bool = False, + propogate_order: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: left_expr = left.expr right_expr = right.expr @@ -2861,6 +2870,7 @@ def join_mono_indexed( conditions=( join_defs.JoinCondition(left.index_columns[0], right.index_columns[0]), ), + propogate_order=propogate_order, ) left_index = get_column_left[left.index_columns[0]] @@ -2895,7 +2905,8 @@ def join_multi_indexed( right: Block, *, how="left", - sort=False, + sort: bool = False, + propogate_order: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: if not (left.index.is_uniquely_named() and right.index.is_uniquely_named()): raise ValueError("Joins not supported on indices with non-unique level names") @@ -2924,6 +2935,7 @@ def join_multi_indexed( join_defs.JoinCondition(left, right) for left, right in zip(left_join_ids, right_join_ids) ), + propogate_order=propogate_order, ) left_ids_post_join = [get_column_left[id] for id in left_join_ids] diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index c0c4d9ec11..d1a0c42e97 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -379,12 +379,14 @@ def _perform_loc_list_join( result = typing.cast( bigframes.series.Series, series_or_dataframe.to_frame()._perform_join_by_index( - keys_index, how="right" + keys_index, how="right", always_order=True )[name], ) result = result.rename(original_name) else: - result = series_or_dataframe._perform_join_by_index(keys_index, how="right") + result = series_or_dataframe._perform_join_by_index( + keys_index, how="right", always_order=True + ) if drop_levels and series_or_dataframe.index.nlevels > keys_index.nlevels: # drop common levels @@ -492,6 +494,12 @@ def _iloc_getitem_series_or_dataframe( # set to offset index and use regular loc, then restore index df = df.reset_index(drop=False) + block = df._block + # explicitly set index to offsets, reset_index may not generate offsets in some modes + block, offsets_id = block.promote_offsets("temp_iloc_offsets_") + block = block.set_index([offsets_id]) + df = bigframes.dataframe.DataFrame(block) + result = df.loc[key] result = result.set_index(temporary_index_names) result = result.rename_axis(original_index_names) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a48e06d86c..151da51792 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3238,9 +3238,15 @@ def join( return left._perform_join_by_index(right, how=how) def _perform_join_by_index( - self, other: Union[DataFrame, indexes.Index], *, how: str = "left" + self, + other: Union[DataFrame, indexes.Index], + *, + how: str = "left", + always_order: bool = False, ): - block, _ = self._block.join(other._block, how=how, block_identity_join=True) + block, _ = self._block.join( + other._block, how=how, block_identity_join=True, always_order=always_order + ) return DataFrame(block) @validations.requires_ordering() diff --git a/tests/system/conftest.py b/tests/system/conftest.py index d40d0e0eef..5b3add053c 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -544,6 +544,16 @@ def scalars_df_index( return session.read_gbq(scalars_table_id, index_col="rowindex") +@pytest.fixture(scope="session") +def scalars_df_partial_ordering( + scalars_table_id: str, unordered_session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return unordered_session.read_gbq( + scalars_table_id, index_col="rowindex" + ).sort_index() + + @pytest.fixture(scope="session") def scalars_df_null_index( scalars_table_id: str, session: bigframes.Session diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index db777137b0..f80b811217 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4418,6 +4418,20 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): ) +def test_iloc_list_partial_ordering( + scalars_df_partial_ordering, scalars_pandas_df_index +): + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df_partial_ordering.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + def test_iloc_list_multiindex(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs scalars_df = scalars_df.copy() From 7b6e3615f8d4531beb4b59ca1223927112e713da Mon Sep 17 00:00:00 2001 From: jialuoo Date: Thu, 6 Mar 2025 13:01:37 -0800 Subject: [PATCH 06/19] fix: fix the merge issue between 1424 and 1373 (#1461) * fix: fix the merge issue between 1424 and 1373 * Update _function_session.py --- bigframes/functions/_function_session.py | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index ce0ade26ff..15c8cb979e 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -47,6 +47,7 @@ ) from bigframes import clients +from bigframes import version as bigframes_version import bigframes.core.compile.ibis_types import bigframes.exceptions as bfe import bigframes.series as bf_series @@ -265,6 +266,13 @@ def remote_function( .. deprecated:: 0.0.1 This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. + .. warning:: + To use remote functions with Bigframes 2.0 and onwards, please (preferred) + set an explicit user-managed ``cloud_function_service_account`` or (discouraged) + set ``cloud_function_service_account`` to use the Compute Engine service account + by setting it to `"default"`. + See, https://cloud.google.com/functions/docs/securing/function-identity. + .. note:: Please make sure following is setup before using this API: @@ -445,6 +453,26 @@ def remote_function( # Some defaults may be used from the session if not provided otherwise. session = self._resolve_session(session) + # raise a UserWarning if user does not explicitly set cloud_function_service_account to a + # user-managed cloud_function_service_account of to default + msg = bfe.format_message( + "You have not explicitly set a user-managed `cloud_function_service_account`. " + "Using the default Compute Engine service account. " + "To use Bigframes 2.0, please explicitly set `cloud_function_service_account` " + 'either to a user-managed service account (preferred) or to `"default"` ' + "to use the Compute Engine service account (discouraged). " + "See, https://cloud.google.com/functions/docs/securing/function-identity." + ) + + if ( + bigframes_version.__version__.startswith("1.") + and cloud_function_service_account is None + ): + warnings.warn(msg, stacklevel=2, category=FutureWarning) + + if cloud_function_service_account == "default": + cloud_function_service_account = None + # A BigQuery client is required to perform BQ operations. bigquery_client = self._resolve_bigquery_client(session, bigquery_client) From f3fadd780d7a786b6924e887bcb4b1e8f973c11b Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 6 Mar 2025 15:11:01 -0800 Subject: [PATCH 07/19] chore: add experimental blob functions retry and timeout (#1469) * chore: add experimental blob functions retry and timeout * fix --- bigframes/blob/_functions.py | 57 +++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 830bc8de06..a3e7ae153c 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -112,6 +112,10 @@ def image_blur_func( import cv2 as cv # type: ignore import numpy as np import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) ext = ext or ".jpeg" @@ -121,7 +125,7 @@ def image_blur_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] - response = requests.get(src_url) + response = session.get(src_url, timeout=30) bts = response.content nparr = np.frombuffer(bts, np.uint8) @@ -135,12 +139,13 @@ def image_blur_func( ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - requests.put( + session.put( url=dst_url, data=bts, headers={ "Content-Type": content_type, }, + timeout=30, ) return dst_obj_ref_rt @@ -157,13 +162,17 @@ def image_blur_to_bytes_func( import cv2 as cv # type: ignore import numpy as np import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) ext = ext or ".jpeg" src_obj_ref_rt_json = json.loads(src_obj_ref_rt) src_url = src_obj_ref_rt_json["access_urls"]["read_url"] - response = requests.get(src_url) + response = session.get(src_url, timeout=30) bts = response.content nparr = np.frombuffer(bts, np.uint8) @@ -193,6 +202,10 @@ def image_resize_func( import cv2 as cv # type: ignore import numpy as np import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) ext = ext or ".jpeg" @@ -202,7 +215,7 @@ def image_resize_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] - response = requests.get(src_url) + response = session.get(src_url, timeout=30) bts = response.content nparr = np.frombuffer(bts, np.uint8) @@ -216,12 +229,13 @@ def image_resize_func( ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - requests.put( + session.put( url=dst_url, data=bts, headers={ "Content-Type": content_type, }, + timeout=30, ) return dst_obj_ref_rt @@ -245,13 +259,17 @@ def image_resize_to_bytes_func( import cv2 as cv # type: ignore import numpy as np import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) ext = ext or ".jpeg" src_obj_ref_rt_json = json.loads(src_obj_ref_rt) src_url = src_obj_ref_rt_json["access_urls"]["read_url"] - response = requests.get(src_url) + response = session.get(src_url, timeout=30) bts = response.content nparr = np.frombuffer(bts, np.uint8) @@ -280,6 +298,10 @@ def image_normalize_func( import cv2 as cv # type: ignore import numpy as np import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) ext = ext or ".jpeg" @@ -296,7 +318,7 @@ def image_normalize_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] - response = requests.get(src_url) + response = session.get(src_url, timeout=30) bts = response.content nparr = np.frombuffer(bts, np.uint8) @@ -312,12 +334,13 @@ def image_normalize_func( ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - requests.put( + session.put( url=dst_url, data=bts, headers={ "Content-Type": content_type, }, + timeout=30, ) return dst_obj_ref_rt @@ -336,6 +359,10 @@ def image_normalize_to_bytes_func( import cv2 as cv # type: ignore import numpy as np import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) ext = ext or ".jpeg" @@ -349,7 +376,7 @@ def image_normalize_to_bytes_func( src_obj_ref_rt_json = json.loads(src_obj_ref_rt) src_url = src_obj_ref_rt_json["access_urls"]["read_url"] - response = requests.get(src_url) + response = session.get(src_url, timeout=30) bts = response.content nparr = np.frombuffer(bts, np.uint8) @@ -374,11 +401,15 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str: from pypdf import PdfReader # type: ignore import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) src_obj_ref_rt_json = json.loads(src_obj_ref_rt) src_url = src_obj_ref_rt_json["access_urls"]["read_url"] - response = requests.get(src_url, stream=True) + response = session.get(src_url, timeout=30, stream=True) response.raise_for_status() pdf_bytes = response.content @@ -403,11 +434,15 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s from pypdf import PdfReader # type: ignore import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) src_obj_ref_rt_json = json.loads(src_obj_ref_rt) src_url = src_obj_ref_rt_json["access_urls"]["read_url"] - response = requests.get(src_url, stream=True) + response = session.get(src_url, timeout=30, stream=True) response.raise_for_status() pdf_bytes = response.content From be5098202ff773638c2bf0b2afb4d73f52dc7f31 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Thu, 6 Mar 2025 18:42:12 -0800 Subject: [PATCH 08/19] test: add unit tests for udf experiment options (#1468) --- tests/unit/_config/test_experiment_options.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/_config/test_experiment_options.py b/tests/unit/_config/test_experiment_options.py index 8e612be06c..9735e494be 100644 --- a/tests/unit/_config/test_experiment_options.py +++ b/tests/unit/_config/test_experiment_options.py @@ -46,3 +46,18 @@ def test_blob_set_true_shows_warning(): options.blob = True assert options.blob is True + + +def test_udf_default_false(): + options = experiment_options.ExperimentOptions() + + assert options.udf is False + + +def test_udf_set_true_shows_warning(): + options = experiment_options.ExperimentOptions() + + with pytest.warns(bfe.PreviewWarning): + options.udf = True + + assert options.udf is True From 9a65e836394a52632dedd9489310c678537d0e37 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 6 Mar 2025 19:39:31 -0800 Subject: [PATCH 09/19] chore: fix Multimodal Gemini modifies input DF (#1467) * chore: add experimental blob url tests * fix * chore: fix Multimodal Gemini modifies input DF * fix --- bigframes/core/convert.py | 4 ++-- tests/system/small/core/test_convert.py | 5 ++++- tests/unit/ml/test_golden_sql.py | 2 ++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bigframes/core/convert.py b/bigframes/core/convert.py index 94a0564556..1546c2f87e 100644 --- a/bigframes/core/convert.py +++ b/bigframes/core/convert.py @@ -54,7 +54,7 @@ def to_bf_series( bigframes.pandas.Series """ if isinstance(obj, series.Series): - return obj + return obj.copy() if session is None: session = global_session.get_global_session() @@ -118,7 +118,7 @@ def to_bf_dataframe( session: Optional[session.Session] = None, ) -> dataframe.DataFrame: if isinstance(obj, dataframe.DataFrame): - return obj + return obj.copy() if isinstance(obj, pd.DataFrame): if session is None: diff --git a/tests/system/small/core/test_convert.py b/tests/system/small/core/test_convert.py index 3f74d17091..7ce0dd47ba 100644 --- a/tests/system/small/core/test_convert.py +++ b/tests/system/small/core/test_convert.py @@ -56,4 +56,7 @@ def test_to_bf_dataframe(input, session): def test_to_bf_dataframe_with_bf_dataframe(session): bf = dataframe.DataFrame({"test": [1, 2, 3]}, session=session) - assert convert.to_bf_dataframe(bf, None, session) is bf + testing.assert_frame_equal( + convert.to_bf_dataframe(bf, None, session).to_pandas(), + bf.to_pandas(), + ) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 97d1d2d7d1..c9d147e18f 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -66,6 +66,7 @@ def mock_y(mock_session): mock_y._session = mock_session mock_y.columns = pd.Index(["input_column_label"]) mock_y.cache.return_value = mock_y + mock_y.copy.return_value = mock_y return mock_y @@ -98,6 +99,7 @@ def mock_X(mock_y, mock_session): ) mock_X.cache.return_value = mock_X + mock_X.copy.return_value = mock_X return mock_X From 27ab028cdc45296923b12446c77b344af4208a3a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 7 Mar 2025 13:58:18 -0800 Subject: [PATCH 10/19] perf: Compilation no longer bounded by recursion (#1464) --- bigframes/core/bigframe_node.py | 13 + bigframes/core/compile/api.py | 13 +- bigframes/core/compile/compiler.py | 468 ++++++++++++++--------------- bigframes/session/executor.py | 2 +- 4 files changed, 252 insertions(+), 244 deletions(-) diff --git a/bigframes/core/bigframe_node.py b/bigframes/core/bigframe_node.py index 32c7f92912..369e8f6329 100644 --- a/bigframes/core/bigframe_node.py +++ b/bigframes/core/bigframe_node.py @@ -32,6 +32,8 @@ COLUMN_SET = frozenset[identifiers.ColumnId] +T = typing.TypeVar("T") + @dataclasses.dataclass(frozen=True) class Field: @@ -382,3 +384,14 @@ def bottom_up( results[node] = result return results[self] + + def reduce_up(self, reduction: Callable[[BigFrameNode, Tuple[T, ...]], T]) -> T: + """Apply a bottom-up reduction to the tree.""" + results: dict[BigFrameNode, T] = {} + for node in list(self.iter_nodes_topo()): + # child nodes have already been transformed + child_results = tuple(results[child] for child in node.child_nodes) + result = reduction(node, child_results) + results[node] = result + + return results[self] diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index cf441a2053..32257c0f98 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -25,13 +25,8 @@ import bigframes.core.ordering import bigframes.core.schema -_STRICT_COMPILER = compiler.Compiler(strict=True) - class SQLCompiler: - def __init__(self, strict: bool = True): - self._compiler = compiler.Compiler(strict=strict) - def compile( self, node: bigframes.core.nodes.BigFrameNode, @@ -41,7 +36,7 @@ def compile( ) -> str: """Compile node into sql where rows are sorted with ORDER BY.""" # If we are ordering the query anyways, compiling the slice as a limit is probably a good idea. - return self._compiler.compile_sql(node, ordered=ordered, limit=limit) + return compiler.compile_sql(node, ordered=ordered, limit=limit) def compile_raw( self, @@ -50,16 +45,16 @@ def compile_raw( str, Sequence[bigquery.SchemaField], bigframes.core.ordering.RowOrdering ]: """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" - return self._compiler.compile_raw(node) + return compiler.compile_raw(node) def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema.""" import bigframes.core.schema - node = _STRICT_COMPILER._replace_unsupported_ops(node) + node = compiler._replace_unsupported_ops(node) node, _ = rewrite.pull_up_order(node, order_root=False) - ir = _STRICT_COMPILER.compile_node(node) + ir = compiler.compile_node(node) items = tuple( bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id)) for name, ibis_id in zip(node.schema.names, ir.column_ids) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index f5be71830c..3d9bf19f76 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -13,7 +13,6 @@ # limitations under the License. from __future__ import annotations -import dataclasses import functools import io import typing @@ -42,249 +41,250 @@ import bigframes.session -@dataclasses.dataclass(frozen=True) -class Compiler: - # In strict mode, ordering will always be deterministic - # In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance. - strict: bool = True - scalar_op_compiler = compile_scalar.ScalarOpCompiler() - - def compile_sql( - self, - node: nodes.BigFrameNode, - ordered: bool, - limit: typing.Optional[int] = None, - ) -> str: - # later steps might add ids, so snapshot before those steps. - output_ids = node.schema.names - if ordered: - # Need to do this before replacing unsupported ops, as that will rewrite slice ops - node, pulled_up_limit = rewrites.pullup_limit_from_slice(node) - if (pulled_up_limit is not None) and ( - (limit is None) or limit > pulled_up_limit - ): - limit = pulled_up_limit - - node = self._replace_unsupported_ops(node) - # prune before pulling up order to avoid unnnecessary row_number() ops - node = rewrites.column_pruning(node) - node, ordering = rewrites.pull_up_order(node, order_root=ordered) - # final pruning to cleanup up any leftovers unused values - node = rewrites.column_pruning(node) - return self.compile_node(node).to_sql( - order_by=ordering.all_ordering_columns if ordered else (), - limit=limit, - selections=output_ids, +def compile_sql( + node: nodes.BigFrameNode, + ordered: bool, + limit: typing.Optional[int] = None, +) -> str: + # later steps might add ids, so snapshot before those steps. + output_ids = node.schema.names + if ordered: + # Need to do this before replacing unsupported ops, as that will rewrite slice ops + node, pulled_up_limit = rewrites.pullup_limit_from_slice(node) + if (pulled_up_limit is not None) and ( + (limit is None) or limit > pulled_up_limit + ): + limit = pulled_up_limit + + node = _replace_unsupported_ops(node) + # prune before pulling up order to avoid unnnecessary row_number() ops + node = rewrites.column_pruning(node) + node, ordering = rewrites.pull_up_order(node, order_root=ordered) + # final pruning to cleanup up any leftovers unused values + node = rewrites.column_pruning(node) + return compile_node(node).to_sql( + order_by=ordering.all_ordering_columns if ordered else (), + limit=limit, + selections=output_ids, + ) + + +def compile_raw( + node: nodes.BigFrameNode, +) -> typing.Tuple[ + str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering +]: + node = _replace_unsupported_ops(node) + node = rewrites.column_pruning(node) + node, ordering = rewrites.pull_up_order(node, order_root=True) + node = rewrites.column_pruning(node) + sql = compile_node(node).to_sql() + return sql, node.schema.to_bigquery(), ordering + + +def _replace_unsupported_ops(node: nodes.BigFrameNode): + # TODO: Run all replacement rules as single bottom-up pass + node = nodes.bottom_up(node, rewrites.rewrite_slice) + node = nodes.bottom_up(node, rewrites.rewrite_timedelta_expressions) + return node + + +# TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) +@functools.lru_cache(maxsize=5000) +def compile_node(node: nodes.BigFrameNode) -> compiled.UnorderedIR: + """Compile node into CompileArrayValue. Caches result.""" + return node.reduce_up(lambda node, children: _compile_node(node, *children)) + + +@functools.singledispatch +def _compile_node( + node: nodes.BigFrameNode, *compiled_children: compiled.UnorderedIR +) -> compiled.UnorderedIR: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + +@_compile_node.register +def compile_join( + node: nodes.JoinNode, left: compiled.UnorderedIR, right: compiled.UnorderedIR +): + condition_pairs = tuple( + (left.id.sql, right.id.sql) for left, right in node.conditions + ) + return left.join( + right=right, + type=node.type, + conditions=condition_pairs, + join_nulls=node.joins_nulls, + ) + + +@_compile_node.register +def compile_isin( + node: nodes.InNode, left: compiled.UnorderedIR, right: compiled.UnorderedIR +): + return left.isin_join( + right=right, + indicator_col=node.indicator_col.sql, + conditions=(node.left_col.id.sql, node.right_col.id.sql), + join_nulls=node.joins_nulls, + ) + + +@_compile_node.register +def compile_fromrange( + node: nodes.FromRangeNode, start: compiled.UnorderedIR, end: compiled.UnorderedIR +): + # Both start and end are single elements and do not inherently have an order) + start_table = start._to_ibis_expr() + end_table = end._to_ibis_expr() + + start_column = start_table.schema().names[0] + end_column = end_table.schema().names[0] + + # Perform a cross join to avoid errors + joined_table = start_table.cross_join(end_table) + + labels_array_table = ibis_api.range( + joined_table[start_column], joined_table[end_column] + node.step, node.step + ).name(node.output_id.sql) + labels = ( + typing.cast(ibis_types.ArrayValue, labels_array_table) + .as_table() + .unnest([node.output_id.sql]) + ) + return compiled.UnorderedIR( + labels, + columns=[labels[labels.columns[0]]], + ) + + +@_compile_node.register +def compile_readlocal(node: nodes.ReadLocalNode, *args): + array_as_pd = pd.read_feather( + io.BytesIO(node.feather_bytes), + columns=[item.source_id for item in node.scan_list.items], + ) + + # Convert timedeltas to microseconds for compatibility with BigQuery + _ = utils.replace_timedeltas_with_micros(array_as_pd) + + offsets = node.offsets_col.sql if node.offsets_col else None + return compiled.UnorderedIR.from_pandas( + array_as_pd, node.scan_list, offsets=offsets + ) + + +@_compile_node.register +def compile_readtable(node: nodes.ReadTableNode, *args): + ibis_table = _table_to_ibis( + node.source, scan_cols=[col.source_id for col in node.scan_list.items] + ) + + # TODO(b/395912450): Remove workaround solution once b/374784249 got resolved. + for scan_item in node.scan_list.items: + if ( + scan_item.dtype == dtypes.JSON_DTYPE + and ibis_table[scan_item.source_id].type() == ibis_dtypes.string + ): + json_column = compile_scalar.parse_json( + ibis_table[scan_item.source_id] + ).name(scan_item.source_id) + ibis_table = ibis_table.mutate(json_column) + + return compiled.UnorderedIR( + ibis_table, + tuple( + ibis_table[scan_item.source_id].name(scan_item.id.sql) + for scan_item in node.scan_list.items + ), + ) + + +def _table_to_ibis( + source: nodes.BigqueryDataSource, + scan_cols: typing.Sequence[str], +) -> ibis_types.Table: + full_table_name = ( + f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}" + ) + # Physical schema might include unused columns, unsupported datatypes like JSON + physical_schema = ibis_bigquery.BigQuerySchema.to_ibis( + list(source.table.physical_schema) + ) + if source.at_time is not None or source.sql_predicate is not None: + import bigframes.session._io.bigquery + + sql = bigframes.session._io.bigquery.to_query( + full_table_name, + columns=scan_cols, + sql_predicate=source.sql_predicate, + time_travel_timestamp=source.at_time, ) + return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) + else: + return ibis_api.table(physical_schema, full_table_name).select(scan_cols) - def compile_raw( - self, - node: nodes.BigFrameNode, - ) -> typing.Tuple[ - str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering - ]: - node = self._replace_unsupported_ops(node) - node = rewrites.column_pruning(node) - node, ordering = rewrites.pull_up_order(node, order_root=True) - node = rewrites.column_pruning(node) - sql = self.compile_node(node).to_sql() - return sql, node.schema.to_bigquery(), ordering - - def _replace_unsupported_ops(self, node: nodes.BigFrameNode): - # TODO: Run all replacement rules as single bottom-up pass - node = nodes.bottom_up(node, rewrites.rewrite_slice) - node = nodes.bottom_up(node, rewrites.rewrite_timedelta_expressions) - return node - - # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) - @functools.lru_cache(maxsize=5000) - def compile_node(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: - """Compile node into CompileArrayValue. Caches result.""" - return self._compile_node(node) - - @functools.singledispatchmethod - def _compile_node(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: - """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unrecognized node: {node}") - - @_compile_node.register - def compile_join(self, node: nodes.JoinNode): - condition_pairs = tuple( - (left.id.sql, right.id.sql) for left, right in node.conditions - ) - left_unordered = self.compile_node(node.left_child) - right_unordered = self.compile_node(node.right_child) - return left_unordered.join( - right=right_unordered, - type=node.type, - conditions=condition_pairs, - join_nulls=node.joins_nulls, - ) +@_compile_node.register +def compile_filter(node: nodes.FilterNode, child: compiled.UnorderedIR): + return child.filter(node.predicate) - @_compile_node.register - def compile_isin(self, node: nodes.InNode): - left_unordered = self.compile_node(node.left_child) - right_unordered = self.compile_node(node.right_child) - return left_unordered.isin_join( - right=right_unordered, - indicator_col=node.indicator_col.sql, - conditions=(node.left_col.id.sql, node.right_col.id.sql), - join_nulls=node.joins_nulls, - ) - @_compile_node.register - def compile_fromrange(self, node: nodes.FromRangeNode): - # Both start and end are single elements and do not inherently have an order - start = self.compile_node(node.start) - end = self.compile_node(node.end) - start_table = start._to_ibis_expr() - end_table = end._to_ibis_expr() - - start_column = start_table.schema().names[0] - end_column = end_table.schema().names[0] - - # Perform a cross join to avoid errors - joined_table = start_table.cross_join(end_table) - - labels_array_table = ibis_api.range( - joined_table[start_column], joined_table[end_column] + node.step, node.step - ).name(node.output_id.sql) - labels = ( - typing.cast(ibis_types.ArrayValue, labels_array_table) - .as_table() - .unnest([node.output_id.sql]) - ) - return compiled.UnorderedIR( - labels, - columns=[labels[labels.columns[0]]], - ) +@_compile_node.register +def compile_selection(node: nodes.SelectionNode, child: compiled.UnorderedIR): + selection = tuple((ref, id.sql) for ref, id in node.input_output_pairs) + return child.selection(selection) - @_compile_node.register - def compile_readlocal(self, node: nodes.ReadLocalNode): - array_as_pd = pd.read_feather( - io.BytesIO(node.feather_bytes), - columns=[item.source_id for item in node.scan_list.items], - ) - # Convert timedeltas to microseconds for compatibility with BigQuery - _ = utils.replace_timedeltas_with_micros(array_as_pd) +@_compile_node.register +def compile_projection(node: nodes.ProjectionNode, child: compiled.UnorderedIR): + projections = ((expr, id.sql) for expr, id in node.assignments) + return child.projection(tuple(projections)) - offsets = node.offsets_col.sql if node.offsets_col else None - return compiled.UnorderedIR.from_pandas( - array_as_pd, node.scan_list, offsets=offsets - ) - @_compile_node.register - def compile_readtable(self, node: nodes.ReadTableNode): - return self.compile_read_table_unordered(node.source, node.scan_list) - - def read_table_as_unordered_ibis( - self, - source: nodes.BigqueryDataSource, - scan_cols: typing.Sequence[str], - ) -> ibis_types.Table: - full_table_name = f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}" - # Physical schema might include unused columns, unsupported datatypes like JSON - physical_schema = ibis_bigquery.BigQuerySchema.to_ibis( - list(source.table.physical_schema) - ) - if source.at_time is not None or source.sql_predicate is not None: - import bigframes.session._io.bigquery - - sql = bigframes.session._io.bigquery.to_query( - full_table_name, - columns=scan_cols, - sql_predicate=source.sql_predicate, - time_travel_timestamp=source.at_time, - ) - return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) - else: - return ibis_api.table(physical_schema, full_table_name).select(scan_cols) - - def compile_read_table_unordered( - self, source: nodes.BigqueryDataSource, scan: nodes.ScanList - ): - ibis_table = self.read_table_as_unordered_ibis( - source, scan_cols=[col.source_id for col in scan.items] - ) +@_compile_node.register +def compile_concat(node: nodes.ConcatNode, *children: compiled.UnorderedIR): + output_ids = [id.sql for id in node.output_ids] + return concat_impl.concat_unordered(children, output_ids) - # TODO(b/395912450): Remove workaround solution once b/374784249 got resolved. - for scan_item in scan.items: - if ( - scan_item.dtype == dtypes.JSON_DTYPE - and ibis_table[scan_item.source_id].type() == ibis_dtypes.string - ): - json_column = compile_scalar.parse_json( - ibis_table[scan_item.source_id] - ).name(scan_item.source_id) - ibis_table = ibis_table.mutate(json_column) - - return compiled.UnorderedIR( - ibis_table, - tuple( - ibis_table[scan_item.source_id].name(scan_item.id.sql) - for scan_item in scan.items - ), - ) - @_compile_node.register - def compile_filter(self, node: nodes.FilterNode): - return self.compile_node(node.child).filter(node.predicate) - - @_compile_node.register - def compile_selection(self, node: nodes.SelectionNode): - result = self.compile_node(node.child) - selection = tuple((ref, id.sql) for ref, id in node.input_output_pairs) - return result.selection(selection) - - @_compile_node.register - def compile_projection(self, node: nodes.ProjectionNode): - result = self.compile_node(node.child) - projections = ((expr, id.sql) for expr, id in node.assignments) - return result.projection(tuple(projections)) - - @_compile_node.register - def compile_concat(self, node: nodes.ConcatNode): - output_ids = [id.sql for id in node.output_ids] - compiled_unordered = [self.compile_node(node) for node in node.children] - return concat_impl.concat_unordered(compiled_unordered, output_ids) - - @_compile_node.register - def compile_rowcount(self, node: nodes.RowCountNode): - result = self.compile_node(node.child).row_count(name=node.col_id.sql) - return result - - @_compile_node.register - def compile_aggregate(self, node: nodes.AggregateNode): - aggs = tuple((agg, id.sql) for agg, id in node.aggregations) - result = self.compile_node(node.child).aggregate( - aggs, node.by_column_ids, order_by=node.order_by - ) - # TODO: Remove dropna field and use filter node instead - if node.dropna: - for key in node.by_column_ids: - if node.child.field_by_id[key.id].nullable: - result = result.filter(operations.notnull_op.as_expr(key)) - return result - - @_compile_node.register - def compile_window(self, node: nodes.WindowOpNode): - result = self.compile_node(node.child).project_window_op( - node.expression, - node.window_spec, - node.output_name.sql, - never_skip_nulls=node.never_skip_nulls, - ) - return result +@_compile_node.register +def compile_rowcount(node: nodes.RowCountNode, child: compiled.UnorderedIR): + return child.row_count(name=node.col_id.sql) + + +@_compile_node.register +def compile_aggregate(node: nodes.AggregateNode, child: compiled.UnorderedIR): + aggs = tuple((agg, id.sql) for agg, id in node.aggregations) + result = child.aggregate(aggs, node.by_column_ids, order_by=node.order_by) + # TODO: Remove dropna field and use filter node instead + if node.dropna: + for key in node.by_column_ids: + if node.child.field_by_id[key.id].nullable: + result = result.filter(operations.notnull_op.as_expr(key)) + return result + + +@_compile_node.register +def compile_window(node: nodes.WindowOpNode, child: compiled.UnorderedIR): + result = child.project_window_op( + node.expression, + node.window_spec, + node.output_name.sql, + never_skip_nulls=node.never_skip_nulls, + ) + return result + + +@_compile_node.register +def compile_explode(node: nodes.ExplodeNode, child: compiled.UnorderedIR): + offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None + return bigframes.core.compile.explode.explode_unordered( + child, node.column_ids, offsets_col + ) - @_compile_node.register - def compile_explode(self, node: nodes.ExplodeNode): - offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None - return bigframes.core.compile.explode.explode_unordered( - self.compile_node(node.child), node.column_ids, offsets_col - ) - @_compile_node.register - def compile_random_sample(self, node: nodes.RandomSampleNode): - return self.compile_node(node.child)._uniform_sampling(node.fraction) +@_compile_node.register +def compile_random_sample(node: nodes.RandomSampleNode, child: compiled.UnorderedIR): + return child._uniform_sampling(node.fraction) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 0644b0e6d9..e539525d80 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -204,7 +204,7 @@ def __init__( self.bqclient = bqclient self.storage_manager = storage_manager self.compiler: bigframes.core.compile.SQLCompiler = ( - bigframes.core.compile.SQLCompiler(strict=strictly_ordered) + bigframes.core.compile.SQLCompiler() ) self.strictly_ordered: bool = strictly_ordered self._cached_executions: weakref.WeakKeyDictionary[ From ff46f5a16891638484a5bca64442bb02bf8e11f8 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Fri, 7 Mar 2025 15:01:20 -0800 Subject: [PATCH 11/19] chore: fix experimental blob docs (#1472) --- bigframes/operations/blob.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 6541a14655..183003780b 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -44,7 +44,7 @@ def uri(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: URIs as string.""" + bigframes.series.Series: URIs as string.""" s = bigframes.series.Series(self._block) return s.struct.field("uri") @@ -56,7 +56,7 @@ def authorizer(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: Autorithers(connection) as string.""" + bigframes.series.Series: Autorithers(connection) as string.""" s = bigframes.series.Series(self._block) return s.struct.field("authorizer") @@ -68,7 +68,7 @@ def version(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: Version as string.""" + bigframes.series.Series: Version as string.""" # version must be retrieved after fetching metadata return self._apply_unary_op(ops.obj_fetch_metadata_op).struct.field("version") @@ -79,7 +79,7 @@ def metadata(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: JSON metadata of the Blob. Contains fields: content_type, md5_hash, size and updated(time).""" + bigframes.series.Series: JSON metadata of the Blob. Contains fields: content_type, md5_hash, size and updated(time).""" details_json = self._apply_unary_op(ops.obj_fetch_metadata_op).struct.field( "details" ) @@ -94,7 +94,7 @@ def content_type(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: string of the content type.""" + bigframes.series.Series: string of the content type.""" return ( self.metadata() ._apply_unary_op(ops.JSONValue(json_path="$.content_type")) @@ -108,7 +108,7 @@ def md5_hash(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: string of the md5 hash.""" + bigframes.series.Series: string of the md5 hash.""" return ( self.metadata() ._apply_unary_op(ops.JSONValue(json_path="$.md5_hash")) @@ -122,7 +122,7 @@ def size(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: file size in bytes.""" + bigframes.series.Series: file size in bytes.""" return ( self.metadata() ._apply_unary_op(ops.JSONValue(json_path="$.size")) @@ -137,7 +137,7 @@ def updated(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: updated time as UTC datetime.""" + bigframes.series.Series: updated time as UTC datetime.""" import bigframes.pandas as bpd updated = ( @@ -159,7 +159,7 @@ def _get_runtime( metadata (bool, default False): whether to fetch the metadata in the ObjectRefRuntime. Returns: - bigframes Series: ObjectRefRuntime JSON. + bigframes.series.Series: ObjectRefRuntime JSON. """ s = self._apply_unary_op(ops.obj_fetch_metadata_op) if with_metadata else self @@ -172,7 +172,7 @@ def read_url(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: Read only URLs.""" + bigframes.series.Series: Read only URLs.""" return self._get_runtime(mode="R")._apply_unary_op( ops.JSONValue(json_path="$.access_urls.read_url") ) @@ -184,7 +184,7 @@ def write_url(self) -> bigframes.series.Series: BigFrames Blob is still under experiments. It may not work and subject to change in the future. Returns: - BigFrames Series: Writable URLs.""" + bigframes.series.Series: Writable URLs.""" return self._get_runtime(mode="RW")._apply_unary_op( ops.JSONValue(json_path="$.access_urls.write_url") ) @@ -303,7 +303,7 @@ def image_blur( container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: - BigFrames Blob Series + bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ import bigframes.blob._functions as blob_func @@ -390,7 +390,7 @@ def image_resize( container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: - BigFrames Blob Series + bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ dsize_set = dsize[0] > 0 and dsize[1] > 0 fsize_set = fx > 0.0 and fy > 0.0 @@ -486,7 +486,7 @@ def image_normalize( container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: - BigFrames Blob Series + bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ import bigframes.blob._functions as blob_func @@ -603,8 +603,7 @@ def pdf_chunk( arrays of strings. .. note:: - BigFrames Blob is still under experiments. It may not work and - subject to change in the future. + BigFrames Blob is still under experiments. It may not work and subject to change in the future. Args: connection (str or None, default None): BQ connection used for @@ -621,7 +620,7 @@ def pdf_chunk( container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: - bigframe.series.Series of array[str], where each string is a + bigframe.series.Series: Series of array[str], where each string is a chunk of text extracted from PDF. """ From 461e9e017d513376fc623a5ee47f8b9dd002b452 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Fri, 7 Mar 2025 17:41:10 -0800 Subject: [PATCH 12/19] feat: support list output for managed function (#1457) * feat: support list output for managed function * add test decorator * resolve comments --- bigframes/dataframe.py | 6 +- bigframes/functions/_function_session.py | 5 + bigframes/operations/remote_function_ops.py | 21 ++- .../large/functions/test_managed_function.py | 160 ++++++++++++++++++ .../large/functions/test_remote_function.py | 18 +- .../small/functions/test_managed_function.py | 155 +++++++++++++++++ 6 files changed, 345 insertions(+), 20 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 151da51792..2349e469ab 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4199,11 +4199,13 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): udf_input_dtypes = getattr(func, "input_dtypes") if len(udf_input_dtypes) != len(self.columns): raise ValueError( - f"Remote function takes {len(udf_input_dtypes)} arguments but DataFrame has {len(self.columns)} columns." + f"BigFrames BigQuery function takes {len(udf_input_dtypes)}" + f" arguments but DataFrame has {len(self.columns)} columns." ) if udf_input_dtypes != tuple(self.dtypes.to_list()): raise ValueError( - f"Remote function takes arguments of types {udf_input_dtypes} but DataFrame dtypes are {tuple(self.dtypes)}." + f"BigFrames BigQuery function takes arguments of types " + f"{udf_input_dtypes} but DataFrame dtypes are {tuple(self.dtypes)}." ) series_list = [self[col] for col in self.columns] diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 15c8cb979e..0ae674b97d 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -892,6 +892,7 @@ def wrapper(func): func = cloudpickle.loads(cloudpickle.dumps(func)) self._try_delattr(func, "bigframes_bigquery_function") + self._try_delattr(func, "bigframes_bigquery_function_output_dtype") self._try_delattr(func, "input_dtypes") self._try_delattr(func, "output_dtype") self._try_delattr(func, "is_row_processor") @@ -951,6 +952,10 @@ def wrapper(func): ibis_signature.output_type ) ) + # Managed function directly supports certain output types which are + # not supported in remote function (e.g. list output). Thus no more + # processing for 'bigframes_bigquery_function_output_dtype'. + func.bigframes_bigquery_function_output_dtype = func.output_dtype func.is_row_processor = is_row_processor func.ibis_node = node diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index 8505fd1607..51cfccbc41 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -29,11 +29,12 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method + # The output dtype should be set to a valid Dtype by @udf decorator, + # @remote_function decorator, or read_gbq_function method. if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): return self.func.bigframes_bigquery_function_output_dtype - else: - raise AttributeError("bigframes_bigquery_function_output_dtype not defined") + + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -46,11 +47,12 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method + # The output dtype should be set to a valid Dtype by @udf decorator, + # @remote_function decorator, or read_gbq_function method. if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): return self.func.bigframes_bigquery_function_output_dtype - else: - raise AttributeError("bigframes_bigquery_function_output_dtype not defined") + + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -63,8 +65,9 @@ def expensive(self) -> bool: return True def output_type(self, *input_types): - # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method + # The output dtype should be set to a valid Dtype by @udf decorator, + # @remote_function decorator, or read_gbq_function method. if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): return self.func.bigframes_bigquery_function_output_dtype - else: - raise AttributeError("bigframes_bigquery_function_output_dtype not defined") + + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 4db7a1c47c..503720edcc 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -13,8 +13,10 @@ # limitations under the License. import pandas +import pyarrow import pytest +import bigframes from bigframes.functions import _function_session as bff_session from bigframes.functions._utils import get_python_version import bigframes.pandas as bpd @@ -164,3 +166,161 @@ def func(x, y): cleanup_function_assets( session.bqclient, session.cloudfunctionsclient, managed_func ) + + +@pytest.mark.parametrize( + "array_dtype", + [ + bool, + int, + float, + str, + ], +) +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_array_output(session, scalars_dfs, dataset_id, array_dtype): + try: + + @session.udf(dataset=dataset_id) + def featurize(x: int) -> list[array_dtype]: # type: ignore + return [array_dtype(i) for i in [x, x + 1, x + 2]] + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_too"] + bf_result = bf_int64_col.apply(featurize).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_too"] + pd_result = pd_int64_col.apply(featurize) + + # Ignore any dtype disparity. + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets( + featurize, session.bqclient, session.cloudfunctionsclient + ) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_binop_array_output(session, scalars_dfs, dataset_id): + try: + + def func(x, y): + return [len(x), abs(y % 4)] + + managed_func = session.udf( + input_types=[str, int], + output_type=list[int], + dataset=dataset_id, + )(func) + + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_df = scalars_df.dropna() + scalars_pandas_df = scalars_pandas_df.dropna() + bf_result = ( + scalars_df["string_col"] + .combine(scalars_df["int64_col"], managed_func) + .to_pandas() + ) + pd_result = scalars_pandas_df["string_col"].combine( + scalars_pandas_df["int64_col"], func + ) + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets( + managed_func, session.bqclient, session.cloudfunctionsclient + ) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_manage_function_df_apply_axis_1_array_output(session): + bf_df = bigframes.dataframe.DataFrame( + { + "Id": [1, 2, 3], + "Age": [22.5, 23, 23.5], + "Name": ["alpha", "beta", "gamma"], + } + ) + + expected_dtypes = ( + bigframes.dtypes.INT_DTYPE, + bigframes.dtypes.FLOAT_DTYPE, + bigframes.dtypes.STRING_DTYPE, + ) + + # Assert the dataframe dtypes. + assert tuple(bf_df.dtypes) == expected_dtypes + + try: + + @session.udf(input_types=[int, float, str], output_type=list[str]) + def foo(x, y, z): + return [str(x), str(y), z] + + assert getattr(foo, "is_row_processor") is False + assert getattr(foo, "input_dtypes") == expected_dtypes + assert getattr(foo, "output_dtype") == pandas.ArrowDtype( + pyarrow.list_( + bigframes.dtypes.bigframes_dtype_to_arrow_dtype( + bigframes.dtypes.STRING_DTYPE + ) + ) + ) + assert getattr(foo, "output_dtype") == getattr( + foo, "bigframes_bigquery_function_output_dtype" + ) + + # Fails to apply on dataframe with incompatible number of columns. + with pytest.raises( + ValueError, + match="^BigFrames BigQuery function takes 3 arguments but DataFrame has 2 columns\\.$", + ): + bf_df[["Id", "Age"]].apply(foo, axis=1) + + with pytest.raises( + ValueError, + match="^BigFrames BigQuery function takes 3 arguments but DataFrame has 4 columns\\.$", + ): + bf_df.assign(Country="lalaland").apply(foo, axis=1) + + # Fails to apply on dataframe with incompatible column datatypes. + with pytest.raises( + ValueError, + match="^BigFrames BigQuery function takes arguments of types .* but DataFrame dtypes are .*", + ): + bf_df.assign(Age=bf_df["Age"].astype("Int64")).apply(foo, axis=1) + + # Successfully applies to dataframe with matching number of columns. + # and their datatypes. + bf_result = bf_df.apply(foo, axis=1).to_pandas() + + # Since this scenario is not pandas-like, let's handcraft the + # expected result. + expected_result = pandas.Series( + [ + ["1", "22.5", "alpha"], + ["2", "23.0", "beta"], + ["3", "23.5", "gamma"], + ] + ) + + pandas.testing.assert_series_equal( + expected_result, bf_result, check_dtype=False, check_index_type=False + ) + + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 350eae3783..65bf20b966 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -2085,19 +2085,19 @@ def foo(x, y, z): # Fails to apply on dataframe with incompatible number of columns with pytest.raises( ValueError, - match="^Remote function takes 3 arguments but DataFrame has 2 columns\\.$", + match="^BigFrames BigQuery function takes 3 arguments but DataFrame has 2 columns\\.$", ): bf_df[["Id", "Age"]].apply(foo, axis=1) with pytest.raises( ValueError, - match="^Remote function takes 3 arguments but DataFrame has 4 columns\\.$", + match="^BigFrames BigQuery function takes 3 arguments but DataFrame has 4 columns\\.$", ): bf_df.assign(Country="lalaland").apply(foo, axis=1) # Fails to apply on dataframe with incompatible column datatypes with pytest.raises( ValueError, - match="^Remote function takes arguments of types .* but DataFrame dtypes are .*", + match="^BigFrames BigQuery function takes arguments of types .* but DataFrame dtypes are .*", ): bf_df.assign(Age=bf_df["Age"].astype("Int64")).apply(foo, axis=1) @@ -2171,19 +2171,19 @@ def foo(x, y, z): # Fails to apply on dataframe with incompatible number of columns with pytest.raises( ValueError, - match="^Remote function takes 3 arguments but DataFrame has 2 columns\\.$", + match="^BigFrames BigQuery function takes 3 arguments but DataFrame has 2 columns\\.$", ): bf_df[["Id", "Age"]].apply(foo, axis=1) with pytest.raises( ValueError, - match="^Remote function takes 3 arguments but DataFrame has 4 columns\\.$", + match="^BigFrames BigQuery function takes 3 arguments but DataFrame has 4 columns\\.$", ): bf_df.assign(Country="lalaland").apply(foo, axis=1) # Fails to apply on dataframe with incompatible column datatypes with pytest.raises( ValueError, - match="^Remote function takes arguments of types .* but DataFrame dtypes are .*", + match="^BigFrames BigQuery function takes arguments of types .* but DataFrame dtypes are .*", ): bf_df.assign(Age=bf_df["Age"].astype("Int64")).apply(foo, axis=1) @@ -2240,19 +2240,19 @@ def foo(x): # Fails to apply on dataframe with incompatible number of columns with pytest.raises( ValueError, - match="^Remote function takes 1 arguments but DataFrame has 0 columns\\.$", + match="^BigFrames BigQuery function takes 1 arguments but DataFrame has 0 columns\\.$", ): bf_df[[]].apply(foo, axis=1) with pytest.raises( ValueError, - match="^Remote function takes 1 arguments but DataFrame has 2 columns\\.$", + match="^BigFrames BigQuery function takes 1 arguments but DataFrame has 2 columns\\.$", ): bf_df.assign(Country="lalaland").apply(foo, axis=1) # Fails to apply on dataframe with incompatible column datatypes with pytest.raises( ValueError, - match="^Remote function takes arguments of types .* but DataFrame dtypes are .*", + match="^BigFrames BigQuery function takes arguments of types .* but DataFrame dtypes are .*", ): bf_df.assign(Id=bf_df["Id"].astype("Float64")).apply(foo, axis=1) diff --git a/tests/system/small/functions/test_managed_function.py b/tests/system/small/functions/test_managed_function.py index 41a5785d01..e1af68512a 100644 --- a/tests/system/small/functions/test_managed_function.py +++ b/tests/system/small/functions/test_managed_function.py @@ -62,6 +62,9 @@ def foo(x): assert hasattr(foo, "bigframes_bigquery_function") assert hasattr(foo, "ibis_node") + assert hasattr(foo, "input_dtypes") + assert hasattr(foo, "output_dtype") + assert hasattr(foo, "bigframes_bigquery_function_output_dtype") scalars_df, scalars_pandas_df = scalars_dfs @@ -124,6 +127,88 @@ def add(x: int, y: int) -> int: pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +@pytest.mark.parametrize( + ("typ",), + [ + pytest.param(int), + pytest.param(float), + pytest.param(bool), + pytest.param(str), + ], +) +def test_managed_function_series_apply_list_output( + typ, + scalars_dfs, + dataset_id_permanent, +): + def foo_list(x): + # The bytes() constructor expects a non-negative interger as its arg. + return [typ(abs(x)), typ(abs(x) + 1)] + + foo_list = udf( + input_types=int, + output_type=list[typ], # type: ignore + dataset=dataset_id_permanent, + name=get_function_name(foo_list), + )(foo_list) + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result_col = scalars_df["int64_too"].apply(foo_list) + bf_result = ( + scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_result_col = scalars_pandas_df["int64_too"].apply(foo_list) + pd_result = scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) + + # Ignore any dtype difference. + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_series_combine_list_output(dataset_id_permanent, scalars_dfs): + def add_list(x: int, y: int) -> list[int]: + return [x, y] + + scalars_df, scalars_pandas_df = scalars_dfs + int_col_name_with_nulls = "int64_col" + int_col_name_no_nulls = "int64_too" + bf_df = scalars_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + pd_df = scalars_pandas_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + + # Make sure there are NA values in the test column. + assert any([pd.isna(val) for val in bf_df[int_col_name_with_nulls]]) + + add_list_managed_func = udf( + dataset=dataset_id_permanent, + name=get_function_name(add_list), + )(add_list) + + # After filtering out nulls the managed function application should work + # similar to pandas. + pd_filter = pd_df[int_col_name_with_nulls].notnull() + pd_result = pd_df[pd_filter][int_col_name_with_nulls].combine( + pd_df[pd_filter][int_col_name_no_nulls], add_list + ) + bf_filter = bf_df[int_col_name_with_nulls].notnull() + bf_result = ( + bf_df[bf_filter][int_col_name_with_nulls] + .combine(bf_df[bf_filter][int_col_name_no_nulls], add_list_managed_func) + .to_pandas() + ) + + # Ignore any dtype difference. + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + @pytest.mark.skipif( get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", @@ -197,3 +282,73 @@ def add_ints(x, y): pd.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_exact=True ) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_dataframe_map_list_output(scalars_dfs, dataset_id_permanent): + def add_one_list(x): + return [x + 1] * 3 + + mf_add_one_list = udf( + input_types=[int], + output_type=list[int], + dataset=dataset_id_permanent, + name=get_function_name(add_one_list), + )(add_one_list) + + scalars_df, scalars_pandas_df = scalars_dfs + int64_cols = ["int64_col", "int64_too"] + + bf_int64_df = scalars_df[int64_cols] + bf_int64_df_filtered = bf_int64_df.dropna() + bf_result = bf_int64_df_filtered.map(mf_add_one_list).to_pandas() + + pd_int64_df = scalars_pandas_df[int64_cols] + pd_int64_df_filtered = pd_int64_df.dropna() + pd_result = pd_int64_df_filtered.map(add_one_list) + + # Ignore any dtype difference. + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_dataframe_apply_axis_1_list_output( + session, scalars_dfs, dataset_id_permanent +): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df["int64_too"] + series_pandas = scalars_pandas_df["int64_too"] + + def add_ints_list(x, y): + return [x + y] * 2 + + add_ints_list_mf = session.udf( + input_types=[int, int], + output_type=list[int], + dataset=dataset_id_permanent, + name=get_function_name(add_ints_list, is_row_processor=True), + )(add_ints_list) + assert add_ints_list_mf.bigframes_bigquery_function # type: ignore + + with pytest.warns( + bigframes.exceptions.PreviewWarning, + match="axis=1 scenario is in preview.", + ): + bf_result = ( + bpd.DataFrame({"x": series, "y": series}) + .apply(add_ints_list_mf, axis=1) + .to_pandas() + ) + + pd_result = pd.DataFrame({"x": series_pandas, "y": series_pandas}).apply( + lambda row: add_ints_list(row["x"], row["y"]), axis=1 + ) + + # Ignore any dtype difference. + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) From 8bba8df66b45ae5e46924dd2aaa04f7e5539a4e5 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 10 Mar 2025 14:22:46 -0700 Subject: [PATCH 13/19] chore: Cleanup kokoro artifacts at end of build (#1462) --- .kokoro/build.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.kokoro/build.sh b/.kokoro/build.sh index 58eaa7fedf..6cc03455da 100755 --- a/.kokoro/build.sh +++ b/.kokoro/build.sh @@ -50,3 +50,6 @@ if [[ -n "${NOX_SESSION:-}" ]]; then else python3 -m nox --stop-on-first-error fi + +# Prevent kokoro from trying to collect many mb of artifacts, wasting several minutes +sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/* From 0ddee998ca7425047a12f21d2f544d9a034e19fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 10 Mar 2025 17:49:14 -0500 Subject: [PATCH 14/19] test: pin to older pandas-stubs (#1477) --- noxfile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index ca147e171d..b95e58f4ef 100644 --- a/noxfile.py +++ b/noxfile.py @@ -256,7 +256,8 @@ def mypy(session): set( [ "mypy", - "pandas-stubs", + # TODO: update to latest pandas-stubs once we resolve bigframes issues. + "pandas-stubs<=2.2.3.241126", "types-protobuf", "types-python-dateutil", "types-requests", From 9e471fbd1a3661300d988d8307013476029f4ee8 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 11 Mar 2025 10:17:46 -0700 Subject: [PATCH 15/19] test: target e2e tests to python 3.11 for max coverage (#1474) * test: target e2e tests to python 3.12 for max coverage * ensure large udf tests are run without skip * remove pytest.mark.skip from one more test * adjust the expect warnings in the ingress settings and service account * fix mypy * remove version 1.x check to surface 2.0 future warning --- bigframes/functions/_function_session.py | 10 +-- noxfile.py | 8 +- .../large/functions/test_managed_function.py | 12 --- .../large/functions/test_remote_function.py | 86 +++++++++++++------ 4 files changed, 68 insertions(+), 48 deletions(-) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 0ae674b97d..a66f619cf9 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -47,7 +47,6 @@ ) from bigframes import clients -from bigframes import version as bigframes_version import bigframes.core.compile.ibis_types import bigframes.exceptions as bfe import bigframes.series as bf_series @@ -458,16 +457,13 @@ def remote_function( msg = bfe.format_message( "You have not explicitly set a user-managed `cloud_function_service_account`. " "Using the default Compute Engine service account. " - "To use Bigframes 2.0, please explicitly set `cloud_function_service_account` " + "In BigFrames 2.0 onwards, you would have to explicitly set `cloud_function_service_account` " 'either to a user-managed service account (preferred) or to `"default"` ' - "to use the Compute Engine service account (discouraged). " + "to use the default Compute Engine service account (discouraged). " "See, https://cloud.google.com/functions/docs/securing/function-identity." ) - if ( - bigframes_version.__version__.startswith("1.") - and cloud_function_service_account is None - ): + if cloud_function_service_account is None: warnings.warn(msg, stacklevel=2, category=FutureWarning) if cloud_function_service_account == "default": diff --git a/noxfile.py b/noxfile.py index b95e58f4ef..a08ef27781 100644 --- a/noxfile.py +++ b/noxfile.py @@ -59,6 +59,12 @@ DEFAULT_PYTHON_VERSION = "3.10" +# Cloud Run Functions supports Python versions up to 3.12 +# https://cloud.google.com/run/docs/runtimes/python +# Managed Python UDF is supported only in Python 3.11 +# Let's set the E2E tests version to 3.11 to cover most code paths. +E2E_TEST_PYTHON_VERSION = "3.11" + UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", @@ -418,7 +424,7 @@ def doctest(session: nox.sessions.Session): ) -@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) +@nox.session(python=E2E_TEST_PYTHON_VERSION) def e2e(session: nox.sessions.Session): """Run the large tests in system test suite.""" run_system( diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 503720edcc..efab338861 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -25,10 +25,6 @@ bpd.options.experiments.udf = True -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) def test_managed_function_multiply_with_ibis( session, scalars_table_id, @@ -80,10 +76,6 @@ def multiply(x, y): cleanup_function_assets(multiply, bigquery_client) -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) def test_managed_function_stringify_with_ibis( session, scalars_table_id, @@ -132,10 +124,6 @@ def stringify(x): ) -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) def test_managed_function_binop(session, scalars_dfs, dataset_id): try: diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 65bf20b966..0d7f888306 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -17,10 +17,11 @@ import inspect import math # must keep this at top level to test udf referring global import import os.path +import re import shutil -import sys import tempfile import textwrap +import typing import warnings import google.api_core.exceptions @@ -50,12 +51,6 @@ _team_euler = "Team Euler" -pytestmark = pytest.mark.skipif( - sys.version_info >= (3, 13), - reason="Runtime 'python313' is not supported yet. Skip for now.", -) - - def make_uniq_udf(udf): """Transform a udf to another with same behavior but a unique name. Use this to test remote functions with reuse=True, in which case parallel @@ -1323,14 +1318,38 @@ def square_num(x): ) -def test_remote_function_warns_default_cloud_function_service_account(): - project = "bigframes-dev-perf" - rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) - - with pytest.warns(FutureWarning, match="You have not explicitly set a"): - rf_session.remote_function( - cloud_function_service_account=None, # Explicitly omit service account. - ) +@pytest.mark.parametrize( + ("remote_function_args"), + [ + pytest.param( + {}, + id="no-set", + ), + pytest.param( + {"cloud_function_service_account": None}, + id="set-none", + ), + ], +) +def test_remote_function_warns_default_cloud_function_service_account( + session, remote_function_args +): + with pytest.warns(FutureWarning) as record: + session.remote_function(**remote_function_args) + + len( + [ + warn + for warn in record + if re.search( + ( + "You have not explicitly set a user-managed.*Using the default Compute Engine.*service account" + ), + typing.cast(FutureWarning, warn.message).args[0], + re.DOTALL, + ) + ] + ) == 1 @pytest.mark.flaky(retries=2, delay=120) @@ -2319,36 +2338,40 @@ def generate_stats(row: pandas.Series) -> list[int]: @pytest.mark.parametrize( - ("ingress_settings_args", "effective_ingress_settings", "expected_warning"), + ( + "ingress_settings_args", + "effective_ingress_settings", + "expect_default_ingress_setting_warning", + ), [ pytest.param( {}, functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, - FutureWarning, + True, id="no-set", ), pytest.param( {"cloud_function_ingress_settings": None}, functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, - FutureWarning, + True, id="set-none", ), pytest.param( {"cloud_function_ingress_settings": "all"}, functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, - None, + False, id="set-all", ), pytest.param( {"cloud_function_ingress_settings": "internal-only"}, functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_ONLY, - None, + False, id="set-internal-only", ), pytest.param( {"cloud_function_ingress_settings": "internal-and-gclb"}, functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_AND_GCLB, - None, + False, id="set-internal-and-gclb", ), ], @@ -2359,11 +2382,11 @@ def test_remote_function_ingress_settings( scalars_dfs, ingress_settings_args, effective_ingress_settings, - expected_warning, + expect_default_ingress_setting_warning, ): try: # Verify the function raises the expected security warning message. - with warnings.catch_warnings(record=True) as w: + with warnings.catch_warnings(record=True) as record: def square(x: int) -> int: return x * x @@ -2372,11 +2395,18 @@ def square(x: int) -> int: reuse=False, **ingress_settings_args )(square) - if expected_warning is not None: - assert issubclass(w[0].category, FutureWarning) - assert "Consider using 'internal-only' for enhanced security." in str( - w[0].message - ) + default_ingress_setting_warnings = [ + warn + for warn in record + if isinstance(warn.message, FutureWarning) + and "`cloud_function_ingress_settings` are set to 'all' by default" + in warn.message.args[0] + and "will change to 'internal-only' for enhanced security in future" + in warn.message.args[0] + ] + assert len(default_ingress_setting_warnings) == ( + 1 if expect_default_ingress_setting_warning else 0 + ) # Assert that the GCF is created with the intended maximum timeout gcf = session.cloudfunctionsclient.get_function( From 01dfe837740ba7119298cced6d9638af7326049b Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 11 Mar 2025 10:57:20 -0700 Subject: [PATCH 16/19] chore: fix experimental blob errors in preview non-exist files (#1479) --- bigframes/dataframe.py | 18 +++++++++--------- bigframes/operations/blob.py | 12 ++++++++++-- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2349e469ab..262b23abd2 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -778,15 +778,15 @@ def _repr_html_(self) -> str: def obj_ref_rt_to_html(obj_ref_rt) -> str: obj_ref_rt_json = json.loads(obj_ref_rt) - gcs_metadata = obj_ref_rt_json["objectref"]["details"][ - "gcs_metadata" - ] - content_type = typing.cast( - str, gcs_metadata.get("content_type", "") - ) - if content_type.startswith("image"): - url = obj_ref_rt_json["access_urls"]["read_url"] - return f'' + obj_ref_details = obj_ref_rt_json["objectref"]["details"] + if "gcs_metadata" in obj_ref_details: + gcs_metadata = obj_ref_details["gcs_metadata"] + content_type = typing.cast( + str, gcs_metadata.get("content_type", "") + ) + if content_type.startswith("image"): + url = obj_ref_rt_json["access_urls"]["read_url"] + return f'' return f'uri: {obj_ref_rt_json["objectref"]["uri"]}, authorizer: {obj_ref_rt_json["objectref"]["authorizer"]}' diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 183003780b..88a58acbfa 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -18,6 +18,7 @@ from typing import cast, Optional, Union import IPython.display as ipy_display +import pandas as pd import requests from bigframes import clients @@ -209,8 +210,15 @@ def display(self, n: int = 3, *, content_type: str = ""): else: df["content_type"] = df["blob_col"].blob.content_type() - def display_single_url(read_url: str, content_type: str): - content_type = content_type.casefold() + def display_single_url( + read_url: str, content_type: Union[str, pd._libs.missing.NAType] + ): + if content_type is pd.NA: # display as raw data or error + response = requests.get(read_url) + ipy_display.display(response.content) + return + + content_type = cast(str, content_type).casefold() if content_type.startswith("image"): ipy_display.display(ipy_display.Image(url=read_url)) From e720f41ef643ac14ae94fa98de5ef4a3fd6dde93 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 11 Mar 2025 12:13:24 -0700 Subject: [PATCH 17/19] feat!: reading JSON data as a custom arrow extension type (#1458) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: use JSONArrowType for JSON data * fix related system tests * fixes lint and doctest * switch db_dtypes into 1.4.2 * fix tests * fix test_df_drop_duplicates_w_json * commit suggestion --------- Co-authored-by: Tim Sweña (Swast) Release-As: 1.40.0 --- bigframes/bigquery/_operations/json.py | 4 +- bigframes/core/array_value.py | 4 +- bigframes/core/compile/ibis_types.py | 2 +- bigframes/dtypes.py | 7 +- bigframes/session/_io/pandas.py | 3 - setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/small/bigquery/test_json.py | 59 ++++++++-------- tests/system/small/test_dataframe.py | 13 +++- tests/system/small/test_dataframe_io.py | 85 +++++++++++------------- tests/system/small/test_series.py | 26 ++++---- tests/system/small/test_session.py | 25 ++++--- 12 files changed, 119 insertions(+), 113 deletions(-) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 0223811ebc..07efc5fa51 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -53,7 +53,7 @@ def json_set( >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) 0 {"a":100,"b":"hi"} - Name: data, dtype: dbjson + Name: data, dtype: extension>[pyarrow] Args: input (bigframes.series.Series): @@ -253,7 +253,7 @@ def parse_json( dtype: string >>> bbq.parse_json(s) 0 {"class":{"students":[{"id":5},{"id":12}]}} - dtype: dbjson + dtype: extension>[pyarrow] Args: input (bigframes.series.Series): diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 9c44255941..7ede7b7e65 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -108,8 +108,8 @@ def from_table( raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): msg = bfe.format_message( - "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is " - "in preview; this behavior may change in future versions." + "JSON column interpretation as a custom PyArrow extention in `db_dtypes` " + "is a preview feature and subject to change." ) warnings.warn(msg, bfe.PreviewWarning) # define data source only for needed columns, this makes row-hashing cheaper diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 54a5a37736..54b0a1408a 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -75,7 +75,7 @@ IBIS_GEO_TYPE, gpd.array.GeometryDtype(), ), - (ibis_dtypes.json, db_dtypes.JSONDtype()), + (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())), ) BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 5e9f1f108b..22cc521e8e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -62,7 +62,9 @@ # No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() # JSON -JSON_DTYPE = db_dtypes.JSONDtype() +# TODO: switch to pyarrow.json_(pyarrow.string()) when available. +JSON_ARROW_TYPE = db_dtypes.JSONArrowType() +JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE) OBJ_REF_DTYPE = pd.ArrowDtype( pa.struct( ( @@ -80,7 +82,7 @@ ), pa.field( "details", - db_dtypes.JSONArrowType(), + JSON_ARROW_TYPE, ), ) ) @@ -301,7 +303,6 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool: return type_ in ("object", "O") or ( getattr(type_, "kind", None) == "O" and getattr(type_, "storage", None) != "pyarrow" - and getattr(type_, "name", None) != "dbjson" ) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index a1549238b3..ca70ee774c 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -18,7 +18,6 @@ from typing import Collection, Union import bigframes_vendored.constants as constants -import db_dtypes # type: ignore import geopandas # type: ignore import numpy as np import pandas @@ -125,8 +124,6 @@ def arrow_to_pandas( ) elif isinstance(dtype, pandas.ArrowDtype): series = _arrow_to_pandas_arrowdtype(column, dtype) - elif isinstance(dtype, db_dtypes.JSONDtype): - series = db_dtypes.JSONArray(column) else: series = column.to_pandas(types_mapper=lambda _: dtype) diff --git a/setup.py b/setup.py index 9ea563b3cb..34e013c9a3 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ "ipywidgets >=7.7.1", "humanize >=4.6.0", "matplotlib >=3.7.1", - "db-dtypes >=1.4.0", + "db-dtypes >=1.4.2", # For vendored ibis-framework. "atpublic>=2.3,<6", "parsy>=2,<3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index b355e0915b..8c7c69efa7 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -25,7 +25,7 @@ tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 matplotlib==3.7.1 -db-dtypes==1.4.0 +db-dtypes==1.4.2 # For vendored ibis-framework. atpublic==2.3 parsy==2.0 diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 492c0cf9b6..bade725733 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -12,30 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -import db_dtypes # type: ignore import geopandas as gpd # type: ignore import pandas as pd import pyarrow as pa import pytest import bigframes.bigquery as bbq -import bigframes.dtypes +import bigframes.dtypes as dtypes import bigframes.pandas as bpd @pytest.mark.parametrize( ("json_path", "expected_json"), [ - pytest.param("$.a", [{"a": 10}], id="simple"), - pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"), + pytest.param("$.a", ['{"a": 10}'], id="simple"), + pytest.param("$.a.b.c", ['{"a": {"b": {"c": 10, "d": []}}}'], id="nested"), ], ) def test_json_set_at_json_path(json_path, expected_json): - original_json = [{"a": {"b": {"c": "tester", "d": []}}}] - s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) + original_json = ['{"a": {"b": {"c": "tester", "d": []}}}'] + s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) - expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) + expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -45,18 +44,20 @@ def test_json_set_at_json_path(json_path, expected_json): @pytest.mark.parametrize( ("json_value", "expected_json"), [ - pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"), - pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"), - pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"), - pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"), + pytest.param(10, ['{"a": {"b": 10}}', '{"a": {"b": 10}}'], id="int"), + pytest.param(0.333, ['{"a": {"b": 0.333}}', '{"a": {"b": 0.333}}'], id="float"), + pytest.param( + "eng", ['{"a": {"b": "eng"}}', '{"a": {"b": "eng"}}'], id="string" + ), + pytest.param([1, 2], ['{"a": {"b": 1}}', '{"a": {"b": 2}}'], id="series"), ], ) def test_json_set_at_json_value_type(json_value, expected_json): - original_json = [{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}] - s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) + original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}'] + s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) - expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) + expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -64,14 +65,14 @@ def test_json_set_at_json_value_type(json_value, expected_json): def test_json_set_w_more_pairs(): - original_json = [{"a": 2}, {"b": 5}, {"c": 1}] - s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) + original_json = ['{"a": 2}', '{"b": 5}', '{"c": 1}'] + s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set( s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])] ) - expected_json = [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}] - expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) + expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}'] + expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -79,13 +80,13 @@ def test_json_set_w_more_pairs(): def test_json_set_w_invalid_json_path_value_pairs(): - s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype()) + s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) with pytest.raises(ValueError): bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore def test_json_set_w_invalid_value_type(): - s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype()) + s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) with pytest.raises(TypeError): bbq.json_set( s, @@ -101,17 +102,18 @@ def test_json_set_w_invalid_value_type(): def test_json_set_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)]) + bbq.json_set(s, json_path_value_pairs=[("$.a", 1)]) def test_json_extract_from_json(): s = bpd.Series( - [{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}], - dtype=db_dtypes.JSONDtype(), + ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], + dtype=dtypes.JSON_DTYPE, ) actual = bbq.json_extract(s, "$.a.b").to_pandas() - expected = bpd.Series([[1, 2], None, 0], dtype=db_dtypes.JSONDtype()).to_pandas() + expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas() pd.testing.assert_series_equal( actual, expected, @@ -132,14 +134,15 @@ def test_json_extract_from_string(): def test_json_extract_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_extract(bpd.Series([1, 2]), "$.a") + bbq.json_extract(s, "$.a") def test_json_extract_array_from_json(): s = bpd.Series( - [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}], - dtype=db_dtypes.JSONDtype(), + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], + dtype=dtypes.JSON_DTYPE, ) actual = bbq.json_extract_array(s, "$.a") @@ -225,7 +228,7 @@ def test_json_extract_string_array_from_array_strings(): def test_json_extract_string_array_as_float_array_from_array_strings(): s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"]) - actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE) + actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE) expected = bpd.Series([[1, 2.5, 3], [], [4, 5]]) pd.testing.assert_series_equal( actual.to_pandas(), diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f80b811217..9415f9657e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -30,6 +30,7 @@ import bigframes._config.display_options as display_options import bigframes.core.indexes as bf_indexes import bigframes.dataframe as dataframe +import bigframes.dtypes as dtypes import bigframes.pandas as bpd import bigframes.series as series from tests.system.utils import ( @@ -4584,7 +4585,17 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub ) def test_df_drop_duplicates_w_json(json_df, keep): bf_df = json_df.drop_duplicates(keep=keep).to_pandas() - pd_df = json_df.to_pandas().drop_duplicates(keep=keep) + + # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible + # with Arrow string extension types. Temporary conversion to standard Pandas + # strings is required. + json_pandas_df = json_df.to_pandas() + json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( + pd.StringDtype(storage="pyarrow") + ) + + pd_df = json_pandas_df.drop_duplicates(keep=keep) + pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) pd.testing.assert_frame_equal( pd_df, bf_df, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 4758c2d5b4..e80668939a 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math from typing import Tuple import db_dtypes # type:ignore @@ -22,6 +21,7 @@ import pyarrow as pa import pytest +import bigframes.dtypes as dtypes from tests.system import utils try: @@ -35,7 +35,6 @@ from google.cloud import bigquery import bigframes -from bigframes import dtypes import bigframes.dataframe import bigframes.features import bigframes.pandas as bpd @@ -278,7 +277,7 @@ def test_to_arrow_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id -def test_load_json_w_unboxed_py_value(session): +def test_load_json_w_json_string_items(session): sql = """ SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, UNION ALL @@ -292,42 +291,43 @@ def test_load_json_w_unboxed_py_value(session): UNION ALL SELECT 5, JSON_OBJECT('null', null), UNION ALL + SELECT 6, JSON_OBJECT('b', 2, 'a', 1), + UNION ALL SELECT - 6, + 7, JSON_OBJECT( 'dict', JSON_OBJECT( 'int', 1, - 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] ) ), """ df = session.read_gbq(sql, index_col="id") - assert df.dtypes["json_col"] == db_dtypes.JSONDtype() - assert isinstance(df["json_col"][0], dict) + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + assert df["json_col"][0] == '{"boolean":true}' + assert df["json_col"][1] == '{"int":100}' + assert df["json_col"][2] == '{"float":0.98}' + assert df["json_col"][3] == '{"string":"hello world"}' + assert df["json_col"][4] == '{"array":[8,9,10]}' + assert df["json_col"][5] == '{"null":null}' - assert df["json_col"][0]["boolean"] - assert df["json_col"][1]["int"] == 100 - assert math.isclose(df["json_col"][2]["float"], 0.98) - assert df["json_col"][3]["string"] == "hello world" - assert df["json_col"][4]["array"] == [8, 9, 10] - assert df["json_col"][5]["null"] is None - assert df["json_col"][6]["dict"] == { - "int": 1, - "array": [{"bar": "hello"}, {"foo": 1}], - } + # Verifies JSON strings preserve array order, regardless of dictionary key order. + assert df["json_col"][6] == '{"a":1,"b":2}' + assert df["json_col"][7] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' def test_load_json_to_pandas_has_correct_result(session): df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") - assert df.dtypes["json_col"] == db_dtypes.JSONDtype() + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) result = df.to_pandas() - # The order of keys within the JSON object shouldn't matter for equality checks. + # These JSON strings are compatible with BigQuery's JSON storage, pd_df = pd.DataFrame( - {"json_col": [{"bar": True, "foo": 10}]}, - dtype=db_dtypes.JSONDtype(), + {"json_col": ['{"bar":true,"foo":10}']}, + dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), ) pd_df.index = pd_df.index.astype("Int64") pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) @@ -355,7 +355,7 @@ def test_load_json_in_struct(session): 'dict', JSON_OBJECT( 'int', 1, - 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] ) ), 7), """ @@ -365,18 +365,15 @@ def test_load_json_in_struct(session): assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) data = df["struct_col"].struct.field("data") - assert data.dtype == db_dtypes.JSONDtype() + assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - assert data[0]["boolean"] - assert data[1]["int"] == 100 - assert math.isclose(data[2]["float"], 0.98) - assert data[3]["string"] == "hello world" - assert data[4]["array"] == [8, 9, 10] - assert data[5]["null"] is None - assert data[6]["dict"] == { - "int": 1, - "array": [{"bar": "hello"}, {"foo": 1}], - } + assert data[0] == '{"boolean":true}' + assert data[1] == '{"int":100}' + assert data[2] == '{"float":0.98}' + assert data[3] == '{"string":"hello world"}' + assert data[4] == '{"array":[8,9,10]}' + assert data[5] == '{"null":null}' + assert data[6] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' def test_load_json_in_array(session): @@ -406,18 +403,15 @@ def test_load_json_in_array(session): data = df["array_col"].list assert data.len()[0] == 7 - assert data[0].dtype == db_dtypes.JSONDtype() + assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - assert data[0][0]["boolean"] - assert data[1][0]["int"] == 100 - assert math.isclose(data[2][0]["float"], 0.98) - assert data[3][0]["string"] == "hello world" - assert data[4][0]["array"] == [8, 9, 10] - assert data[5][0]["null"] is None - assert data[6][0]["dict"] == { - "int": 1, - "array": [{"bar": "hello"}, {"foo": 1}], - } + assert data[0][0] == '{"boolean":true}' + assert data[1][0] == '{"int":100}' + assert data[2][0] == '{"float":0.98}' + assert data[3][0] == '{"string":"hello world"}' + assert data[4][0] == '{"array":[8,9,10]}' + assert data[5][0] == '{"null":null}' + assert data[6][0] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): @@ -691,7 +685,8 @@ def test_to_gbq_w_json(bigquery_client): """Test the `to_gbq` API can get a JSON column.""" s1 = bpd.Series([1, 2, 3, 4]) s2 = bpd.Series( - ["a", 1, False, ["a", {"b": 1}], {"c": [1, 2, 3]}], dtype=db_dtypes.JSONDtype() + ['"a"', "1", "false", '["a", {"b": 1}]', '{"c": [1, 2, 3]}'], + dtype=dtypes.JSON_DTYPE, ) df = bpd.DataFrame({"id": s1, "json_col": s2}) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 980f2226b7..d62af962fc 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -26,6 +26,7 @@ import pytest import shapely # type: ignore +import bigframes.dtypes as dtypes import bigframes.features import bigframes.pandas import bigframes.series as series @@ -304,22 +305,21 @@ def test_series_construct_w_dtype_for_array_struct(): def test_series_construct_w_dtype_for_json(): data = [ - 1, - "str", - False, - ["a", {"b": 1}, None], + "1", + '"str"', + "false", + '["a", {"b": 1}, null]', None, - {"a": {"b": [1, 2, 3], "c": True}}, + '{"a": {"b": [1, 2, 3], "c": true}}', ] - s = bigframes.pandas.Series(data, dtype=db_dtypes.JSONDtype()) + s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) - assert s[0] == 1 - assert s[1] == "str" - assert s[2] is False - assert s[3][0] == "a" - assert s[3][1]["b"] == 1 + assert s[0] == "1" + assert s[1] == '"str"' + assert s[2] == "false" + assert s[3] == '["a",{"b":1},null]' assert pd.isna(s[4]) - assert s[5]["a"] == {"b": [1, 2, 3], "c": True} + assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' def test_series_keys(scalars_dfs): @@ -383,7 +383,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] series_pandas = series.to_pandas() - assert series.dtype == db_dtypes.JSONDtype() + assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert series_pandas.shape[0] == json_pandas_df.shape[0] diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 0c8da52774..4b7495694b 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -22,7 +22,6 @@ import warnings import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq -import db_dtypes # type: ignore import google import google.cloud.bigquery as bigquery import numpy as np @@ -759,13 +758,13 @@ def test_read_pandas_timedelta_index(session, write_engine): ) def test_read_pandas_json_dataframes(session, write_engine): json_data = [ - 1, + "1", None, - ["1", "3", "5"], - {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] expected_df = pd.DataFrame( - {"my_col": pd.Series(json_data, dtype=db_dtypes.JSONDtype())} + {"my_col": pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE)} ) actual_result = session.read_pandas( @@ -783,12 +782,12 @@ def test_read_pandas_json_dataframes(session, write_engine): ) def test_read_pandas_json_series(session, write_engine): json_data = [ - 1, + "1", None, - ["1", "3", "5"], - {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] - expected_series = pd.Series(json_data, dtype=db_dtypes.JSONDtype()) + expected_series = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) actual_result = session.read_pandas( expected_series, write_engine=write_engine @@ -807,12 +806,12 @@ def test_read_pandas_json_series(session, write_engine): ) def test_read_pandas_json_index(session, write_engine): json_data = [ - 1, + "1", None, - ["1", "3", "5"], - {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] - expected_index = pd.Index(json_data, dtype=db_dtypes.JSONDtype()) + expected_index: pd.Index = pd.Index(json_data, dtype=bigframes.dtypes.JSON_DTYPE) actual_result = session.read_pandas( expected_index, write_engine=write_engine ).to_pandas() From 0db248b5597a3966ac3dee1cca849509e48f4648 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 11 Mar 2025 14:24:34 -0700 Subject: [PATCH 18/19] fix: use `==` instead of `is` for timedelta type equality checks (#1480) * fix: use instead of for timedelta type equality checks * use int column for casting --- bigframes/core/rewrite/timedeltas.py | 30 +++++++++---------- bigframes/operations/aggregations.py | 8 ++--- bigframes/operations/numeric_ops.py | 22 +++++++------- bigframes/operations/timedelta_ops.py | 8 ++--- .../small/operations/test_timedeltas.py | 29 ++++++++++++------ 5 files changed, 54 insertions(+), 43 deletions(-) diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index bf3c0ee639..ea8e608a84 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -111,7 +111,7 @@ def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _Ty def _rewrite_scalar_constant_expr(expr: ex.ScalarConstantExpression) -> _TypedExpr: - if expr.dtype is dtypes.TIMEDELTA_DTYPE: + if expr.dtype == dtypes.TIMEDELTA_DTYPE: int_repr = utils.timedelta_to_micros(expr.value) # type: ignore return _TypedExpr(ex.const(int_repr, expr.dtype), expr.dtype) @@ -148,31 +148,31 @@ def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: if dtypes.is_datetime_like(left.dtype) and dtypes.is_datetime_like(right.dtype): return _TypedExpr.create_op_expr(ops.timestamp_diff_op, left, right) - if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + if dtypes.is_datetime_like(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.timestamp_sub_op, left, right) if left.dtype == dtypes.DATE_DTYPE and right.dtype == dtypes.DATE_DTYPE: return _TypedExpr.create_op_expr(ops.date_diff_op, left, right) - if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + if left.dtype == dtypes.DATE_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.date_sub_op, left, right) return _TypedExpr.create_op_expr(ops.sub_op, left, right) def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: - if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + if dtypes.is_datetime_like(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.timestamp_add_op, left, right) - if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right.dtype): + if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right.dtype): # Re-arrange operands such that timestamp is always on the left and timedelta is # always on the right. return _TypedExpr.create_op_expr(ops.timestamp_add_op, right, left) - if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + if left.dtype == dtypes.DATE_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.date_add_op, left, right) - if left.dtype is dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.DATE_DTYPE: + if left.dtype == dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.DATE_DTYPE: # Re-arrange operands such that date is always on the left and timedelta is # always on the right. return _TypedExpr.create_op_expr(ops.date_add_op, right, left) @@ -183,9 +183,9 @@ def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.mul_op, left, right) - if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): + if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) - if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + if dtypes.is_numeric(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -194,7 +194,7 @@ def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.div_op, left, right) - if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): + if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -203,14 +203,14 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right) - if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): + if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr): - if arg.dtype is dtypes.TIMEDELTA_DTYPE: + if arg.dtype == dtypes.TIMEDELTA_DTYPE: # Do nothing for values that are already timedeltas return arg @@ -239,19 +239,19 @@ def _rewrite_aggregation( aggs.DateSeriesDiffOp(aggregation.op.periods), aggregation.arg ) - if isinstance(aggregation.op, aggs.StdOp) and input_type is dtypes.TIMEDELTA_DTYPE: + if isinstance(aggregation.op, aggs.StdOp) and input_type == dtypes.TIMEDELTA_DTYPE: return ex.UnaryAggregation( aggs.StdOp(should_floor_result=True), aggregation.arg ) - if isinstance(aggregation.op, aggs.MeanOp) and input_type is dtypes.TIMEDELTA_DTYPE: + if isinstance(aggregation.op, aggs.MeanOp) and input_type == dtypes.TIMEDELTA_DTYPE: return ex.UnaryAggregation( aggs.MeanOp(should_floor_result=True), aggregation.arg ) if ( isinstance(aggregation.op, aggs.QuantileOp) - and input_type is dtypes.TIMEDELTA_DTYPE + and input_type == dtypes.TIMEDELTA_DTYPE ): return ex.UnaryAggregation( aggs.QuantileOp(q=aggregation.op.q, should_floor_result=True), diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index a714f5804c..0ae4516dfd 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -142,7 +142,7 @@ class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if input_types[0] is dtypes.TIMEDELTA_DTYPE: + if input_types[0] == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE if dtypes.is_numeric(input_types[0]): @@ -185,7 +185,7 @@ def order_independent(self) -> bool: return True def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if input_types[0] is dtypes.TIMEDELTA_DTYPE: + if input_types[0] == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -233,7 +233,7 @@ class MeanOp(UnaryAggregateOp): should_floor_result: bool = False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if input_types[0] is dtypes.TIMEDELTA_DTYPE: + if input_types[0] == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -275,7 +275,7 @@ class StdOp(UnaryAggregateOp): should_floor_result: bool = False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if input_types[0] is dtypes.TIMEDELTA_DTYPE: + if input_types[0] == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE return signatures.FixedOutputType( diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index ae23aff707..d06d6eb336 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -124,9 +124,9 @@ def output_type(self, *input_types): return input_types[0] # Temporal addition. - if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + if dtypes.is_datetime_like(left_type) and right_type == dtypes.TIMEDELTA_DTYPE: return left_type - if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): + if left_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): return right_type if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: @@ -135,7 +135,7 @@ def output_type(self, *input_types): if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.DATE_DTYPE: return dtypes.DATETIME_DTYPE - if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE if (left_type is None or dtypes.is_numeric(left_type)) and ( @@ -164,13 +164,13 @@ def output_type(self, *input_types): if left_type == dtypes.DATE_DTYPE and right_type == dtypes.DATE_DTYPE: return dtypes.TIMEDELTA_DTYPE - if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + if dtypes.is_datetime_like(left_type) and right_type == dtypes.TIMEDELTA_DTYPE: return left_type if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.DATETIME_DTYPE - if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE if (left_type is None or dtypes.is_numeric(left_type)) and ( @@ -193,9 +193,9 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT left_type = input_types[0] right_type = input_types[1] - if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + if left_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): return dtypes.TIMEDELTA_DTYPE - if dtypes.is_numeric(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + if dtypes.is_numeric(left_type) and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE if (left_type is None or dtypes.is_numeric(left_type)) and ( @@ -217,10 +217,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT left_type = input_types[0] right_type = input_types[1] - if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + if left_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): return dtypes.TIMEDELTA_DTYPE - if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.FLOAT_DTYPE if (left_type is None or dtypes.is_numeric(left_type)) and ( @@ -244,10 +244,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT left_type = input_types[0] right_type = input_types[1] - if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + if left_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): return dtypes.TIMEDELTA_DTYPE - if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.INT_DTYPE if (left_type is None or dtypes.is_numeric(left_type)) and ( diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index b831e3f864..5e9a1189e4 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -46,7 +46,7 @@ class TimedeltaFloorOp(base_ops.UnaryOp): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: input_type = input_types[0] - if dtypes.is_numeric(input_type) or input_type is dtypes.TIMEDELTA_DTYPE: + if dtypes.is_numeric(input_type) or input_type == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE raise TypeError(f"unsupported type: {input_type}") @@ -62,11 +62,11 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT # timestamp + timedelta => timestamp if ( dtypes.is_datetime_like(input_types[0]) - and input_types[1] is dtypes.TIMEDELTA_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE ): return input_types[0] # timedelta + timestamp => timestamp - if input_types[0] is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like( + if input_types[0] == dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like( input_types[1] ): return input_types[1] @@ -87,7 +87,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT # timestamp - timedelta => timestamp if ( dtypes.is_datetime_like(input_types[0]) - and input_types[1] is dtypes.TIMEDELTA_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE ): return input_types[0] diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 53cb5f7419..0cf394e454 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -58,7 +58,8 @@ def temporal_dfs(session): pd.Timedelta(-4, "m"), pd.Timedelta(6, "h"), ], - "numeric_col": [1.5, 2, -3], + "float_col": [1.5, 2, -3], + "int_col": [1, 2, -3], } ) @@ -92,10 +93,10 @@ def _assert_series_equal(actual: pd.Series, expected: pd.Series): (operator.sub, "timedelta_col_1", "timedelta_col_2"), (operator.truediv, "timedelta_col_1", "timedelta_col_2"), (operator.floordiv, "timedelta_col_1", "timedelta_col_2"), - (operator.truediv, "timedelta_col_1", "numeric_col"), - (operator.floordiv, "timedelta_col_1", "numeric_col"), - (operator.mul, "timedelta_col_1", "numeric_col"), - (operator.mul, "numeric_col", "timedelta_col_1"), + (operator.truediv, "timedelta_col_1", "float_col"), + (operator.floordiv, "timedelta_col_1", "float_col"), + (operator.mul, "timedelta_col_1", "float_col"), + (operator.mul, "float_col", "timedelta_col_1"), ], ) def test_timedelta_binary_ops_between_series(temporal_dfs, op, col_1, col_2): @@ -117,7 +118,7 @@ def test_timedelta_binary_ops_between_series(temporal_dfs, op, col_1, col_2): (operator.truediv, "timedelta_col_1", 3), (operator.floordiv, "timedelta_col_1", 3), (operator.mul, "timedelta_col_1", 3), - (operator.mul, "numeric_col", pd.Timedelta(1, "s")), + (operator.mul, "float_col", pd.Timedelta(1, "s")), ], ) def test_timedelta_binary_ops_series_and_literal(temporal_dfs, op, col, literal): @@ -136,10 +137,10 @@ def test_timedelta_binary_ops_series_and_literal(temporal_dfs, op, col, literal) (operator.sub, "timedelta_col_1", pd.Timedelta(2, "s")), (operator.truediv, "timedelta_col_1", pd.Timedelta(2, "s")), (operator.floordiv, "timedelta_col_1", pd.Timedelta(2, "s")), - (operator.truediv, "numeric_col", pd.Timedelta(2, "s")), - (operator.floordiv, "numeric_col", pd.Timedelta(2, "s")), + (operator.truediv, "float_col", pd.Timedelta(2, "s")), + (operator.floordiv, "float_col", pd.Timedelta(2, "s")), (operator.mul, "timedelta_col_1", 3), - (operator.mul, "numeric_col", pd.Timedelta(1, "s")), + (operator.mul, "float_col", pd.Timedelta(1, "s")), ], ) def test_timedelta_binary_ops_literal_and_series(temporal_dfs, op, col, literal): @@ -181,6 +182,16 @@ def test_timestamp_add__ts_series_plus_td_series(temporal_dfs, column, pd_dtype) ) +@pytest.mark.parametrize("column", ["datetime_col", "timestamp_col"]) +def test_timestamp_add__ts_series_plus_td_series__explicit_cast(temporal_dfs, column): + bf_df, _ = temporal_dfs + dtype = pd.ArrowDtype(pa.duration("us")) + + actual_result = bf_df[column] + bf_df["int_col"].astype(dtype) + + assert len(actual_result) > 0 + + @pytest.mark.parametrize( "literal", [ From 5273d36343ccab30ec7fbefdd88121ef8c986df7 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 11 Mar 2025 16:14:44 -0700 Subject: [PATCH 19/19] chore(main): release 1.40.0 (#1466) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 24 +++++++++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9617d97c58..78ecfa53d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,30 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.40.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.39.0...v1.40.0) (2025-03-11) + + +### ⚠ BREAKING CHANGES + +* reading JSON data as a custom arrow extension type ([#1458](https://github.com/googleapis/python-bigquery-dataframes/issues/1458)) + +### Features + +* Reading JSON data as a custom arrow extension type ([#1458](https://github.com/googleapis/python-bigquery-dataframes/issues/1458)) ([e720f41](https://github.com/googleapis/python-bigquery-dataframes/commit/e720f41ef643ac14ae94fa98de5ef4a3fd6dde93)) +* Support list output for managed function ([#1457](https://github.com/googleapis/python-bigquery-dataframes/issues/1457)) ([461e9e0](https://github.com/googleapis/python-bigquery-dataframes/commit/461e9e017d513376fc623a5ee47f8b9dd002b452)) + + +### Bug Fixes + +* Fix list-like indexers in partial ordering mode ([#1456](https://github.com/googleapis/python-bigquery-dataframes/issues/1456)) ([fe72ada](https://github.com/googleapis/python-bigquery-dataframes/commit/fe72ada9cebb32947560c97567d7937c8b618f0d)) +* Fix the merge issue between 1424 and 1373 ([#1461](https://github.com/googleapis/python-bigquery-dataframes/issues/1461)) ([7b6e361](https://github.com/googleapis/python-bigquery-dataframes/commit/7b6e3615f8d4531beb4b59ca1223927112e713da)) +* Use `==` instead of `is` for timedelta type equality checks ([#1480](https://github.com/googleapis/python-bigquery-dataframes/issues/1480)) ([0db248b](https://github.com/googleapis/python-bigquery-dataframes/commit/0db248b5597a3966ac3dee1cca849509e48f4648)) + + +### Performance Improvements + +* Compilation no longer bounded by recursion ([#1464](https://github.com/googleapis/python-bigquery-dataframes/issues/1464)) ([27ab028](https://github.com/googleapis/python-bigquery-dataframes/commit/27ab028cdc45296923b12446c77b344af4208a3a)) + ## [1.39.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.38.0...v1.39.0) (2025-03-05) diff --git a/bigframes/version.py b/bigframes/version.py index f743c7e94d..e4062aa0c6 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.39.0" +__version__ = "1.40.0" # {x-release-please-start-date} -__release_date__ = "2025-03-05" +__release_date__ = "2025-03-11" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index f743c7e94d..e4062aa0c6 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.39.0" +__version__ = "1.40.0" # {x-release-please-start-date} -__release_date__ = "2025-03-05" +__release_date__ = "2025-03-11" # {x-release-please-end}