diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 7da0881bbe..124e4b8b48 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -57,8 +57,7 @@ git config --global --add safe.directory "${PROJECT_ROOT}" # Workaround for older pip not able to resolve dependencies. See internal # issue 316909553. -python3.10 -m pip install pip==23.3.2 -python3.10 -m pip install --require-hashes -r .kokoro/requirements.txt +python3.10 -m pip install pip==25.0.1 # Disable buffering, so that the logs stream through. export PYTHONUNBUFFERED=1 diff --git a/CHANGELOG.md b/CHANGELOG.md index bebe139c72..bd0eb732c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,51 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.0.0.dev0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.42.0...v2.0.0.dev0) (2025-03-31) + + +### ⚠ BREAKING CHANGES + +* Locational endpoints support is not available in BigFrames 2.0. +* change default LLM model to gemini-2.0-flash-001, drop PaLM2TextGenerator and PaLM2TextEmbeddingGenerator ([#1558](https://github.com/googleapis/python-bigquery-dataframes/issues/1558)) +* change default ingress setting for `remote_function` to internal-only ([#1544](https://github.com/googleapis/python-bigquery-dataframes/issues/1544)) +* make `remote_function` params keyword only ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) +* make `remote_function` default service account explicit ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) +* set `allow_large_results=False` by default ([#1541](https://github.com/googleapis/python-bigquery-dataframes/issues/1541)) + +### Features + +* Add component to manage temporary tables ([#1559](https://github.com/googleapis/python-bigquery-dataframes/issues/1559)) ([0a4e245](https://github.com/googleapis/python-bigquery-dataframes/commit/0a4e245670e678f4ead0aec8f8b534e7fe97d112)) +* Allow `input_types`, `output_type`, and `dataset` to be used positionally in `remote_function` ([#1560](https://github.com/googleapis/python-bigquery-dataframes/issues/1560)) ([bcac8c6](https://github.com/googleapis/python-bigquery-dataframes/commit/bcac8c6ed0b40902d0ccaef3f907e6acbe6a52ed)) +* Allow pandas.cut 'labels' parameter to accept a list of string ([#1549](https://github.com/googleapis/python-bigquery-dataframes/issues/1549)) ([af842b1](https://github.com/googleapis/python-bigquery-dataframes/commit/af842b174de7eef4908b397d6a745caf8eda7b3d)) +* Change default ingress setting for `remote_function` to internal-only ([#1544](https://github.com/googleapis/python-bigquery-dataframes/issues/1544)) ([c848a80](https://github.com/googleapis/python-bigquery-dataframes/commit/c848a80766ff68ea92c05a5dc5c26508e6755381)) +* Drop support for locational endpoints ([#1542](https://github.com/googleapis/python-bigquery-dataframes/issues/1542)) ([4bf2e43](https://github.com/googleapis/python-bigquery-dataframes/commit/4bf2e43ef4498b11f32086231fc4cc749fde966a)) +* Make `remote_function` default service account explicit ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) ([9eb9089](https://github.com/googleapis/python-bigquery-dataframes/commit/9eb9089ce3f1dad39761ba8ebc2d6f76261bd243)) +* Set `allow_large_results=False` by default ([#1541](https://github.com/googleapis/python-bigquery-dataframes/issues/1541)) ([e9fb712](https://github.com/googleapis/python-bigquery-dataframes/commit/e9fb7129a05e8ac7c938ffe30e86902950316f20)) +* Support bigquery connection in managed function ([#1554](https://github.com/googleapis/python-bigquery-dataframes/issues/1554)) ([f6f697a](https://github.com/googleapis/python-bigquery-dataframes/commit/f6f697afc167e0fa7ea923c0aed85a9ef257d61f)) +* Support bq connection path format ([#1550](https://github.com/googleapis/python-bigquery-dataframes/issues/1550)) ([e7eb918](https://github.com/googleapis/python-bigquery-dataframes/commit/e7eb918dd9df3569febe695f57c1a5909844fd3c)) +* Support gemini-2.0-X models ([#1558](https://github.com/googleapis/python-bigquery-dataframes/issues/1558)) ([3104fab](https://github.com/googleapis/python-bigquery-dataframes/commit/3104fab019d20b0cbc06cd81d43b3f34fd1dd987)) + + +### Bug Fixes + +* Include role and service account in IAM exception ([#1564](https://github.com/googleapis/python-bigquery-dataframes/issues/1564)) ([8c50755](https://github.com/googleapis/python-bigquery-dataframes/commit/8c507556c5f61fab95c6389a8ad04d731df1df7b)) +* Pandas.cut returns labels index for numeric breaks when labels=False ([#1548](https://github.com/googleapis/python-bigquery-dataframes/issues/1548)) ([b2375de](https://github.com/googleapis/python-bigquery-dataframes/commit/b2375decedbf1a793eedbbc9dc2efc2296f8cc6e)) +* Prevent `KeyError` in `bpd.concat` with empty DF and struct/array types DF ([#1568](https://github.com/googleapis/python-bigquery-dataframes/issues/1568)) ([b4da1cf](https://github.com/googleapis/python-bigquery-dataframes/commit/b4da1cf3c0fb94a2bb21e6039896accab85742d4)) + + +### Documentation + +* Add message to remove default model for version 3.0 ([#1563](https://github.com/googleapis/python-bigquery-dataframes/issues/1563)) ([910be2b](https://github.com/googleapis/python-bigquery-dataframes/commit/910be2b5b2bfaf0e21cdc4fd775c1605a864c1aa)) +* Add warning for bigframes 2.0 ([#1557](https://github.com/googleapis/python-bigquery-dataframes/issues/1557)) ([3f0eaa1](https://github.com/googleapis/python-bigquery-dataframes/commit/3f0eaa1c6b02d086270421f91dbb6aa2f117317d)) +* Remove gemini-1.5 deprecation warning for `GeminiTextGenerator` ([#1562](https://github.com/googleapis/python-bigquery-dataframes/issues/1562)) ([0cc6784](https://github.com/googleapis/python-bigquery-dataframes/commit/0cc678448fdec1eaa3acfbb563a018325a8c85bc)) +* Use restructured text to allow publishing to PyPI ([#1565](https://github.com/googleapis/python-bigquery-dataframes/issues/1565)) ([d1e9ec2](https://github.com/googleapis/python-bigquery-dataframes/commit/d1e9ec2936d270ec4035014ea3ddd335a5747ade)) + + +### Miscellaneous Chores + +* Make `remote_function` params keyword only ([#1537](https://github.com/googleapis/python-bigquery-dataframes/issues/1537)) ([9eb9089](https://github.com/googleapis/python-bigquery-dataframes/commit/9eb9089ce3f1dad39761ba8ebc2d6f76261bd243)) + ## [1.42.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.41.0...v1.42.0) (2025-03-27) diff --git a/README.rst b/README.rst index 185c50c14a..2419029c29 100644 --- a/README.rst +++ b/README.rst @@ -12,6 +12,30 @@ powered by the BigQuery engine. BigQuery DataFrames is an open-source package. You can run ``pip install --upgrade bigframes`` to install the latest version. +⚠️ Warning: Breaking Changes in BigQuery DataFrames v2.0 +-------------------------------------------------------- + +Version 2.0 introduces breaking changes for improved security and performance. Key default behaviors have changed, including + +* **Large Results (>10GB):** The default value for ``allow_large_results`` has changed to ``False``. + Methods like ``to_pandas()`` will now fail if the query result's compressed data size exceeds 10GB, + unless large results are explicitly permitted. +* **Remote Function Security:** The library no longer automatically lets the Compute Engine default service + account become the identity of the Cloud Run functions. If that is desired, it has to be indicated by passing + ``cloud_function_service_account="default"``. And network ingress now defaults to ``"internal-only"``. +* **@remote_function Argument Passing:** Arguments other than ``input_types``, ``output_type``, and ``dataset`` + to ``remote_function`` must now be passed using keyword syntax, as positional arguments are no longer supported. +* **Endpoint Connections:** Automatic fallback to locational endpoints in certain regions is removed. +* **LLM Updates (Gemini Integration):** Integrations now default to the ``gemini-2.0-flash-001`` model. + PaLM2 support has been removed; please migrate any existing PaLM2 usage to Gemini. **Note:** The current default + model will be removed in Version 3.0. + +**Important:** If you are not ready to adapt to these changes, please pin your dependency to a version less than 2.0 +(e.g., ``bigframes==1.42.0``) to avoid disruption. + +To learn about these changes and how to migrate to version 2.0, see the +`updated introduction guide `_. + .. |GA| image:: https://img.shields.io/badge/support-GA-gold.svg :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#general-availability .. |pypi| image:: https://img.shields.io/pypi/v/bigframes.svg diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 84bc4f6d01..41f662c6c2 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -89,7 +89,7 @@ def __init__( kms_key_name: Optional[str] = None, skip_bq_connection_check: bool = False, *, - allow_large_results: bool = True, + allow_large_results: bool = False, ordering_mode: Literal["strict", "partial"] = "strict", client_endpoints_override: Optional[dict] = None, ): @@ -258,7 +258,8 @@ def allow_large_results(self, value: bool): @property def use_regional_endpoints(self) -> bool: - """Flag to connect to regional API endpoints. + """Flag to connect to regional API endpoints for BigQuery API and + BigQuery Storage API. .. note:: Use of regional endpoints is a feature in Preview and available only @@ -267,18 +268,16 @@ def use_regional_endpoints(self) -> bool: "us-east5", "us-east7", "us-south1", "us-west1", "us-west2", "us-west3" and "us-west4". - .. deprecated:: 0.13.0 - Use of locational endpoints is available only in selected projects. - - Requires that ``location`` is set. For supported regions, for example - ``europe-west3``, you need to specify ``location='europe-west3'`` and - ``use_regional_endpoints=True``, and then BigQuery DataFrames would - connect to the BigQuery endpoint ``bigquery.europe-west3.rep.googleapis.com``. - For not supported regions, for example ``asia-northeast1``, when you - specify ``location='asia-northeast1'`` and ``use_regional_endpoints=True``, - a different endpoint (called locational endpoint, now deprecated, used - to provide weaker promise on the request remaining within the location - during transit) ``europe-west3-bigquery.googleapis.com`` would be used. + Requires that ``location`` is set. For [supported regions](https://cloud.google.com/bigquery/docs/regional-endpoints), + for example ``europe-west3``, you need to specify + ``location='europe-west3'`` and ``use_regional_endpoints=True``, and + then BigQuery DataFrames would connect to the BigQuery endpoint + ``bigquery.europe-west3.rep.googleapis.com``. For not supported regions, + for example ``asia-northeast1``, when you specify + ``location='asia-northeast1'`` and ``use_regional_endpoints=True``, + the global endpoint ``bigquery.googleapis.com`` would be used, which + does not promise any guarantee on the request remaining within the + location during transit. Returns: bool: diff --git a/bigframes/clients.py b/bigframes/clients.py index 1b8212377d..484169191b 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -17,6 +17,7 @@ from __future__ import annotations import logging +import textwrap import time from typing import cast, Optional @@ -28,21 +29,46 @@ logger = logging.getLogger(__name__) -def resolve_full_bq_connection_name( - connection_name: str, default_project: str, default_location: str +def get_canonical_bq_connection_id( + connection_id: str, default_project: str, default_location: str ) -> str: - """Retrieve the full connection name of the form ... - Use default project, location or connection_id when any of them are missing.""" - if connection_name.count(".") == 2: - return connection_name - - if connection_name.count(".") == 1: - return f"{default_project}.{connection_name}" - - if connection_name.count(".") == 0: - return f"{default_project}.{default_location}.{connection_name}" - - raise ValueError(f"Invalid connection name format: {connection_name}.") + """ + Retrieve the full connection id of the form + ... + Use default project, location or connection_id when any of them are missing. + """ + + if "/" in connection_id: + fields = connection_id.split("/") + if ( + len(fields) == 6 + and fields[0] == "projects" + and fields[2] == "locations" + and fields[4] == "connections" + ): + return ".".join((fields[1], fields[3], fields[5])) + else: + if connection_id.count(".") == 2: + return connection_id + + if connection_id.count(".") == 1: + return f"{default_project}.{connection_id}" + + if connection_id.count(".") == 0: + return f"{default_project}.{default_location}.{connection_id}" + + raise ValueError( + textwrap.dedent( + f""" + Invalid connection id format: {connection_id}. + Only the following formats are supported: + .., + ., + , + projects//locations//connections/ + """ + ).strip() + ) class BqConnectionManager: @@ -80,7 +106,7 @@ def create_bq_connection( ) if service_account_id: logger.info( - f"Connector {project_id}.{location}.{connection_id} already exists" + f"BQ connection {project_id}.{location}.{connection_id} already exists" ) else: connection_name, service_account_id = self._create_bq_connection( @@ -90,9 +116,14 @@ def create_bq_connection( f"Created BQ connection {connection_name} with service account id: {service_account_id}" ) service_account_id = cast(str, service_account_id) + # Ensure IAM role on the BQ connection # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - self._ensure_iam_binding(project_id, service_account_id, iam_role) + try: + self._ensure_iam_binding(project_id, service_account_id, iam_role) + except google.api_core.exceptions.PermissionDenied as ex: + ex.message = f"Failed ensuring IAM binding (role={iam_role}, service-account={service_account_id}). {ex.message}" + raise # Introduce retries to accommodate transient errors like: # (1) Etag mismatch, diff --git a/bigframes/constants.py b/bigframes/constants.py index 8f5ed95e1a..89f27afd78 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -96,22 +96,27 @@ } ) -# https://cloud.google.com/storage/docs/locational-endpoints -LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset( +REP_NOT_ENABLED_BIGQUERY_LOCATIONS = frozenset( ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS ) -LEP_DEPRECATION_WARNING_MESSAGE = textwrap.dedent( +LOCATION_NEEDED_FOR_REP_MESSAGE = textwrap.dedent( """ - Support for regional endpoints is not yet available in the location - {location} for BigQuery and BigQuery Storage APIs. For the supported - locations and APIs see https://cloud.google.com/bigquery/docs/regional-endpoints. - For other locations and APIs, currently an older, now deprecated locational - endpoints are being used, which requires your project to be allowlisted. In - future version 2.0 onwards the locational endpoints will no longer be - supported automatically when you enable regional endpoints. However, if you - still need them, you will be able to override the endpoints directly by - doing the following: + Must set location to use regional endpoints. + You can do it via bigframaes.pandas.options.bigquery.location. + The supported locations can be found at + https://cloud.google.com/bigquery/docs/regional-endpoints#supported-locations. + """ +).strip() + +REP_NOT_SUPPORTED_MESSAGE = textwrap.dedent( + """ + Support for regional endpoints for BigQuery and BigQuery Storage APIs may + not be available in the location {location}. For the supported APIs and + locations see https://cloud.google.com/bigquery/docs/regional-endpoints. + If you have the (deprecated) locational endpoints enabled in your project + (which requires your project to be allowlisted), you can override the + endpoints directly by doing the following: bigframes.pandas.options.bigquery.client_endpoints_override = {{ "bqclient": "https://{location}-bigquery.googleapis.com", "bqconnectionclient": "{location}-bigqueryconnection.googleapis.com", diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index f96471e200..0d31798f25 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -360,69 +360,73 @@ def _( if isinstance(op.bins, int): col_min = _apply_window_if_present(x.min(), window) col_max = _apply_window_if_present(x.max(), window) + adj = (col_max - col_min) * 0.001 bin_width = (col_max - col_min) / op.bins - if op.labels is False: - for this_bin in range(op.bins - 1): - if op.right: - case_expr = x <= (col_min + (this_bin + 1) * bin_width) - else: - case_expr = x < (col_min + (this_bin + 1) * bin_width) - out = out.when( - case_expr, - compile_ibis_types.literal_to_ibis_scalar( - this_bin, force_dtype=pd.Int64Dtype() - ), + for this_bin in range(op.bins): + if op.labels is False: + value = compile_ibis_types.literal_to_ibis_scalar( + this_bin, + force_dtype=pd.Int64Dtype(), ) - out = out.when(x.notnull(), op.bins - 1) - else: - interval_struct = None - adj = (col_max - col_min) * 0.001 - for this_bin in range(op.bins): - left_edge_adj = adj if this_bin == 0 and op.right else 0 - right_edge_adj = adj if this_bin == op.bins - 1 and not op.right else 0 + elif isinstance(op.labels, typing.Iterable): + value = compile_ibis_types.literal_to_ibis_scalar( + list(op.labels)[this_bin], + force_dtype=pd.StringDtype(storage="pyarrow"), + ) + else: + left_adj = adj if this_bin == 0 and op.right else 0 + right_adj = adj if this_bin == op.bins - 1 and not op.right else 0 - left_edge = col_min + this_bin * bin_width - left_edge_adj - right_edge = col_min + (this_bin + 1) * bin_width + right_edge_adj + left = col_min + this_bin * bin_width - left_adj + right = col_min + (this_bin + 1) * bin_width + right_adj if op.right: - interval_struct = ibis_types.struct( - { - "left_exclusive": left_edge, - "right_inclusive": right_edge, - } + value = ibis_types.struct( + {"left_exclusive": left, "right_inclusive": right} ) else: - interval_struct = ibis_types.struct( - { - "left_inclusive": left_edge, - "right_exclusive": right_edge, - } + value = ibis_types.struct( + {"left_inclusive": left, "right_exclusive": right} ) - - if this_bin < op.bins - 1: - if op.right: - case_expr = x <= (col_min + (this_bin + 1) * bin_width) - else: - case_expr = x < (col_min + (this_bin + 1) * bin_width) - out = out.when(case_expr, interval_struct) + if this_bin == op.bins - 1: + case_expr = x.notnull() + else: + if op.right: + case_expr = x <= (col_min + (this_bin + 1) * bin_width) else: - out = out.when(x.notnull(), interval_struct) + case_expr = x < (col_min + (this_bin + 1) * bin_width) + out = out.when(case_expr, value) else: # Interpret as intervals - for interval in op.bins: + for this_bin, interval in enumerate(op.bins): left = compile_ibis_types.literal_to_ibis_scalar(interval[0]) right = compile_ibis_types.literal_to_ibis_scalar(interval[1]) if op.right: condition = (x > left) & (x <= right) - interval_struct = ibis_types.struct( - {"left_exclusive": left, "right_inclusive": right} - ) else: condition = (x >= left) & (x < right) - interval_struct = ibis_types.struct( - {"left_inclusive": left, "right_exclusive": right} + + if op.labels is False: + value = compile_ibis_types.literal_to_ibis_scalar( + this_bin, + force_dtype=pd.Int64Dtype(), ) - out = out.when(condition, interval_struct) + elif isinstance(op.labels, typing.Iterable): + value = compile_ibis_types.literal_to_ibis_scalar( + list(op.labels)[this_bin], + force_dtype=pd.StringDtype(storage="pyarrow"), + ) + else: + if op.right: + value = ibis_types.struct( + {"left_exclusive": left, "right_inclusive": right} + ) + else: + value = ibis_types.struct( + {"left_inclusive": left, "right_exclusive": right} + ) + + out = out.when(condition, value) return out.end() diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 54b0a1408a..d5f9b5c5f9 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -388,7 +388,8 @@ def literal_to_ibis_scalar( # Ibis has bug for casting nulltype to geospatial, so we perform intermediate cast first geotype = ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True) return bigframes_vendored.ibis.literal(None, geotype) - ibis_dtype = BIGFRAMES_TO_IBIS[force_dtype] if force_dtype else None + + ibis_dtype = bigframes_dtype_to_ibis_dtype(force_dtype) if force_dtype else None if pd.api.types.is_list_like(literal): if validate: diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index d9a5a87145..86ccf52408 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -20,6 +20,7 @@ import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile import pandas as pd +import bigframes.constants import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils @@ -41,15 +42,37 @@ def cut( right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, ) -> bigframes.series.Series: - if labels is not None and labels is not False: + if ( + labels is not None + and labels is not False + and not isinstance(labels, typing.Iterable) + ): + raise ValueError( + "Bin labels must either be False, None or passed in as a list-like argument" + ) + if ( + isinstance(labels, typing.Iterable) + and len(list(labels)) > 0 + and not isinstance(list(labels)[0], str) + ): raise NotImplementedError( - "The 'labels' parameter must be either False or None. " - "Please provide a valid value for 'labels'." + "When using an iterable for labels, only iterables of strings are supported " + f"but found {type(list(labels)[0])}. {constants.FEEDBACK_LINK}" ) + if x.size == 0: + raise ValueError("Cannot cut empty array.") + if isinstance(bins, int): if bins <= 0: raise ValueError("`bins` should be a positive integer.") + if isinstance(labels, typing.Iterable): + labels = tuple(labels) + if len(labels) != bins: + raise ValueError( + f"Bin labels({len(labels)}) must be same as the value of bins({bins})" + ) + op = agg_ops.CutOp(bins, right=right, labels=labels) return x._apply_window_op(op, window_spec=window_specs.unbound()) elif isinstance(bins, typing.Iterable): @@ -58,6 +81,7 @@ def cut( bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) # To maintain consistency with pandas' behavior right = True + labels = None elif len(list(bins)) == 0: as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple() @@ -66,6 +90,7 @@ def cut( bins = tuple(bins) # To maintain consistency with pandas' behavior right = True + labels = None elif pd.api.types.is_number(list(bins)[0]): bins_list = list(bins) as_index = pd.IntervalIndex.from_breaks(bins_list) @@ -81,11 +106,24 @@ def cut( raise ValueError("`bins` iterable should contain tuples or numerics.") if as_index.is_overlapping: - raise ValueError("Overlapping IntervalIndex is not accepted.") - elif len(as_index) == 0: - op = agg_ops.CutOp(bins, right=right, labels=labels) + raise ValueError("Overlapping IntervalIndex is not accepted.") # TODO: test + + if isinstance(labels, typing.Iterable): + labels = tuple(labels) + if len(labels) != len(as_index): + raise ValueError( + f"Bin labels({len(labels)}) must be same as the number of bin edges" + f"({len(as_index)})" + ) + + if len(as_index) == 0: + dtype = agg_ops.CutOp(bins, right=right, labels=labels).output_type() return bigframes.series.Series( - [pd.NA] * len(x), dtype=op.output_type(), name=x.name + [pd.NA] * len(x), + dtype=dtype, + name=x.name, + index=x.index, + session=x._session, ) else: op = agg_ops.CutOp(bins, right=right, labels=labels) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 44aea57898..8a591f6916 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -196,6 +196,7 @@ def provision_bq_managed_function( name, packages, is_row_processor, + bq_connection_id, *, capture_references=False, ): @@ -273,12 +274,21 @@ def provision_bq_managed_function( udf_code = textwrap.dedent(inspect.getsource(func)) udf_code = udf_code[udf_code.index("def") :] + with_connection_clause = ( + ( + f"WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}`" + ) + if bq_connection_id + else "" + ) + create_function_ddl = ( textwrap.dedent( f""" CREATE OR REPLACE FUNCTION {persistent_func_id}({','.join(bq_function_args)}) RETURNS {bq_function_return_type} LANGUAGE python + {with_connection_clause} OPTIONS ({managed_function_options_str}) AS r''' __UDF_PLACE_HOLDER__ @@ -365,7 +375,7 @@ def create_cloud_function( is_row_processor=False, vpc_connector=None, memory_mib=1024, - ingress_settings="all", + ingress_settings="internal-only", ): """Create a cloud function from the given user defined function.""" diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index c04de54be6..b0d3ab81eb 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -167,7 +167,7 @@ def _resolve_bigquery_connection_id( if not bigquery_connection: bigquery_connection = session._bq_connection # type: ignore - bigquery_connection = clients.resolve_full_bq_connection_name( + bigquery_connection = clients.get_canonical_bq_connection_id( bigquery_connection, default_project=dataset_ref.project, default_location=bq_location, @@ -237,6 +237,7 @@ def _try_delattr(self, func: Callable, attr: str) -> None: # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py def remote_function( self, + *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, session: Optional[Session] = None, @@ -251,7 +252,7 @@ def remote_function( reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, + cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, @@ -259,9 +260,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Optional[ - Literal["all", "internal-only", "internal-and-gclb"] - ] = None, + cloud_function_ingress_settings: Literal[ + "all", "internal-only", "internal-and-gclb" + ] = "internal-only", ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -384,8 +385,8 @@ def remote_function( Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided then + cloud_function_service_account (str): + Service account to use for the cloud functions. If "default" provided then the default service account would be used. See https://cloud.google.com/functions/docs/securing/function-identity for more details. Please make sure the service account has the @@ -448,29 +449,20 @@ def remote_function( https://cloud.google.com/functions/docs/configuring/memory. cloud_function_ingress_settings (str, Optional): Ingress settings controls dictating what traffic can reach the - function. By default `all` will be used. It must be one of: - `all`, `internal-only`, `internal-and-gclb`. See for more details + function. Options are: `all`, `internal-only`, or `internal-and-gclb`. + If no setting is provided, `internal-only` will be used by default. + See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. """ # Some defaults may be used from the session if not provided otherwise. session = self._resolve_session(session) - # raise a UserWarning if user does not explicitly set cloud_function_service_account to a - # user-managed cloud_function_service_account of to default - msg = bfe.format_message( - "You have not explicitly set a user-managed `cloud_function_service_account`. " - "Using the default Compute Engine service account. " - "In BigFrames 2.0 onwards, you would have to explicitly set `cloud_function_service_account` " - 'either to a user-managed service account (preferred) or to `"default"` ' - "to use the default Compute Engine service account (discouraged). " - "See, https://cloud.google.com/functions/docs/securing/function-identity." - ) - + # If the user forces the cloud function service argument to None, throw + # an exception if cloud_function_service_account is None: - warnings.warn(msg, stacklevel=2, category=FutureWarning) - - if cloud_function_service_account == "default": - cloud_function_service_account = None + raise ValueError( + 'You must provide a user managed cloud_function_service_account, or "default" if you would like to let the default service account be used.' + ) # A BigQuery client is required to perform BQ operations. bigquery_client = self._resolve_bigquery_client(session, bigquery_client) @@ -516,24 +508,11 @@ def remote_function( ) if cloud_function_ingress_settings is None: - cloud_function_ingress_settings = "all" - msg = bfe.format_message( - "The `cloud_function_ingress_settings` are set to 'all' by default, " - "which will change to 'internal-only' for enhanced security in future version 2.0 onwards. " - "However, you will be able to explicitly pass cloud_function_ingress_settings='all' if you need. " - "See https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings for details." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - - if cloud_function_ingress_settings is None: - cloud_function_ingress_settings = "all" + cloud_function_ingress_settings = "internal-only" msg = bfe.format_message( - "The `cloud_function_ingress_settings` are set to 'all' by default, " - "which will change to 'internal-only' for enhanced security in future version 2.0 onwards. " - "However, you will be able to explicitly pass cloud_function_ingress_settings='all' if you need. " - "See https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings for details." + "The `cloud_function_ingress_settings` is being set to 'internal-only' by default." ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) + warnings.warn(msg, category=UserWarning, stacklevel=2) bq_connection_manager = session.bqconnectionmanager @@ -615,7 +594,9 @@ def wrapper(func): bq_connection_manager, cloud_function_region, cloud_functions_client, - cloud_function_service_account, + None + if cloud_function_service_account == "default" + else cloud_function_service_account, cloud_function_kms_key_name, cloud_function_docker_repository, session=session, # type: ignore @@ -826,9 +807,13 @@ def udf( bq_location, _ = _utils.get_remote_function_locations(bigquery_client.location) - # A connection is required for BQ managed function. - bq_connection_id = self._resolve_bigquery_connection_id( - session, dataset_ref, bq_location, bigquery_connection + # A connection is optional for BQ managed function. + bq_connection_id = ( + self._resolve_bigquery_connection_id( + session, dataset_ref, bq_location, bigquery_connection + ) + if bigquery_connection + else None ) bq_connection_manager = session.bqconnectionmanager @@ -926,6 +911,7 @@ def wrapper(func): name=name, packages=packages, is_row_processor=is_row_processor, + bq_connection_id=bq_connection_id, ) # TODO(shobs): Find a better way to support udfs with param named diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 1fd9fbc4a7..03562aa869 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -21,9 +21,8 @@ import bigframes_vendored.constants as constants from google.cloud import bigquery -import typing_extensions -from bigframes import clients, dtypes, exceptions +from bigframes import dtypes, exceptions import bigframes.bigquery as bbq from bigframes.core import blocks, global_session, log_adapter import bigframes.dataframe @@ -41,13 +40,6 @@ _TEXT_GENERATOR_BISON_32K_ENDPOINT, ) -_EMBEDDING_GENERATOR_GECKO_ENDPOINT = "textembedding-gecko" -_EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT = "textembedding-gecko-multilingual" -_PALM2_EMBEDDING_GENERATOR_ENDPOINTS = ( - _EMBEDDING_GENERATOR_GECKO_ENDPOINT, - _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT, -) - _TEXT_EMBEDDING_005_ENDPOINT = "text-embedding-005" _TEXT_EMBEDDING_004_ENDPOINT = "text-embedding-004" _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT = "text-multilingual-embedding-002" @@ -59,7 +51,6 @@ _MULTIMODAL_EMBEDDING_001_ENDPOINT = "multimodalembedding@001" -_GEMINI_PRO_ENDPOINT = "gemini-pro" _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514" _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514" _GEMINI_1P5_PRO_001_ENDPOINT = "gemini-1.5-pro-001" @@ -67,8 +58,9 @@ _GEMINI_1P5_FLASH_001_ENDPOINT = "gemini-1.5-flash-001" _GEMINI_1P5_FLASH_002_ENDPOINT = "gemini-1.5-flash-002" _GEMINI_2_FLASH_EXP_ENDPOINT = "gemini-2.0-flash-exp" +_GEMINI_2_FLASH_001_ENDPOINT = "gemini-2.0-flash-001" +_GEMINI_2_FLASH_LITE_001_ENDPOINT = "gemini-2.0-flash-lite-001" _GEMINI_ENDPOINTS = ( - _GEMINI_PRO_ENDPOINT, _GEMINI_1P5_PRO_PREVIEW_ENDPOINT, _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT, _GEMINI_1P5_PRO_001_ENDPOINT, @@ -76,6 +68,8 @@ _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, _GEMINI_2_FLASH_EXP_ENDPOINT, + _GEMINI_2_FLASH_001_ENDPOINT, + _GEMINI_2_FLASH_LITE_001_ENDPOINT, ) _GEMINI_PREVIEW_ENDPOINTS = ( _GEMINI_1P5_PRO_PREVIEW_ENDPOINT, @@ -83,7 +77,6 @@ _GEMINI_2_FLASH_EXP_ENDPOINT, ) _GEMINI_FINE_TUNE_SCORE_ENDPOINTS = ( - _GEMINI_PRO_ENDPOINT, _GEMINI_1P5_PRO_002_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, ) @@ -119,515 +112,6 @@ ) -@typing_extensions.deprecated( - "PaLM2TextGenerator is going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", - category=exceptions.ApiDeprecationWarning, -) -@log_adapter.class_logger -class PaLM2TextGenerator(base.BaseEstimator): - """PaLM2 text generator LLM model. - - .. note:: - PaLM2TextGenerator is going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. - - Args: - model_name (str, Default to "text-bison"): - The model for natural language tasks. “text-bison” returns model fine-tuned to follow natural language instructions - and is suitable for a variety of language tasks. "text-bison-32k" supports up to 32k tokens per request. - Default to "text-bison". - session (bigframes.Session or None): - BQ session to create the model. If None, use the global default session. - connection_name (str or None): - Connection to connect with remote service. str of the format ... - If None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach - permission if the connection isn't fully set up. - max_iterations (Optional[int], Default to 300): - The number of steps to run when performing supervised tuning. - """ - - def __init__( - self, - *, - model_name: Literal["text-bison", "text-bison-32k"] = "text-bison", - session: Optional[bigframes.Session] = None, - connection_name: Optional[str] = None, - max_iterations: int = 300, - ): - self.model_name = model_name - self.session = session or global_session.get_global_session() - self.max_iterations = max_iterations - self._bq_connection_manager = self.session.bqconnectionmanager - - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) - - self._bqml_model_factory = globals.bqml_model_factory() - self._bqml_model: core.BqmlModel = self._create_bqml_model() - - def _create_bqml_model(self): - # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) - - if self.model_name not in _TEXT_GENERATOR_ENDPOINTS: - msg = exceptions.format_message( - _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_TEXT_GENERATOR_ENDPOINTS), - ) - ) - warnings.warn(msg) - - options = { - "endpoint": self.model_name, - } - - return self._bqml_model_factory.create_remote_model( - session=self.session, connection_name=self.connection_name, options=options - ) - - @classmethod - def _from_bq( - cls, session: bigframes.Session, bq_model: bigquery.Model - ) -> PaLM2TextGenerator: - assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" - assert "remoteModelInfo" in bq_model._properties - assert "endpoint" in bq_model._properties["remoteModelInfo"] - assert "connection" in bq_model._properties["remoteModelInfo"] - - # Parse the remote model endpoint - bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] - model_connection = bq_model._properties["remoteModelInfo"]["connection"] - model_endpoint = bqml_endpoint.split("/")[-1] - - kwargs = utils.retrieve_params_from_bq_model( - cls, bq_model, _BQML_PARAMS_MAPPING - ) - - model = cls( - **kwargs, - session=session, - model_name=model_endpoint, - connection_name=model_connection, - ) - model._bqml_model = core.BqmlModel(session, bq_model) - return model - - @property - def _bqml_options(self) -> dict: - """The model options as they will be set for BQML""" - options = { - "max_iterations": self.max_iterations, - "data_split_method": "NO_SPLIT", - } - return options - - def fit( - self, - X: utils.ArrayType, - y: utils.ArrayType, - ) -> PaLM2TextGenerator: - """Fine tune PaLM2TextGenerator model. - - .. note:: - - This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the - Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" - and might have limited support. For more information, see the launch stage descriptions - (https://cloud.google.com/products#product-launch-stages). - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Training labels. - - Returns: - PaLM2TextGenerator: Fitted estimator. - """ - X, y = utils.batch_convert_to_dataframe(X, y) - - options = self._bqml_options - options["endpoint"] = self.model_name + "@001" - options["prompt_col"] = X.columns.tolist()[0] - - self._bqml_model = self._bqml_model_factory.create_llm_remote_model( - X, - y, - options=options, - connection_name=self.connection_name, - ) - return self - - def predict( - self, - X: utils.ArrayType, - *, - temperature: float = 0.0, - max_output_tokens: int = 128, - top_k: int = 40, - top_p: float = 0.95, - ) -> bigframes.dataframe.DataFrame: - """Predict the result from input DataFrame. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. - Prompts can include preamble, questions, suggestions, instructions, or examples. - - temperature (float, default 0.0): - The temperature is used for sampling during the response generation, which occurs when topP and topK are applied. - Temperature controls the degree of randomness in token selection. Lower temperatures are good for prompts that expect a true or correct response, - while higher temperatures can lead to more diverse or unexpected results. A temperature of 0 is deterministic: - the highest probability token is always selected. For most use cases, try starting with a temperature of 0.2. - Default 0. Possible values [0.0, 1.0]. - - max_output_tokens (int, default 128): - Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. - A token may be smaller than a word. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. - Default 128. For the 'text-bison' model, possible values are in the range [1, 1024]. For the 'text-bison-32k' model, possible values are in the range [1, 8192]. - Please ensure that the specified value for max_output_tokens is within the appropriate range for the model being used. - - top_k (int, default 40): - Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens - in the model's vocabulary (also called greedy decoding), while a top-k of 3 means that the next token is selected from among the 3 most probable tokens (using temperature). - For each token selection step, the top K tokens with the highest probabilities are sampled. Then tokens are further filtered based on topP with the final token selected using temperature sampling. - Specify a lower value for less random responses and a higher value for more random responses. - Default 40. Possible values [1, 40]. - - top_p (float, default 0.95):: - Top-p changes how the model selects tokens for output. Tokens are selected from most K (see topK parameter) probable to least until the sum of their probabilities equals the top-p value. - For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-p value is 0.5, then the model will select either A or B as the next token (using temperature) - and not consider C at all. - Specify a lower value for less random responses and a higher value for more random responses. - Default 0.95. Possible values [0.0, 1.0]. - - - Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. - """ - - # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - if temperature < 0.0 or temperature > 1.0: - raise ValueError(f"temperature must be [0.0, 1.0], but is {temperature}.") - - if ( - self.model_name == _TEXT_GENERATOR_BISON_ENDPOINT - and max_output_tokens not in range(1, 1025) - ): - raise ValueError( - f"max_output_token must be [1, 1024] for TextBison model, but is {max_output_tokens}." - ) - - if ( - self.model_name == _TEXT_GENERATOR_BISON_32K_ENDPOINT - and max_output_tokens not in range(1, 8193) - ): - raise ValueError( - f"max_output_token must be [1, 8192] for TextBison 32k model, but is {max_output_tokens}." - ) - - if top_k not in range(1, 41): - raise ValueError(f"top_k must be [1, 40], but is {top_k}.") - - if top_p < 0.0 or top_p > 1.0: - raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - - if len(X.columns) == 1: - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "prompt"}) - - options = { - "temperature": temperature, - "max_output_tokens": max_output_tokens, - "top_k": top_k, - "top_p": top_p, - "flatten_json_output": True, - } - - df = self._bqml_model.generate_text(X, options) - - if (df[_ML_GENERATE_TEXT_STATUS] != "").any(): - msg = exceptions.format_message( - f"Some predictions failed. Check column {_ML_GENERATE_TEXT_STATUS} for " - "detailed status. You may want to filter the failed rows and retry." - ) - warnings.warn(msg, category=RuntimeWarning) - - return df - - def score( - self, - X: utils.ArrayType, - y: utils.ArrayType, - task_type: Literal[ - "text_generation", "classification", "summarization", "question_answering" - ] = "text_generation", - ) -> bigframes.dataframe.DataFrame: - """Calculate evaluation metrics of the model. - - .. note:: - - This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the - Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" - and might have limited support. For more information, see the launch stage descriptions - (https://cloud.google.com/products#product-launch-stages). - - .. note:: - - Output matches that of the BigQuery ML.EVALUATE function. - See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm - for the outputs relevant to this model type. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - A BigQuery DataFrame as evaluation data, which contains only one column of input_text - that contains the prompt text to use when evaluating the model. - y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - A BigQuery DataFrame as evaluation labels, which contains only one column of output_text - that you would expect to be returned by the model. - task_type (str): - The type of the task for LLM model. Default to "text_generation". - Possible values: "text_generation", "classification", "summarization", and "question_answering". - - Returns: - bigframes.dataframe.DataFrame: The DataFrame as evaluation result. - """ - if not self._bqml_model: - raise RuntimeError("A model must be fitted before score") - - X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session) - - if len(X.columns) != 1 or len(y.columns) != 1: - raise ValueError( - f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}" - ) - - # BQML identified the column by name - X_col_label = cast(blocks.Label, X.columns[0]) - y_col_label = cast(blocks.Label, y.columns[0]) - X = X.rename(columns={X_col_label: "input_text"}) - y = y.rename(columns={y_col_label: "output_text"}) - - input_data = X.join(y, how="outer") - - return self._bqml_model.llm_evaluate(input_data, task_type) - - def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: - """Save the model to BigQuery. - - Args: - model_name (str): - The name of the model. - replace (bool, default False): - Determine whether to replace if the model already exists. Default to False. - - Returns: - PaLM2TextGenerator: Saved model.""" - - new_model = self._bqml_model.copy(model_name, replace) - return new_model.session.read_gbq_model(model_name) - - -@typing_extensions.deprecated( - "PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. ", - category=exceptions.ApiDeprecationWarning, -) -@log_adapter.class_logger -class PaLM2TextEmbeddingGenerator(base.BaseEstimator): - """PaLM2 text embedding generator LLM model. - - .. note:: - PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. - - - Args: - model_name (str, Default to "textembedding-gecko"): - The model for text embedding. “textembedding-gecko” returns model embeddings for text inputs. - "textembedding-gecko-multilingual" returns model embeddings for text inputs which support over 100 languages. - Default to "textembedding-gecko". - version (str or None): - Model version. Accepted values are "001", "002", "003", "latest" etc. Will use the default version if unset. - See https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning for details. - session (bigframes.Session or None): - BQ session to create the model. If None, use the global default session. - connection_name (str or None): - Connection to connect with remote service. str of the format ... - If None, use default connection in session context. - """ - - def __init__( - self, - *, - model_name: Literal[ - "textembedding-gecko", "textembedding-gecko-multilingual" - ] = "textembedding-gecko", - version: Optional[str] = None, - session: Optional[bigframes.Session] = None, - connection_name: Optional[str] = None, - ): - self.model_name = model_name - self.version = version - self.session = session or global_session.get_global_session() - self._bq_connection_manager = self.session.bqconnectionmanager - - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) - - self._bqml_model_factory = globals.bqml_model_factory() - self._bqml_model: core.BqmlModel = self._create_bqml_model() - - def _create_bqml_model(self): - # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) - - if self.model_name not in _PALM2_EMBEDDING_GENERATOR_ENDPOINTS: - msg = exceptions.format_message( - _MODEL_NOT_SUPPORTED_WARNING.format( - model_name=self.model_name, - known_models=", ".join(_PALM2_EMBEDDING_GENERATOR_ENDPOINTS), - ) - ) - warnings.warn(msg) - - endpoint = ( - self.model_name + "@" + self.version if self.version else self.model_name - ) - options = { - "endpoint": endpoint, - } - return self._bqml_model_factory.create_remote_model( - session=self.session, connection_name=self.connection_name, options=options - ) - - @classmethod - def _from_bq( - cls, session: bigframes.Session, bq_model: bigquery.Model - ) -> PaLM2TextEmbeddingGenerator: - assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" - assert "remoteModelInfo" in bq_model._properties - assert "endpoint" in bq_model._properties["remoteModelInfo"] - assert "connection" in bq_model._properties["remoteModelInfo"] - - # Parse the remote model endpoint - bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] - model_connection = bq_model._properties["remoteModelInfo"]["connection"] - model_endpoint = bqml_endpoint.split("/")[-1] - - model_name, version = utils.parse_model_endpoint(model_endpoint) - - model = cls( - session=session, - # str to literals - model_name=model_name, # type: ignore - version=version, - connection_name=model_connection, - ) - - model._bqml_model = core.BqmlModel(session, bq_model) - return model - - def predict(self, X: utils.ArrayType) -> bigframes.dataframe.DataFrame: - """Predict the result from input DataFrame. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. - - Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. - """ - - # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - - if len(X.columns) == 1: - # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) - X = X.rename(columns={col_label: "content"}) - - options = { - "flatten_json_output": True, - } - - df = self._bqml_model.generate_embedding(X, options) - df = df.rename( - columns={ - "ml_generate_embedding_result": "text_embedding", - "ml_generate_embedding_statistics": "statistics", - "ml_generate_embedding_status": _ML_EMBED_TEXT_STATUS, - } - ) - - if (df[_ML_EMBED_TEXT_STATUS] != "").any(): - msg = exceptions.format_message( - f"Some predictions failed. Check column {_ML_EMBED_TEXT_STATUS} for " - "detailed status. You may want to filter the failed rows and retry." - ) - warnings.warn(msg, category=RuntimeWarning) - - return df - - def to_gbq( - self, model_name: str, replace: bool = False - ) -> PaLM2TextEmbeddingGenerator: - """Save the model to BigQuery. - - Args: - model_name (str): - The name of the model. - replace (bool, default False): - Determine whether to replace if the model already exists. Default to False. - - Returns: - PaLM2TextEmbeddingGenerator: Saved model.""" - - new_model = self._bqml_model.copy(model_name, replace) - return new_model.session.read_gbq_model(model_name) - - @log_adapter.class_logger class TextEmbeddingGenerator(base.RetriableRemotePredictor): """Text embedding generator LLM model. @@ -918,23 +402,23 @@ def to_gbq( return new_model.session.read_gbq_model(model_name) -@typing_extensions.deprecated( - "gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", - category=exceptions.ApiDeprecationWarning, -) @log_adapter.class_logger class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. .. note:: - gemini-pro and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. + gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. Args: - model_name (str, Default to "gemini-pro"): - The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002" and "gemini-2.0-flash-exp". Default to "gemini-pro". + model_name (str, Default to "gemini-2.0-flash-001"): + The model for natural language tasks. Accepted values are + "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", + "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001", + "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001". + Default to "gemini-2.0-flash-001". .. note:: - "gemini-pro" is going to be deprecated. Bigframes 2 will transition to using gemini-2.0-X. "gemini-2.0-flash-exp", "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" and might have limited support. For more information, see the launch stage descriptions @@ -954,7 +438,6 @@ def __init__( self, *, model_name: Literal[ - "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", @@ -962,7 +445,9 @@ def __init__( "gemini-1.5-flash-001", "gemini-1.5-flash-002", "gemini-2.0-flash-exp", - ] = "gemini-pro", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", + ] = "gemini-2.0-flash-001", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, max_iterations: int = 300, @@ -1052,8 +537,8 @@ def fit( X: utils.ArrayType, y: utils.ArrayType, ) -> GeminiTextGenerator: - """Fine tune GeminiTextGenerator model. Only support "gemini-pro", "gemini-1.5-pro-002", - "gemini-1.5-flash-002" models for now. + """Fine tune GeminiTextGenerator model. Only support "gemini-1.5-pro-002", + and "gemini-1.5-flash-002" models for now. .. note:: @@ -1073,16 +558,13 @@ def fit( """ if self.model_name not in _GEMINI_FINE_TUNE_SCORE_ENDPOINTS: raise NotImplementedError( - "fit() only supports gemini-pro, \ - gemini-1.5-pro-002, or gemini-1.5-flash-002 model." + "fit() only supports gemini-1.5-pro-002, or gemini-1.5-flash-002 model." ) X, y = utils.batch_convert_to_dataframe(X, y) options = self._bqml_options - options["endpoint"] = ( - "gemini-1.0-pro-002" if self.model_name == "gemini-pro" else self.model_name - ) + options["endpoint"] = self.model_name options["prompt_col"] = X.columns.tolist()[0] self._bqml_model = self._bqml_model_factory.create_llm_remote_model( @@ -1231,7 +713,8 @@ def score( "text_generation", "classification", "summarization", "question_answering" ] = "text_generation", ) -> bigframes.dataframe.DataFrame: - """Calculate evaluation metrics of the model. Only support "gemini-pro" and "gemini-1.5-pro-002", and "gemini-1.5-flash-002". + """Calculate evaluation metrics of the model. Only support + "gemini-1.5-pro-002", and "gemini-1.5-flash-002". .. note:: @@ -1265,8 +748,7 @@ def score( if self.model_name not in _GEMINI_FINE_TUNE_SCORE_ENDPOINTS: raise NotImplementedError( - "score() only supports gemini-pro \ - , gemini-1.5-pro-002, and gemini-1.5-flash-2 model." + "score() only supports gemini-1.5-pro-002, and gemini-1.5-flash-2 model." ) X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index eef72584bc..7ee558ad39 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -56,11 +56,6 @@ _BQML_ENDPOINT_TYPE_MAPPING = MappingProxyType( { - llm._TEXT_GENERATOR_BISON_ENDPOINT: llm.PaLM2TextGenerator, - llm._TEXT_GENERATOR_BISON_32K_ENDPOINT: llm.PaLM2TextGenerator, - llm._EMBEDDING_GENERATOR_GECKO_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, - llm._EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, - llm._GEMINI_PRO_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_001_ENDPOINT: llm.GeminiTextGenerator, @@ -68,6 +63,8 @@ llm._GEMINI_1P5_FLASH_001_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_FLASH_002_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_2_FLASH_EXP_ENDPOINT: llm.GeminiTextGenerator, + llm._GEMINI_2_FLASH_001_ENDPOINT: llm.GeminiTextGenerator, + llm._GEMINI_2_FLASH_LITE_001_ENDPOINT: llm.GeminiTextGenerator, llm._CLAUDE_3_HAIKU_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator, @@ -95,8 +92,6 @@ def from_bq( imported.TensorFlowModel, imported.ONNXModel, imported.XGBoostModel, - llm.PaLM2TextGenerator, - llm.PaLM2TextEmbeddingGenerator, llm.Claude3TextGenerator, llm.TextEmbeddingGenerator, llm.MultimodalEmbeddingGenerator, diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index d25791d3e4..e3f15e67a1 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -340,15 +340,17 @@ class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? bins: typing.Union[int, Iterable] right: Optional[bool] - labels: Optional[bool] + labels: typing.Union[bool, Iterable[str], None] @property def skips_nulls(self): return False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if isinstance(self.bins, int) and (self.labels is False): + if self.labels is False: return dtypes.INT_DTYPE + elif isinstance(self.labels, Iterable): + return dtypes.STRING_DTYPE else: # Assumption: buckets use same numeric type if isinstance(self.bins, int): diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index b4fae68a4f..06d07640ab 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -297,7 +297,7 @@ def _resolve_connection(self, connection: Optional[str] = None) -> str: ValueError: If the connection cannot be resolved to a valid string. """ connection = connection or self._block.session._bq_connection - return clients.resolve_full_bq_connection_name( + return clients.get_canonical_bq_connection_id( connection, default_project=self._block.session._project, default_location=self._block.session._location, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 730c287e1f..5df69e3da5 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -65,14 +65,19 @@ def remote_function( + # Make sure that the input/output types, and dataset can be used + # positionally. This avoids the worst of the breaking change from 1.x to + # 2.x while still preventing possible mixups between consecutive str + # parameters. input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, dataset: Optional[str] = None, + *, bigquery_connection: Optional[str] = None, reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, + cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, @@ -80,9 +85,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Optional[ - Literal["all", "internal-only", "internal-and-gclb"] - ] = None, + cloud_function_ingress_settings: Literal[ + "all", "internal-only", "internal-and-gclb" + ] = "internal-only", ): return global_session.with_default_session( bigframes.session.Session.remote_function, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 3ac9b75039..c0eebc0299 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1202,14 +1202,19 @@ def _check_file_size(self, filepath: str): def remote_function( self, + # Make sure that the input/output types, and dataset can be used + # positionally. This avoids the worst of the breaking change from 1.x to + # 2.x while still preventing possible mixups between consecutive str + # parameters. input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, dataset: Optional[str] = None, + *, bigquery_connection: Optional[str] = None, reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, + cloud_function_service_account: str, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, max_batching_rows: Optional[int] = 1000, @@ -1217,9 +1222,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Optional[ - Literal["all", "internal-only", "internal-and-gclb"] - ] = None, + cloud_function_ingress_settings: Literal[ + "all", "internal-only", "internal-and-gclb" + ] = "internal-only", ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1327,8 +1332,8 @@ def remote_function( Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided + cloud_function_service_account (str): + Service account to use for the cloud functions. If "default" provided then the default service account would be used. See https://cloud.google.com/functions/docs/securing/function-identity for more details. Please make sure the service account has the @@ -1392,8 +1397,8 @@ def remote_function( cloud_function_ingress_settings (str, Optional): Ingress settings controls dictating what traffic can reach the function. Options are: `all`, `internal-only`, or `internal-and-gclb`. - If no setting is provided, `all` will be used by default and a warning - will be issued. See for more details + If no setting is provided, `internal-only` will be used by default. + See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. Returns: collections.abc.Callable: @@ -1406,8 +1411,8 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ return self._function_session.remote_function( - input_types, - output_type, + input_types=input_types, + output_type=output_type, session=self, dataset=dataset, bigquery_connection=bigquery_connection, @@ -1499,8 +1504,8 @@ def udf( deployed for the user defined code. """ return self._function_session.udf( - input_types, - output_type, + input_types=input_types, + output_type=output_type, session=self, dataset=dataset, bigquery_connection=bigquery_connection, @@ -1593,7 +1598,7 @@ def read_gbq_function( Another use case is to define your own remote function and use it later. For example, define the remote function: - >>> @bpd.remote_function() + >>> @bpd.remote_function(cloud_function_service_account="default") ... def tenfold(num: int) -> float: ... return num * 10 @@ -1620,7 +1625,7 @@ def read_gbq_function( note, row processor implies that the function has only one input parameter. - >>> @bpd.remote_function() + >>> @bpd.remote_function(cloud_function_service_account="default") ... def row_sum(s: bpd.Series) -> float: ... return s['a'] + s['b'] + s['c'] @@ -1774,8 +1779,8 @@ def _create_bq_connection( """Create the connection with the session settings and try to attach iam role to the connection SA. If any of project, location or connection isn't specified, use the session defaults. Returns fully-qualified connection name.""" connection = self._bq_connection if not connection else connection - connection = bigframes.clients.resolve_full_bq_connection_name( - connection_name=connection, + connection = bigframes.clients.get_canonical_bq_connection_id( + connection_id=connection, default_project=self._project, default_location=self._location, ) diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py new file mode 100644 index 0000000000..28dfee7840 --- /dev/null +++ b/bigframes/session/bigquery_session.py @@ -0,0 +1,171 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import logging +import threading +from typing import Callable, Optional, Sequence +import uuid + +# TODO: Non-ibis implementation +import bigframes_vendored.ibis.backends.bigquery.datatypes as ibis_bq +import google.cloud.bigquery as bigquery + +from bigframes.core.compile import googlesql + +KEEPALIVE_QUERY_TIMEOUT_SECONDS = 5.0 + +KEEPALIVE_FREQUENCY = datetime.timedelta(hours=6) + + +logger = logging.getLogger(__name__) + + +class SessionResourceManager: + """ + Responsible for allocating and cleaning up temporary gbq tables used by a BigFrames session. + """ + + def __init__( + self, bqclient: bigquery.Client, location: str, *, kms_key: Optional[str] = None + ): + self.bqclient = bqclient + self.location = location + self._kms_key = kms_key + self._session_id: Optional[str] = None + self._sessiondaemon: Optional[RecurringTaskDaemon] = None + self._session_lock = threading.RLock() + + def create_temp_table( + self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] + ) -> bigquery.TableReference: + """Create a temporary session table. Session is an exclusive resource, so throughput is limited""" + # Can't set a table in _SESSION as destination via query job API, so we + # run DDL, instead. + with self._session_lock: + table_ref = bigquery.TableReference( + bigquery.DatasetReference(self.bqclient.project, "_SESSION"), + uuid.uuid4().hex, + ) + job_config = bigquery.QueryJobConfig( + connection_properties=[ + bigquery.ConnectionProperty("session_id", self._get_session_id()) + ] + ) + if self._kms_key: + job_config.destination_encryption_configuration = ( + bigquery.EncryptionConfiguration(kms_key_name=self._kms_key) + ) + + ibis_schema = ibis_bq.BigQuerySchema.to_ibis(list(schema)) + + fields = [ + f"{googlesql.identifier(name)} {ibis_bq.BigQueryType.from_ibis(ibis_type)}" + for name, ibis_type in ibis_schema.fields.items() + ] + fields_string = ",".join(fields) + + cluster_string = "" + if cluster_cols: + cluster_cols_sql = ", ".join( + f"{googlesql.identifier(cluster_col)}" + for cluster_col in cluster_cols + ) + cluster_string = f"\nCLUSTER BY {cluster_cols_sql}" + + ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" + + job = self.bqclient.query(ddl, job_config=job_config) + + job.result() + # return the fully qualified table, so it can be used outside of the session + return job.destination + + def close(self): + if self._sessiondaemon is not None: + self._sessiondaemon.stop() + + if self._session_id is not None and self.bqclient is not None: + self.bqclient.query_and_wait(f"CALL BQ.ABORT_SESSION('{self._session_id}')") + + def _get_session_id(self) -> str: + if self._session_id: + return self._session_id + with self._session_lock: + if self._session_id is None: + job_config = bigquery.QueryJobConfig(create_session=True) + # Make sure the session is a new one, not one associated with another query. + job_config.use_query_cache = False + query_job = self.bqclient.query( + "SELECT 1", job_config=job_config, location=self.location + ) + query_job.result() # blocks until finished + assert query_job.session_info is not None + assert query_job.session_info.session_id is not None + self._session_id = query_job.session_info.session_id + self._sessiondaemon = RecurringTaskDaemon( + task=self._keep_session_alive, frequency=KEEPALIVE_FREQUENCY + ) + self._sessiondaemon.start() + return query_job.session_info.session_id + else: + return self._session_id + + def _keep_session_alive(self): + # bq sessions will default expire after 24 hours of disuse, but if queried, this is renewed to a maximum of 7 days + with self._session_lock: + job_config = bigquery.QueryJobConfig( + connection_properties=[ + bigquery.ConnectionProperty("session_id", self._get_session_id()) + ] + ) + try: + self.bqclient.query_and_wait( + "SELECT 1", + location=self.location, + job_config=job_config, + wait_timeout=KEEPALIVE_QUERY_TIMEOUT_SECONDS, + ) + except Exception as e: + logging.warning("BigQuery session keep-alive query errored : %s", e) + + +class RecurringTaskDaemon: + def __init__(self, task: Callable[[], None], frequency: datetime.timedelta): + self._stop_event = threading.Event() + self._frequency = frequency + self._thread = threading.Thread(target=self._run_loop, daemon=True) + self._task = task + + def start(self): + """Start the daemon. Cannot be restarted once stopped.""" + if self._stop_event.is_set(): + raise RuntimeError("Cannot restart daemon thread.") + self._thread.start() + + def _run_loop(self): + while True: + self._stop_event.wait(self._frequency.total_seconds()) + if self._stop_event.is_set(): + return + try: + self._task() + except Exception as e: + logging.warning("RecurringTaskDaemon task errorred: %s", e) + + def stop(self, timeout_seconds: Optional[float] = None): + """Stop and cleanup the daemon.""" + if self._thread.is_alive(): + self._stop_event.set() + self._thread.join(timeout=timeout_seconds) diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 2b24b6cb8b..86be8bd897 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -17,7 +17,6 @@ import os import typing from typing import Optional -import warnings import google.api_core.client_info import google.api_core.client_options @@ -32,7 +31,6 @@ import pydata_google_auth import bigframes.constants -import bigframes.exceptions as bfe import bigframes.version from . import environment @@ -43,16 +41,11 @@ # BigQuery is a REST API, which requires the protocol as part of the URL. -_BIGQUERY_LOCATIONAL_ENDPOINT = "https://{location}-bigquery.googleapis.com" _BIGQUERY_REGIONAL_ENDPOINT = "https://bigquery.{location}.rep.googleapis.com" # BigQuery Connection and Storage are gRPC APIs, which don't support the # https:// protocol in the API endpoint URL. -_BIGQUERYCONNECTION_LOCATIONAL_ENDPOINT = "{location}-bigqueryconnection.googleapis.com" -_BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT = "{location}-bigquerystorage.googleapis.com" -_BIGQUERYSTORAGE_REGIONAL_ENDPOINT = ( - "https://bigquerystorage.{location}.rep.googleapis.com" -) +_BIGQUERYSTORAGE_REGIONAL_ENDPOINT = "bigquerystorage.{location}.rep.googleapis.com" def _get_default_credentials_with_project(): @@ -114,19 +107,18 @@ def __init__( ) self._project = project - if ( - use_regional_endpoints - and location is not None - and location.lower() - not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - ): - msg = bfe.format_message( - bigframes.constants.LEP_DEPRECATION_WARNING_MESSAGE.format( - location=location - ), - fill=False, - ) - warnings.warn(msg, category=FutureWarning) + if use_regional_endpoints: + if location is None: + raise ValueError(bigframes.constants.LOCATION_NEEDED_FOR_REP_MESSAGE) + elif ( + location.lower() + not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS + ): + raise ValueError( + bigframes.constants.REP_NOT_SUPPORTED_MESSAGE.format( + location=location + ) + ) self._location = location self._use_regional_endpoints = use_regional_endpoints @@ -156,16 +148,8 @@ def _create_bigquery_client(self): api_endpoint=self._client_endpoints_override["bqclient"] ) elif self._use_regional_endpoints: - endpoint_template = _BIGQUERY_REGIONAL_ENDPOINT - if ( - self._location is not None - and self._location.lower() - not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - ): - endpoint_template = _BIGQUERY_LOCATIONAL_ENDPOINT - bq_options = google.api_core.client_options.ClientOptions( - api_endpoint=endpoint_template.format(location=self._location) + api_endpoint=_BIGQUERY_REGIONAL_ENDPOINT.format(location=self._location) ) bq_info = google.api_core.client_info.ClientInfo( @@ -212,12 +196,6 @@ def bqconnectionclient(self): bqconnection_options = google.api_core.client_options.ClientOptions( api_endpoint=self._client_endpoints_override["bqconnectionclient"] ) - elif self._use_regional_endpoints: - bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYCONNECTION_LOCATIONAL_ENDPOINT.format( - location=self._location - ) - ) bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name @@ -241,16 +219,10 @@ def bqstoragereadclient(self): api_endpoint=self._client_endpoints_override["bqstoragereadclient"] ) elif self._use_regional_endpoints: - endpoint_template = _BIGQUERYSTORAGE_REGIONAL_ENDPOINT - if ( - self._location is not None - and self._location.lower() - not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - ): - endpoint_template = _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=endpoint_template.format(location=self._location) + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) ) bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( diff --git a/bigframes/version.py b/bigframes/version.py index 356e73a71d..a94498722d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.42.0" +__version__ = "2.0.0.dev0" # {x-release-please-start-date} -__release_date__ = "2025-03-27" +__release_date__ = "2025-03-31" # {x-release-please-end} diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index b00044b087..1e712848df 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -86,6 +86,9 @@ uid: bigframes.operations.structs.StructAccessor - name: PlotAccessor uid: bigframes.operations.plotting.PlotAccessor + - name: BlobAccessor + uid: bigframes.operations.blob.BlobAccessor + status: beta name: Series - name: Window uid: bigframes.core.window.Window diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb index c190f219af..f830e35c16 100644 --- a/notebooks/apps/synthetic_data_generation.ipynb +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -248,8 +248,8 @@ }, "outputs": [], "source": [ - "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n", - "def data_generator(id):\n", + "@bpd.remote_function(packages=['faker', 'pandas'], cloud_function_service_account=\"default\")\n", + "def data_generator(id: int) -> str:\n", " context = {}\n", " exec(code, context)\n", " result_df = context.get(\"result_df\")\n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 88633f8635..788111cfe6 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -914,8 +914,8 @@ }, "outputs": [], "source": [ - "@bf.remote_function([str], str)\n", - "def extract_code(text: str):\n", + "@bf.remote_function(cloud_function_service_account=\"default\")\n", + "def extract_code(text: str) -> str:\n", " try:\n", " res = text[text.find('\\n')+1:text.find('```', 3)]\n", " res = res.replace(\"import pandas as pd\", \"import bigframes.pandas as bf\")\n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb index d458a0f53b..1a9b568897 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb @@ -369,7 +369,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index c5deeef1c5..a8158bcb85 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -1485,8 +1485,8 @@ }, "outputs": [], "source": [ - "@bpd.remote_function([float], str)\n", - "def get_bucket(num):\n", + "@bpd.remote_function(cloud_function_service_account=\"default\")\n", + "def get_bucket(num: float) -> str:\n", " if not num: return \"NA\"\n", " boundary = 4000\n", " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb index 1b138c6a66..066cd18136 100644 --- a/notebooks/location/regionalized.ipynb +++ b/notebooks/location/regionalized.ipynb @@ -1475,8 +1475,8 @@ } ], "source": [ - "@bpd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", - "def get_bucket(num):\n", + "@bpd.remote_function(bigquery_connection='bigframes-rf-conn', cloud_function_service_account=\"default\")\n", + "def get_bucket(num: float) -> str:\n", " if not num: return \"NA\"\n", " boundary = 4000\n", " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" diff --git a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb index 4bfdcc24aa..501bfc88d3 100644 --- a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb +++ b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb @@ -27,21 +27,25 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2024-10-01 22:44:50.650768+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n" - ] + "data": { + "text/html": [ + "Query job aa2b9845-0e66-4f42-a360-ffe03215caf6 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/html": [ - "Query job 4c2f2252-687a-47c3-87ad-22db8ad96e2b is DONE. 0 Bytes processed. Open Job" + "Query job fe2bc354-672e-4d08-b969-bb2ede299fca is DONE. 28.9 kB processed. Open Job" ], "text/plain": [ "" @@ -53,7 +57,7 @@ { "data": { "text/html": [ - "Query job a05c7268-8db2-468b-9fb4-0fb5c9534f51 is DONE. 0 Bytes processed. Open Job" + "Query job 8d16fa20-391f-4917-86fc-1a595dba3fc6 is DONE. 33.6 kB processed. Open Job" ], "text/plain": [ "" @@ -97,149 +101,317 @@ " 0\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 50.5\n", - " 15.9\n", - " 225.0\n", - " 5400.0\n", + " 45.2\n", + " 16.4\n", + " 223.0\n", + " 5950.0\n", " MALE\n", " \n", " \n", " 1\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 45.1\n", + " 46.5\n", " 14.5\n", - " 215.0\n", - " 5000.0\n", + " 213.0\n", + " 4400.0\n", " FEMALE\n", " \n", " \n", " 2\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Torgersen\n", - " 41.4\n", - " 18.5\n", - " 202.0\n", - " 3875.0\n", - " MALE\n", + " Biscoe\n", + " 37.7\n", + " 16.0\n", + " 183.0\n", + " 3075.0\n", + " FEMALE\n", " \n", " \n", " 3\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Torgersen\n", - " 38.6\n", - " 17.0\n", - " 188.0\n", - " 2900.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.4\n", + " 15.6\n", + " 221.0\n", + " 5000.0\n", + " MALE\n", " \n", " \n", " 4\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 46.5\n", - " 14.8\n", - " 217.0\n", - " 5200.0\n", + " 46.1\n", + " 13.2\n", + " 211.0\n", + " 4500.0\n", " FEMALE\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " 5\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 43.1\n", + " 19.2\n", + " 197.0\n", + " 3500.0\n", + " MALE\n", " \n", " \n", - " 339\n", + " 6\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.2\n", + " 15.8\n", + " 215.0\n", + " 5300.0\n", + " MALE\n", + " \n", + " \n", + " 7\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 38.1\n", - " 17.6\n", + " 36.2\n", + " 17.3\n", " 187.0\n", - " 3425.0\n", + " 3300.0\n", " FEMALE\n", " \n", " \n", - " 340\n", + " 8\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 46.0\n", + " 18.9\n", + " 195.0\n", + " 4150.0\n", + " FEMALE\n", + " \n", + " \n", + " 9\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 54.3\n", + " 15.7\n", + " 231.0\n", + " 5650.0\n", + " MALE\n", + " \n", + " \n", + " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 39.5\n", + " 17.4\n", + " 186.0\n", + " 3800.0\n", + " FEMALE\n", + " \n", + " \n", + " 12\n", + " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 36.4\n", - " 17.1\n", - " 184.0\n", - " 2850.0\n", + " 42.7\n", + " 13.7\n", + " 208.0\n", + " 3950.0\n", " FEMALE\n", " \n", " \n", - " 341\n", + " 13\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 41.0\n", + " 20.0\n", + " 203.0\n", + " 4725.0\n", + " MALE\n", + " \n", + " \n", + " 14\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.5\n", + " 15.0\n", + " 219.0\n", + " 4850.0\n", + " FEMALE\n", + " \n", + " \n", + " 15\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 40.9\n", - " 16.6\n", - " 187.0\n", - " 3200.0\n", + " 49.6\n", + " 18.2\n", + " 193.0\n", + " 3775.0\n", + " MALE\n", + " \n", + " \n", + " 16\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.8\n", + " 17.3\n", + " 228.0\n", + " 5600.0\n", + " MALE\n", + " \n", + " \n", + " 17\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.2\n", + " 14.1\n", + " 217.0\n", + " 4375.0\n", " FEMALE\n", " \n", " \n", - " 342\n", + " 18\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 41.3\n", - " 21.1\n", - " 195.0\n", - " 4400.0\n", + " 38.8\n", + " 17.2\n", + " 180.0\n", + " 3800.0\n", " MALE\n", " \n", " \n", - " 343\n", + " 19\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 45.2\n", - " 16.6\n", - " 191.0\n", - " 3250.0\n", + " 51.0\n", + " 18.8\n", + " 203.0\n", + " 4100.0\n", + " MALE\n", + " \n", + " \n", + " 20\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 42.9\n", + " 13.1\n", + " 215.0\n", + " 5000.0\n", + " FEMALE\n", + " \n", + " \n", + " 21\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.4\n", + " 15.3\n", + " 224.0\n", + " 5550.0\n", + " MALE\n", + " \n", + " \n", + " 22\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 49.0\n", + " 16.1\n", + " 216.0\n", + " 5550.0\n", + " MALE\n", + " \n", + " \n", + " 23\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 43.4\n", + " 14.4\n", + " 218.0\n", + " 4600.0\n", + " FEMALE\n", + " \n", + " \n", + " 24\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.0\n", + " 15.4\n", + " 220.0\n", + " 5050.0\n", + " MALE\n", + " \n", + " \n", + " 25\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 47.5\n", + " 14.0\n", + " 212.0\n", + " 4875.0\n", " FEMALE\n", " \n", " \n", "\n", - "

334 rows × 7 columns

\n", + "

25 rows × 7 columns

\n", "[334 rows x 7 columns in total]" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", - "1 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "2 Adelie Penguin (Pygoscelis adeliae) Torgersen 41.4 \n", - "3 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.6 \n", - "4 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - ".. ... ... ... \n", - "339 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", - "340 Adelie Penguin (Pygoscelis adeliae) Biscoe 36.4 \n", - "341 Chinstrap penguin (Pygoscelis antarctica) Dream 40.9 \n", - "342 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", - "343 Chinstrap penguin (Pygoscelis antarctica) Dream 45.2 \n", + " species island culmen_length_mm \\\n", + "0 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", + "1 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", + "2 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.7 \n", + "3 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 \n", + "4 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 \n", + "5 Adelie Penguin (Pygoscelis adeliae) Torgersen 43.1 \n", + "6 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", + "7 Adelie Penguin (Pygoscelis adeliae) Dream 36.2 \n", + "8 Chinstrap penguin (Pygoscelis antarctica) Dream 46.0 \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe 54.3 \n", + "11 Adelie Penguin (Pygoscelis adeliae) Torgersen 39.5 \n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe 42.7 \n", + "13 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.0 \n", + "14 Gentoo penguin (Pygoscelis papua) Biscoe 48.5 \n", + "15 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", + "16 Gentoo penguin (Pygoscelis papua) Biscoe 50.8 \n", + "17 Gentoo penguin (Pygoscelis papua) Biscoe 46.2 \n", + "18 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.8 \n", + "19 Chinstrap penguin (Pygoscelis antarctica) Dream 51.0 \n", + "20 Gentoo penguin (Pygoscelis papua) Biscoe 42.9 \n", + "21 Gentoo penguin (Pygoscelis papua) Biscoe 50.4 \n", + "22 Gentoo penguin (Pygoscelis papua) Biscoe 49.0 \n", + "23 Gentoo penguin (Pygoscelis papua) Biscoe 43.4 \n", + "24 Gentoo penguin (Pygoscelis papua) Biscoe 45.0 \n", + "25 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 15.9 225.0 5400.0 MALE \n", - "1 14.5 215.0 5000.0 FEMALE \n", - "2 18.5 202.0 3875.0 MALE \n", - "3 17.0 188.0 2900.0 FEMALE \n", - "4 14.8 217.0 5200.0 FEMALE \n", - ".. ... ... ... ... \n", - "339 17.6 187.0 3425.0 FEMALE \n", - "340 17.1 184.0 2850.0 FEMALE \n", - "341 16.6 187.0 3200.0 FEMALE \n", - "342 21.1 195.0 4400.0 MALE \n", - "343 16.6 191.0 3250.0 FEMALE \n", + " culmen_depth_mm flipper_length_mm body_mass_g sex \n", + "0 16.4 223.0 5950.0 MALE \n", + "1 14.5 213.0 4400.0 FEMALE \n", + "2 16.0 183.0 3075.0 FEMALE \n", + "3 15.6 221.0 5000.0 MALE \n", + "4 13.2 211.0 4500.0 FEMALE \n", + "5 19.2 197.0 3500.0 MALE \n", + "6 15.8 215.0 5300.0 MALE \n", + "7 17.3 187.0 3300.0 FEMALE \n", + "8 18.9 195.0 4150.0 FEMALE \n", + "9 15.7 231.0 5650.0 MALE \n", + "11 17.4 186.0 3800.0 FEMALE \n", + "12 13.7 208.0 3950.0 FEMALE \n", + "13 20.0 203.0 4725.0 MALE \n", + "14 15.0 219.0 4850.0 FEMALE \n", + "15 18.2 193.0 3775.0 MALE \n", + "16 17.3 228.0 5600.0 MALE \n", + "17 14.1 217.0 4375.0 FEMALE \n", + "18 17.2 180.0 3800.0 MALE \n", + "19 18.8 203.0 4100.0 MALE \n", + "20 13.1 215.0 5000.0 FEMALE \n", + "21 15.3 224.0 5550.0 MALE \n", + "22 16.1 216.0 5550.0 MALE \n", + "23 14.4 218.0 4600.0 FEMALE \n", + "24 15.4 220.0 5050.0 MALE \n", + "25 14.0 212.0 4875.0 FEMALE \n", "...\n", "\n", "[334 rows x 7 columns]" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -253,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -277,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -286,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -297,37 +469,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 582e7c02-bcc6-412a-a513-46ee5dba7ad8 is DONE. 2.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 917ff09b-072b-4c55-b26f-1780e2e97519 is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 2f4e102d-48bc-401f-a781-39830e2c6c9b is DONE. 16.4 kB processed. Open Job" + "Query job 9ce9fb43-306d-46e9-bbe5-d98ee55143bd is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -339,7 +487,7 @@ { "data": { "text/html": [ - "Query job aabe8a28-8dce-4e00-8a8c-18e9e090e6e7 is DONE. 26.3 kB processed. Open Job" + "Query job 8c86156d-ee97-4f66-9dc1-db15ff3d8e8e is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -351,19 +499,7 @@ { "data": { "text/html": [ - "Query job ec9d8798-e28e-44bc-aa8e-44ab28f0214f is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8aa0fa94-e43e-41c6-9de3-f0a67392c47f is DONE. 48 Bytes processed. Open Job" + "Query job b8f2b382-b938-4dff-8bdb-129703ade285 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -377,10 +513,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 318.358226 151689.571141 0.009814 \n", + "0 297.36838 148892.914876 0.009057 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 255.095561 0.780659 0.783304 \n", + "0 238.424052 0.814613 0.816053 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -388,7 +524,7 @@ { "data": { "text/html": [ - "Query job bf6ef937-9583-4aa8-8313-563638465d5f is DONE. 25.9 kB processed. Open Job" + "Query job ec2968f3-1713-4617-8a26-6fe4267f8061 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -400,7 +536,7 @@ { "data": { "text/html": [ - "Query job 4c8b564c-5bbd-4447-babf-e307524962e5 is DONE. 16.4 kB processed. Open Job" + "Query job c7a1b80f-26f5-41b1-bcdc-b276af141671 is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -412,31 +548,7 @@ { "data": { "text/html": [ - "Query job cd5e337f-6d44-473d-a90b-be8a79bba6bf is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ad80012d-7c6c-4dbf-9271-2ff7f899f174 is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8fc20587-d8ba-4c0f-bed9-3e1cf3c6ae52 is DONE. 48 Bytes processed. Open Job" + "Query job 82054991-c22f-41b3-9802-f16919949e26 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -450,10 +562,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 306.435423 151573.84019 0.008539 \n", + "0 307.6149 139013.303482 0.007907 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 244.2899 0.737623 0.742859 \n", + "0 266.589811 0.782835 0.794297 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -461,7 +573,7 @@ { "data": { "text/html": [ - "Query job 90286d2b-e805-4b19-8876-c9973579e9ff is DONE. 25.9 kB processed. Open Job" + "Query job 3e5ae019-7c5b-44ea-8392-85145fdb6802 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -473,7 +585,7 @@ { "data": { "text/html": [ - "Query job ceb6c8f2-16cc-4758-bde8-3e4975ba1452 is DONE. 16.4 kB processed. Open Job" + "Query job c35dfd28-504d-4d12-b039-da890b9cb51d is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -485,31 +597,7 @@ { "data": { "text/html": [ - "Query job f49434fa-a7e0-406a-bbe2-5651595e3418 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5dd7a277-10fe-4117-a354-ef8668a8b913 is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4b58b016-9a50-4a66-b86c-8431faad43bf is DONE. 48 Bytes processed. Open Job" + "Query job 29ac1bb3-f864-400e-8cac-0b4c7f78ebcd is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -523,10 +611,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 253.349578 112039.741164 0.007153 \n", + "0 348.412701 180661.063512 0.01125 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 185.916761 0.823381 0.823456 \n", + "0 313.29406 0.744053 0.74537 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -534,7 +622,7 @@ { "data": { "text/html": [ - "Query job ca700ecf-0c08-4286-b979-2bc7a0bee89c is DONE. 25.9 kB processed. Open Job" + "Query job d90f5938-2894-4c93-8691-21162a2fca4c is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -546,7 +634,7 @@ { "data": { "text/html": [ - "Query job f0731e71-7754-47a2-a553-93a61e712533 is DONE. 16.4 kB processed. Open Job" + "Query job 4c6328b3-2d3f-42bb-9f83-4f8c84773c95 is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -558,31 +646,7 @@ { "data": { "text/html": [ - "Query job ae66d34d-5f0a-4297-9d41-57067ae54a9b is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7655a649-ceca-4792-b764-fb371f5872ec is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8b0634c8-73a9-422c-9644-842142dbb059 is DONE. 48 Bytes processed. Open Job" + "Query job 8a885a6a-d3ad-4569-80ce-4f57d9b86105 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -596,10 +660,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 320.381386 155234.800349 0.008638 \n", + "0 309.991882 151820.705254 0.008898 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 306.281263 0.793405 0.794504 \n", + "0 212.758708 0.694001 0.694287 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -607,19 +671,7 @@ { "data": { "text/html": [ - "Query job bb26cde9-1991-4e0a-8492-b19d15b1b7aa is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7ddd0883-492d-46bc-a588-f3cbab2474bb is DONE. 16.5 kB processed. Open Job" + "Query job d1e60370-11c8-4f49-a8d5-85417662aa51 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -631,7 +683,7 @@ { "data": { "text/html": [ - "Query job 5de571e4-d2f9-43c7-b014-3d65a3731b64 is DONE. 26.3 kB processed. Open Job" + "Query job d8e8712a-6347-4725-a27d-49810d4acc1c is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -643,19 +695,7 @@ { "data": { "text/html": [ - "Query job d20ac7d8-cd21-4a1f-a200-2dfa6373bcdb is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 235e8a80-33ea-4a95-a7d0-34e40a8ca396 is DONE. 48 Bytes processed. Open Job" + "Query job 6a0ebaa6-5572-404f-a41d-b90e2c65d948 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -669,10 +709,10 @@ "output_type": "stream", "text": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 303.855563 141869.030392 0.008989 \n", + "0 256.569216 103495.042886 0.006605 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 245.102301 0.731737 0.732793 \n", + "0 222.940815 0.818589 0.832344 \n", "\n", "[1 rows x 6 columns]\n" ] @@ -696,145 +736,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 9274ae2e-e9a7-4701-ac64-56632323d02a is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 22f9477b-de02-4c07-b480-c3270a69d7e0 is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ebb192b7-4a9e-4238-b4e6-b630e2f94988 is DONE. 16.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 44441e8c-8753-41b0-b1b7-9a6c4eab8c74 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 239fed9a-b488-47da-a0df-a3b7c6ec40f4 is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job f4248b2d-3430-426c-872d-8590f2878366 is DONE. 16.4 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d9f6b034-c300-4dd7-91dd-48fa912f2456 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e2f39f5b-2f4c-402a-a8d5-a7cff918508d is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 54cf3710-b5f4-4aec-b11f-0281126a151a is DONE. 16.4 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 833d13cd-ec59-499b-98f6-95ec18766698 is DONE. 26.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0120e332-0691-44a4-9198-f5c131b8f59c is DONE. 25.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job f4ba7a4c-5fd9-4f97-ab34-a8f139e7472a is DONE. 16.4 kB processed. Open Job" + "Query job 5bdcd65d-7d72-4094-be3a-cf67a1787cf4 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -846,7 +754,7 @@ { "data": { "text/html": [ - "Query job 857aadfc-2ade-429c-bef8-428e44d48c55 is DONE. 26.3 kB processed. Open Job" + "Query job bb0504b2-b656-4a08-9bf8-dcab0d188022 is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -858,7 +766,7 @@ { "data": { "text/html": [ - "Query job 906d6d34-a506-4957-b07f-7e5ed2e0634b is DONE. 25.9 kB processed. Open Job" + "Query job 8c5c4b66-9a14-455a-a3f5-99f0f522713f is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -870,7 +778,7 @@ { "data": { "text/html": [ - "Query job 498563db-3e68-4df7-a2d5-83da6adb49ed is DONE. 16.5 kB processed. Open Job" + "Query job 9c9b81de-35b6-4561-8881-57da8b73cc7f is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -882,7 +790,7 @@ { "data": { "text/html": [ - "Query job 01af95ca-6288-4253-b379-7327e1c9de88 is DONE. 26.3 kB processed. Open Job" + "Query job b781f1aa-6572-49e5-ab8d-f1908b497a1c is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -894,7 +802,7 @@ { "data": { "text/html": [ - "Query job 5ce36d32-6db1-42e5-a8cf-84bb8244a57e is DONE. 48 Bytes processed. Open Job" + "Query job 41a2a58e-0289-4d58-8e39-de286f2a91fb is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -906,7 +814,7 @@ { "data": { "text/html": [ - "Query job e05ec77d-6025-4edd-b5e3-9c4e7a124e71 is DONE. 48 Bytes processed. Open Job" + "Query job 7ee839a9-f77c-49b0-844e-8eecc1647b97 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -918,7 +826,7 @@ { "data": { "text/html": [ - "Query job 418a4a5d-2bb3-41e5-9e7c-9852389a491b is DONE. 48 Bytes processed. Open Job" + "Query job a317d488-8589-4faa-940b-e59af91caf4d is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -930,7 +838,7 @@ { "data": { "text/html": [ - "Query job b33e30da-cfed-4d6f-b227-f433d97879cb is DONE. 48 Bytes processed. Open Job" + "Query job 2de96ea8-519a-4976-a641-eb26a4bd38fb is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -942,7 +850,7 @@ { "data": { "text/html": [ - "Query job 7ad7f0c8-ecae-4ef2-bc91-0ebeb5f88e7b is DONE. 48 Bytes processed. Open Job" + "Query job 41a7d5a0-c76b-4ef3-a3da-d4d5a2ebbb0e is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -954,7 +862,7 @@ { "data": { "text/html": [ - "Query job a6e8bd12-1122-4c26-b0e1-58342238016c is DONE. 48 Bytes processed. Open Job" + "Query job 9e82ddc9-8461-4644-ba34-957a7426ff8e is DONE. 16.4 kB processed. Open Job" ], "text/plain": [ "" @@ -966,7 +874,7 @@ { "data": { "text/html": [ - "Query job c553439c-9586-479c-92c5-01a0d333125b is DONE. 48 Bytes processed. Open Job" + "Query job 0fa84d07-fdfa-41c9-b601-9326a94f3a09 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -978,7 +886,7 @@ { "data": { "text/html": [ - "Query job c598d64c-26b9-49fc-afad-a6544b38cfa2 is DONE. 48 Bytes processed. Open Job" + "Query job d4495568-f1b5-431b-b892-4fc7dcbccfd5 is DONE. 37.0 kB processed. Open Job" ], "text/plain": [ "" @@ -990,7 +898,7 @@ { "data": { "text/html": [ - "Query job ebcb73e8-1294-4f10-b826-c495046fd714 is DONE. 48 Bytes processed. Open Job" + "Query job af1e6460-3078-4a8b-8992-9e7df9dcfbb3 is DONE. 16.5 kB processed. Open Job" ], "text/plain": [ "" @@ -1002,7 +910,7 @@ { "data": { "text/html": [ - "Query job d73f57ba-a25d-4b90-b474-13d81a3e22ab is DONE. 48 Bytes processed. Open Job" + "Query job f14401bf-fd80-401a-a61d-52614fba1ca7 is DONE. 37.3 kB processed. Open Job" ], "text/plain": [ "" @@ -1015,53 +923,53 @@ "data": { "text/plain": [ "{'test_score': [ mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 237.154735 97636.17064 0.005571 \n", + " 0 322.341485 157616.627179 0.009137 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 187.883888 0.842018 0.846816 \n", + " 0 269.412639 0.705594 0.724882 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 304.281635 141966.045867 0.008064 \n", + " 0 289.682121 136550.318797 0.00878 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 236.096453 0.762979 0.764008 \n", + " 0 212.874686 0.799363 0.81416 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 316.380322 157332.146085 0.009699 \n", + " 0 325.358522 155218.752974 0.009606 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 222.824496 0.764607 0.765369 \n", + " 0 267.301671 0.777174 0.7782 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 309.609657 152421.826588 0.009772 \n", + " 0 286.874056 120586.575364 0.007484 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 254.163976 0.772954 0.773119 \n", + " 0 247.656578 0.79281 0.796001 \n", " \n", " [1 rows x 6 columns],\n", " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - " 0 339.339345 169760.629993 0.010597 \n", + " 0 287.989397 145947.465344 0.008447 \n", " \n", " median_absolute_error r2_score explained_variance \n", - " 0 312.335706 0.741167 0.74118 \n", + " 0 186.777549 0.791452 0.798825 \n", " \n", " [1 rows x 6 columns]],\n", - " 'fit_time': [18.200648623984307,\n", - " 17.565149880945683,\n", - " 18.202434757025912,\n", - " 18.04062689607963,\n", - " 19.370970834977925],\n", - " 'score_time': [4.76077218609862,\n", - " 4.577479084953666,\n", - " 4.581933492794633,\n", - " 4.741644307971001,\n", - " 5.1031754210125655]}" + " 'fit_time': [18.79181448201416,\n", + " 19.092008439009078,\n", + " 75.7446747609647,\n", + " 17.520530884969048,\n", + " 21.157033596013207],\n", + " 'score_time': [4.247669544012751,\n", + " 6.792615927988663,\n", + " 4.502274781989399,\n", + " 4.484583999030292,\n", + " 4.224339194013737]}" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1097,7 +1005,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/ml/easy_linear_regression.ipynb b/notebooks/ml/easy_linear_regression.ipynb index fdabd82a4b..5a7258a182 100644 --- a/notebooks/ml/easy_linear_regression.ipynb +++ b/notebooks/ml/easy_linear_regression.ipynb @@ -52,20 +52,9 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(DatasetReference('shobs-test', 'bqml_tutorial'))" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dataset = f\"{session.bqclient.project}.bqml_tutorial\"\n", "session.bqclient.create_dataset(dataset, exists_ok=True)" @@ -96,383 +85,9 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 525fc879-1f59-45e8-96b4-f9c67d244d06 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 91aa1b30-2b0e-41eb-9bfb-4f6232913b31 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Adelie Penguin (Pygoscelis adeliae)Biscoe40.118.9188.04300.0MALE
1Adelie Penguin (Pygoscelis adeliae)Torgersen39.118.7181.03750.0MALE
2Gentoo penguin (Pygoscelis papua)Biscoe47.414.6212.04725.0FEMALE
3Chinstrap penguin (Pygoscelis antarctica)Dream42.516.7187.03350.0FEMALE
4Adelie Penguin (Pygoscelis adeliae)Biscoe43.219.0197.04775.0MALE
5Gentoo penguin (Pygoscelis papua)Biscoe46.715.3219.05200.0MALE
6Adelie Penguin (Pygoscelis adeliae)Biscoe41.321.1195.04400.0MALE
7Gentoo penguin (Pygoscelis papua)Biscoe45.213.8215.04750.0FEMALE
8Gentoo penguin (Pygoscelis papua)Biscoe46.513.5210.04550.0FEMALE
9Gentoo penguin (Pygoscelis papua)Biscoe50.515.2216.05000.0FEMALE
10Gentoo penguin (Pygoscelis papua)Biscoe48.215.6221.05100.0MALE
11Adelie Penguin (Pygoscelis adeliae)Dream38.118.6190.03700.0FEMALE
12Gentoo penguin (Pygoscelis papua)Biscoe50.715.0223.05550.0MALE
13Adelie Penguin (Pygoscelis adeliae)Biscoe37.820.0190.04250.0MALE
14Adelie Penguin (Pygoscelis adeliae)Biscoe35.017.9190.03450.0FEMALE
15Gentoo penguin (Pygoscelis papua)Biscoe48.715.7208.05350.0MALE
16Adelie Penguin (Pygoscelis adeliae)Torgersen34.621.1198.04400.0MALE
17Gentoo penguin (Pygoscelis papua)Biscoe46.815.4215.05150.0MALE
18Chinstrap penguin (Pygoscelis antarctica)Dream50.320.0197.03300.0MALE
19Adelie Penguin (Pygoscelis adeliae)Dream37.218.1178.03900.0MALE
20Chinstrap penguin (Pygoscelis antarctica)Dream51.018.8203.04100.0MALE
21Adelie Penguin (Pygoscelis adeliae)Biscoe40.517.9187.03200.0FEMALE
22Gentoo penguin (Pygoscelis papua)Biscoe45.513.9210.04200.0FEMALE
23Adelie Penguin (Pygoscelis adeliae)Dream42.218.5180.03550.0FEMALE
24Chinstrap penguin (Pygoscelis antarctica)Dream51.720.3194.03775.0MALE
\n", - "

25 rows × 7 columns

\n", - "
[344 rows x 7 columns in total]" - ], - "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.1 \n", - "1 Adelie Penguin (Pygoscelis adeliae) Torgersen 39.1 \n", - "2 Gentoo penguin (Pygoscelis papua) Biscoe 47.4 \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", - "4 Adelie Penguin (Pygoscelis adeliae) Biscoe 43.2 \n", - "5 Gentoo penguin (Pygoscelis papua) Biscoe 46.7 \n", - "6 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", - "7 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", - "8 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "9 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", - "10 Gentoo penguin (Pygoscelis papua) Biscoe 48.2 \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", - "12 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", - "13 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.8 \n", - "14 Adelie Penguin (Pygoscelis adeliae) Biscoe 35.0 \n", - "15 Gentoo penguin (Pygoscelis papua) Biscoe 48.7 \n", - "16 Adelie Penguin (Pygoscelis adeliae) Torgersen 34.6 \n", - "17 Gentoo penguin (Pygoscelis papua) Biscoe 46.8 \n", - "18 Chinstrap penguin (Pygoscelis antarctica) Dream 50.3 \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.2 \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 51.0 \n", - "21 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", - "22 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 42.2 \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 51.7 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 18.9 188.0 4300.0 MALE \n", - "1 18.7 181.0 3750.0 MALE \n", - "2 14.6 212.0 4725.0 FEMALE \n", - "3 16.7 187.0 3350.0 FEMALE \n", - "4 19.0 197.0 4775.0 MALE \n", - "5 15.3 219.0 5200.0 MALE \n", - "6 21.1 195.0 4400.0 MALE \n", - "7 13.8 215.0 4750.0 FEMALE \n", - "8 13.5 210.0 4550.0 FEMALE \n", - "9 15.2 216.0 5000.0 FEMALE \n", - "10 15.6 221.0 5100.0 MALE \n", - "11 18.6 190.0 3700.0 FEMALE \n", - "12 15.0 223.0 5550.0 MALE \n", - "13 20.0 190.0 4250.0 MALE \n", - "14 17.9 190.0 3450.0 FEMALE \n", - "15 15.7 208.0 5350.0 MALE \n", - "16 21.1 198.0 4400.0 MALE \n", - "17 15.4 215.0 5150.0 MALE \n", - "18 20.0 197.0 3300.0 MALE \n", - "19 18.1 178.0 3900.0 MALE \n", - "20 18.8 203.0 4100.0 MALE \n", - "21 17.9 187.0 3200.0 FEMALE \n", - "22 13.9 210.0 4200.0 FEMALE \n", - "23 18.5 180.0 3550.0 FEMALE \n", - "24 20.3 194.0 3775.0 MALE \n", - "...\n", - "\n", - "[344 rows x 7 columns]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# read a BigQuery table to a BigQuery DataFrame\n", "df = bigframes.pandas.read_gbq(f\"bigquery-public-data.ml_datasets.penguins\")\n", @@ -491,357 +106,9 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d2bd7c5e-2652-4c0d-8495-8ef65e89031b is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 92f0a5e5-bc61-426f-a9ef-213a1c376851 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
islandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Biscoe40.118.9188.04300.0MALE
1Torgersen39.118.7181.03750.0MALE
4Biscoe43.219.0197.04775.0MALE
6Biscoe41.321.1195.04400.0MALE
11Dream38.118.6190.03700.0FEMALE
13Biscoe37.820.0190.04250.0MALE
14Biscoe35.017.9190.03450.0FEMALE
16Torgersen34.621.1198.04400.0MALE
19Dream37.218.1178.03900.0MALE
21Biscoe40.517.9187.03200.0FEMALE
23Dream42.218.5180.03550.0FEMALE
30Dream39.221.1196.04150.0MALE
32Torgersen42.917.6196.04700.0MALE
38Dream41.117.5190.03900.0MALE
40Torgersen38.621.2191.03800.0MALE
42Biscoe35.516.2195.03350.0FEMALE
44Dream39.218.6190.04250.0MALE
45Torgersen35.215.9186.03050.0FEMALE
46Dream43.218.5192.04100.0MALE
49Biscoe39.617.7186.03500.0FEMALE
53Biscoe45.620.3191.04600.0MALE
58Torgersen40.916.8191.03700.0FEMALE
60Torgersen40.318.0195.03250.0FEMALE
62Dream36.018.5186.03100.0FEMALE
63Torgersen39.320.6190.03650.0MALE
\n", - "

25 rows × 6 columns

\n", - "
[146 rows x 6 columns in total]" - ], - "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", - "0 Biscoe 40.1 18.9 188.0 \n", - "1 Torgersen 39.1 18.7 181.0 \n", - "4 Biscoe 43.2 19.0 197.0 \n", - "6 Biscoe 41.3 21.1 195.0 \n", - "11 Dream 38.1 18.6 190.0 \n", - "13 Biscoe 37.8 20.0 190.0 \n", - "14 Biscoe 35.0 17.9 190.0 \n", - "16 Torgersen 34.6 21.1 198.0 \n", - "19 Dream 37.2 18.1 178.0 \n", - "21 Biscoe 40.5 17.9 187.0 \n", - "23 Dream 42.2 18.5 180.0 \n", - "30 Dream 39.2 21.1 196.0 \n", - "32 Torgersen 42.9 17.6 196.0 \n", - "38 Dream 41.1 17.5 190.0 \n", - "40 Torgersen 38.6 21.2 191.0 \n", - "42 Biscoe 35.5 16.2 195.0 \n", - "44 Dream 39.2 18.6 190.0 \n", - "45 Torgersen 35.2 15.9 186.0 \n", - "46 Dream 43.2 18.5 192.0 \n", - "49 Biscoe 39.6 17.7 186.0 \n", - "53 Biscoe 45.6 20.3 191.0 \n", - "58 Torgersen 40.9 16.8 191.0 \n", - "60 Torgersen 40.3 18.0 195.0 \n", - "62 Dream 36.0 18.5 186.0 \n", - "63 Torgersen 39.3 20.6 190.0 \n", - "\n", - " body_mass_g sex \n", - "0 4300.0 MALE \n", - "1 3750.0 MALE \n", - "4 4775.0 MALE \n", - "6 4400.0 MALE \n", - "11 3700.0 FEMALE \n", - "13 4250.0 MALE \n", - "14 3450.0 FEMALE \n", - "16 4400.0 MALE \n", - "19 3900.0 MALE \n", - "21 3200.0 FEMALE \n", - "23 3550.0 FEMALE \n", - "30 4150.0 MALE \n", - "32 4700.0 MALE \n", - "38 3900.0 MALE \n", - "40 3800.0 MALE \n", - "42 3350.0 FEMALE \n", - "44 4250.0 MALE \n", - "45 3050.0 FEMALE \n", - "46 4100.0 MALE \n", - "49 3500.0 FEMALE \n", - "53 4600.0 MALE \n", - "58 3700.0 FEMALE \n", - "60 3250.0 FEMALE \n", - "62 3100.0 FEMALE \n", - "63 3650.0 MALE \n", - "...\n", - "\n", - "[146 rows x 6 columns]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# filter down to the data we want to analyze\n", "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", @@ -880,56 +147,9 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 43c8fdc2-0bc3-4607-a36d-5bee87c894d8 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 97e0c84d-aa6a-4197-9377-740d973ea44d is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 726b9a5e-48a1-4ced-ac34-fa028dcb2bf4 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from bigframes.ml.linear_model import LinearRegression\n", "\n", @@ -942,104 +162,9 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 28975567-2526-40f7-a7be-9dee6f782b4e is DONE. 9.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5c71d3d9-0e1c-45bd-866f-1f98f056260d is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 890767f7-a83b-469a-9f3e-abd5667f8202 is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0223.87876378553.6016340.005614181.3309110.6239510.623951
\n", - "

1 rows × 6 columns

\n", - "
[1 rows x 6 columns in total]" - ], - "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 223.878763 78553.601634 0.005614 \n", - "\n", - " median_absolute_error r2_score explained_variance \n", - "0 181.330911 0.623951 0.623951 \n", - "\n", - "[1 rows x 6 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# check how the model performed\n", "model.score(feature_columns, label_columns)" @@ -1047,103 +172,9 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d59df3e8-cf87-4340-a4c7-a27c3abfcc50 is DONE. 29.1 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5af493aa-96f9-434f-a101-ec855f4de694 is DONE. 8 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e2076bc3-3966-4c45-8265-c461756a7782 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e9cdfca7-30f6-4e93-95fb-244896e7c2ab is DONE. 16 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predicted_body_mass_g
3345891.735118
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "334 5891.735118\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# use the model to predict the missing labels\n", "model.predict(missing_body_mass)" @@ -1159,32 +190,9 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Copy job cb4ef454-10df-4325-b9cb-6084df3ac9d5 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "LinearRegression(optimize_strategy='NORMAL_EQUATION')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# save the model to a permanent location in BigQuery, so we can use it in future sessions (and elsewhere in BQ)\n", "model.to_gbq(penguins_model, replace=True)" @@ -1199,20 +207,9 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression(optimize_strategy='NORMAL_EQUATION')" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# WARNING - until b/281709360 is fixed & pipeline is updated, pipelines will load as models,\n", "# and details of their transform steps will be lost (the loaded model will behave the same)\n", diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb index 2114311e10..e2bc88ecae 100644 --- a/notebooks/remote_functions/remote_function.ipynb +++ b/notebooks/remote_functions/remote_function.ipynb @@ -174,7 +174,7 @@ "source": [ "# User defined function\n", "# https://www.codespeedy.com/find-nth-prime-number-in-python/\n", - "def nth_prime(n):\n", + "def nth_prime(n: int) -> int:\n", " prime_numbers = [2,3]\n", " i=3\n", " if(0 int:\n", " prime_numbers = [2,3]\n", " i=3\n", " if(0 str:\n", " if duration_minutes < 90:\n", " return \"short\"\n", @@ -466,7 +466,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False)\n", + "@bpd.remote_function(reuse=False, cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " if duration_minutes < 90:\n", " return DURATION_CATEGORY_SHORT\n", @@ -675,7 +675,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False)\n", + "@bpd.remote_function(reuse=False, cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " duration_hours = mymath.ceil(duration_minutes / 60)\n", " return f\"{duration_hours}h\"\n", @@ -886,7 +886,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False)\n", + "@bpd.remote_function(reuse=False, cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " duration_hours = get_hour_ceiling(duration_minutes)\n", " return f\"{duration_hours} hrs\"\n", @@ -1068,7 +1068,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False, packages=[\"cryptography\"])\n", + "@bpd.remote_function(reuse=False, packages=[\"cryptography\"], cloud_function_service_account=\"default\")\n", "def get_hash(input: str) -> str:\n", " from cryptography.fernet import Fernet\n", "\n", @@ -1271,7 +1271,7 @@ } ], "source": [ - "@bpd.remote_function(reuse=False, packages=[\"humanize\"])\n", + "@bpd.remote_function(reuse=False, packages=[\"humanize\"], cloud_function_service_account=\"default\")\n", "def duration_category(duration_minutes: int) -> str:\n", " timedelta = dt.timedelta(minutes=duration_minutes)\n", " return humanize.naturaldelta(timedelta)\n", @@ -1442,7 +1442,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb index 78f0d27474..605f879bc7 100644 --- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb +++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb @@ -286,7 +286,9 @@ "source": [ "@bpd.remote_function(packages=[\"anthropic[vertex]\", \"google-auth[requests]\"],\n", " max_batching_rows=1, \n", - " bigquery_connection=\"bigframes-dev.us-east5.bigframes-rf-conn\") # replace with your connection\n", + " bigquery_connection=\"bigframes-dev.us-east5.bigframes-rf-conn\", # replace with your connection\n", + " cloud_function_service_account=\"default\",\n", + ")\n", "def anthropic_transformer(message: str) -> str:\n", " from anthropic import AnthropicVertex\n", " client = AnthropicVertex(region=LOCATION, project_id=PROJECT)\n", diff --git a/noxfile.py b/noxfile.py index bcab34d0c0..bb4ba91a3a 100644 --- a/noxfile.py +++ b/noxfile.py @@ -184,6 +184,14 @@ def lint_setup_py(session): session.install("docutils", "pygments") session.run("python", "setup.py", "check", "--restructuredtext", "--strict") + session.install("twine", "wheel") + shutil.rmtree("build", ignore_errors=True) + shutil.rmtree("dist", ignore_errors=True) + session.run("python", "setup.py", "sdist") + session.run( + "python", "-m", "twine", "check", *pathlib.Path("dist").glob("*.tar.gz") + ) + def install_unittest_dependencies(session, install_test_extra, *constraints): standard_deps = UNIT_TEST_STANDARD_DEPENDENCIES + UNIT_TEST_DEPENDENCIES diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py deleted file mode 100644 index 5cdcd6d3a7..0000000000 --- a/samples/snippets/gen_ai_model_test.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test_llm_model() -> None: - # Determine project id, in this case prefer the one set in the environment - # variable GOOGLE_CLOUD_PROJECT (if any) - import os - - PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") - REGION = "us" - CONN_NAME = "bigframes-default-connection" - - # [START bigquery_dataframes_gen_ai_model] - from bigframes.ml.llm import PaLM2TextGenerator - import bigframes.pandas as bpd - - # Create the LLM model - session = bpd.get_global_session() - connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" - model = PaLM2TextGenerator(session=session, connection_name=connection) - - df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") - - # Prepare the prompts and send them to the LLM model for prediction - df_prompt_prefix = "Generate Pandas sample code for DataFrame." - df_prompt = df_prompt_prefix + df_api["API"] - - # Predict using the model - df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024) - # [END bigquery_dataframes_gen_ai_model] - assert df_pred["ml_generate_text_llm_result"] is not None - assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py index c35daf35fc..3a7031ef89 100644 --- a/samples/snippets/remote_function.py +++ b/samples/snippets/remote_function.py @@ -47,9 +47,8 @@ def run_remote_function_and_read_gbq_function(project_id: str) -> None: # of the penguins, which is a real number, into a category, which is a # string. @bpd.remote_function( - float, - str, reuse=False, + cloud_function_service_account="default", ) def get_bucket(num: float) -> str: if not num: @@ -91,10 +90,9 @@ def get_bucket(num: float) -> str: # as a remote function. The custom function in this example has external # package dependency, which can be specified via `packages` parameter. @bpd.remote_function( - str, - str, reuse=False, packages=["cryptography"], + cloud_function_service_account="default", ) def get_hash(input: str) -> str: from cryptography.fernet import Fernet diff --git a/samples/snippets/text_generation_test.py b/samples/snippets/text_generation_test.py deleted file mode 100644 index c4df1dde3b..0000000000 --- a/samples/snippets/text_generation_test.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test_llm_text_generation() -> None: - # Determine project id, in this case prefer the one set in the environment - # variable GOOGLE_CLOUD_PROJECT (if any) - import os - - PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") - LOCATION = "US" - - # [START bigquery_dataframes_generate_text_tutorial_create_remote_model] - import bigframes - from bigframes.ml.llm import PaLM2TextGenerator - - bigframes.options.bigquery.project = PROJECT_ID - bigframes.options.bigquery.location = LOCATION - - model = PaLM2TextGenerator() - # [END bigquery_dataframes_generate_text_tutorial_create_remote_model] - assert model is not None - - # [START bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction] - import bigframes.pandas as bpd - - df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5) - df_prompt_prefix = "Extract the key words from the text below: " - df_prompt = df_prompt_prefix + df["review"] - - # Predict using the model - df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100) - df_pred.peek(5) - # [END bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction] - # peek() is used to show a preview of the results. If the output - # of this sample changes, also update the screenshot for the associated - # tutorial on cloud.google.com. - assert df_pred["ml_generate_text_llm_result"] is not None - assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None - - # [START bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis] - import bigframes.pandas as bpd - - df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5) - df_prompt_prefix = "perform sentiment analysis on the following text, return one the following categories: positive, negative: " - df_prompt = df_prompt_prefix + df["review"] - - # Predict using the model - df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100) - df_pred.peek(5) - # [END bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis] - # peek() is used to show a preview of the results. If the output - # of this sample changes, also update the screenshot for the associated - # tutorial on cloud.google.com. - - assert df_pred["ml_generate_text_llm_result"] is not None - assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None diff --git a/setup.py b/setup.py index 34e013c9a3..f0c336a1f5 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ "google-auth >=2.15.0,<3.0dev", "google-cloud-bigtable >=2.24.0", "google-cloud-pubsub >=2.21.4", - "google-cloud-bigquery[bqstorage,pandas] >=3.18.0", + "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", "google-cloud-functions >=1.12.0", "google-cloud-bigquery-connection >=1.12.0", "google-cloud-iam >=2.12.1", @@ -51,7 +51,7 @@ "jellyfish >=0.8.9,<1.1.2", "numpy >=1.24.0", "pandas >=1.5.3", - "pandas-gbq >=0.26.0", + "pandas-gbq >=0.26.1", "pyarrow >=15.0.2", "pydata-google-auth >=1.8.2", "requests >=2.27.1", @@ -117,6 +117,7 @@ version=version_id, description=description, long_description=readme, + long_description_content_type="text/x-rst", author="Google LLC", author_email="bigframes-feedback@google.com", license="Apache 2.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 8c7c69efa7..5fc3d5f96f 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -6,7 +6,7 @@ geopandas==0.12.2 google-auth==2.15.0 google-cloud-bigtable==2.24.0 google-cloud-pubsub==2.21.4 -google-cloud-bigquery==3.18.0 +google-cloud-bigquery==3.31.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 @@ -15,7 +15,7 @@ google-cloud-storage==2.0.0 jellyfish==0.8.9 numpy==1.24.0 pandas==1.5.3 -pandas-gbq==0.26.0 +pandas-gbq==0.26.1 pyarrow==15.0.2 pydata-google-auth==1.8.2 requests==2.27.1 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 398ee8a6b2..a466b558b2 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -185,8 +185,13 @@ def session_tokyo(tokyo_location: str) -> Generator[bigframes.Session, None, Non @pytest.fixture(scope="session") -def bq_connection(bigquery_client: bigquery.Client) -> str: - return f"{bigquery_client.project}.{bigquery_client.location}.bigframes-rf-conn" +def bq_connection_name() -> str: + return "bigframes-rf-conn" + + +@pytest.fixture(scope="session") +def bq_connection(bigquery_client: bigquery.Client, bq_connection_name: str) -> str: + return f"{bigquery_client.project}.{bigquery_client.location}.{bq_connection_name}" @pytest.fixture(scope="session", autouse=True) @@ -460,7 +465,7 @@ def nested_structs_df( @pytest.fixture(scope="session") -def nested_structs_pandas_df() -> pd.DataFrame: +def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame: """pd.DataFrame pointing at test data.""" df = pd.read_json( @@ -468,6 +473,7 @@ def nested_structs_pandas_df() -> pd.DataFrame: lines=True, ) df = df.set_index("id") + df["person"] = df["person"].astype(nested_structs_pandas_type) return df diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index eabafd96fb..831ab71be7 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -166,10 +166,7 @@ def featurize(x: int) -> list[float]: cleanup_function_assets(featurize, session.bqclient, ignore_failures=False) -def test_managed_function_series_apply( - session, - scalars_dfs, -): +def test_managed_function_series_apply(session, scalars_dfs): try: @session.udf() @@ -504,7 +501,10 @@ def test_managed_function_dataframe_apply_axis_1_array_output(session): try: - @session.udf(input_types=[int, float, str], output_type=list[str]) + @session.udf( + input_types=[int, float, str], + output_type=list[str], + ) def foo(x, y, z): return [str(x), str(y), z] @@ -587,3 +587,41 @@ def foo(x, y, z): finally: # Clean up the gcp assets created for the managed function. cleanup_function_assets(foo, session.bqclient, ignore_failures=False) + + +@pytest.mark.parametrize( + "connection_fixture", + [ + "bq_connection_name", + "bq_connection", + ], +) +def test_managed_function_with_connection( + session, scalars_dfs, request, connection_fixture +): + try: + bigquery_connection = request.getfixturevalue(connection_fixture) + + @session.udf(bigquery_connection=bigquery_connection) + def foo(x: int) -> int: + return x + 10 + + # Function should still work normally. + assert foo(-2) == 8 + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result_col = scalars_df["int64_too"].apply(foo) + bf_result = ( + scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_result_col = scalars_pandas_df["int64_too"].apply(foo) + pd_result = ( + scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(foo, session.bqclient, ignore_failures=False) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 1e5e7ede26..426813b0ff 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -17,11 +17,9 @@ import inspect import math # must keep this at top level to test udf referring global import import os.path -import re import shutil import tempfile import textwrap -import typing import warnings import google.api_core.exceptions @@ -111,11 +109,14 @@ def test_remote_function_multiply_with_ibis( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int, int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def multiply(x, y): return x * y @@ -165,11 +166,14 @@ def test_remote_function_stringify_with_ibis( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], str, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def stringify(x): return f"I got {x}" @@ -213,11 +217,14 @@ def func(x, y): return x * abs(y % 4) remote_func = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [str, int], str, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(func) scalars_df, scalars_pandas_df = scalars_dfs @@ -250,11 +257,14 @@ def func(x, y): return [len(x), abs(y % 4)] remote_func = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [str, int], list[int], dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(func) scalars_df, scalars_pandas_df = scalars_dfs @@ -284,11 +294,14 @@ def test_remote_function_decorator_with_bigframes_series( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def square(x): return x * x @@ -330,11 +343,14 @@ def add_one(x): return x + 1 remote_add_one = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -380,7 +396,14 @@ def test_remote_function_input_types(session, scalars_dfs, input_types): def add_one(x): return x + 1 - remote_add_one = session.remote_function(input_types, int, reuse=False)(add_one) + remote_add_one = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. + input_types, + int, + reuse=False, + cloud_function_service_account="default", + )(add_one) assert remote_add_one.input_dtypes == (bigframes.dtypes.INT_DTYPE,) scalars_df, scalars_pandas_df = scalars_dfs @@ -406,11 +429,14 @@ def test_remote_function_explicit_dataset_not_created( try: @session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, - dataset_id_not_created, - bq_cf_connection, + dataset=dataset_id_not_created, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def square(x): return x * x @@ -459,11 +485,14 @@ def sign(num): return NO_SIGN remote_sign = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [int], int, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(sign) scalars_df, scalars_pandas_df = scalars_dfs @@ -506,11 +535,14 @@ def circumference(radius): return 2 * mymath.pi * radius remote_circumference = session.remote_function( + # Make sure that the input/output types can be used positionally. + # This avoids the worst of the breaking change from 1.x to 2.x. [float], float, dataset_id, - bq_cf_connection, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(circumference) scalars_df, scalars_pandas_df = scalars_dfs @@ -555,11 +587,12 @@ def find_team(num): return _team_pi remote_find_team = session.remote_function( - [float], - str, - dataset_id, - bq_cf_connection, + input_types=[float], + output_type=str, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(find_team) scalars_df, scalars_pandas_df = scalars_dfs @@ -627,11 +660,12 @@ def add_one(x): # The first time both the cloud function and the bq remote function don't # exist and would be created remote_add_one = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=True, + cloud_function_service_account="default", )(add_one_uniq) # There should have been excactly one cloud function created at this point @@ -697,11 +731,12 @@ def inner_test(): # exist even though the remote function exists, and goes ahead and recreates # the cloud function remote_add_one = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=True, + cloud_function_service_account="default", )(add_one_uniq) # There should be excactly one cloud function again @@ -743,11 +778,12 @@ def is_odd(num): return flag is_odd_remote = session.remote_function( - [int], - bool, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=bool, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(is_odd) scalars_df, scalars_pandas_df = scalars_dfs @@ -783,11 +819,12 @@ def is_odd(num): return flag is_odd_remote = session.remote_function( - [int], - bool, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=bool, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(is_odd) scalars_df, scalars_pandas_df = scalars_dfs @@ -817,11 +854,12 @@ def test_remote_udf_lambda(session, scalars_dfs, dataset_id, bq_cf_connection): add_one_lambda = lambda x: x + 1 # noqa: E731 add_one_lambda_remote = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(add_one_lambda) scalars_df, scalars_pandas_df = scalars_dfs @@ -872,12 +910,13 @@ def square(x): # Create the remote function with the name provided explicitly square_remote = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, name=rf_name, + cloud_function_service_account="default", )(square) # The remote function should reflect the explicitly provided name @@ -925,12 +964,13 @@ def pd_np_foo(x): # Create the remote function with the name provided explicitly pd_np_foo_remote = session.remote_function( - [int], - float, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=float, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, reuse=False, packages=["numpy", "pandas >= 2.0.0"], + cloud_function_service_account="default", )(pd_np_foo) # The behavior of the created remote function should be as expected @@ -1005,11 +1045,12 @@ def test_internal(rf, udf): # Create a new remote function with the name provided explicitly square_remote1 = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, name=rf_name, + cloud_function_service_account="default", )(square_uniq) # The remote function should reflect the explicitly provided name @@ -1030,11 +1071,12 @@ def test_internal(rf, udf): # explicitly. Since reuse is True by default, the previously created # remote function with the same name will be reused. square_remote2 = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, name=rf_name, + cloud_function_service_account="default", )(square_uniq) # The new remote function should still reflect the explicitly provided name @@ -1074,11 +1116,12 @@ def plusone(x): # created remote function with the same name should not be reused since # this time it is a different user code. plusone_remote = session.remote_function( - [int], - int, - dataset_id, - bq_cf_connection, + input_types=[int], + output_type=int, + dataset=dataset_id, + bigquery_connection=bq_cf_connection, name=rf_name, + cloud_function_service_account="default", )(plusone_uniq) # The new remote function should still reflect the explicitly provided name @@ -1139,7 +1182,13 @@ def test_remote_function_via_session_context_connection_setter( # unique dataset_id, even though the cloud function would be reused, the bq # remote function would still be created, making use of the bq connection # set in the BigQueryOptions above. - @session.remote_function([int], int, dataset=dataset_id, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + dataset=dataset_id, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1174,7 +1223,13 @@ def square(x): def test_remote_function_default_connection(session, scalars_dfs, dataset_id): try: - @session.remote_function([int], int, dataset=dataset_id, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + dataset=dataset_id, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1209,7 +1264,13 @@ def square(x): def test_remote_function_runtime_error(session, scalars_dfs, dataset_id): try: - @session.remote_function([int], int, dataset=dataset_id, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + dataset=dataset_id, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1233,7 +1294,12 @@ def test_remote_function_anonymous_dataset(session, scalars_dfs): # function in the bigframes session's anonymous dataset. Use reuse=False # param to make sure parallel instances of the test don't step over each # other due to the common anonymous dataset. - @session.remote_function([int], int, reuse=False) + @session.remote_function( + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + ) def square(x): return x * x @@ -1290,14 +1356,27 @@ def test_remote_function_via_session_custom_sa(scalars_dfs): try: + # TODO(shobs): Figure out why the default ingress setting + # (internal-only) does not work here @rf_session.remote_function( - [int], int, reuse=False, cloud_function_service_account=gcf_service_account + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account=gcf_service_account, + cloud_function_ingress_settings="all", ) def square_num(x): if x is None: return x return x * x + # assert that the GCF is created with the intended SA + gcf = rf_session.cloudfunctionsclient.get_function( + name=square_num.bigframes_cloud_function + ) + assert gcf.service_config.service_account_email == gcf_service_account + + # assert that the function works as expected on data scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] @@ -1309,12 +1388,6 @@ def square_num(x): pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - # Assert that the GCF is created with the intended SA - gcf = rf_session.cloudfunctionsclient.get_function( - name=square_num.bigframes_cloud_function - ) - assert gcf.service_config.service_account_email == gcf_service_account finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -1322,38 +1395,12 @@ def square_num(x): ) -@pytest.mark.parametrize( - ("remote_function_args"), - [ - pytest.param( - {}, - id="no-set", - ), - pytest.param( - {"cloud_function_service_account": None}, - id="set-none", - ), - ], -) -def test_remote_function_warns_default_cloud_function_service_account( - session, remote_function_args -): - with pytest.warns(FutureWarning) as record: - session.remote_function(**remote_function_args) - - len( - [ - warn - for warn in record - if re.search( - ( - "You have not explicitly set a user-managed.*Using the default Compute Engine.*service account" - ), - typing.cast(FutureWarning, warn.message).args[0], - re.DOTALL, - ) - ] - ) == 1 +def test_remote_function_throws_none_cloud_function_service_account(session): + with pytest.raises( + ValueError, + match='^You must provide a user managed cloud_function_service_account, or "default" if you would like to let the default service account be used.$', + ): + session.remote_function(cloud_function_service_account=None) @pytest.mark.flaky(retries=2, delay=120) @@ -1378,9 +1425,10 @@ def test_remote_function_with_gcf_cmek(): try: @session.remote_function( - [int], - int, + input_types=[int], + output_type=int, reuse=False, + cloud_function_service_account="default", cloud_function_kms_key_name=cmek, cloud_function_docker_repository=docker_repository, ) @@ -1452,10 +1500,24 @@ def square_num(x): return x return x * x + # TODO(shobs): See if the test vpc can be configured to make this flow + # work with the default ingress setting (internal-only) square_num_remote = rf_session.remote_function( - [int], int, reuse=False, cloud_function_vpc_connector=gcf_vpc_connector + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + cloud_function_vpc_connector=gcf_vpc_connector, + cloud_function_ingress_settings="all", )(square_num) + # assert that the GCF is created with the intended vpc connector + gcf = rf_session.cloudfunctionsclient.get_function( + name=square_num_remote.bigframes_cloud_function + ) + assert gcf.service_config.vpc_connector == gcf_vpc_connector + + # assert that the function works as expected on data scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] @@ -1467,12 +1529,6 @@ def square_num(x): pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - # Assert that the GCF is created with the intended vpc connector - gcf = rf_session.cloudfunctionsclient.get_function( - name=square_num_remote.bigframes_cloud_function - ) - assert gcf.service_config.vpc_connector == gcf_vpc_connector finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -1495,7 +1551,11 @@ def square(x): return x * x square_remote = session.remote_function( - [int], int, reuse=False, max_batching_rows=max_batching_rows + input_types=[int], + output_type=int, + reuse=False, + max_batching_rows=max_batching_rows, + cloud_function_service_account="default", )(square) bq_routine = session.bqclient.get_routine( @@ -1534,7 +1594,11 @@ def square(x): return x * x square_remote = session.remote_function( - [int], int, reuse=False, **timeout_args + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + **timeout_args, )(square) # Assert that the GCF is created with the intended maximum timeout @@ -1560,7 +1624,13 @@ def square(x): def test_remote_function_gcf_timeout_max_supported_exceeded(session): with pytest.raises(ValueError): - @session.remote_function([int], int, reuse=False, cloud_function_timeout=1201) + @session.remote_function( + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + cloud_function_timeout=1201, + ) def square(x): return x * x @@ -1583,7 +1653,11 @@ def square(x): return x * x square_remote = session.remote_function( - [int], int, reuse=False, **max_instances_args + input_types=[int], + output_type=int, + reuse=False, + cloud_function_service_account="default", + **max_instances_args, )(square) # Assert that the GCF is created with the intended max instance count @@ -1632,7 +1706,10 @@ def serialize_row(row): ) serialize_row_remote = session.remote_function( - bigframes.series.Series, str, reuse=False + input_types=bigframes.series.Series, + output_type=str, + reuse=False, + cloud_function_service_account="default", )(serialize_row) assert getattr(serialize_row_remote, "is_row_processor") @@ -1678,7 +1755,10 @@ def analyze(row): ) analyze_remote = session.remote_function( - bigframes.series.Series, str, reuse=False + input_types=bigframes.series.Series, + output_type=str, + reuse=False, + cloud_function_service_account="default", )(analyze) assert getattr(analyze_remote, "is_row_processor") @@ -1799,7 +1879,10 @@ def serialize_row(row): ) serialize_row_remote = session.remote_function( - bigframes.series.Series, str, reuse=False + input_types=bigframes.series.Series, + output_type=str, + reuse=False, + cloud_function_service_account="default", )(serialize_row) assert getattr(serialize_row_remote, "is_row_processor") @@ -1856,7 +1939,10 @@ def float_parser(row): return float(row["text"]) float_parser_remote = session.remote_function( - bigframes.series.Series, float, reuse=False + input_types=bigframes.series.Series, + output_type=float, + reuse=False, + cloud_function_service_account="default", )(float_parser) assert getattr(float_parser_remote, "is_row_processor") @@ -1901,7 +1987,9 @@ def test_remote_function_gcf_memory( def square(x: int) -> int: return x * x - square_remote = session.remote_function(reuse=False, **memory_mib_args)(square) + square_remote = session.remote_function( + reuse=False, cloud_function_service_account="default", **memory_mib_args + )(square) # Assert that the GCF is created with the intended memory gcf = session.cloudfunctionsclient.get_function( @@ -1936,7 +2024,11 @@ def test_remote_function_gcf_memory_unsupported(session, memory_mib): match="Invalid value specified for container memory", ): - @session.remote_function(reuse=False, cloud_function_memory_mib=memory_mib) + @session.remote_function( + reuse=False, + cloud_function_service_account="default", + cloud_function_memory_mib=memory_mib, + ) def square(x: int) -> int: return x * x @@ -1947,7 +2039,7 @@ def test_remote_function_unnamed_removed_w_session_cleanup(): session = bigframes.connect() # create an unnamed remote function in the session - @session.remote_function(reuse=False) + @session.remote_function(reuse=False, cloud_function_service_account="default") def foo(x: int) -> int: return x + 1 @@ -1989,7 +2081,9 @@ def test_remote_function_named_perists_w_session_cleanup(): name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() # create an unnamed remote function in the session - @session.remote_function(reuse=False, name=name) + @session.remote_function( + reuse=False, name=name, cloud_function_service_account="default" + ) def foo(x: int) -> int: return x + 1 @@ -2030,14 +2124,16 @@ def test_remote_function_clean_up_by_session_id(): # without it, and later confirm that the former is deleted when the session # is cleaned up by session id, but the latter remains ## unnamed - @session.remote_function(reuse=False) + @session.remote_function(reuse=False, cloud_function_service_account="default") def foo_unnamed(x: int) -> int: return x + 1 ## named rf_name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() - @session.remote_function(reuse=False, name=rf_name) + @session.remote_function( + reuse=False, name=rf_name, cloud_function_service_account="default" + ) def foo_named(x: int) -> int: return x + 2 @@ -2104,7 +2200,12 @@ def test_df_apply_axis_1_multiple_params(session): try: - @session.remote_function([int, float, str], str, reuse=False) + @session.remote_function( + input_types=[int, float, str], + output_type=str, + reuse=False, + cloud_function_service_account="default", + ) def foo(x, y, z): return f"I got {x}, {y} and {z}" @@ -2179,7 +2280,12 @@ def test_df_apply_axis_1_multiple_params_array_output(session): try: - @session.remote_function([int, float, str], list[str], reuse=False) + @session.remote_function( + input_types=[int, float, str], + output_type=list[str], + reuse=False, + cloud_function_service_account="default", + ) def foo(x, y, z): return [str(x), str(y), z] @@ -2259,7 +2365,12 @@ def test_df_apply_axis_1_single_param_non_series(session): try: - @session.remote_function([int], str, reuse=False) + @session.remote_function( + input_types=[int], + output_type=str, + reuse=False, + cloud_function_service_account="default", + ) def foo(x): return f"I got {x}" @@ -2313,7 +2424,7 @@ def test_df_apply_axis_1_array_output(session, scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs try: - @session.remote_function(reuse=False) + @session.remote_function(reuse=False, cloud_function_service_account="default") def generate_stats(row: pandas.Series) -> list[int]: import pandas as pd @@ -2356,13 +2467,13 @@ def generate_stats(row: pandas.Series) -> list[int]: [ pytest.param( {}, - functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, - True, + functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_ONLY, + False, id="no-set", ), pytest.param( {"cloud_function_ingress_settings": None}, - functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, + functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_ONLY, True, id="set-none", ), @@ -2402,17 +2513,16 @@ def square(x: int) -> int: return x * x square_remote = session.remote_function( - reuse=False, **ingress_settings_args + reuse=False, + cloud_function_service_account="default", + **ingress_settings_args, )(square) default_ingress_setting_warnings = [ warn for warn in record - if isinstance(warn.message, FutureWarning) - and "`cloud_function_ingress_settings` are set to 'all' by default" - in warn.message.args[0] - and "will change to 'internal-only' for enhanced security in future" - in warn.message.args[0] + if isinstance(warn.message, UserWarning) + and "The `cloud_function_ingress_settings` is being set to 'internal-only' by default." ] assert len(default_ingress_setting_warnings) == ( 1 if expect_default_ingress_setting_warning else 0 @@ -2443,7 +2553,11 @@ def test_remote_function_ingress_settings_unsupported(session): ValueError, match="'unknown' not one of the supported ingress settings values" ): - @session.remote_function(reuse=False, cloud_function_ingress_settings="unknown") + @session.remote_function( + reuse=False, + cloud_function_service_account="default", + cloud_function_ingress_settings="unknown", + ) def square(x: int) -> int: return x * x @@ -2475,6 +2589,7 @@ def add_one(x: int) -> int: dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", )(add_one) temporary_bigquery_remote_function = ( @@ -2552,6 +2667,7 @@ def add_one(x: int) -> int: bigquery_connection=bq_cf_connection, reuse=False, name=name, + cloud_function_service_account="default", )(add_one) persistent_bigquery_remote_function = ( @@ -2619,6 +2735,7 @@ def test_remote_function_array_output( dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def featurize(x: int) -> list[array_dtype]: # type: ignore return [array_dtype(i) for i in [x, x + 1, x + 2]] @@ -2657,6 +2774,7 @@ def test_remote_function_array_output_partial_ordering_mode( dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def featurize(x: float) -> list[float]: # type: ignore return [x, x + 1, x + 2] @@ -2698,6 +2816,7 @@ def test_remote_function_array_output_multiindex( dataset=dataset_id, bigquery_connection=bq_cf_connection, reuse=False, + cloud_function_service_account="default", ) def featurize(x: int) -> list[float]: return [x, x + 0.5, x + 0.33] @@ -2720,3 +2839,33 @@ def featurize(x: int) -> list[float]: cleanup_function_assets( featurize, session.bqclient, session.cloudfunctionsclient ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_connection_path_format( + session, scalars_dfs, dataset_id, bq_cf_connection +): + try: + + @session.remote_function( + dataset=dataset_id, + bigquery_connection=f"projects/{session.bqclient.project}/locations/{session._location}/connections/{bq_cf_connection}", + reuse=False, + cloud_function_service_account="default", + ) + def foo(x: int) -> int: + return x + 1 + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_too"] + bf_result = bf_int64_col.apply(foo).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_too"] + pd_result = pd_int64_col.apply(foo) + + # ignore any dtype disparity + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index 7801f5dada..d4428c1f95 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -13,9 +13,11 @@ # limitations under the License. import typing -import warnings from google.cloud import bigquery +from google.cloud.bigquery_storage import types as bqstorage_types +import pandas +import pandas.testing import pytest import bigframes @@ -41,6 +43,7 @@ def _assert_bq_execution_location( assert typing.cast(bigquery.QueryJob, df.query_job).location == expected_location + # Ensure operation involving BQ client suceeds result = ( df[["name", "number"]] .groupby("name") @@ -53,6 +56,35 @@ def _assert_bq_execution_location( typing.cast(bigquery.QueryJob, result.query_job).location == expected_location ) + expected_result = pandas.DataFrame( + {"number": [444, 222]}, index=pandas.Index(["aaa", "bbb"], name="name") + ) + pandas.testing.assert_frame_equal( + expected_result, result.to_pandas(), check_dtype=False, check_index_type=False + ) + + # Ensure BQ Storage Read client operation succceeds + table = result.query_job.destination + requested_session = bqstorage_types.ReadSession( # type: ignore[attr-defined] + table=f"projects/{table.project}/datasets/{table.dataset_id}/tables/{table.table_id}", + data_format=bqstorage_types.DataFormat.ARROW, # type: ignore[attr-defined] + ) + read_session = session.bqstoragereadclient.create_read_session( + parent=f"projects/{table.project}", + read_session=requested_session, + max_stream_count=1, + ) + reader = session.bqstoragereadclient.read_rows(read_session.streams[0].name) + frames = [] + for message in reader.rows().pages: + frames.append(message.to_dataframe()) + read_dataframe = pandas.concat(frames) + # normalize before comparing since we lost some of the bigframes column + # naming abtractions in the direct read of the destination table + read_dataframe = read_dataframe.set_index("name") + read_dataframe.columns = result.columns + pandas.testing.assert_frame_equal(expected_result, read_dataframe) + def test_bq_location_default(): session = bigframes.Session() @@ -119,22 +151,14 @@ def test_bq_location_non_canonical(set_location, resolved_location): sorted(bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS), ) def test_bq_rep_endpoints(bigquery_location): - with warnings.catch_warnings(record=True) as record: - warnings.simplefilter("always") - session = bigframes.Session( - context=bigframes.BigQueryOptions( - location=bigquery_location, use_regional_endpoints=True - ) - ) - assert ( - len([warn for warn in record if isinstance(warn.message, FutureWarning)]) - == 0 + session = bigframes.Session( + context=bigframes.BigQueryOptions( + location=bigquery_location, use_regional_endpoints=True ) + ) - # Verify that location and endpoints are correctly set for the BigQuery API + # Verify that location and endpoint is correctly set for the BigQuery API # client - # TODO(shobs): Figure out if the same can be verified for the other API - # clients. assert session.bqclient.location == bigquery_location assert ( session.bqclient._connection.API_BASE_URL @@ -143,36 +167,52 @@ def test_bq_rep_endpoints(bigquery_location): ) ) + # Verify that endpoint is correctly set for the BigQuery Storage API client + # TODO(shobs): Figure out if we can verify that location is set in the + # BigQuery Storage API client. + assert ( + session.bqstoragereadclient.api_endpoint + == f"bigquerystorage.{bigquery_location}.rep.googleapis.com" + ) + # assert that bigframes session honors the location _assert_bq_execution_location(session) +def test_clients_provider_no_location(): + with pytest.raises(ValueError, match="Must set location to use regional endpoints"): + bigframes.session.clients.ClientsProvider(use_regional_endpoints=True) + + @pytest.mark.parametrize( "bigquery_location", # Sort the set to avoid nondeterminism. - sorted(bigframes.constants.LEP_ENABLED_BIGQUERY_LOCATIONS), + sorted(bigframes.constants.REP_NOT_ENABLED_BIGQUERY_LOCATIONS), ) -def test_bq_lep_endpoints(bigquery_location): - # We are not testing BigFrames Session for LEP endpoints because it involves - # query execution using the endpoint, which requires the project to be - # allowlisted for LEP access. We could hardcode one project which is - # allowlisted but then not every open source developer will have access to - # that. Let's rely on just creating the clients for LEP. - with pytest.warns(FutureWarning) as record: - clients_provider = bigframes.session.clients.ClientsProvider( +def test_clients_provider_use_regional_endpoints_non_rep_locations(bigquery_location): + with pytest.raises( + ValueError, + match=f"not .*available in the location {bigquery_location}", + ): + bigframes.session.clients.ClientsProvider( location=bigquery_location, use_regional_endpoints=True ) - assert len(record) == 1 - assert bigquery_location in typing.cast(Warning, record[0].message).args[0] - # Verify that location and endpoints are correctly set for the BigQuery API - # client - # TODO(shobs): Figure out if the same can be verified for the other API - # clients. - assert clients_provider.bqclient.location == bigquery_location - assert ( - clients_provider.bqclient._connection.API_BASE_URL - == "https://{location}-bigquery.googleapis.com".format( - location=bigquery_location + +@pytest.mark.parametrize( + "bigquery_location", + # Sort the set to avoid nondeterminism. + sorted(bigframes.constants.REP_NOT_ENABLED_BIGQUERY_LOCATIONS), +) +def test_session_init_fails_to_use_regional_endpoints_non_rep_endpoints( + bigquery_location, +): + with pytest.raises( + ValueError, + match=f"not .*available in the location {bigquery_location}", + ): + bigframes.Session( + context=bigframes.BigQueryOptions( + location=bigquery_location, use_regional_endpoints=True + ) ) - ) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 57fc878643..00f690ed54 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -36,11 +36,7 @@ def test_json_set_at_json_path(json_path, expected_json): actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) @pytest.mark.parametrize( @@ -60,11 +56,7 @@ def test_json_set_at_json_value_type(json_value, expected_json): actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_set_w_more_pairs(): @@ -77,11 +69,7 @@ def test_json_set_w_more_pairs(): expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}'] expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_set_w_invalid_value_type(): @@ -114,11 +102,7 @@ def test_json_extract_from_json(): actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_from_string(): @@ -129,11 +113,7 @@ def test_json_extract_from_string(): actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow")) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_w_invalid_series_type(): @@ -165,11 +145,7 @@ def test_json_extract_array_from_json(): expected.index.name = None expected.name = None - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_array_from_json_strings(): @@ -183,11 +159,7 @@ def test_json_extract_array_from_json_strings(): dtype=pd.ArrowDtype(pa.list_(pa.string())), ) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_array_from_json_array_strings(): @@ -201,11 +173,7 @@ def test_json_extract_array_from_json_array_strings(): dtype=pd.ArrowDtype(pa.list_(pa.string())), ) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_array_w_invalid_series_type(): @@ -219,11 +187,7 @@ def test_json_extract_string_array_from_json_strings(): actual = bbq.json_extract_string_array(s, "$.a") expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]]) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_string_array_from_array_strings(): @@ -231,11 +195,7 @@ def test_json_extract_string_array_from_array_strings(): actual = bbq.json_extract_string_array(s) expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_string_array_as_float_array_from_array_strings(): @@ -243,11 +203,7 @@ def test_json_extract_string_array_as_float_array_from_array_strings(): actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE) expected = bpd.Series([[1, 2.5, 3], [], [4, 5]]) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd.testing.assert_series_equal( - actual.to_pandas(allow_large_results=True), - expected.to_pandas(allow_large_results=True), - ) + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) def test_json_extract_string_array_w_invalid_series_type(): diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py index c7704ec86d..0e66256f98 100644 --- a/tests/system/small/blob/test_properties.py +++ b/tests/system/small/blob/test_properties.py @@ -55,10 +55,7 @@ def test_blob_version(images_mm_df: bpd.DataFrame): def test_blob_metadata(images_mm_df: bpd.DataFrame): - # allow_large_result=False incompatible with json b/401630655 - with bigframes.option_context( - "bigquery.allow_large_results", True, "experiments.blob", True - ): + with bigframes.option_context("experiments.blob", True): actual = images_mm_df["blob_col"].blob.metadata().to_pandas() expected = pd.Series( [ diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 0af7f4e42e..51e0459014 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -25,6 +25,7 @@ import test_utils.prefixer import bigframes +import bigframes.clients import bigframes.dtypes import bigframes.exceptions from bigframes.functions import _utils as bff_utils @@ -93,6 +94,11 @@ def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: return session +def get_bq_connection_id_path_format(connection_id_dot_format): + fields = connection_id_dot_format.split(".") + return f"projects/{fields[0]}/locations/{fields[1]}/connections/{fields[2]}" + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, @@ -107,8 +113,8 @@ def square(x): return x * x square = bff.remote_function( - int, - int, + input_types=int, + output_type=int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, @@ -118,6 +124,7 @@ def square(x): # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -154,11 +161,8 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_location_specified( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_location( + session, scalars_dfs, dataset_id_permanent, bq_cf_connection_location, @@ -167,17 +171,15 @@ def square(x): return x * x square = bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, + input_types=int, + output_type=int, + session=session, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -208,11 +210,8 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_location_mismatched( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_location_mismatched( + session, dataset_id_permanent, bq_cf_connection_location_mismatched, ): @@ -221,31 +220,41 @@ def square(x): # connection doesn't match the location of the dataset. return x * x # pragma: NO COVER - with pytest.raises( - ValueError, - match=re.escape("The location does not match BigQuery connection location:"), - ): - bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, - dataset=dataset_id_permanent, - bigquery_connection=bq_cf_connection_location_mismatched, - # See e2e tests for tests that actually deploy the Cloud Function. - reuse=True, - name=get_function_name(square), - )(square) + bq_cf_connection_location_mismatched_path_fmt = get_bq_connection_id_path_format( + bigframes.clients.get_canonical_bq_connection_id( + bq_cf_connection_location_mismatched, + session.bqclient.project, + session._location, + ) + ) + connection_ids = [ + bq_cf_connection_location_mismatched, + bq_cf_connection_location_mismatched_path_fmt, + ] + + for connection_id in connection_ids: + with pytest.raises( + ValueError, + match=re.escape( + "The location does not match BigQuery connection location:" + ), + ): + bff.remote_function( + input_types=int, + output_type=int, + session=session, + dataset=dataset_id_permanent, + bigquery_connection=connection_id, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + name=get_function_name(square), + cloud_function_service_account="default", + )(square) @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_location_project_specified( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_location_project( + session, scalars_dfs, dataset_id_permanent, bq_cf_connection_location_project, @@ -254,17 +263,15 @@ def square(x): return x * x square = bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, + input_types=int, + output_type=int, + session=session, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -295,11 +302,8 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_no_session_param_project_mismatched( - bigquery_client, - bigqueryconnection_client, - cloudfunctions_client, - resourcemanager_client, +def test_remote_function_connection_w_project_mismatched( + session, dataset_id_permanent, bq_cf_connection_location_project_mismatched, ): @@ -308,25 +312,38 @@ def square(x): # connection doesn't match the project of the dataset. return x * x # pragma: NO COVER - with pytest.raises( - ValueError, - match=re.escape( - "The project_id does not match BigQuery connection gcp_project_id:" - ), - ): - bff.remote_function( - int, - int, - bigquery_client=bigquery_client, - bigquery_connection_client=bigqueryconnection_client, - cloud_functions_client=cloudfunctions_client, - resource_manager_client=resourcemanager_client, - dataset=dataset_id_permanent, - bigquery_connection=bq_cf_connection_location_project_mismatched, - # See e2e tests for tests that actually deploy the Cloud Function. - reuse=True, - name=get_function_name(square), - )(square) + bq_cf_connection_location_project_mismatched_path_fmt = ( + get_bq_connection_id_path_format( + bigframes.clients.get_canonical_bq_connection_id( + bq_cf_connection_location_project_mismatched, + session.bqclient.project, + session._location, + ) + ) + ) + connection_ids = [ + bq_cf_connection_location_project_mismatched, + bq_cf_connection_location_project_mismatched_path_fmt, + ] + + for connection_id in connection_ids: + with pytest.raises( + ValueError, + match=re.escape( + "The project_id does not match BigQuery connection gcp_project_id:" + ), + ): + bff.remote_function( + input_types=int, + output_type=int, + session=session, + dataset=dataset_id_permanent, + bigquery_connection=connection_id, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + name=get_function_name(square), + cloud_function_service_account="default", + )(square) @pytest.mark.flaky(retries=2, delay=120) @@ -337,11 +354,12 @@ def square(x): return x * x square = bff.remote_function( - int, - int, + input_types=int, + output_type=int, session=session_with_bq_connection, dataset=dataset_id_permanent, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -386,7 +404,11 @@ def square(x): # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. square = session_with_bq_connection.remote_function( - int, int, dataset_id_permanent, name=get_function_name(square) + input_types=int, + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -424,13 +446,14 @@ def square(x): return x * x square = session.remote_function( - int, - int, - dataset_id_permanent, - bq_cf_connection, + input_types=int, + output_type=int, + dataset=dataset_id_permanent, + bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, name=get_function_name(square), + cloud_function_service_account="default", )(square) # Function should still work normally. @@ -468,7 +491,11 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_function_name(add_one) + input_types=[int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -499,7 +526,11 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_function_name(add_one) + input_types=[int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -530,7 +561,11 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_function_name(add_one) + input_types=[int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -576,6 +611,7 @@ def bytes_to_hex(mybytes: bytes) -> bytes: dataset=dataset_id_permanent, name=get_function_name(bytes_to_hex, package_requirements=packages), packages=packages, + cloud_function_service_account="default", )(bytes_to_hex) bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() @@ -618,10 +654,11 @@ def add_one(x): return x + 1 # pragma: NO COVER session.remote_function( - [int], - int, + input_types=[int], + output_type=int, dataset=dataset_id_permanent, name=get_function_name(add_one), + cloud_function_service_account="default", )(add_one) @@ -651,8 +688,8 @@ def square1(x): return x * x square1 = bff.remote_function( - [int], - int, + input_types=[int], + output_type=int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, dataset=dataset_id_permanent, @@ -661,6 +698,7 @@ def square1(x): bigquery_connection=bq_cf_connection, reuse=True, name=get_function_name(square1), + cloud_function_service_account="default", )(square1) # Function should still work normally. @@ -1135,10 +1173,11 @@ def add_ints(row): match="input_types=Series is in preview.", ): add_ints_remote = session.remote_function( - bigframes.series.Series, - int, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=int, + dataset=dataset_id_permanent, name=get_function_name(add_ints, is_row_processor=True), + cloud_function_service_account="default", )(add_ints) assert add_ints_remote.bigframes_remote_function # type: ignore assert add_ints_remote.bigframes_bigquery_function # type: ignore @@ -1187,10 +1226,11 @@ def add_ints(row): return row["int64_col"] + row["int64_too"] add_ints_remote = session.remote_function( - bigframes.series.Series, - int, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=int, + dataset=dataset_id_permanent, name=get_function_name(add_ints, is_row_processor=True), + cloud_function_service_account="default", )(add_ints) bf_result = ( @@ -1226,10 +1266,11 @@ def add_numbers(row): return row["x"] + row["y"] add_numbers_remote = session.remote_function( - bigframes.series.Series, - float, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=float, + dataset=dataset_id_permanent, name=get_function_name(add_numbers, is_row_processor=True), + cloud_function_service_account="default", )(add_numbers) bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas() @@ -1279,10 +1320,11 @@ def echo_len(row): return len(row) echo_len_remote = session.remote_function( - bigframes.series.Series, - float, - dataset_id_permanent, + input_types=bigframes.series.Series, + output_type=float, + dataset=dataset_id_permanent, name=get_function_name(echo_len, is_row_processor=True), + cloud_function_service_account="default", )(echo_len) for column in columns_with_not_supported_dtypes: @@ -1315,7 +1357,9 @@ def should_mask(name: str) -> bool: assert "name" in inspect.signature(should_mask).parameters should_mask = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(should_mask) + dataset=dataset_id_permanent, + name=get_function_name(should_mask), + cloud_function_service_account="default", )(should_mask) s = bigframes.series.Series(["Alice", "Bob", "Caroline"]) @@ -1374,7 +1418,9 @@ def is_odd(x: int) -> bool: # create a remote function is_odd_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(is_odd) + dataset=dataset_id_permanent, + name=get_function_name(is_odd), + cloud_function_service_account="default", )(is_odd) # with nulls in the series the remote function application would fail @@ -1424,7 +1470,9 @@ def add(x: int, y: int) -> int: # create a remote function add_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(add) + dataset=dataset_id_permanent, + name=get_function_name(add), + cloud_function_service_account="default", )(add) # with nulls in the series the remote function application would fail @@ -1477,7 +1525,9 @@ def add(x: int, y: int, z: float) -> float: # create a remote function add_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(add) + dataset=dataset_id_permanent, + name=get_function_name(add), + cloud_function_service_account="default", )(add) # pandas does not support nary functions, so let's create a proxy function @@ -1533,6 +1583,7 @@ def is_long_duration(minutes: int) -> bool: is_long_duration = unordered_session.remote_function( dataset=dataset_id_permanent, name=get_function_name(is_long_duration), + cloud_function_service_account="default", )(is_long_duration) method = getattr(df["duration_minutes"], method) @@ -1551,7 +1602,9 @@ def combiner(x: int, y: int) -> int: return x combiner = unordered_session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(combiner) + dataset=dataset_id_permanent, + name=get_function_name(combiner), + cloud_function_service_account="default", )(combiner) df = scalars_df_index[["int64_col", "int64_too", "float64_col", "string_col"]] @@ -1567,7 +1620,9 @@ def processor(x: int, y: int, z: float, w: str) -> str: return f"I got x={x}, y={y}, z={z} and w={w}" processor = unordered_session.remote_function( - dataset=dataset_id_permanent, name=get_function_name(processor) + dataset=dataset_id_permanent, + name=get_function_name(processor), + cloud_function_service_account="default", )(processor) df = scalars_df_index[["int64_col", "int64_too", "float64_col", "string_col"]] diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 1843da41d7..d56874719e 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -202,64 +202,6 @@ def xgboost_iris_df(session, xgboost_iris_pandas_df): return session.read_pandas(xgboost_iris_pandas_df) -@pytest.fixture(scope="session") -def bqml_palm2_text_generator_model(session, bq_connection) -> core.BqmlModel: - options = { - "remote_service_type": "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1", - } - return globals.bqml_model_factory().create_remote_model( - session=session, connection_name=bq_connection, options=options - ) - - -@pytest.fixture(scope="session") -def palm2_text_generator_model(session, bq_connection) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection) - - -@pytest.fixture(scope="session") -def palm2_text_generator_32k_model(session, bq_connection) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator( - model_name="text-bison-32k", session=session, connection_name=bq_connection - ) - - -@pytest.fixture(scope="function") -def ephemera_palm2_text_generator_model( - session, bq_connection -) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection) - - -@pytest.fixture(scope="session") -def palm2_embedding_generator_model( - session, bq_connection -) -> llm.PaLM2TextEmbeddingGenerator: - return llm.PaLM2TextEmbeddingGenerator( - session=session, connection_name=bq_connection - ) - - -@pytest.fixture(scope="session") -def palm2_embedding_generator_model_002( - session, bq_connection -) -> llm.PaLM2TextEmbeddingGenerator: - return llm.PaLM2TextEmbeddingGenerator( - version="002", session=session, connection_name=bq_connection - ) - - -@pytest.fixture(scope="session") -def palm2_embedding_generator_multilingual_model( - session, bq_connection -) -> llm.PaLM2TextEmbeddingGenerator: - return llm.PaLM2TextEmbeddingGenerator( - model_name="textembedding-gecko-multilingual", - session=session, - connection_name=bq_connection, - ) - - @pytest.fixture(scope="session") def linear_remote_model_params() -> dict: # Pre-deployed endpoint of linear reg model in Vertex. diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 1827858353..3c5ba9bb18 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -390,27 +390,6 @@ def test_remote_model_predict( ) -@pytest.mark.flaky(retries=2) -def test_model_generate_text( - bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df -): - options = { - "temperature": 0.5, - "max_output_tokens": 100, - "top_k": 20, - "top_p": 0.5, - "flatten_json_output": True, - } - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - df = bqml_palm2_text_generator_model.generate_text( - llm_text_df, options=options - ).to_pandas(allow_large_results=True) - - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - @pytest.mark.parametrize("id_col_name", [None, "id"]) def test_model_forecast( time_series_bqml_arima_plus_model: core.BqmlModel, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 7e7a532f79..747f9ce954 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -24,187 +24,6 @@ from tests.system import utils -# Until b/401630655 is resolved, ML apis return json, not compatible with allow_large_results=False -@pytest.fixture(scope="module", autouse=True) -def always_create_table(): - with bigframes.option_context("bigquery.allow_large_results", True): - yield - - -def test_create_load_text_generator_model( - palm2_text_generator_model, dataset_id, bq_connection -): - # Model creation doesn't return error - assert palm2_text_generator_model is not None - assert palm2_text_generator_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = palm2_text_generator_model.to_gbq( - f"{dataset_id}.temp_text_model", replace=True - ) - assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.model_name == "text-bison" - assert reloaded_model.connection_name == bq_connection - - -def test_create_load_text_generator_32k_model( - palm2_text_generator_32k_model, dataset_id, bq_connection -): - # Model creation doesn't return error - assert palm2_text_generator_32k_model is not None - assert palm2_text_generator_32k_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = palm2_text_generator_32k_model.to_gbq( - f"{dataset_id}.temp_text_model", replace=True - ) - assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.model_name == "text-bison-32k" - assert reloaded_model.connection_name == bq_connection - - -@pytest.mark.flaky(retries=2) -def test_create_text_generator_model_default_session( - bq_connection, llm_text_pandas_df, bigquery_client -): - import bigframes.pandas as bpd - - # Note: This starts a thread-local session. - with bpd.option_context( - "bigquery.bq_connection", - bq_connection, - "bigquery.location", - "US", - ): - model = llm.PaLM2TextGenerator() - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-rf-conn" - ) - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - df = model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_create_text_generator_32k_model_default_session( - bq_connection, llm_text_pandas_df, bigquery_client -): - import bigframes.pandas as bpd - - # Note: This starts a thread-local session. - with bpd.option_context( - "bigquery.bq_connection", - bq_connection, - "bigquery.location", - "US", - ): - model = llm.PaLM2TextGenerator(model_name="text-bison-32k") - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-rf-conn" - ) - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - df = model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_create_text_generator_model_default_connection( - llm_text_pandas_df, bigquery_client -): - from bigframes import _config - import bigframes.pandas as bpd - - bpd.close_session() - _config.options = _config.Options() # reset configs - - llm_text_df = bpd.read_pandas(llm_text_pandas_df) - - model = llm.PaLM2TextGenerator() - assert model is not None - assert model._bqml_model is not None - assert ( - model.connection_name.casefold() - == f"{bigquery_client.project}.us.bigframes-default-connection" - ) - - df = model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -# Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_default_params_success( - palm2_text_generator_model, llm_text_df -): - df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_series_default_params_success( - palm2_text_generator_model, llm_text_df -): - df = palm2_text_generator_model.predict(llm_text_df["prompt"]).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_arbitrary_col_label_success( - palm2_text_generator_model, llm_text_df -): - llm_text_df = llm_text_df.rename(columns={"prompt": "arbitrary"}) - df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_multiple_cols_success( - palm2_text_generator_model, llm_text_df: bpd.DataFrame -): - df = llm_text_df.assign(additional_col=1) - pd_df = palm2_text_generator_model.predict(df).to_pandas() - utils.check_pandas_df_schema_and_index( - pd_df, - columns=utils.ML_GENERATE_TEXT_OUTPUT + ["additional_col"], - index=3, - col_exact=False, - ) - - -@pytest.mark.flaky(retries=2) -def test_text_generator_predict_with_params_success( - palm2_text_generator_model, llm_text_df -): - df = palm2_text_generator_model.predict( - llm_text_df, temperature=0.5, max_output_tokens=100, top_k=20, top_p=0.5 - ).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False - ) - - @pytest.mark.parametrize( "model_name", ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), @@ -316,6 +135,8 @@ def test_multimodal_embedding_generator_predict_default_params_success( "gemini-1.5-flash-001", "gemini-1.5-flash-002", "gemini-2.0-flash-exp", + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", ), ) @pytest.mark.flaky( @@ -922,50 +743,6 @@ def test_text_embedding_generator_retry_no_progress(session, bq_connection): ) -@pytest.mark.flaky(retries=2) -def test_llm_palm_score(llm_fine_tune_df_default_index): - model = llm.PaLM2TextGenerator(model_name="text-bison") - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index[["prompt"]], - y=llm_fine_tune_df_default_index[["label"]], - ).to_pandas() - utils.check_pandas_df_schema_and_index( - score_result, - columns=[ - "bleu4_score", - "rouge-l_precision", - "rouge-l_recall", - "rouge-l_f1_score", - "evaluation_status", - ], - index=1, - ) - - -@pytest.mark.flaky(retries=2) -def test_llm_palm_score_params(llm_fine_tune_df_default_index): - model = llm.PaLM2TextGenerator(model_name="text-bison", max_iterations=1) - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index["prompt"], - y=llm_fine_tune_df_default_index["label"], - task_type="classification", - ).to_pandas() - utils.check_pandas_df_schema_and_index( - score_result, - columns=[ - "precision", - "recall", - "f1_score", - "label", - "evaluation_status", - ], - ) - - @pytest.mark.flaky(retries=2) @pytest.mark.parametrize( "model_name", @@ -1023,41 +800,6 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) ) -def test_palm2_text_generator_deprecated(): - with pytest.warns(exceptions.ApiDeprecationWarning): - llm.PaLM2TextGenerator() - - -def test_palm2_text_embedding_deprecated(): - with pytest.warns(exceptions.ApiDeprecationWarning): - try: - llm.PaLM2TextEmbeddingGenerator() - except (Exception): - pass - - -@pytest.mark.parametrize( - "model_name", - ( - "gemini-1.5-pro-001", - "gemini-1.5-pro-002", - "gemini-1.5-flash-001", - "gemini-1.5-flash-002", - ), -) -def test_gemini_text_generator_deprecated(model_name): - with pytest.warns(exceptions.ApiDeprecationWarning): - llm.GeminiTextGenerator(model_name=model_name) - - -def test_gemini_pro_text_generator_deprecated(): - with pytest.warns(exceptions.ApiDeprecationWarning): - try: - llm.GeminiTextGenerator(model_name="gemini-pro") - except (Exception): - pass - - @pytest.mark.parametrize( "model_name", ( diff --git a/tests/system/small/ml/test_register.py b/tests/system/small/ml/test_register.py index 6d8ff0a712..f21567da63 100644 --- a/tests/system/small/ml/test_register.py +++ b/tests/system/small/ml/test_register.py @@ -14,9 +14,7 @@ from typing import cast -import pytest - -from bigframes.ml import core, imported, linear_model, llm +from bigframes.ml import core, imported, linear_model def test_linear_reg_register( @@ -53,13 +51,6 @@ def test_linear_reg_register_with_params( ) -def test_palm2_text_generator_register( - ephemera_palm2_text_generator_model: llm.PaLM2TextGenerator, -): - with pytest.raises(AttributeError): - ephemera_palm2_text_generator_model.register() # type: ignore - - def test_imported_tensorflow_register( ephemera_imported_tensorflow_model: imported.TensorFlowModel, ): diff --git a/tests/system/small/test_bq_sessions.py b/tests/system/small/test_bq_sessions.py new file mode 100644 index 0000000000..e470728061 --- /dev/null +++ b/tests/system/small/test_bq_sessions.py @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from concurrent.futures import ThreadPoolExecutor + +import google +import google.api_core.exceptions +import google.cloud +from google.cloud import bigquery +import pytest + +from bigframes.session import bigquery_session + +TEST_SCHEMA = [ + bigquery.SchemaField("bool field", "BOOLEAN"), + bigquery.SchemaField("string field", "STRING"), + bigquery.SchemaField("float array_field", "FLOAT", mode="REPEATED"), + bigquery.SchemaField( + "struct field", + "RECORD", + fields=(bigquery.SchemaField("int subfield", "INTEGER"),), + ), +] + + +@pytest.fixture +def session_resource_manager( + bigquery_client, +) -> bigquery_session.SessionResourceManager: + return bigquery_session.SessionResourceManager(bigquery_client, "US") + + +def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client): + session_resource_manager = bigquery_session.SessionResourceManager( + bigquery_client, "US" + ) + cluster_cols = ["string field", "bool field"] + + session_table_ref = session_resource_manager.create_temp_table( + TEST_SCHEMA, cluster_cols=cluster_cols + ) + session_resource_manager._keep_session_alive() + + result_table = bigquery_client.get_table(session_table_ref) + assert result_table.schema == TEST_SCHEMA + assert result_table.clustering_fields == cluster_cols + + session_resource_manager.close() + with pytest.raises(google.api_core.exceptions.NotFound): + bigquery_client.get_table(session_table_ref) + + +def test_bq_session_create_multi_temp_tables(bigquery_client: bigquery.Client): + session_resource_manager = bigquery_session.SessionResourceManager( + bigquery_client, "US" + ) + + def create_table(): + return session_resource_manager.create_temp_table(TEST_SCHEMA) + + with ThreadPoolExecutor() as executor: + results = [executor.submit(create_table) for i in range(10)] + + for future in results: + table = future.result() + result_table = bigquery_client.get_table(table) + assert result_table.schema == TEST_SCHEMA + + session_resource_manager.close() diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8cc3be1577..914c953f99 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4607,13 +4607,12 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub ], ) def test_df_drop_duplicates_w_json(json_df, keep): - bf_df = json_df.drop_duplicates(keep=keep).to_pandas(allow_large_results=True) + bf_df = json_df.drop_duplicates(keep=keep).to_pandas() # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible # with Arrow string extension types. Temporary conversion to standard Pandas # strings is required. - # allow_large_results=True for b/401630655 - json_pandas_df = json_df.to_pandas(allow_large_results=True) + json_pandas_df = json_df.to_pandas() json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( pd.StringDtype(storage="pyarrow") ) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 2b6dfefb12..491b56d5fc 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,6 +16,7 @@ import typing import pandas as pd +import pyarrow as pa import pytest import pytz @@ -39,6 +40,16 @@ def test_concat_dataframe(scalars_dfs, ordered): assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) +def test_concat_dataframe_w_struct_cols(nested_structs_df, nested_structs_pandas_df): + """Avoid regressions for internal issue 407107482""" + empty_bf_df = bpd.DataFrame(session=nested_structs_df._block.session) + bf_result = bpd.concat((empty_bf_df, nested_structs_df), ignore_index=True) + bf_result = bf_result.to_pandas() + pd_result = pd.concat((pd.DataFrame(), nested_structs_pandas_df), ignore_index=True) + pd_result.index = pd_result.index.astype("Int64") + pd.testing.assert_frame_equal(bf_result, pd_result) + + def test_concat_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat( @@ -388,150 +399,229 @@ def test_merge_series(scalars_dfs, merge_how): def _convert_pandas_category(pd_s: pd.Series): + """ + Transforms a pandas Series with Categorical dtype into a bigframes-compatible + Series representing intervals." + """ + # When `labels=False` + if pd.api.types.is_integer_dtype(pd_s.dtype) or pd.api.types.is_float_dtype( + pd_s.dtype + ): + return pd_s.astype("Int64") + if not isinstance(pd_s.dtype, pd.CategoricalDtype): - raise ValueError("Input must be a pandas Series with categorical data.") + raise ValueError( + f"Input must be a pandas Series with categorical data: {pd_s.dtype}" + ) - if len(pd_s.dtype.categories) == 0: - return pd.Series([pd.NA] * len(pd_s), name=pd_s.name) + if pd.api.types.is_object_dtype(pd_s.cat.categories.dtype): + return pd_s.astype(pd.StringDtype(storage="pyarrow")) - pd_interval: pd.IntervalIndex = pd_s.cat.categories[pd_s.cat.codes] # type: ignore - if pd_interval.closed == "left": + if not isinstance(pd_s.cat.categories.dtype, pd.IntervalDtype): + raise ValueError( + f"Must be a IntervalDtype with categorical data: {pd_s.cat.categories.dtype}" + ) + + if pd_s.cat.categories.dtype.closed == "left": # type: ignore left_key = "left_inclusive" right_key = "right_exclusive" else: left_key = "left_exclusive" right_key = "right_inclusive" - return pd.Series( - [ - {left_key: interval.left, right_key: interval.right} + + subtype = pd_s.cat.categories.dtype.subtype # type: ignore + if pd.api.types.is_float_dtype(subtype): + interval_dtype = pa.float64() + elif pd.api.types.is_integer_dtype(subtype): + interval_dtype = pa.int64() + else: + raise ValueError(f"Unknown category type: {subtype}") + + dtype = pd.ArrowDtype( + pa.struct( + [ + pa.field(left_key, interval_dtype, nullable=True), + pa.field(right_key, interval_dtype, nullable=True), + ] + ) + ) + + if len(pd_s.dtype.categories) == 0: + data = [pd.NA] * len(pd_s) + else: + data = [ + {left_key: interval.left, right_key: interval.right} # type: ignore if pd.notna(val) else pd.NA - for val, interval in zip(pd_s, pd_interval) - ], + for val, interval in zip(pd_s, pd_s.cat.categories[pd_s.cat.codes]) # type: ignore + ] + + return pd.Series( + data=data, name=pd_s.name, + dtype=dtype, + index=pd_s.index.astype("Int64"), ) @pytest.mark.parametrize( - ("right"), + ("right", "labels"), [ - pytest.param(True), - pytest.param(False), + pytest.param(True, None, id="right_w_none_labels"), + pytest.param(True, False, id="right_w_false_labels"), + pytest.param(False, None, id="left_w_none_labels"), + pytest.param(False, False, id="left_w_false_labels"), ], ) -def test_cut(scalars_dfs, right): +def test_cut_by_int_bins(scalars_dfs, labels, right): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False, right=right) - bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False, right=right) + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=labels, right=right) + bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels, right=right) - # make sure the result is a supported dtype - assert bf_result.dtype == bpd.Int64Dtype() - pd_result = pd_result.astype("Int64") + pd_result = _convert_pandas_category(pd_result) pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) -@pytest.mark.parametrize( - ("right"), - [ - pytest.param(True), - pytest.param(False), - ], -) -def test_cut_default_labels(scalars_dfs, right): +def test_cut_by_int_bins_w_labels(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, right=right) - bf_result = bpd.cut(scalars_df["float64_col"], 5, right=right).to_pandas() + labels = ["A", "B", "C", "D", "E"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels) - # Convert to match data format - pd_result_converted = _convert_pandas_category(pd_result) - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( - ("breaks", "right"), + ("breaks", "right", "labels"), [ - pytest.param([0, 5, 10, 15, 20, 100, 1000], True, id="int_right"), - pytest.param([0, 5, 10, 15, 20, 100, 1000], False, id="int_left"), - pytest.param([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5], False, id="float_left"), - pytest.param([0, 5, 10.5, 15.5, 20, 100, 1000.5], True, id="mixed_right"), + pytest.param( + [0, 5, 10, 15, 20, 100, 1000], + True, + None, + id="int_breaks_w_right_closed_and_none_labels", + ), + pytest.param( + [0, 5, 10, 15, 20, 100, 1000], + False, + False, + id="int_breaks_w_left_closed_and_false_labels", + ), + pytest.param( + [0.5, 10.5, 15.5, 20.5, 100.5, 1000.5], + False, + None, + id="float_breaks_w_left_closed_and_none_labels", + ), + pytest.param( + [0, 5, 10.5, 15.5, 20, 100, 1000.5], + True, + False, + id="mixed_types_breaks_w_right_closed_and_false_labels", + ), ], ) -def test_cut_numeric_breaks(scalars_dfs, breaks, right): +def test_cut_by_numeric_breaks(scalars_dfs, breaks, right, labels): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks, right=right) - bf_result = bpd.cut(scalars_df["float64_col"], breaks, right=right).to_pandas() + pd_result = pd.cut( + scalars_pandas_df["float64_col"], breaks, right=right, labels=labels + ) + bf_result = bpd.cut( + scalars_df["float64_col"], breaks, right=right, labels=labels + ).to_pandas() - # Convert to match data format pd_result_converted = _convert_pandas_category(pd_result) - - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + pd.testing.assert_series_equal(bf_result, pd_result_converted) -@pytest.mark.parametrize( - "bins", - [ - pytest.param([], id="empty_list"), - pytest.param( - [1], id="single_int_list", marks=pytest.mark.skip(reason="b/404338651") - ), - pytest.param(pd.IntervalIndex.from_tuples([]), id="empty_interval_index"), - ], -) -def test_cut_w_edge_cases(scalars_dfs, bins): +def test_cut_by_numeric_breaks_w_labels(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas() - if isinstance(bins, list): - bins = pd.IntervalIndex.from_tuples(bins) - pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False) - # Convert to match data format - pd_result_converted = _convert_pandas_category(pd_result) + bins = [0, 5, 10, 15, 20] + labels = ["A", "B", "C", "D"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( - ("bins", "right"), + ("bins", "right", "labels"), [ - pytest.param([(-5, 2), (2, 3), (-3000, -10)], True, id="tuple_right"), - pytest.param([(-5, 2), (2, 3), (-3000, -10)], False, id="tuple_left"), + pytest.param( + [(-5, 2), (2, 3), (-3000, -10)], True, None, id="tuple_right_w_none_labels" + ), + pytest.param( + [(-5, 2), (2, 3), (-3000, -10)], + False, + False, + id="tuple_left_w_false_labels", + ), pytest.param( pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]), True, - id="interval_right", + False, + id="interval_right_w_none_labels", ), pytest.param( pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]), False, - id="interval_left", + None, + id="interval_left_w_false_labels", ), ], ) -def test_cut_with_interval(scalars_dfs, bins, right): +def test_cut_by_interval_bins(scalars_dfs, bins, right, labels): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.cut( - scalars_df["int64_too"], bins, labels=False, right=right + scalars_df["int64_too"], bins, labels=labels, right=right ).to_pandas() if isinstance(bins, list): bins = pd.IntervalIndex.from_tuples(bins) - pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False, right=right) + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels, right=right) - # Convert to match data format pd_result_converted = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result, pd_result_converted) - pd.testing.assert_series_equal( - bf_result, pd_result_converted, check_index=False, check_dtype=False - ) + +def test_cut_by_interval_bins_w_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bins = pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]) + labels = ["A", "B", "C", "D", "E"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) + + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize( + ("bins", "labels"), + [ + pytest.param([], None, id="empty_breaks"), + pytest.param([1], False, id="single_int_breaks"), + pytest.param(pd.IntervalIndex.from_tuples([]), None, id="empty_interval_index"), + ], +) +def test_cut_by_edge_cases_bins(scalars_dfs, bins, labels): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=labels).to_pandas() + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels) + + pd_result_converted = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result, pd_result_converted) + + +def test_cut_empty_array_raises_error(): + bf_df = bpd.Series([]) + with pytest.raises(ValueError, match="Cannot cut empty array"): + bpd.cut(bf_df, bins=5) @pytest.mark.parametrize( diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 3139ae5225..9c61c8ea5b 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -55,6 +55,19 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, with bf.option_context("display.progress_bar", "terminal"): penguins_df_default_index["body_mass_g"].head(10).mean() + assert capsys.readouterr().out == "" + + +def test_progress_bar_scalar_allow_large_results( + penguins_df_default_index: bf.dataframe.DataFrame, capsys +): + capsys.readouterr() # clear output + + with bf.option_context( + "display.progress_bar", "terminal", "bigquery.allow_large_results", "True" + ): + penguins_df_default_index["body_mass_g"].head(10).mean() + assert_loading_msg_exist(capsys.readouterr().out) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5ca055dc43..e9d3e6e4da 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -322,24 +322,22 @@ def test_series_construct_local_unordered_has_sequential_index(unordered_session def test_series_construct_w_dtype_for_json(): - # Until b/401630655 is resolved, json, not compatible with allow_large_results=False - with bigframes.option_context("bigquery.allow_large_results", True): - data = [ - "1", - '"str"', - "false", - '["a", {"b": 1}, null]', - None, - '{"a": {"b": [1, 2, 3], "c": true}}', - ] - s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) + data = [ + "1", + '"str"', + "false", + '["a", {"b": 1}, null]', + None, + '{"a": {"b": [1, 2, 3], "c": true}}', + ] + s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) - assert s[0] == "1" - assert s[1] == '"str"' - assert s[2] == "false" - assert s[3] == '["a",{"b":1},null]' - assert pd.isna(s[4]) - assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' + assert s[0] == "1" + assert s[1] == '"str"' + assert s[2] == "false" + assert s[3] == '["a",{"b":1},null]' + assert pd.isna(s[4]) + assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' def test_series_keys(scalars_dfs): @@ -402,8 +400,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - series_pandas = series.to_pandas(allow_large_results=True) + series_pandas = series.to_pandas() assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert series_pandas.shape[0] == json_pandas_df.shape[0] @@ -4384,13 +4381,13 @@ def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col def test_series_struct_get_field_by_attribute( - nested_structs_df, nested_structs_pandas_df, nested_structs_pandas_type + nested_structs_df, nested_structs_pandas_df ): if Version(pd.__version__) < Version("2.2.0"): pytest.skip("struct accessor is not supported before pandas 2.2") bf_series = nested_structs_df["person"] - df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type) + df_series = nested_structs_pandas_df["person"] pd.testing.assert_series_equal( bf_series.address.city.to_pandas(), diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e286c40450..9daaa9aeeb 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -630,8 +630,7 @@ def test_read_gbq_w_json(session): ) ), """ - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True) + df = session.read_gbq(sql, index_col="id") assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) @@ -651,17 +650,14 @@ def test_read_gbq_w_json_and_compare_w_pandas_json(session): df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - result = df.to_pandas(allow_large_results=True) - # These JSON strings are compatible with BigQuery's JSON storage, pd_df = pd.DataFrame( {"json_col": ['{"bar":true,"foo":10}']}, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), ) pd_df.index = pd_df.index.astype("Int64") - pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) - pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) + pd.testing.assert_series_equal(df.dtypes, pd_df.dtypes) + pd.testing.assert_series_equal(df["json_col"].to_pandas(), pd_df["json_col"]) def test_read_gbq_w_json_in_struct(session): @@ -697,9 +693,6 @@ def test_read_gbq_w_json_in_struct(session): data = df["struct_col"].struct.field("data") assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - data = data.to_pandas(allow_large_results=True) - assert data[0] == '{"boolean":true}' assert data[1] == '{"int":100}' assert data[2] == '{"float":0.98}' @@ -738,10 +731,7 @@ def test_read_gbq_w_json_in_array(session): assert data.list.len()[0] == 7 assert data.list[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - # TODO(b/401630655): JSON is not compatible with allow_large_results=False - pd_data = data.to_pandas(allow_large_results=True) - - assert pd_data[0] == [ + assert data[0] == [ '{"boolean":true}', '{"int":100}', '{"float":0.98}', @@ -873,7 +863,6 @@ def test_read_pandas_timedelta_dataframes(session, write_engine): def test_read_pandas_timedelta_series(session, write_engine): expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d")) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = ( session.read_pandas(expected_series, write_engine=write_engine) .to_pandas() @@ -896,10 +885,9 @@ def test_read_pandas_timedelta_index(session, write_engine): [1, 2, 3], unit="d" ) # to_timedelta returns an index - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = ( session.read_pandas(expected_index, write_engine=write_engine) - .to_pandas(allow_large_results=True) + .to_pandas() .astype("timedelta64[ns]") ) @@ -926,10 +914,9 @@ def test_read_pandas_json_dataframes(session, write_engine): {"my_col": pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE)} ) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_df, write_engine=write_engine - ).to_pandas(allow_large_results=True) + ).to_pandas() if write_engine == "bigquery_streaming": expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64") @@ -949,10 +936,9 @@ def test_read_pandas_json_series(session, write_engine): ] expected_series = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_series, write_engine=write_engine - ).to_pandas(allow_large_results=True) + ).to_pandas() pd.testing.assert_series_equal( actual_result, expected_series, check_index_type=False ) @@ -973,10 +959,9 @@ def test_read_pandas_json_index(session, write_engine): '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] expected_index: pd.Index = pd.Index(json_data, dtype=bigframes.dtypes.JSON_DTYPE) - # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_index, write_engine=write_engine - ).to_pandas(allow_large_results=True) + ).to_pandas() pd.testing.assert_index_equal(actual_result, expected_index) @@ -1004,10 +989,7 @@ def test_read_pandas_w_nested_json(session, write_engine): ), ) with pytest.raises(NotImplementedError, match="Nested JSON types, found in column"): - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - session.read_pandas(pd_s, write_engine=write_engine).to_pandas( - allow_large_results=True - ) + session.read_pandas(pd_s, write_engine=write_engine) @pytest.mark.parametrize( @@ -1036,10 +1018,7 @@ def test_read_pandas_w_nested_json_index(session, write_engine): with pytest.raises( NotImplementedError, match="Nested JSON types, found in the index" ): - # Until b/401630655 is resolved, json not compatible with allow_large_results=False - session.read_pandas(pd_idx, write_engine=write_engine).to_pandas( - allow_large_results=True - ) + session.read_pandas(pd_idx, write_engine=write_engine) @utils.skip_legacy_pandas diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index 98a74d4e4c..b8f3a612d4 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -183,3 +183,10 @@ def test_client_endpoints_override_set_shows_warning(): with pytest.warns(UserWarning): options.client_endpoints_override = {"bqclient": "endpoint_address"} + + +def test_default_options(): + options = bigquery_options.BigQueryOptions() + + assert options.allow_large_results is False + assert options.ordering_mode == "strict" diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index d377fb4d49..56003abf2d 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -42,7 +42,9 @@ def test_series_input_types_to_str(series_type): """Check that is_row_processor=True uses str as the input type to serialize a row.""" session = resources.create_bigquery_session() - remote_function_decorator = bff.remote_function(session=session) + remote_function_decorator = bff.remote_function( + session=session, cloud_function_service_account="default" + ) with pytest.warns( bigframes.exceptions.PreviewWarning, @@ -79,7 +81,9 @@ def test_supported_types_correspond(): def test_missing_input_types(): session = resources.create_bigquery_session() - remote_function_decorator = bff.remote_function(session=session) + remote_function_decorator = bff.remote_function( + session=session, cloud_function_service_account="default" + ) def function_without_parameter_annotations(myparam) -> str: return str(myparam) @@ -95,7 +99,9 @@ def function_without_parameter_annotations(myparam) -> str: def test_missing_output_type(): session = resources.create_bigquery_session() - remote_function_decorator = bff.remote_function(session=session) + remote_function_decorator = bff.remote_function( + session=session, cloud_function_service_account="default" + ) def function_without_return_annotation(myparam: int): return str(myparam) diff --git a/tests/unit/test_clients.py b/tests/unit/test_clients.py index 37450ececb..032512c26e 100644 --- a/tests/unit/test_clients.py +++ b/tests/unit/test_clients.py @@ -17,33 +17,51 @@ from bigframes import clients -def test_get_connection_name_full_connection_id(): - connection_name = clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_connection_id_only(): + connection_id = clients.get_canonical_bq_connection_id( "connection-id", default_project="default-project", default_location="us" ) - assert connection_name == "default-project.us.connection-id" + assert connection_id == "default-project.us.connection-id" -def test_get_connection_name_full_location_connection_id(): - connection_name = clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_location_and_connection_id(): + connection_id = clients.get_canonical_bq_connection_id( "eu.connection-id", default_project="default-project", default_location="us" ) - assert connection_name == "default-project.eu.connection-id" + assert connection_id == "default-project.eu.connection-id" -def test_get_connection_name_full_all(): - connection_name = clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_already_canonical(): + connection_id = clients.get_canonical_bq_connection_id( "my-project.eu.connection-id", default_project="default-project", default_location="us", ) - assert connection_name == "my-project.eu.connection-id" + assert connection_id == "my-project.eu.connection-id" -def test_get_connection_name_full_raise_value_error(): - with pytest.raises(ValueError): - clients.resolve_full_bq_connection_name( +def test_get_canonical_bq_connection_id_invalid(): + with pytest.raises(ValueError, match="Invalid connection id format"): + clients.get_canonical_bq_connection_id( "my-project.eu.connection-id.extra_field", default_project="default-project", default_location="us", ) + + +def test_get_canonical_bq_connection_id_valid_path(): + connection_id = clients.get_canonical_bq_connection_id( + "projects/project_id/locations/northamerica-northeast1/connections/connection-id", + default_project="default-project", + default_location="us", + ) + assert connection_id == "project_id.northamerica-northeast1.connection-id" + + +def test_get_canonical_bq_connection_id_invalid_path(): + with pytest.raises(ValueError, match="Invalid connection id format"): + clients.get_canonical_bq_connection_id( + "/projects/project_id/locations/northamerica-northeast1/connections/connection-id", + default_project="default-project", + default_location="us", + ) diff --git a/tests/unit/test_daemon.py b/tests/unit/test_daemon.py new file mode 100644 index 0000000000..6b3acd7d7d --- /dev/null +++ b/tests/unit/test_daemon.py @@ -0,0 +1,42 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import time +from unittest.mock import MagicMock + +from bigframes.session.bigquery_session import RecurringTaskDaemon + + +def test_recurring_task_daemon_calls(): + mock_task = MagicMock() + daemon = RecurringTaskDaemon( + task=mock_task, frequency=datetime.timedelta(seconds=0.1) + ) + daemon.start() + time.sleep(1.0) + daemon.stop() + time.sleep(0.5) + # be lenient, but number of calls should be in this ballpark regardless of scheduling hiccups + assert mock_task.call_count > 6 + assert mock_task.call_count < 12 + + +def test_recurring_task_daemon_never_started(): + mock_task = MagicMock() + _ = RecurringTaskDaemon( + task=mock_task, frequency=datetime.timedelta(seconds=0.0001) + ) + time.sleep(0.1) + assert mock_task.call_count == 0 diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 64a287aaca..e8383512a6 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -91,13 +91,48 @@ def test_method_matches_session(method_name: str): assert pandas_signature.return_annotation == session_signature.return_annotation -def test_cut_raises_with_labels(): +@pytest.mark.parametrize( + ("bins", "labels", "error_message"), + [ + pytest.param( + 5, + True, + "Bin labels must either be False, None or passed in as a list-like argument", + id="true", + ), + pytest.param( + 5, + 1.5, + "Bin labels must either be False, None or passed in as a list-like argument", + id="invalid_types", + ), + pytest.param( + 2, + ["A"], + "must be same as the value of bins", + id="int_bins_mismatch", + ), + pytest.param( + [1, 2, 3], + ["A"], + "must be same as the number of bin edges", + id="iterator_bins_mismatch", + ), + ], +) +def test_cut_raises_with_invalid_labels(bins: int, labels, error_message: str): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + with pytest.raises(ValueError, match=error_message): + bigframes.pandas.cut(mock_series, bins, labels=labels) + + +def test_cut_raises_with_unsupported_labels(): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + labels = [1, 2] with pytest.raises( - NotImplementedError, - match="The 'labels' parameter must be either False or None.", + NotImplementedError, match=r".*only iterables of strings are supported.*" ): - mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) - bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) + bigframes.pandas.cut(mock_series, 2, labels=labels) # type: ignore @pytest.mark.parametrize( @@ -111,11 +146,21 @@ def test_cut_raises_with_labels(): "`bins` iterable should contain tuples or numerics", id="iterable_w_wrong_type", ), + pytest.param( + [10, 3], + "left side of interval must be <= right side", + id="decreased_breaks", + ), + pytest.param( + [(1, 10), (2, 25)], + "Overlapping IntervalIndex is not accepted.", + id="overlapping_intervals", + ), ], ) def test_cut_raises_with_invalid_bins(bins: int, error_message: str): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) with pytest.raises(ValueError, match=error_message): - mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, bins, labels=False) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e59232ee85..8f3e150606 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4433,7 +4433,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: to potentially reuse a previously deployed ``remote_function`` from the same user defined function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -4813,7 +4813,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): to select only the necessary columns before calling `apply()`. Note: This feature is currently in **preview**. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(row: pd.Series) -> int: ... result = 1 ... result += row["col1"] @@ -4828,7 +4828,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): You could return an array output for every input row from the remote function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def marks_analyzer(marks: pd.Series) -> list[float]: ... import statistics ... average = marks.mean() @@ -4869,7 +4869,7 @@ def apply(self, func, *, axis=0, args=(), **kwargs): [2 rows x 3 columns] - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def foo(x: int, y: int, z: int) -> float: ... result = 1 ... result += x diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index d911a303eb..fccaffdadf 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -31,8 +31,6 @@ def cut( age ranges. Supports binning into an equal number of bins, or a pre-specified array of bins. - ``labels=False`` implies you just want the bins back. - **Examples:** >>> import bigframes.pandas as bpd @@ -55,7 +53,16 @@ def cut( 3 {'left_exclusive': 7.5, 'right_inclusive': 10.0} dtype: struct[pyarrow] - Cut with an integer (equal-width bins) and labels=False: + Cut with the same bins, but assign them specific labels: + + >>> bpd.cut(s, bins=3, labels=["bad", "medium", "good"]) + 0 bad + 1 bad + 2 medium + 3 good + dtype: string + + `labels=False` implies you want the bins back. >>> bpd.cut(s, bins=4, labels=False) 0 0 @@ -67,7 +74,6 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: >>> import pandas as pd - >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 @@ -107,7 +113,7 @@ def cut( dtype: struct[pyarrow] Args: - x (Series): + x (bigframes.pandas.Series): The input Series to be binned. Must be 1-dimensional. bins (int, pd.IntervalIndex, Iterable): The criteria to bin by. @@ -127,10 +133,11 @@ def cut( ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. - labels (default None): + labels (bool, Iterable, default None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the - bins. This affects the type of the output container. + bins. This affects the type of the output container. This argument is + ignored when `bins` is an IntervalIndex. If True, raises an error. Returns: bigframes.pandas.Series: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 913a2e7c3e..a2d0983652 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1854,7 +1854,7 @@ def apply( to potentially reuse a previously deployed `remote_function` from the same user defined function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1883,6 +1883,7 @@ def apply( >>> @bpd.remote_function( ... reuse=False, ... packages=["cryptography"], + ... cloud_function_service_account="default" ... ) ... def get_hash(input: str) -> str: ... from cryptography.fernet import Fernet @@ -1900,7 +1901,7 @@ def apply( You could return an array output from the remote function. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def text_analyzer(text: str) -> list[int]: ... words = text.count(" ") + 1 ... periods = text.count(".") @@ -5069,7 +5070,7 @@ def mask(self, cond, other): condition is evaluated based on a complicated business logic which cannot be expressed in form of a Series. - >>> @bpd.remote_function(reuse=False) + >>> @bpd.remote_function(reuse=False, cloud_function_service_account="default") ... def should_mask(name: str) -> bool: ... hash = 0 ... for char_ in name: @@ -5665,7 +5666,7 @@ def map( It also accepts a remote function: - >>> @bpd.remote_function() + >>> @bpd.remote_function(cloud_function_service_account="default") ... def my_mapper(val: str) -> str: ... vowels = ["a", "e", "i", "o", "u"] ... if val: diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 356e73a71d..a94498722d 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.42.0" +__version__ = "2.0.0.dev0" # {x-release-please-start-date} -__release_date__ = "2025-03-27" +__release_date__ = "2025-03-31" # {x-release-please-end}