diff --git a/.github/release-please.yml b/.github/release-please.yml index 466597e5b1..8c34d1b49f 100644 --- a/.github/release-please.yml +++ b/.github/release-please.yml @@ -1,2 +1,5 @@ releaseType: python handleGHRelease: true +extraFiles: + - bigframes/version.py + - third_party/bigframes_vendored/version.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 24a1d8cb62..9617d97c58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,42 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.39.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.38.0...v1.39.0) (2025-03-05) + + +### Features + +* (Preview) Support `diff()` for date series ([#1423](https://github.com/googleapis/python-bigquery-dataframes/issues/1423)) ([521e987](https://github.com/googleapis/python-bigquery-dataframes/commit/521e9874f1c7dcd80e10bfd86f1b467b0f6d6d6e)) +* (Preview) Support aggregations over timedeltas ([#1418](https://github.com/googleapis/python-bigquery-dataframes/issues/1418)) ([1251ded](https://github.com/googleapis/python-bigquery-dataframes/commit/1251dedac8faf383c931185a057a8bb26afb4b8f)) +* (Preview) Support arithmetics between dates and timedeltas ([#1413](https://github.com/googleapis/python-bigquery-dataframes/issues/1413)) ([962b152](https://github.com/googleapis/python-bigquery-dataframes/commit/962b152ce5a368132d1ac14f6d8348b7ba285694)) +* (Preview) Support automatic load of timedelta from BQ tables. ([#1429](https://github.com/googleapis/python-bigquery-dataframes/issues/1429)) ([b2917bb](https://github.com/googleapis/python-bigquery-dataframes/commit/b2917bb57212ac399c20356755c878d179454bfe)) +* Add `allow_large_results` option to many I/O methods. Set to `False` to reduce latency ([#1428](https://github.com/googleapis/python-bigquery-dataframes/issues/1428)) ([dd2f488](https://github.com/googleapis/python-bigquery-dataframes/commit/dd2f48893eced458afecc93dc17b7e22735c39b9)) +* Add `GeoSeries.boundary()` ([#1435](https://github.com/googleapis/python-bigquery-dataframes/issues/1435)) ([32cddfe](https://github.com/googleapis/python-bigquery-dataframes/commit/32cddfecd25ff4208473574df09a8010f8be0de9)) +* Add allow_large_results to peek ([#1448](https://github.com/googleapis/python-bigquery-dataframes/issues/1448)) ([67487b9](https://github.com/googleapis/python-bigquery-dataframes/commit/67487b9a3bbe07f1b76e0332fab693b4c4022529)) +* Add groupby.rank() ([#1433](https://github.com/googleapis/python-bigquery-dataframes/issues/1433)) ([3a633d5](https://github.com/googleapis/python-bigquery-dataframes/commit/3a633d5cc9c3e6a2bd8311c8834b406db5cb8699)) +* Iloc multiple columns selection. ([#1437](https://github.com/googleapis/python-bigquery-dataframes/issues/1437)) ([ddfd02a](https://github.com/googleapis/python-bigquery-dataframes/commit/ddfd02a83040847f6d4642420d3bd32a4a855001)) +* Support interface for BigQuery managed functions ([#1373](https://github.com/googleapis/python-bigquery-dataframes/issues/1373)) ([2bbf53f](https://github.com/googleapis/python-bigquery-dataframes/commit/2bbf53f0d92dc669e1d775fafc54199f582d9059)) +* Warn if default ingress_settings is used in remote_functions ([#1419](https://github.com/googleapis/python-bigquery-dataframes/issues/1419)) ([dfd891a](https://github.com/googleapis/python-bigquery-dataframes/commit/dfd891a0102314e7542d0b0057442dcde3d9a4a1)) + + +### Bug Fixes + +* Do not compare schema description during schema validation ([#1452](https://github.com/googleapis/python-bigquery-dataframes/issues/1452)) ([03a3a56](https://github.com/googleapis/python-bigquery-dataframes/commit/03a3a5632ab187e1208cdc7133acfe0214243832)) +* Remove warnings for null index and partial ordering mode in prep for GA ([#1431](https://github.com/googleapis/python-bigquery-dataframes/issues/1431)) ([6785aee](https://github.com/googleapis/python-bigquery-dataframes/commit/6785aee97f4ee0c122d83e78409f9d6cc361b6d8)) +* Warn if default `cloud_function_service_account` is used in `remote_function` ([#1424](https://github.com/googleapis/python-bigquery-dataframes/issues/1424)) ([fe7463a](https://github.com/googleapis/python-bigquery-dataframes/commit/fe7463a69e616776df3f1b3bce4abdeaf7579f9b)) +* Window operations over JSON columns ([#1451](https://github.com/googleapis/python-bigquery-dataframes/issues/1451)) ([0070e77](https://github.com/googleapis/python-bigquery-dataframes/commit/0070e77579d0d0535d9f9a6c12641128e8a6dfbc)) +* Write chunked text instead of dummy text for pdf chunk ([#1444](https://github.com/googleapis/python-bigquery-dataframes/issues/1444)) ([96b0e8a](https://github.com/googleapis/python-bigquery-dataframes/commit/96b0e8a7a9d405c895ffd8ece56f4e3d04e0fbe5)) + + +### Performance Improvements + +* Speed up DataFrame corr, cov ([#1309](https://github.com/googleapis/python-bigquery-dataframes/issues/1309)) ([c598c0a](https://github.com/googleapis/python-bigquery-dataframes/commit/c598c0a1694ebc5a49bd92c837e4aaf1c311a899)) + + +### Documentation + +* Add snippet for explaining the linear regression model prediction ([#1427](https://github.com/googleapis/python-bigquery-dataframes/issues/1427)) ([7c37c7d](https://github.com/googleapis/python-bigquery-dataframes/commit/7c37c7d81c0cdc4647667daeebf13d47dabf3972)) + ## [1.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.37.0...v1.38.0) (2025-02-24) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 8fec253b24..3968e98a69 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -87,6 +87,7 @@ def __init__( kms_key_name: Optional[str] = None, skip_bq_connection_check: bool = False, *, + allow_large_results: bool = True, ordering_mode: Literal["strict", "partial"] = "strict", client_endpoints_override: Optional[dict] = None, ): @@ -98,6 +99,7 @@ def __init__( self._application_name = application_name self._kms_key_name = kms_key_name self._skip_bq_connection_check = skip_bq_connection_check + self._allow_large_results = allow_large_results self._session_started = False # Determines the ordering strictness for the session. self._ordering_mode = _validate_ordering_mode(ordering_mode) @@ -232,6 +234,26 @@ def skip_bq_connection_check(self, value: bool): ) self._skip_bq_connection_check = value + @property + def allow_large_results(self) -> bool: + """ + Sets the flag to allow or disallow query results larger than 10 GB. + + The default setting for this flag is True, which allows queries to return results + exceeding 10 GB by creating an explicit destination table. If set to False, it + restricts the result size to 10 GB, and BigQuery will raise an error if this limit + is exceeded. + + Returns: + bool: True if large results are allowed with an explicit destination table, + False if results are limited to 10 GB and errors are raised when exceeded. + """ + return self._allow_large_results + + @allow_large_results.setter + def allow_large_results(self, value: bool): + self._allow_large_results = value + @property def use_regional_endpoints(self) -> bool: """Flag to connect to regional API endpoints. diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index 69273aef1c..b958667628 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -25,6 +25,7 @@ class ExperimentOptions: def __init__(self): self._semantic_operators: bool = False self._blob: bool = False + self._udf: bool = False @property def semantic_operators(self) -> bool: @@ -53,3 +54,17 @@ def blob(self, value: bool): ) warnings.warn(msg, category=bfe.PreviewWarning) self._blob = value + + @property + def udf(self) -> bool: + return self._udf + + @udf.setter + def udf(self, value: bool): + if value is True: + msg = ( + "BigFrames managed function (udf) is still under experiments. " + "It may not work and subject to change in the future." + ) + warnings.warn(msg, category=bfe.PreviewWarning) + self._udf = value diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 1099535712..480e04f02c 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -14,7 +14,7 @@ from dataclasses import dataclass import inspect -from typing import Callable, Iterable +from typing import Callable, Iterable, Union import google.cloud.bigquery as bigquery @@ -42,12 +42,18 @@ def __init__( session: bigframes.session.Session, connection: str, max_batching_rows: int, + container_cpu: Union[float, int], + container_memory: str, ): self._func = func_def.func self._requirements = func_def.requirements self._session = session self._connection = connection - self._max_batching_rows = max_batching_rows + self._max_batching_rows = ( + int(max_batching_rows) if max_batching_rows > 1 else max_batching_rows + ) + self._container_cpu = container_cpu + self._container_memory = container_memory def _input_bq_signature(self): sig = inspect.signature(self._func) @@ -72,7 +78,7 @@ def _create_udf(self): CREATE OR REPLACE FUNCTION `{udf_name}`({self._input_bq_signature()}) RETURNS {self._output_bq_type()} LANGUAGE python WITH CONNECTION `{self._connection}` -OPTIONS (entry_point='{func_name}', runtime_version='python-3.11', packages={packages}, max_batching_rows={self._max_batching_rows}) +OPTIONS (entry_point='{func_name}', runtime_version='python-3.11', packages={packages}, max_batching_rows={self._max_batching_rows}, container_cpu={self._container_cpu}, container_memory='{self._container_memory}') AS r\"\"\" @@ -99,7 +105,7 @@ def udf(self): # Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string. def image_blur_func( - src_obj_ref_rt: str, dst_obj_ref_rt: str, ksize_x: int, ksize_y: int + src_obj_ref_rt: str, dst_obj_ref_rt: str, ksize_x: int, ksize_y: int, ext: str ) -> str: import json @@ -107,6 +113,8 @@ def image_blur_func( import numpy as np import requests + ext = ext or ".jpeg" + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) dst_obj_ref_rt_json = json.loads(dst_obj_ref_rt) @@ -119,13 +127,19 @@ def image_blur_func( nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - bts = cv.imencode(".jpeg", img_blurred)[1].tobytes() + + bts = cv.imencode(ext, img_blurred)[1].tobytes() + + ext = ext.replace(".", "") + ext_mappings = {"jpg": "jpeg", "tif": "tiff"} + ext = ext_mappings.get(ext, ext) + content_type = "image/" + ext requests.put( url=dst_url, data=bts, headers={ - "Content-Type": "image/jpeg", + "Content-Type": content_type, }, ) @@ -135,13 +149,17 @@ def image_blur_func( image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) -def image_blur_to_bytes_func(src_obj_ref_rt: str, ksize_x: int, ksize_y: int) -> bytes: +def image_blur_to_bytes_func( + src_obj_ref_rt: str, ksize_x: int, ksize_y: int, ext: str +) -> bytes: import json import cv2 as cv # type: ignore import numpy as np import requests + ext = ext or ".jpeg" + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) src_url = src_obj_ref_rt_json["access_urls"]["read_url"] @@ -151,7 +169,7 @@ def image_blur_to_bytes_func(src_obj_ref_rt: str, ksize_x: int, ksize_y: int) -> nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - bts = cv.imencode(".jpeg", img_blurred)[1].tobytes() + bts = cv.imencode(ext, img_blurred)[1].tobytes() return bts @@ -168,6 +186,7 @@ def image_resize_func( dsize_y: int, fx: float, fy: float, + ext: str, ) -> str: import json @@ -175,6 +194,8 @@ def image_resize_func( import numpy as np import requests + ext = ext or ".jpeg" + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) dst_obj_ref_rt_json = json.loads(dst_obj_ref_rt) @@ -187,13 +208,19 @@ def image_resize_func( nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) - bts = cv.imencode(".jpeg", img_resized)[1].tobytes() + + bts = cv.imencode(ext, img_resized)[1].tobytes() + + ext = ext.replace(".", "") + ext_mappings = {"jpg": "jpeg", "tif": "tiff"} + ext = ext_mappings.get(ext, ext) + content_type = "image/" + ext requests.put( url=dst_url, data=bts, headers={ - "Content-Type": "image/jpeg", + "Content-Type": content_type, }, ) @@ -211,6 +238,7 @@ def image_resize_to_bytes_func( dsize_y: int, fx: float, fy: float, + ext: str, ) -> bytes: import json @@ -218,6 +246,8 @@ def image_resize_to_bytes_func( import numpy as np import requests + ext = ext or ".jpeg" + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) src_url = src_obj_ref_rt_json["access_urls"]["read_url"] @@ -238,7 +268,12 @@ def image_resize_to_bytes_func( def image_normalize_func( - src_obj_ref_rt: str, dst_obj_ref_rt: str, alpha: float, beta: float, norm_type: str + src_obj_ref_rt: str, + dst_obj_ref_rt: str, + alpha: float, + beta: float, + norm_type: str, + ext: str, ) -> str: import json @@ -246,6 +281,8 @@ def image_normalize_func( import numpy as np import requests + ext = ext or ".jpeg" + norm_type_mapping = { "inf": cv.NORM_INF, "l1": cv.NORM_L1, @@ -267,13 +304,19 @@ def image_normalize_func( img_normalized = cv.normalize( img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] ) - bts = cv.imencode(".jpeg", img_normalized)[1].tobytes() + + bts = cv.imencode(ext, img_normalized)[1].tobytes() + + ext = ext.replace(".", "") + ext_mappings = {"jpg": "jpeg", "tif": "tiff"} + ext = ext_mappings.get(ext, ext) + content_type = "image/" + ext requests.put( url=dst_url, data=bts, headers={ - "Content-Type": "image/jpeg", + "Content-Type": content_type, }, ) @@ -286,7 +329,7 @@ def image_normalize_func( def image_normalize_to_bytes_func( - src_obj_ref_rt: str, alpha: float, beta: float, norm_type: str + src_obj_ref_rt: str, alpha: float, beta: float, norm_type: str, ext: str ) -> bytes: import json @@ -294,6 +337,8 @@ def image_normalize_to_bytes_func( import numpy as np import requests + ext = ext or ".jpeg" + norm_type_mapping = { "inf": cv.NORM_INF, "l1": cv.NORM_L1, diff --git a/bigframes/constants.py b/bigframes/constants.py index dbc24401a7..8f5ed95e1a 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -18,6 +18,7 @@ """ import datetime +import textwrap DEFAULT_EXPIRATION = datetime.timedelta(days=7) @@ -100,6 +101,25 @@ ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS ) +LEP_DEPRECATION_WARNING_MESSAGE = textwrap.dedent( + """ + Support for regional endpoints is not yet available in the location + {location} for BigQuery and BigQuery Storage APIs. For the supported + locations and APIs see https://cloud.google.com/bigquery/docs/regional-endpoints. + For other locations and APIs, currently an older, now deprecated locational + endpoints are being used, which requires your project to be allowlisted. In + future version 2.0 onwards the locational endpoints will no longer be + supported automatically when you enable regional endpoints. However, if you + still need them, you will be able to override the endpoints directly by + doing the following: + bigframes.pandas.options.bigquery.client_endpoints_override = {{ + "bqclient": "https://{location}-bigquery.googleapis.com", + "bqconnectionclient": "{location}-bigqueryconnection.googleapis.com", + "bqstoragereadclient": "{location}-bigquerystorage.googleapis.com" + }} + """ +).strip() + # BigQuery default is 10000, leave 100 for overhead MAX_COLUMNS = 9900 diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index dc9b8e3b9b..9325e3e5a8 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -18,7 +18,7 @@ import functools import io import typing -from typing import Iterable, List, Optional, Sequence, Tuple +from typing import Iterable, List, Mapping, Optional, Sequence, Tuple import warnings import google.cloud.bigquery @@ -198,12 +198,6 @@ def as_cached( ) return ArrayValue(node) - def _try_evaluate_local(self): - """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" - import bigframes.core.compile - - return bigframes.core.compile.test_only_try_evaluate(self.node) - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: return self.schema.get_type(key) @@ -355,6 +349,20 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: ) ) + def rename_columns(self, col_id_overrides: Mapping[str, str]) -> ArrayValue: + if not col_id_overrides: + return self + output_ids = [col_id_overrides.get(id, id) for id in self.node.schema.names] + return ArrayValue( + nodes.SelectionNode( + self.node, + tuple( + nodes.AliasedRef(ex.DerefOp(old_id), ids.ColumnId(out_id)) + for old_id, out_id in zip(self.node.ids, output_ids) + ), + ) + ) + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: return self.select_columns( [col_id for col_id in self.column_ids if col_id not in columns] @@ -444,6 +452,7 @@ def relational_join( other: ArrayValue, conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", + propogate_order: Optional[bool] = None, ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: l_mapping = { # Identity mapping, only rename right side lcol.name: lcol.name for lcol in self.node.ids @@ -457,6 +466,7 @@ def relational_join( for l_col, r_col in conditions ), type=type, + propogate_order=propogate_order or self.session._strictly_ordered, ) return ArrayValue(join_node), (l_mapping, r_mapping) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 8ef3aa123b..0e9525d5af 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -26,6 +26,7 @@ import bigframes.core.expression as ex import bigframes.core.ordering as ordering import bigframes.core.window_spec as windows +import bigframes.dtypes import bigframes.dtypes as dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -409,6 +410,8 @@ def rank( method: str = "average", na_option: str = "keep", ascending: bool = True, + grouping_cols: tuple[str, ...] = (), + columns: tuple[str, ...] = (), ): if method not in ["average", "min", "max", "first", "dense"]: raise ValueError( @@ -417,8 +420,8 @@ def rank( if na_option not in ["keep", "top", "bottom"]: raise ValueError("na_option must be one of 'keep', 'top', or 'bottom'") - columns = block.value_columns - labels = block.column_labels + columns = columns or tuple(col for col in block.value_columns) + labels = [block.col_id_to_label[id] for id in columns] # Step 1: Calculate row numbers for each row # Identify null values to be treated according to na_option param rownum_col_ids = [] @@ -442,9 +445,13 @@ def rank( block, rownum_id = block.apply_window_op( col if na_option == "keep" else nullity_col_id, agg_ops.dense_rank_op if method == "dense" else agg_ops.count_op, - window_spec=windows.unbound(ordering=window_ordering) + window_spec=windows.unbound( + grouping_keys=grouping_cols, ordering=window_ordering + ) if method == "dense" - else windows.rows(following=0, ordering=window_ordering), + else windows.rows( + following=0, ordering=window_ordering, grouping_keys=grouping_cols + ), skip_reproject_unsafe=(col != columns[-1]), ) rownum_col_ids.append(rownum_id) @@ -462,12 +469,32 @@ def rank( block, result_id = block.apply_window_op( rownum_col_ids[i], agg_op, - window_spec=windows.unbound(grouping_keys=(columns[i],)), + window_spec=windows.unbound(grouping_keys=(columns[i], *grouping_cols)), skip_reproject_unsafe=(i < (len(columns) - 1)), ) post_agg_rownum_col_ids.append(result_id) rownum_col_ids = post_agg_rownum_col_ids + # Pandas masks all values where any grouping column is null + # Note: we use pd.NA instead of float('nan') + if grouping_cols: + predicate = functools.reduce( + ops.and_op.as_expr, + [ops.notnull_op.as_expr(column_id) for column_id in grouping_cols], + ) + block = block.project_exprs( + [ + ops.where_op.as_expr( + ex.deref(col), + predicate, + ex.const(None), + ) + for col in rownum_col_ids + ], + labels=labels, + ) + rownum_col_ids = list(block.value_columns[-len(rownum_col_ids) :]) + # Step 3: post processing: mask null values and cast to float if method in ["min", "max", "first", "dense"]: # Pandas rank always produces Float64, so must cast for aggregation types that produce ints diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 10970b24e8..7ac2b03f28 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -64,7 +64,6 @@ import bigframes.core.utils as utils import bigframes.core.window_spec as windows import bigframes.dtypes -import bigframes.exceptions as bfe import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -113,6 +112,7 @@ class MaterializationOptions: downsampling: sampling_options.SamplingOptions = dataclasses.field( default_factory=sampling_options.SamplingOptions ) + allow_large_results: Optional[bool] = None ordered: bool = True @@ -137,9 +137,6 @@ def __init__( f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) - if len(index_columns) == 0: - msg = "Creating object with Null Index. Null Index is a preview feature." - warnings.warn(msg, category=bfe.NullIndexPreviewWarning) self._index_columns = tuple(index_columns) # Index labels don't need complicated hierarchical access so can store as tuple self._index_labels = ( @@ -213,14 +210,10 @@ def index(self) -> BlockIndexProperties: @functools.cached_property def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" - - row_count_expr = self.expr.row_count() - - # Support in-memory engines for hermetic unit tests. - if self.expr.session is None: + # Support zero-query for hermetic unit tests. + if self.expr.session is None and self.expr.node.row_count: try: - row_count = row_count_expr._try_evaluate_local().squeeze() - return (row_count, len(self.value_columns)) + return self.expr.node.row_count except Exception: pass @@ -487,9 +480,12 @@ def to_arrow( self, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> Tuple[pa.Table, bigquery.QueryJob]: """Run query and download results as a pyarrow Table.""" - execute_result = self.session._executor.execute(self.expr, ordered=ordered) + execute_result = self.session._executor.execute( + self.expr, ordered=ordered, use_explicit_destination=allow_large_results + ) pa_table = execute_result.to_arrow_table() pa_index_labels = [] @@ -511,6 +507,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> Tuple[pd.DataFrame, Optional[bigquery.QueryJob]]: """Run query and download results as a pandas DataFrame. @@ -553,17 +550,21 @@ def to_pandas( df, query_job = self._materialize_local( materialize_options=MaterializationOptions( - downsampling=sampling, ordered=ordered + downsampling=sampling, + allow_large_results=allow_large_results, + ordered=ordered, ) ) df.set_axis(self.column_labels, axis=1, copy=False) return df, query_job def try_peek( - self, n: int = 20, force: bool = False + self, n: int = 20, force: bool = False, allow_large_results=None ) -> typing.Optional[pd.DataFrame]: if force or self.expr.supports_fast_peek: - result = self.session._executor.peek(self.expr, n) + result = self.session._executor.peek( + self.expr, n, use_explicit_destination=allow_large_results + ) df = io_pandas.arrow_to_pandas(result.to_arrow_table(), self.expr.schema) self._copy_index_to_pandas(df) return df @@ -571,7 +572,10 @@ def try_peek( return None def to_pandas_batches( - self, page_size: Optional[int] = None, max_results: Optional[int] = None + self, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + allow_large_results: Optional[bool] = None, ): """Download results one message at a time. @@ -580,7 +584,7 @@ def to_pandas_batches( execute_result = self.session._executor.execute( self.expr, ordered=True, - use_explicit_destination=True, + use_explicit_destination=allow_large_results, page_size=page_size, max_results=max_results, ) @@ -609,17 +613,30 @@ def _materialize_local( """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. execute_result = self.session._executor.execute( - self.expr, ordered=materialize_options.ordered, get_size_bytes=True + self.expr, + ordered=materialize_options.ordered, + use_explicit_destination=materialize_options.allow_large_results, ) - assert execute_result.total_bytes is not None - table_mb = execute_result.total_bytes / _BYTES_TO_MEGABYTES sample_config = materialize_options.downsampling - max_download_size = sample_config.max_download_size - fraction = ( - max_download_size / table_mb - if (max_download_size is not None) and (table_mb != 0) - else 2 - ) + if execute_result.total_bytes is not None: + table_mb = execute_result.total_bytes / _BYTES_TO_MEGABYTES + max_download_size = sample_config.max_download_size + fraction = ( + max_download_size / table_mb + if (max_download_size is not None) and (table_mb != 0) + else 2 + ) + else: + # Since we cannot acquire the table size without a query_job, + # we skip the sampling. + if sample_config.enable_downsampling: + warnings.warn( + "Sampling is disabled and there is no download size limit when 'allow_large_results' is set to " + "False. To prevent downloading excessive data, it is recommended to use the peek() method, or " + "limit the data with methods like .head() or .sample() before proceeding with downloads.", + UserWarning, + ) + fraction = 2 # TODO: Maybe materialize before downsampling # Some downsampling methods @@ -1706,7 +1723,7 @@ def transpose( original_row_index = ( original_row_index if original_row_index is not None - else self.index.to_pandas(ordered=True) + else self.index.to_pandas(ordered=True)[0] ) original_row_count = len(original_row_index) if original_row_count > bigframes.constants.MAX_COLUMNS: @@ -2426,7 +2443,7 @@ def to_sql_query( # implementaton. It will reference cached tables instead of original data sources. # Maybe should just compile raw BFET? Depends on user intent. sql = self.session._executor.to_sql( - array_value, col_id_overrides=substitutions, enable_cache=enable_cache + array_value.rename_columns(substitutions), enable_cache=enable_cache ) return ( sql, @@ -2665,14 +2682,22 @@ def column_ids(self) -> Sequence[str]: def is_null(self) -> bool: return len(self._block._index_columns) == 0 - def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: + def to_pandas( + self, + *, + ordered: Optional[bool] = None, + allow_large_results: Optional[bool] = None, + ) -> Tuple[pd.Index, Optional[bigquery.QueryJob]]: """Executes deferred operations and downloads the results.""" if len(self.column_ids) == 0: raise bigframes.exceptions.NullIndexError( "Cannot materialize index, as this object does not have an index. Set index column(s) using set_index." ) ordered = ordered if ordered is not None else True - return self._block.select_columns([]).to_pandas(ordered=ordered)[0].index + df, query_job = self._block.select_columns([]).to_pandas( + ordered=ordered, allow_large_results=allow_large_results + ) + return df.index, query_job def resolve_level(self, level: LevelsType) -> typing.Sequence[str]: if utils.is_list_like(level): diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index 964113bd7b..0bfdf2222d 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -13,14 +13,9 @@ # limitations under the License. from __future__ import annotations -from bigframes.core.compile.api import ( - SQLCompiler, - test_only_ibis_inferred_schema, - test_only_try_evaluate, -) +from bigframes.core.compile.api import SQLCompiler, test_only_ibis_inferred_schema __all__ = [ "SQLCompiler", - "test_only_try_evaluate", "test_only_ibis_inferred_schema", ] diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 4ec0b270ed..edf1e14b3a 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -26,6 +26,7 @@ import bigframes_vendored.ibis.expr.types as ibis_types import pandas as pd +from bigframes.core.compile import constants as compiler_constants import bigframes.core.compile.ibis_types as compile_ibis_types import bigframes.core.compile.scalar_op_compiler as scalar_compilers import bigframes.core.expression as ex @@ -231,7 +232,11 @@ def _( column: ibis_types.NumericColumn, window=None, ) -> ibis_types.NumericValue: - return _apply_window_if_present(column.quantile(op.q), window) + result = column.quantile(op.q) + if op.should_floor_result: + result = result.floor() # type:ignore + + return _apply_window_if_present(result, window) @compile_unary_agg.register @@ -242,7 +247,8 @@ def _( window=None, # order_by: typing.Sequence[ibis_types.Value] = [], ) -> ibis_types.NumericValue: - return _apply_window_if_present(column.mean(), window) + result = column.mean().floor() if op.should_floor_result else column.mean() + return _apply_window_if_present(result, window) @compile_unary_agg.register @@ -306,10 +312,11 @@ def _( @numeric_op def _( op: agg_ops.StdOp, - x: ibis_types.Column, + x: ibis_types.NumericColumn, window=None, ) -> ibis_types.Value: - return _apply_window_if_present(cast(ibis_types.NumericColumn, x).std(), window) + result = x.std().floor() if op.should_floor_result else x.std() + return _apply_window_if_present(result, window) @compile_unary_agg.register @@ -569,6 +576,30 @@ def _( return original_column.delta(shifted_column, part="microsecond") +@compile_unary_agg.register +def _( + op: agg_ops.DateSeriesDiffOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: + if not column.type().is_date(): + raise TypeError(f"Cannot perform date series diff on type{column.type()}") + + original_column = cast(ibis_types.DateColumn, column) + shifted_column = cast( + ibis_types.DateColumn, + compile_unary_agg(agg_ops.ShiftOp(op.periods), column, window), + ) + + conversion_factor = typing.cast( + ibis_types.IntegerValue, compiler_constants.UNIT_TO_US_CONVERSION_FACTORS["D"] + ) + + return ( + original_column.delta(shifted_column, part="day") * conversion_factor + ).floor() + + @compile_unary_agg.register def _( op: agg_ops.AllOp, diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 9280cfbb7b..cf441a2053 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -13,11 +13,12 @@ # limitations under the License. from __future__ import annotations -from typing import Mapping, Sequence, Tuple, TYPE_CHECKING +from typing import Optional, Sequence, Tuple, TYPE_CHECKING import google.cloud.bigquery as bigquery -import bigframes.core.compile.compiler as compiler +from bigframes.core import rewrite +from bigframes.core.compile import compiler if TYPE_CHECKING: import bigframes.core.nodes @@ -31,31 +32,16 @@ class SQLCompiler: def __init__(self, strict: bool = True): self._compiler = compiler.Compiler(strict=strict) - def compile_peek(self, node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str: - """Compile node into sql that selects N arbitrary rows, may not execute deterministically.""" - return self._compiler.compile_peek_sql(node, n_rows) - - def compile_unordered( - self, - node: bigframes.core.nodes.BigFrameNode, - *, - col_id_overrides: Mapping[str, str] = {}, - ) -> str: - """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" - # TODO: Enable limit pullup, but only if not being used to write to clustered table. - output_ids = [col_id_overrides.get(id, id) for id in node.schema.names] - return self._compiler.compile_sql(node, ordered=False, output_ids=output_ids) - - def compile_ordered( + def compile( self, node: bigframes.core.nodes.BigFrameNode, *, - col_id_overrides: Mapping[str, str] = {}, + ordered: bool = True, + limit: Optional[int] = None, ) -> str: """Compile node into sql where rows are sorted with ORDER BY.""" # If we are ordering the query anyways, compiling the slice as a limit is probably a good idea. - output_ids = [col_id_overrides.get(id, id) for id in node.schema.names] - return self._compiler.compile_sql(node, ordered=True, output_ids=output_ids) + return self._compiler.compile_sql(node, ordered=ordered, limit=limit) def compile_raw( self, @@ -67,21 +53,15 @@ def compile_raw( return self._compiler.compile_raw(node) -def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): - """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" - node = _STRICT_COMPILER._preprocess(node) - ibis = _STRICT_COMPILER.compile_node(node)._to_ibis_expr() - return ibis.pandas.connect({}).execute(ibis) - - def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema.""" import bigframes.core.schema - node = _STRICT_COMPILER._preprocess(node) - compiled = _STRICT_COMPILER.compile_node(node) + node = _STRICT_COMPILER._replace_unsupported_ops(node) + node, _ = rewrite.pull_up_order(node, order_root=False) + ir = _STRICT_COMPILER.compile_node(node) items = tuple( - bigframes.core.schema.SchemaItem(name, compiled.get_column_type(ibis_id)) - for name, ibis_id in zip(node.schema.names, compiled.column_ids) + bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id)) + for name, ibis_id in zip(node.schema.names, ir.column_ids) ) return bigframes.core.schema.ArraySchema(items) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index b0cf30269e..c3d4c10267 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -30,6 +30,7 @@ import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as op_compilers +import bigframes.core.compile.scalar_op_compiler as scalar_op_compiler import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import OrderingExpression @@ -72,7 +73,11 @@ def to_sql( ) -> str: ibis_table = self._to_ibis_expr() # This set of output transforms maybe should be its own output node?? - if order_by or limit: + if ( + order_by + or limit + or (selections and (tuple(selections) != tuple(self.column_ids))) + ): sql = ibis_bigquery.Backend().compile(ibis_table) sql = ( bigframes.core.compile.googlesql.Select() @@ -672,4 +677,7 @@ def _as_groupable(value: ibis_types.Value): # Some types need to be converted to string to enable groupby if value.type().is_float64() or value.type().is_geospatial(): return value.cast(ibis_dtypes.str) - return value + elif value.type().is_json(): + return scalar_op_compiler.to_json_string(value) + else: + return value diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 77f51542b4..f5be71830c 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -33,8 +33,6 @@ import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as compile_scalar import bigframes.core.compile.schema_translator -import bigframes.core.expression as ex -import bigframes.core.identifiers as ids import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering import bigframes.core.rewrite as rewrites @@ -52,65 +50,50 @@ class Compiler: scalar_op_compiler = compile_scalar.ScalarOpCompiler() def compile_sql( - self, node: nodes.BigFrameNode, ordered: bool, output_ids: typing.Sequence[str] + self, + node: nodes.BigFrameNode, + ordered: bool, + limit: typing.Optional[int] = None, ) -> str: - # TODO: get rid of output_ids arg - assert len(output_ids) == len(list(node.fields)) - node = set_output_names(node, output_ids) - node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) + # later steps might add ids, so snapshot before those steps. + output_ids = node.schema.names if ordered: - node, limit = rewrites.pullup_limit_from_slice(node) - node = nodes.bottom_up(node, rewrites.rewrite_slice) - # TODO: Extract out CTEs - node, ordering = rewrites.pull_up_order( - node, order_root=True, ordered_joins=self.strict - ) - node = rewrites.column_pruning(node) - ir = self.compile_node(node) - return ir.to_sql( - order_by=ordering.all_ordering_columns, - limit=limit, - selections=output_ids, - ) - else: - node = nodes.bottom_up(node, rewrites.rewrite_slice) - node, _ = rewrites.pull_up_order( - node, order_root=False, ordered_joins=self.strict - ) - node = rewrites.column_pruning(node) - ir = self.compile_node(node) - return ir.to_sql(selections=output_ids) + # Need to do this before replacing unsupported ops, as that will rewrite slice ops + node, pulled_up_limit = rewrites.pullup_limit_from_slice(node) + if (pulled_up_limit is not None) and ( + (limit is None) or limit > pulled_up_limit + ): + limit = pulled_up_limit - def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str: - ids = [id.sql for id in node.ids] - node = nodes.bottom_up(node, rewrites.rewrite_slice) - node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) - node, _ = rewrites.pull_up_order( - node, order_root=False, ordered_joins=self.strict - ) + node = self._replace_unsupported_ops(node) + # prune before pulling up order to avoid unnnecessary row_number() ops node = rewrites.column_pruning(node) - return self.compile_node(node).to_sql(limit=n_rows, selections=ids) + node, ordering = rewrites.pull_up_order(node, order_root=ordered) + # final pruning to cleanup up any leftovers unused values + node = rewrites.column_pruning(node) + return self.compile_node(node).to_sql( + order_by=ordering.all_ordering_columns if ordered else (), + limit=limit, + selections=output_ids, + ) def compile_raw( self, - node: bigframes.core.nodes.BigFrameNode, + node: nodes.BigFrameNode, ) -> typing.Tuple[ str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering ]: - node = nodes.bottom_up(node, rewrites.rewrite_slice) - node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) - node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict) + node = self._replace_unsupported_ops(node) + node = rewrites.column_pruning(node) + node, ordering = rewrites.pull_up_order(node, order_root=True) node = rewrites.column_pruning(node) - ir = self.compile_node(node) - sql = ir.to_sql() + sql = self.compile_node(node).to_sql() return sql, node.schema.to_bigquery(), ordering - def _preprocess(self, node: nodes.BigFrameNode): + def _replace_unsupported_ops(self, node: nodes.BigFrameNode): + # TODO: Run all replacement rules as single bottom-up pass node = nodes.bottom_up(node, rewrites.rewrite_slice) - node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) - node, _ = rewrites.pull_up_order( - node, order_root=False, ordered_joins=self.strict - ) + node = nodes.bottom_up(node, rewrites.rewrite_timedelta_expressions) return node # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) @@ -305,16 +288,3 @@ def compile_explode(self, node: nodes.ExplodeNode): @_compile_node.register def compile_random_sample(self, node: nodes.RandomSampleNode): return self.compile_node(node.child)._uniform_sampling(node.fraction) - - -def set_output_names( - node: bigframes.core.nodes.BigFrameNode, output_ids: typing.Sequence[str] -): - # TODO: Create specialized output operators that will handle final names - return nodes.SelectionNode( - node, - tuple( - bigframes.core.nodes.AliasedRef(ex.DerefOp(old_id), ids.ColumnId(out_id)) - for old_id, out_id in zip(node.ids, output_ids) - ), - ) diff --git a/bigframes/core/compile/constants.py b/bigframes/core/compile/constants.py new file mode 100644 index 0000000000..9c307125ab --- /dev/null +++ b/bigframes/core/compile/constants.py @@ -0,0 +1,27 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Datetime constants +UNIT_TO_US_CONVERSION_FACTORS = { + "W": 7 * 24 * 60 * 60 * 1000 * 1000, + "d": 24 * 60 * 60 * 1000 * 1000, + "D": 24 * 60 * 60 * 1000 * 1000, + "h": 60 * 60 * 1000 * 1000, + "m": 60 * 1000 * 1000, + "s": 1000 * 1000, + "ms": 1000, + "us": 1, + "ns": 1e-3, +} diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index c47c6cf07b..54a5a37736 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -397,6 +397,7 @@ def literal_to_ibis_scalar( ) # "correct" way would be to use ibis.array, but this produces invalid BQ SQL syntax return tuple(literal) + if not pd.api.types.is_list_like(literal) and pd.isna(literal): if ibis_dtype: return bigframes_vendored.ibis.null().cast(ibis_dtype) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 7111406646..35a307722f 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -26,6 +26,7 @@ import numpy as np import pandas as pd +from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS import bigframes.core.compile.default_ordering import bigframes.core.compile.ibis_types import bigframes.core.expression as ex @@ -50,19 +51,6 @@ ) _OBJ_REF_IBIS_DTYPE = ibis_dtypes.Struct.from_tuples(_OBJ_REF_STRUCT_SCHEMA) # type: ignore -# Datetime constants -UNIT_TO_US_CONVERSION_FACTORS = { - "W": 7 * 24 * 60 * 60 * 1000 * 1000, - "d": 24 * 60 * 60 * 1000 * 1000, - "D": 24 * 60 * 60 * 1000 * 1000, - "h": 60 * 60 * 1000 * 1000, - "m": 60 * 1000 * 1000, - "s": 1000 * 1000, - "ms": 1000, - "us": 1, - "ns": 1e-3, -} - class ScalarOpCompiler: # Mapping of operation name to implemenations @@ -752,6 +740,21 @@ def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal return x - y.to_interval("us") +@scalar_op_compiler.register_binary_op(ops.date_diff_op) +def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): + return x.delta(y, "day") * int(UNIT_TO_US_CONVERSION_FACTORS["d"]) # type: ignore + + +@scalar_op_compiler.register_binary_op(ops.date_add_op) +def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast("timestamp") + y.to_interval("us") # type: ignore + + +@scalar_op_compiler.register_binary_op(ops.date_sub_op) +def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast("timestamp") - y.to_interval("us") # type: ignore + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] @@ -998,14 +1001,9 @@ def normalize_op_impl(x: ibis_types.Value): # Geo Ops -@scalar_op_compiler.register_unary_op(ops.geo_x_op) -def geo_x_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).x() - - -@scalar_op_compiler.register_unary_op(ops.geo_y_op) -def geo_y_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).y() +@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) +def geo_st_boundary_op_impl(x: ibis_types.Value): + return st_boundary(x) @scalar_op_compiler.register_unary_op(ops.geo_area_op) @@ -1032,6 +1030,16 @@ def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): ) +@scalar_op_compiler.register_unary_op(ops.geo_x_op) +def geo_x_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).x() + + +@scalar_op_compiler.register_unary_op(ops.geo_y_op) +def geo_y_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).y() + + # Parameterized ops @scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): @@ -1962,6 +1970,11 @@ def unix_millis(a: ibis_dtypes.timestamp) -> int: # type: ignore """Convert a timestamp to milliseconds""" +@ibis_udf.scalar.builtin +def st_boundary(a: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ignore + """Find the boundary of a geography.""" + + @ibis_udf.scalar.builtin def unix_micros(a: ibis_dtypes.timestamp) -> int: # type: ignore """Convert a timestamp to microseconds""" diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index f619cd72c9..126d2f4dd2 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -174,6 +174,20 @@ def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFr return self.quantile(0.5) return self._aggregate_all(agg_ops.median_op, numeric_only=True) + def rank( + self, method="average", ascending: bool = True, na_option: str = "keep" + ) -> df.DataFrame: + return df.DataFrame( + block_ops.rank( + self._block, + method, + na_option, + ascending, + grouping_cols=tuple(self._by_col_ids), + columns=tuple(self._selected_cols), + ) + ) + def quantile( self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False ) -> df.DataFrame: @@ -574,6 +588,20 @@ def sum(self, *args) -> series.Series: def mean(self, *args) -> series.Series: return self._aggregate(agg_ops.mean_op) + def rank( + self, method="average", ascending: bool = True, na_option: str = "keep" + ) -> series.Series: + return series.Series( + block_ops.rank( + self._block, + method, + na_option, + ascending, + grouping_cols=tuple(self._by_col_ids), + columns=(self._value_column,), + ) + ) + def median( self, *args, diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 9c7fba8ec1..97115a3ed0 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -425,7 +425,7 @@ def _iloc_getitem_series_or_dataframe( @typing.overload def _iloc_getitem_series_or_dataframe( series_or_dataframe: bigframes.dataframe.DataFrame, key -) -> Union[bigframes.dataframe.DataFrame, pd.Series]: +) -> Union[bigframes.dataframe.DataFrame, pd.Series, bigframes.core.scalar.Scalar]: ... @@ -447,31 +447,42 @@ def _iloc_getitem_series_or_dataframe( return result_pd_df.iloc[0] elif isinstance(key, slice): return series_or_dataframe._slice(key.start, key.stop, key.step) - elif isinstance(key, tuple) and len(key) == 0: - return series_or_dataframe - elif isinstance(key, tuple) and len(key) == 1: - return _iloc_getitem_series_or_dataframe(series_or_dataframe, key[0]) - elif ( - isinstance(key, tuple) - and isinstance(series_or_dataframe, bigframes.dataframe.DataFrame) - and len(key) == 2 - ): - return series_or_dataframe.iat[key] elif isinstance(key, tuple): - raise pd.errors.IndexingError("Too many indexers") + if len(key) > 2 or ( + len(key) == 2 and isinstance(series_or_dataframe, bigframes.series.Series) + ): + raise pd.errors.IndexingError("Too many indexers") + + if len(key) == 0: + return series_or_dataframe + + if len(key) == 1: + return _iloc_getitem_series_or_dataframe(series_or_dataframe, key[0]) + + # len(key) == 2 + df = typing.cast(bigframes.dataframe.DataFrame, series_or_dataframe) + if isinstance(key[1], int): + return df.iat[key] + elif isinstance(key[1], list): + columns = df.columns[key[1]] + return _iloc_getitem_series_or_dataframe(df[columns], key[0]) + raise NotImplementedError( + f"iloc does not yet support indexing with {key}. {constants.FEEDBACK_LINK}" + ) elif pd.api.types.is_list_like(key): if len(key) == 0: return typing.cast( Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) - df = series_or_dataframe if isinstance(series_or_dataframe, bigframes.series.Series): original_series_name = series_or_dataframe.name series_name = ( original_series_name if original_series_name is not None else 0 ) df = series_or_dataframe.to_frame() + else: + df = series_or_dataframe original_index_names = df.index.names temporary_index_names = [ guid.generate_guid(prefix="temp_iloc_index_") @@ -491,11 +502,6 @@ def _iloc_getitem_series_or_dataframe( result = result.rename(original_series_name) return result - - elif isinstance(key, tuple): - raise NotImplementedError( - f"iloc does not yet support indexing with a (row, column) tuple. {constants.FEEDBACK_LINK}" - ) elif callable(key): raise NotImplementedError( f"iloc does not yet support indexing with a callable. {constants.FEEDBACK_LINK}" diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index b3a07d33bc..84da6c5de0 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -490,17 +490,29 @@ def __getitem__(self, key: int) -> typing.Any: else: raise NotImplementedError(f"Index key not supported {key}") - def to_pandas(self) -> pandas.Index: + def to_pandas(self, *, allow_large_results: Optional[bool] = None) -> pandas.Index: """Gets the Index as a pandas Index. + Args: + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. + Returns: pandas.Index: A pandas Index with all of the labels from this Index. """ - return self._block.index.to_pandas(ordered=True) + df, query_job = self._block.index.to_pandas( + ordered=True, allow_large_results=allow_large_results + ) + if query_job: + self._query_job = query_job + return df - def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: - return self.to_pandas().to_numpy(dtype, **kwargs) + def to_numpy(self, dtype=None, *, allow_large_results=None, **kwargs) -> np.ndarray: + return self.to_pandas(allow_large_results=allow_large_results).to_numpy( + dtype, **kwargs + ) __array__ = to_numpy diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index e2093e57d9..fbc43e033a 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -287,6 +287,7 @@ class JoinNode(BigFrameNode): right_child: BigFrameNode conditions: typing.Tuple[typing.Tuple[ex.DerefOp, ex.DerefOp], ...] type: typing.Literal["inner", "outer", "left", "right", "cross"] + propogate_order: bool def _validate(self): assert not ( @@ -311,8 +312,7 @@ def order_ambiguous(self) -> bool: @property def explicitly_ordered(self) -> bool: - # Do not consider user pre-join ordering intent - they need to re-order post-join in unordered mode. - return False + return self.propogate_order @property def fields(self) -> Iterable[Field]: @@ -1555,6 +1555,14 @@ def remap_refs( return dataclasses.replace(self, column_ids=new_ids) # type: ignore +# Introduced during planing/compilation +@dataclasses.dataclass(frozen=True, eq=False) +class ResultNode(UnaryNode): + output_names: tuple[str, ...] + order_by: Tuple[OrderingExpression, ...] = () + limit: Optional[int] = None + + # Tree operators def top_down( root: BigFrameNode, diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index bdb30fbc34..cca55092e0 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -29,7 +29,6 @@ def pull_up_order( root: bigframes.core.nodes.BigFrameNode, *, order_root: bool = True, - ordered_joins: bool = True, ) -> Tuple[bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.RowOrdering]: """ Pull the ordering up, putting full order definition into window ops. @@ -92,7 +91,7 @@ def pull_up_order_inner( child_result, child_order = pull_up_order_inner(node.child) return node.replace_child(child_result), child_order elif isinstance(node, bigframes.core.nodes.JoinNode): - if ordered_joins: + if node.propogate_order: return pull_order_join(node) else: return ( diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index bde1a4431c..bf3c0ee639 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -70,6 +70,19 @@ def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNod root.skip_reproject_unsafe, ) + if isinstance(root, nodes.AggregateNode): + updated_aggregations = tuple( + (_rewrite_aggregation(agg, root.child.schema), col_id) + for agg, col_id in root.aggregations + ) + return nodes.AggregateNode( + root.child, + updated_aggregations, + root.by_column_ids, + root.order_by, + root.dropna, + ) + return root @@ -138,6 +151,12 @@ def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.timestamp_sub_op, left, right) + if left.dtype == dtypes.DATE_DTYPE and right.dtype == dtypes.DATE_DTYPE: + return _TypedExpr.create_op_expr(ops.date_diff_op, left, right) + + if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.date_sub_op, left, right) + return _TypedExpr.create_op_expr(ops.sub_op, left, right) @@ -150,6 +169,14 @@ def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: # always on the right. return _TypedExpr.create_op_expr(ops.timestamp_add_op, right, left) + if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.date_add_op, left, right) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.DATE_DTYPE: + # Re-arrange operands such that date is always on the left and timedelta is + # always on the right. + return _TypedExpr.create_op_expr(ops.date_add_op, right, left) + return _TypedExpr.create_op_expr(ops.add_op, left, right) @@ -196,17 +223,39 @@ def _rewrite_aggregation( ) -> ex.Aggregation: if not isinstance(aggregation, ex.UnaryAggregation): return aggregation - if not isinstance(aggregation.op, aggs.DiffOp): - return aggregation if isinstance(aggregation.arg, ex.DerefOp): input_type = schema.get_type(aggregation.arg.id.sql) else: input_type = aggregation.arg.dtype - if dtypes.is_datetime_like(input_type): + if isinstance(aggregation.op, aggs.DiffOp): + if dtypes.is_datetime_like(input_type): + return ex.UnaryAggregation( + aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg + ) + elif input_type == dtypes.DATE_DTYPE: + return ex.UnaryAggregation( + aggs.DateSeriesDiffOp(aggregation.op.periods), aggregation.arg + ) + + if isinstance(aggregation.op, aggs.StdOp) and input_type is dtypes.TIMEDELTA_DTYPE: + return ex.UnaryAggregation( + aggs.StdOp(should_floor_result=True), aggregation.arg + ) + + if isinstance(aggregation.op, aggs.MeanOp) and input_type is dtypes.TIMEDELTA_DTYPE: + return ex.UnaryAggregation( + aggs.MeanOp(should_floor_result=True), aggregation.arg + ) + + if ( + isinstance(aggregation.op, aggs.QuantileOp) + and input_type is dtypes.TIMEDELTA_DTYPE + ): return ex.UnaryAggregation( - aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg + aggs.QuantileOp(q=aggregation.op.q, should_floor_result=True), + aggregation.arg, ) return aggregation diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 502a40d92d..18061dca18 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -52,7 +52,7 @@ def is_dict_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Mapping] def combine_indices(index1: pd.Index, index2: pd.Index) -> pd.MultiIndex: - """Combines indices into multi-index while preserving dtypes, names.""" + """Combines indices into multi-index while preserving dtypes, names merging by rows 1:1""" multi_index = pd.MultiIndex.from_frame( pd.concat([index1.to_frame(index=False), index2.to_frame(index=False)], axis=1) ) @@ -61,6 +61,20 @@ def combine_indices(index1: pd.Index, index2: pd.Index) -> pd.MultiIndex: return multi_index +def cross_indices(index1: pd.Index, index2: pd.Index) -> pd.MultiIndex: + """Combines indices into multi-index while preserving dtypes, names using cross product""" + multi_index = pd.MultiIndex.from_frame( + pd.merge( + left=index1.to_frame(index=False), + right=index2.to_frame(index=False), + how="cross", + ) + ) + # to_frame will produce numbered default names, we don't want these + multi_index.names = [*index1.names, *index2.names] + return multi_index + + def index_as_tuples(index: pd.Index) -> typing.Sequence[typing.Tuple]: if isinstance(index, pd.MultiIndex): return [label for label in index] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index caf1b62e07..b5174dbd3e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1270,6 +1270,35 @@ def combine( def combine_first(self, other: DataFrame): return self._apply_dataframe_binop(other, ops.fillna_op) + def _fast_stat_matrix(self, op: agg_ops.BinaryAggregateOp) -> DataFrame: + """Faster corr, cov calculations, but creates more sql text, so cannot scale to many columns""" + assert len(self.columns) * len(self.columns) < bigframes.constants.MAX_COLUMNS + orig_columns = self.columns + frame = self.copy() + # Replace column names with 0 to n - 1 to keep order + # and avoid the influence of duplicated column name + frame.columns = pandas.Index(range(len(orig_columns))) + frame = frame.astype(bigframes.dtypes.FLOAT_DTYPE) + block = frame._block + + aggregations = [ + ex.BinaryAggregation(op, ex.deref(left_col), ex.deref(right_col)) + for left_col in block.value_columns + for right_col in block.value_columns + ] + # unique columns stops + uniq_orig_columns = utils.combine_indices( + orig_columns, pandas.Index(range(len(orig_columns))) + ) + labels = utils.cross_indices(uniq_orig_columns, uniq_orig_columns) + + block, _ = block.aggregate(aggregations=aggregations, column_labels=labels) + + block = block.stack(levels=orig_columns.nlevels + 1) + # The aggregate operation crated a index level with just 0, need to drop it + # Also, drop the last level of each index, which was created to guarantee uniqueness + return DataFrame(block).droplevel(0).droplevel(-1, axis=0).droplevel(-1, axis=1) + def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFrame: if method != "pearson": raise NotImplementedError( @@ -1285,6 +1314,10 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr else: frame = self._drop_non_numeric() + if len(frame.columns) <= 30: + return frame._fast_stat_matrix(agg_ops.CorrOp()) + + frame = frame.copy() orig_columns = frame.columns # Replace column names with 0 to n - 1 to keep order # and avoid the influence of duplicated column name @@ -1393,6 +1426,10 @@ def cov(self, *, numeric_only: bool = False) -> DataFrame: else: frame = self._drop_non_numeric() + if len(frame.columns) <= 30: + return frame._fast_stat_matrix(agg_ops.CovOp()) + + frame = frame.copy() orig_columns = frame.columns # Replace column names with 0 to n - 1 to keep order # and avoid the influence of duplicated column name @@ -1529,6 +1566,7 @@ def to_arrow( self, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> pyarrow.Table: """Write DataFrame to an Arrow table / record batch. @@ -1536,6 +1574,9 @@ def to_arrow( ordered (bool, default True): Determines whether the resulting Arrow table will be ordered. In some cases, unordered may result in a faster-executing query. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame. @@ -1543,8 +1584,11 @@ def to_arrow( msg = "to_arrow is in preview. Types and unnamed / duplicate name columns may change in future." warnings.warn(msg, category=bfe.PreviewWarning) - pa_table, query_job = self._block.to_arrow(ordered=ordered) - self._set_internal_query_job(query_job) + pa_table, query_job = self._block.to_arrow( + ordered=ordered, allow_large_results=allow_large_results + ) + if query_job: + self._set_internal_query_job(query_job) return pa_table def to_pandas( @@ -1554,6 +1598,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -1576,6 +1621,9 @@ def to_pandas( ordered (bool, default True): Determines whether the resulting pandas dataframe will be ordered. In some cases, unordered may result in a faster-executing query. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -1588,12 +1636,18 @@ def to_pandas( sampling_method=sampling_method, random_state=random_state, ordered=ordered, + allow_large_results=allow_large_results, ) - self._set_internal_query_job(query_job) + if query_job: + self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) def to_pandas_batches( - self, page_size: Optional[int] = None, max_results: Optional[int] = None + self, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + *, + allow_large_results: Optional[bool] = None, ) -> Iterable[pandas.DataFrame]: """Stream DataFrame results to an iterable of pandas DataFrame. @@ -1605,6 +1659,9 @@ def to_pandas_batches( The size of each batch. max_results (int, default None): If given, only download this many rows at maximum. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: Iterable[pandas.DataFrame]: @@ -1613,7 +1670,9 @@ def to_pandas_batches( see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable """ return self._block.to_pandas_batches( - page_size=page_size, max_results=max_results + page_size=page_size, + max_results=max_results, + allow_large_results=allow_large_results, ) def _compute_dry_run(self) -> bigquery.QueryJob: @@ -1630,7 +1689,9 @@ def head(self, n: int = 5) -> DataFrame: def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) - def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: + def peek( + self, n: int = 5, *, force: bool = True, allow_large_results=None + ) -> pandas.DataFrame: """ Preview n arbitrary rows from the dataframe. No guarantees about row selection or ordering. ``DataFrame.peek(force=False)`` will always be very fast, but will not succeed if data requires @@ -1643,17 +1704,22 @@ def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: force (bool, default True): If the data cannot be peeked efficiently, the dataframe will instead be fully materialized as part of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: pandas.DataFrame: A pandas DataFrame with n rows. Raises: ValueError: If force=False and data cannot be efficiently peeked. """ - maybe_result = self._block.try_peek(n) + maybe_result = self._block.try_peek(n, allow_large_results=allow_large_results) if maybe_result is None: if force: self._cached() - maybe_result = self._block.try_peek(n, force=True) + maybe_result = self._block.try_peek( + n, force=True, allow_large_results=allow_large_results + ) assert maybe_result is not None else: raise ValueError( @@ -3527,6 +3593,7 @@ def to_csv( *, header: bool = True, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. @@ -3534,7 +3601,7 @@ def to_csv( # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size if not utils.is_gcs_path(path_or_buf): - pd_df = self.to_pandas() + pd_df = self.to_pandas(allow_large_results=allow_large_results) return pd_df.to_csv(path_or_buf, sep=sep, header=header, index=index) if "*" not in path_or_buf: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3548,8 +3615,7 @@ def to_csv( "header": header, } query_job = self._session._executor.export_gcs( - export_array, - id_overrides, + export_array.rename_columns(id_overrides), path_or_buf, format="csv", export_options=options, @@ -3566,10 +3632,11 @@ def to_json( *, lines: bool = False, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? if not utils.is_gcs_path(path_or_buf): - pd_df = self.to_pandas() + pd_df = self.to_pandas(allow_large_results=allow_large_results) return pd_df.to_json( path_or_buf, orient=orient, @@ -3597,7 +3664,10 @@ def to_json( ordering_id=bigframes.session._io.bigquery.IO_ORDERING_ID, ) query_job = self._session._executor.export_gcs( - export_array, id_overrides, path_or_buf, format="json", export_options={} + export_array.rename_columns(id_overrides), + path_or_buf, + format="json", + export_options={}, ) self._set_internal_query_job(query_job) return None @@ -3672,10 +3742,10 @@ def to_gbq( default_project=default_project, ) ) + query_job = self._session._executor.export_gbq( - export_array, + export_array.rename_columns(id_overrides), destination=destination, - col_id_overrides=id_overrides, cluster_cols=clustering_fields, if_exists=if_exists, ) @@ -3701,9 +3771,17 @@ def to_gbq( return destination_table def to_numpy( - self, dtype=None, copy=False, na_value=None, **kwargs + self, + dtype=None, + copy=False, + na_value=None, + *, + allow_large_results=None, + **kwargs, ) -> numpy.ndarray: - return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) + return self.to_pandas(allow_large_results=allow_large_results).to_numpy( + dtype, copy, na_value, **kwargs + ) def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: if copy is False: @@ -3718,6 +3796,7 @@ def to_parquet( *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[bytes]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. @@ -3725,7 +3804,7 @@ def to_parquet( # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size if not utils.is_gcs_path(path): - pd_df = self.to_pandas() + pd_df = self.to_pandas(allow_large_results=allow_large_results) return pd_df.to_parquet(path, compression=compression, index=index) if "*" not in path: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3742,8 +3821,7 @@ def to_parquet( ordering_id=bigframes.session._io.bigquery.IO_ORDERING_ID, ) query_job = self._session._executor.export_gcs( - export_array, - id_overrides, + export_array.rename_columns(id_overrides), path, format="parquet", export_options=export_options, @@ -3757,12 +3835,23 @@ def to_dict( "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> dict | list[dict]: - return self.to_pandas().to_dict(orient, into, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_dict(orient, into, **kwargs) # type: ignore - def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: - return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) + def to_excel( + self, + excel_writer, + sheet_name: str = "Sheet1", + *, + allow_large_results: Optional[bool] = None, + **kwargs, + ) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_excel( + excel_writer, sheet_name, **kwargs + ) def to_latex( self, @@ -3770,16 +3859,25 @@ def to_latex( columns: Sequence | None = None, header: bool | Sequence[str] = True, index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: - return self.to_pandas().to_latex( + return self.to_pandas(allow_large_results=allow_large_results).to_latex( buf, columns=columns, header=header, index=index, **kwargs # type: ignore ) def to_records( - self, index: bool = True, column_dtypes=None, index_dtypes=None + self, + index: bool = True, + column_dtypes=None, + index_dtypes=None, + *, + allow_large_results=None, ) -> numpy.recarray: - return self.to_pandas().to_records(index, column_dtypes, index_dtypes) + return self.to_pandas(allow_large_results=allow_large_results).to_records( + index, column_dtypes, index_dtypes + ) def to_string( self, @@ -3802,8 +3900,10 @@ def to_string( min_rows: int | None = None, max_colwidth: int | None = None, encoding: str | None = None, + *, + allow_large_results: Optional[bool] = None, ) -> str | None: - return self.to_pandas().to_string( + return self.to_pandas(allow_large_results=allow_large_results).to_string( buf, columns, # type: ignore col_space, @@ -3850,8 +3950,10 @@ def to_html( table_id: str | None = None, render_links: bool = False, encoding: str | None = None, + *, + allow_large_results: bool | None = None, ) -> str: - return self.to_pandas().to_html( + return self.to_pandas(allow_large_results=allow_large_results).to_html( buf, columns, # type: ignore col_space, @@ -3882,15 +3984,19 @@ def to_markdown( buf=None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: - return self.to_pandas().to_markdown(buf, mode, index, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode, index, **kwargs) # type: ignore - def to_pickle(self, path, **kwargs) -> None: - return self.to_pandas().to_pickle(path, **kwargs) + def to_pickle(self, path, *, allow_large_results=None, **kwargs) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_pickle( + path, **kwargs + ) - def to_orc(self, path=None, **kwargs) -> bytes | None: - as_pandas = self.to_pandas() + def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | None: + as_pandas = self.to_pandas(allow_large_results=allow_large_results) # to_orc only works with default index as_pandas_default_index = as_pandas.reset_index() return as_pandas_default_index.to_orc(path, **kwargs) @@ -3970,7 +4076,9 @@ def _prepare_export( # the arbitrary unicode column labels feature in BigQuery, which is # currently (June 2023) in preview. id_overrides = { - col_id: col_label for col_id, col_label in zip(columns, column_labels) + col_id: col_label + for col_id, col_label in zip(columns, column_labels) + if (col_id != col_label) } if ordering_id is not None: @@ -3999,9 +4107,16 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): msg = "axis=1 scenario is in preview." warnings.warn(msg, category=bfe.PreviewWarning) - # Check if the function is a remote function - if not hasattr(func, "bigframes_remote_function"): - raise ValueError("For axis=1 a remote function must be used.") + # TODO(jialuo): Deprecate the "bigframes_remote_function" attribute. + # We have some tests using pre-defined remote_function that were + # defined based on "bigframes_remote_function" instead of + # "bigframes_bigquery_function". So we need to fix those pre-defined + # remote functions before deprecating the "bigframes_remote_function" + # attribute. Check if the function is a remote function. + if not hasattr(func, "bigframes_remote_function") and not hasattr( + func, "bigframes_bigquery_function" + ): + raise ValueError("For axis=1 a bigframes function must be used.") is_row_processor = getattr(func, "is_row_processor") if is_row_processor: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 54b621a0f8..5e9f1f108b 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -678,6 +678,12 @@ def convert_schema_field( pa_struct = pa.struct(fields) pa_type = pa.list_(pa_struct) if is_repeated else pa_struct return field.name, pd.ArrowDtype(pa_type) + elif ( + field.field_type == "INTEGER" + and field.description is not None + and field.description.endswith(TIMEDELTA_DESCRIPTION_TAG) + ): + return field.name, TIMEDELTA_DTYPE elif field.field_type in _TK_TO_BIGFRAMES: if is_repeated: pa_type = pa.list_( @@ -719,7 +725,9 @@ def convert_to_schema_field( ) if bigframes_dtype.pyarrow_dtype == pa.duration("us"): # Timedeltas are represented as integers in microseconds. - return google.cloud.bigquery.SchemaField(name, "INTEGER") + return google.cloud.bigquery.SchemaField( + name, "INTEGER", description=TIMEDELTA_DESCRIPTION_TAG + ) raise TypeError( f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) @@ -876,3 +884,6 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: "STRING", "ARRAY", } + + +TIMEDELTA_DESCRIPTION_TAG = "#microseconds" diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 3cb5f3665d..97e2da40a1 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -40,7 +40,10 @@ class PreviewWarning(Warning): class NullIndexPreviewWarning(PreviewWarning): - """Null index feature is in preview.""" + """Unused. Kept for backwards compatibility. + + Was used when null index feature was in preview. + """ class NullIndexError(ValueError): @@ -48,7 +51,10 @@ class NullIndexError(ValueError): class OrderingModePartialPreviewWarning(PreviewWarning): - """Ordering mode 'partial' is in preview.""" + """Unused. Kept for backwards compatibility. + + Was used when ordering mode 'partial' was in preview. + """ class OrderRequiredError(ValueError): diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index f5001ff909..3e69563db6 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -21,8 +21,8 @@ import random import shutil import string -import sys import tempfile +import textwrap import types from typing import cast, Tuple, TYPE_CHECKING @@ -55,7 +55,7 @@ class FunctionClient: - # Wait time (in seconds) for an IAM binding to take effect after creation + # Wait time (in seconds) for an IAM binding to take effect after creation. _iam_wait_seconds = 120 # TODO(b/392707725): Convert all necessary parameters for cloud function @@ -63,31 +63,79 @@ class FunctionClient: def __init__( self, gcp_project_id, - cloud_function_region, - cloud_functions_client, bq_location, bq_dataset, bq_client, bq_connection_id, bq_connection_manager, - cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, + cloud_function_region=None, + cloud_functions_client=None, + cloud_function_service_account=None, + cloud_function_kms_key_name=None, + cloud_function_docker_repository=None, *, session: Session, ): self._gcp_project_id = gcp_project_id - self._cloud_function_region = cloud_function_region - self._cloud_functions_client = cloud_functions_client self._bq_location = bq_location self._bq_dataset = bq_dataset self._bq_client = bq_client self._bq_connection_id = bq_connection_id self._bq_connection_manager = bq_connection_manager + self._session = session + + # Optional attributes only for remote functions. + self._cloud_function_region = cloud_function_region + self._cloud_functions_client = cloud_functions_client self._cloud_function_service_account = cloud_function_service_account self._cloud_function_kms_key_name = cloud_function_kms_key_name self._cloud_function_docker_repository = cloud_function_docker_repository - self._session = session + + def _create_bq_connection(self) -> None: + if self._bq_connection_manager: + self._bq_connection_manager.create_bq_connection( + self._gcp_project_id, + self._bq_location, + self._bq_connection_id, + "run.invoker", + ) + + def _ensure_dataset_exists(self) -> None: + # Make sure the dataset exists, i.e. if it doesn't exist, go ahead and + # create it. + dataset = bigquery.Dataset( + bigquery.DatasetReference.from_string( + self._bq_dataset, default_project=self._gcp_project_id + ) + ) + dataset.location = self._bq_location + try: + # This check does not require bigquery.datasets.create IAM + # permission. So, if the data set already exists, then user can work + # without having that permission. + self._bq_client.get_dataset(dataset) + except google.api_core.exceptions.NotFound: + # This requires bigquery.datasets.create IAM permission. + self._bq_client.create_dataset(dataset, exists_ok=True) + + def _create_bq_function(self, create_function_ddl: str) -> None: + # TODO(swast): plumb through the original, user-facing api_name. + _, query_job = bigframes.session._io.bigquery.start_query_with_client( + self._session.bqclient, + create_function_ddl, + job_config=bigquery.QueryJobConfig(), + ) + assert query_job is not None + logger.info(f"Created bigframes function {query_job.ddl_target_routine}") + + def _format_function_options(self, function_options: dict) -> str: + return ", ".join( + [ + f"{key}='{val}'" if isinstance(val, str) else f"{key}={val}" + for key, val in function_options.items() + if val is not None + ] + ) def create_bq_remote_function( self, @@ -101,13 +149,7 @@ def create_bq_remote_function( ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" - if self._bq_connection_manager: - self._bq_connection_manager.create_bq_connection( - self._gcp_project_id, - self._bq_location, - self._bq_connection_id, - "run.invoker", - ) + self._create_bq_connection() # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 @@ -128,12 +170,8 @@ def create_bq_remote_function( # bigframes specific metadata for the lack of a better option remote_function_options["description"] = metadata - remote_function_options_str = ", ".join( - [ - f"{key}='{val}'" if isinstance(val, str) else f"{key}={val}" - for key, val in remote_function_options.items() - if val is not None - ] + remote_function_options_str = self._format_function_options( + remote_function_options ) create_function_ddl = f""" @@ -144,31 +182,78 @@ def create_bq_remote_function( logger.info(f"Creating BQ remote function: {create_function_ddl}") - # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and - # create it - dataset = bigquery.Dataset( - bigquery.DatasetReference.from_string( - self._bq_dataset, default_project=self._gcp_project_id - ) - ) - dataset.location = self._bq_location - try: - # This check does not require bigquery.datasets.create IAM - # permission. So, if the data set already exists, then user can work - # without having that permission. - self._bq_client.get_dataset(dataset) - except google.api_core.exceptions.NotFound: - # This requires bigquery.datasets.create IAM permission - self._bq_client.create_dataset(dataset, exists_ok=True) + self._ensure_dataset_exists() + self._create_bq_function(create_function_ddl) - # TODO(swast): plumb through the original, user-facing api_name. - _, query_job = bigframes.session._io.bigquery.start_query_with_client( - self._session.bqclient, - create_function_ddl, - job_config=bigquery.QueryJobConfig(), + def provision_bq_managed_function( + self, + func, + input_types, + output_type, + name, + packages, + is_row_processor, + ): + """Create a BigQuery managed function.""" + import cloudpickle + + pickled = cloudpickle.dumps(func) + + # Create BQ managed function. + bq_function_args = [] + bq_function_return_type = output_type + + input_args = inspect.getargs(func.__code__).args + # We expect the input type annotations to be 1:1 with the input args. + for name_, type_ in zip(input_args, input_types): + bq_function_args.append(f"{name_} {type_}") + + managed_function_options = { + "runtime_version": _utils.get_python_version(), + "entry_point": "bigframes_handler", + } + + # Augment user package requirements with any internal package + # requirements. + packages = _utils._get_updated_package_requirements(packages, is_row_processor) + if packages: + managed_function_options["packages"] = packages + managed_function_options_str = self._format_function_options( + managed_function_options ) - logger.info(f"Created remote function {query_job.ddl_target_routine}") + session_id = None if name else self._session.session_id + bq_function_name = name + if not bq_function_name: + # Compute a unique hash representing the user code. + function_hash = _utils._get_hash(func, packages) + bq_function_name = _utils.get_bigframes_function_name( + function_hash, + session_id, + ) + + persistent_func_id = ( + f"`{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}" + ) + create_function_ddl = textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION {persistent_func_id}({','.join(bq_function_args)}) + RETURNS {bq_function_return_type} + LANGUAGE python + OPTIONS ({managed_function_options_str}) + AS r''' + import cloudpickle + udf = cloudpickle.loads({pickled}) + def bigframes_handler(*args): + return udf(*args) + ''' + """ + ).strip() + + self._ensure_dataset_exists() + self._create_bq_function(create_function_ddl) + + return bq_function_name def get_cloud_function_fully_qualified_parent(self): "Get the fully qualilfied parent for a cloud function." @@ -262,9 +347,7 @@ def create_cloud_function( # TODO(shobs): Figure out how to achieve version compatibility, specially # when pickle (internally used by cloudpickle) guarantees that: # https://docs.python.org/3/library/pickle.html#:~:text=The%20pickle%20serialization%20format%20is,unique%20breaking%20change%20language%20boundary. - python_version = "python{}{}".format( - sys.version_info.major, sys.version_info.minor - ) + python_version = _utils.get_python_version(is_compat=True) # Determine an upload URL for user code upload_url_request = functions_v2.GenerateUploadUrlRequest( @@ -443,7 +526,7 @@ def provision_bq_remote_function( # Derive the name of the remote function remote_function_name = name if not remote_function_name: - remote_function_name = _utils.get_remote_function_name( + remote_function_name = _utils.get_bigframes_function_name( function_hash, self._session.session_id, uniq_suffix ) rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 93b5c4c596..20dcf45103 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -21,6 +21,7 @@ import threading from typing import ( Any, + Callable, cast, Dict, Literal, @@ -46,6 +47,9 @@ ) from bigframes import clients +import bigframes.core.compile.ibis_types +import bigframes.exceptions as bfe +import bigframes.series as bf_series if TYPE_CHECKING: from bigframes.session import Session @@ -54,6 +58,9 @@ from . import _function_client, _utils +# BQ managed functions (@udf) currently only support Python 3.11. +_MANAGED_FUNC_PYTHON_VERSIONS = ("python-3.11",) + class FunctionSession: """Session to manage bigframes functions.""" @@ -65,6 +72,123 @@ def __init__(self): # Lock to synchronize the update of the session artifacts self._artifacts_lock = threading.Lock() + def _resolve_session(self, session: Optional[Session]) -> Session: + """Resolves the BigFrames session.""" + import bigframes.pandas as bpd + import bigframes.session + + # Using the global session if none is provided. + return cast(bigframes.session.Session, session or bpd.get_global_session()) + + def _resolve_bigquery_client( + self, session: Session, bigquery_client: Optional[bigquery.Client] + ) -> bigquery.Client: + """Resolves the BigQuery client.""" + if not bigquery_client: + bigquery_client = session.bqclient + if not bigquery_client: + raise ValueError( + "A bigquery client must be provided, either directly or via " + f"session. {constants.FEEDBACK_LINK}" + ) + return bigquery_client + + def _resolve_bigquery_connection_client( + self, + session: Session, + bigquery_connection_client: Optional[ + bigquery_connection_v1.ConnectionServiceClient + ], + ) -> bigquery_connection_v1.ConnectionServiceClient: + """Resolves the BigQuery connection client.""" + if not bigquery_connection_client: + bigquery_connection_client = session.bqconnectionclient + if not bigquery_connection_client: + raise ValueError( + "A bigquery connection client must be provided, either " + f"directly or via session. {constants.FEEDBACK_LINK}" + ) + return bigquery_connection_client + + def _resolve_resource_manager_client( + self, + session: Session, + resource_manager_client: Optional[resourcemanager_v3.ProjectsClient], + ) -> resourcemanager_v3.ProjectsClient: + """Resolves the resource manager client.""" + if not resource_manager_client: + resource_manager_client = session.resourcemanagerclient + if not resource_manager_client: + raise ValueError( + "A resource manager client must be provided, either directly " + f"or via session. {constants.FEEDBACK_LINK}" + ) + return resource_manager_client + + def _resolve_dataset_reference( + self, + session: Session, + bigquery_client: bigquery.Client, + dataset: Optional[str], + ) -> bigquery.DatasetReference: + """Resolves the dataset reference for the bigframes function.""" + if dataset: + dataset_ref = bigquery.DatasetReference.from_string( + dataset, default_project=bigquery_client.project + ) + else: + dataset_ref = session._anonymous_dataset + return dataset_ref + + def _resolve_cloud_functions_client( + self, + session: Session, + cloud_functions_client: Optional[functions_v2.FunctionServiceClient], + ) -> Optional[functions_v2.FunctionServiceClient]: + """Resolves the Cloud Functions client.""" + if not cloud_functions_client: + cloud_functions_client = session.cloudfunctionsclient + if not cloud_functions_client: + raise ValueError( + "A cloud functions client must be provided, either directly " + f"or via session. {constants.FEEDBACK_LINK}" + ) + return cloud_functions_client + + def _resolve_bigquery_connection_id( + self, + session: Session, + dataset_ref: bigquery.DatasetReference, + bq_location: str, + bigquery_connection: Optional[str] = None, + ) -> str: + """Resolves BigQuery connection id.""" + if not bigquery_connection: + bigquery_connection = session._bq_connection # type: ignore + + bigquery_connection = clients.resolve_full_bq_connection_name( + bigquery_connection, + default_project=dataset_ref.project, + default_location=bq_location, + ) + # Guaranteed to be the form of .. + ( + gcp_project_id, + bq_connection_location, + bq_connection_id, + ) = bigquery_connection.split(".") + if gcp_project_id.casefold() != dataset_ref.project.casefold(): + raise ValueError( + "The project_id does not match BigQuery connection " + f"gcp_project_id: {dataset_ref.project}." + ) + if bq_connection_location.casefold() != bq_location.casefold(): + raise ValueError( + "The location does not match BigQuery connection location: " + f"{bq_location}." + ) + return bq_connection_id + def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): """Update function artifacts in the current session.""" with self._artifacts_lock: @@ -83,15 +207,27 @@ def clean_up( # deleted directly by the user bqclient.delete_routine(bqrf_routine, not_found_ok=True) - # Let's accept the possibility that the cloud function may have - # been deleted directly by the user - try: - gcfclient.delete_function(name=gcf_path) - except google.api_core.exceptions.NotFound: - pass + if gcf_path: + # Let's accept the possibility that the cloud function may + # have been deleted directly by the user + try: + gcfclient.delete_function(name=gcf_path) + except google.api_core.exceptions.NotFound: + pass self._temp_artifacts.clear() + def _try_delattr(self, func: Callable, attr: str) -> None: + """Attempts to delete an attribute from a bigframes function.""" + # In the unlikely case where the user is trying to re-deploy the same + # function, cleanup the attributes we add in bigframes functions, first. + # This prevents the pickle from having dependencies that might not + # otherwise be present such as ibis or pandas. + try: + delattr(func, attr) + except AttributeError: + pass + # Inspired by @udf decorator implemented in ibis-bigquery package # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py # which has moved as @js to the ibis package @@ -120,9 +256,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Literal[ - "all", "internal-only", "internal-and-gclb" - ] = "all", + cloud_function_ingress_settings: Optional[ + Literal["all", "internal-only", "internal-and-gclb"] + ] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -306,91 +442,42 @@ def remote_function( `all`, `internal-only`, `internal-and-gclb`. See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. """ - # Some defaults may be used from the session if not provided otherwise - import bigframes.exceptions as bfe - import bigframes.pandas as bpd - import bigframes.series as bf_series - import bigframes.session - - session = cast(bigframes.session.Session, session or bpd.get_global_session()) + # Some defaults may be used from the session if not provided otherwise. + session = self._resolve_session(session) - # A BigQuery client is required to perform BQ operations - if not bigquery_client: - bigquery_client = session.bqclient - if not bigquery_client: - raise ValueError( - "A bigquery client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) - - # A BigQuery connection client is required to perform BQ connection operations - if not bigquery_connection_client: - bigquery_connection_client = session.bqconnectionclient - if not bigquery_connection_client: - raise ValueError( - "A bigquery connection client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + # A BigQuery client is required to perform BQ operations. + bigquery_client = self._resolve_bigquery_client(session, bigquery_client) - # A cloud functions client is required to perform cloud functions operations - if not cloud_functions_client: - cloud_functions_client = session.cloudfunctionsclient - if not cloud_functions_client: - raise ValueError( - "A cloud functions client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + # A BigQuery connection client is required for BQ connection operations. + bigquery_connection_client = self._resolve_bigquery_connection_client( + session, bigquery_connection_client + ) - # A resource manager client is required to get/set IAM operations - if not resource_manager_client: - resource_manager_client = session.resourcemanagerclient - if not resource_manager_client: - raise ValueError( - "A resource manager client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + # A resource manager client is required to get/set IAM operations. + resource_manager_client = self._resolve_resource_manager_client( + session, resource_manager_client + ) - # BQ remote function must be persisted, for which we need a dataset + # BQ remote function must be persisted, for which we need a dataset. # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. - if dataset: - dataset_ref = bigquery.DatasetReference.from_string( - dataset, default_project=bigquery_client.project - ) - else: - dataset_ref = session._anonymous_dataset + dataset_ref = self._resolve_dataset_reference(session, bigquery_client, dataset) + + # A cloud functions client is required for cloud functions operations. + cloud_functions_client = self._resolve_cloud_functions_client( + session, cloud_functions_client + ) bq_location, cloud_function_region = _utils.get_remote_function_locations( bigquery_client.location ) - # A connection is required for BQ remote function + # A connection is required for BQ remote function. # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function - if not bigquery_connection: - bigquery_connection = session._bq_connection # type: ignore - - bigquery_connection = clients.resolve_full_bq_connection_name( - bigquery_connection, - default_project=dataset_ref.project, - default_location=bq_location, + bq_connection_id = self._resolve_bigquery_connection_id( + session, dataset_ref, bq_location, bigquery_connection ) - # Guaranteed to be the form of .. - ( - gcp_project_id, - bq_connection_location, - bq_connection_id, - ) = bigquery_connection.split(".") - if gcp_project_id.casefold() != dataset_ref.project.casefold(): - raise ValueError( - "The project_id does not match BigQuery connection gcp_project_id: " - f"{dataset_ref.project}." - ) - if bq_connection_location.casefold() != bq_location.casefold(): - raise ValueError( - "The location does not match BigQuery connection location: " - f"{bq_location}." - ) - # If any CMEK is intended then check that a docker repository is also specified + # If any CMEK is intended then check that a docker repository is also specified. if ( cloud_function_kms_key_name is not None and cloud_function_docker_repository is None @@ -400,6 +487,16 @@ def remote_function( " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" ) + if cloud_function_ingress_settings is None: + cloud_function_ingress_settings = "all" + msg = ( + "The `cloud_function_ingress_settings` are set to 'all' by default, " + "which will change to 'internal-only' for enhanced security in future version 2.0 onwards. " + "However, you will be able to explicitly pass cloud_function_ingress_settings='all' if you need. " + "See https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings for details." + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) + bq_connection_manager = session.bqconnectionmanager def wrapper(func): @@ -456,26 +553,26 @@ def wrapper(func): warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) # we will model the row as a json serialized string containing the data - # and the metadata representing the row + # and the metadata representing the row. input_types = [str] is_row_processor = True elif isinstance(input_types, type): input_types = [input_types] - # TODO(b/340898611): fix type error + # TODO(b/340898611): fix type error. ibis_signature = _utils.ibis_signature_from_python_signature( signature, input_types, output_type # type: ignore ) remote_function_client = _function_client.FunctionClient( dataset_ref.project, - cloud_function_region, - cloud_functions_client, bq_location, dataset_ref.dataset_id, bigquery_client, bq_connection_id, bq_connection_manager, + cloud_function_region, + cloud_functions_client, cloud_function_service_account, cloud_function_kms_key_name, cloud_function_docker_repository, @@ -484,29 +581,20 @@ def wrapper(func): # To respect the user code/environment let's use a copy of the # original udf, especially since we would be setting some properties - # on it + # on it. func = cloudpickle.loads(cloudpickle.dumps(func)) - # In the unlikely case where the user is trying to re-deploy the same - # function, cleanup the attributes we add below, first. This prevents - # the pickle from having dependencies that might not otherwise be - # present such as ibis or pandas. - def try_delattr(attr): - try: - delattr(func, attr) - except AttributeError: - pass - - try_delattr("bigframes_cloud_function") - try_delattr("bigframes_remote_function") - try_delattr("input_dtypes") - try_delattr("output_dtype") - try_delattr("bigframes_bigquery_function_output_dtype") - try_delattr("is_row_processor") - try_delattr("ibis_node") + self._try_delattr(func, "bigframes_cloud_function") + self._try_delattr(func, "bigframes_remote_function") + self._try_delattr(func, "bigframes_bigquery_function") + self._try_delattr(func, "bigframes_bigquery_function_output_dtype") + self._try_delattr(func, "input_dtypes") + self._try_delattr(func, "output_dtype") + self._try_delattr(func, "is_row_processor") + self._try_delattr(func, "ibis_node") # resolve the output type that can be supported in the bigframes, - # ibis, BQ remote functions and cloud functions integration + # ibis, BQ remote functions and cloud functions integration. ibis_output_type_for_bqrf = ibis_signature.output_type bqrf_metadata = None if isinstance(ibis_signature.output_type, ibis_dtypes.Array): @@ -560,7 +648,7 @@ def try_delattr(attr): ] ) - # TODO: Move ibis logic to compiler step + # TODO: Move ibis logic to compiler step. node = ibis_udf.scalar.builtin( func, name=rf_name, @@ -571,11 +659,12 @@ def try_delattr(attr): func.bigframes_cloud_function = ( remote_function_client.get_cloud_function_fully_qualified_name(cf_name) ) - func.bigframes_remote_function = ( + func.bigframes_bigquery_function = ( remote_function_client.get_remote_function_fully_qualilfied_name( rf_name ) ) + func.bigframes_remote_function = func.bigframes_bigquery_function func.input_dtypes = tuple( [ bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( @@ -612,3 +701,234 @@ def try_delattr(attr): return func return wrapper + + def udf( + self, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, + session: Optional[Session] = None, + bigquery_client: Optional[bigquery.Client] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, + name: Optional[str] = None, + packages: Optional[Sequence[str]] = None, + ): + """Decorator to turn a Python udf into a BigQuery managed function. + + .. note:: + Please have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + + Args: + input_types (type or sequence(type), Optional): + For scalar user defined function it should be the input type or + sequence of input types. The supported scalar input types are + `bool`, `bytes`, `float`, `int`, `str`. + output_type (type, Optional): + Data type of the output in the user defined function. If the + user defined function returns an array, then `list[type]` should + be specified. The supported output types are `bool`, `bytes`, + `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]` + and `list[str]`. + session (bigframes.Session, Optional): + BigQuery DataFrames session to use for getting default project, + dataset and BigQuery connection. + bigquery_client (google.cloud.bigquery.Client, Optional): + Client to use for BigQuery operations. If this param is not + provided, then bigquery client from the session would be used. + dataset (str, Optional): + Dataset in which to create a BigQuery managed function. It + should be in `.` or `` + format. If this parameter is not provided then session dataset + id is used. + bigquery_connection (str, Optional): + Name of the BigQuery connection. It is used to provide an + identity to the serverless instances running the user code. It + helps BigQuery manage and track the resources used by the udf. + name (str, Optional): + Explicit name of the persisted BigQuery managed function. Use it + with caution, because more than one users working in the same + project and dataset could overwrite each other's managed + functions if they use the same persistent name. When an explicit + name is provided, any session specific clean up ( + ``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. + packages (str[], Optional): + Explicit name of the external package dependencies. Each + dependency is added to the `requirements.txt` as is, and can be + of the form supported in + https://pip.pypa.io/en/stable/reference/requirements-file-format/. + """ + if not bigframes.options.experiments.udf: + raise NotImplementedError() + + # Check the Python version. + python_version = _utils.get_python_version() + if python_version not in _MANAGED_FUNC_PYTHON_VERSIONS: + raise RuntimeError( + f"Python version {python_version} is not supported yet for " + "BigFrames managed function." + ) + + # Some defaults may be used from the session if not provided otherwise. + session = self._resolve_session(session) + + # A BigQuery client is required to perform BQ operations. + bigquery_client = self._resolve_bigquery_client(session, bigquery_client) + + # BQ managed function must be persisted, for which we need a dataset. + dataset_ref = self._resolve_dataset_reference(session, bigquery_client, dataset) + + bq_location, _ = _utils.get_remote_function_locations(bigquery_client.location) + + # A connection is required for BQ managed function. + bq_connection_id = self._resolve_bigquery_connection_id( + session, dataset_ref, bq_location, bigquery_connection + ) + + bq_connection_manager = session.bqconnectionmanager + + # TODO(b/399129906): Write a method for the repeated part in the wrapper + # for both managed function and remote function. + def wrapper(func): + nonlocal input_types, output_type + + if not callable(func): + raise TypeError("f must be callable, got {}".format(func)) + + # Managed function supports version >= 3.11. + signature_kwargs: Mapping[str, Any] = {"eval_str": True} + signature = inspect.signature(func, **signature_kwargs) + + # Try to get input types via type annotations. + if input_types is None: + input_types = [] + for parameter in signature.parameters.values(): + if (param_type := parameter.annotation) is inspect.Signature.empty: + raise ValueError( + "'input_types' was not set and parameter " + f"'{parameter.name}' is missing a type annotation. " + "Types are required to use managed function." + ) + input_types.append(param_type) + elif not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + + if output_type is None: + if ( + output_type := signature.return_annotation + ) is inspect.Signature.empty: + raise ValueError( + "'output_type' was not set and function is missing a " + "return type annotation. Types are required to use " + "managed function." + ) + + # The function will actually be receiving a pandas Series, but allow + # both BigQuery DataFrames and pandas object types for compatibility. + is_row_processor = False + if len(input_types) == 1 and ( + (input_type := input_types[0]) == bf_series.Series + or input_type == pandas.Series + ): + msg = "input_types=Series is in preview." + warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) + + # we will model the row as a json serialized string containing + # the data and the metadata representing the row. + input_types = [str] + is_row_processor = True + elif isinstance(input_types, type): + input_types = [input_types] + + # TODO(b/340898611): fix type error. + ibis_signature = _utils.ibis_signature_from_python_signature( + signature, input_types, output_type # type: ignore + ) + + remote_function_client = _function_client.FunctionClient( + dataset_ref.project, + bq_location, + dataset_ref.dataset_id, + bigquery_client, + bq_connection_id, + bq_connection_manager, + session=session, # type: ignore + ) + + func = cloudpickle.loads(cloudpickle.dumps(func)) + + self._try_delattr(func, "bigframes_bigquery_function") + self._try_delattr(func, "input_dtypes") + self._try_delattr(func, "output_dtype") + self._try_delattr(func, "is_row_processor") + self._try_delattr(func, "ibis_node") + + bq_function_name = remote_function_client.provision_bq_managed_function( + func=func, + input_types=tuple( + third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) + for type_ in ibis_signature.input_types + if type_ is not None + ), + output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( + ibis_signature.output_type + ), + name=name, + packages=packages, + is_row_processor=is_row_processor, + ) + + # TODO(shobs): Find a better way to support udfs with param named + # "name". This causes an issue in the ibis compilation. + func.__signature__ = inspect.signature(func).replace( # type: ignore + parameters=[ + inspect.Parameter( + f"bigframes_{param.name}", + param.kind, + ) + for param in inspect.signature(func).parameters.values() + ] + ) + + # TODO: Move ibis logic to compiler step. + node = ibis_udf.scalar.builtin( + func, + name=bq_function_name, + catalog=dataset_ref.project, + database=dataset_ref.dataset_id, + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) # type: ignore + func.bigframes_bigquery_function = ( + remote_function_client.get_remote_function_fully_qualilfied_name( + bq_function_name + ) + ) + func.input_dtypes = tuple( + [ + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + input_type + ) + for input_type in ibis_signature.input_types + if input_type is not None + ] + ) + func.output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_signature.output_type + ) + ) + func.is_row_processor = is_row_processor + func.ibis_node = node + + if not name: + self._update_temp_artifacts(func.bigframes_bigquery_function, "") + + return func + + return wrapper diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py index f1f8c97e7f..bd6bd920b8 100644 --- a/bigframes/functions/_utils.py +++ b/bigframes/functions/_utils.py @@ -16,6 +16,7 @@ import hashlib import inspect import json +import sys import typing from typing import cast, List, NamedTuple, Optional, Sequence, Set @@ -185,8 +186,8 @@ def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) -def get_remote_function_name(function_hash, session_id, uniq_suffix=None): - "Get a name for the remote function for the given user defined function." +def get_bigframes_function_name(function_hash, session_id, uniq_suffix=None): + "Get a name for the bigframes function for the given user defined function." parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function_hash] if uniq_suffix: parts.append(uniq_suffix) @@ -280,3 +281,12 @@ def get_bigframes_metadata(*, python_output_type: Optional[type] = None) -> str: ) return metadata_ser + + +def get_python_version(is_compat: bool = False) -> str: + # Cloud Run functions use the 'compat' format (e.g., python311, see more + # from https://cloud.google.com/functions/docs/runtime-support#python), + # while managed functions use the standard format (e.g., python-3.11). + major = sys.version_info.major + minor = sys.version_info.minor + return f"python{major}{minor}" if is_compat else f"python-{major}.{minor}" diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index c2809b96eb..392a209714 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -122,13 +122,22 @@ def get_routine_reference( def remote_function(*args, **kwargs): - remote_function_session = bff_session.FunctionSession() - return remote_function_session.remote_function(*args, **kwargs) + function_session = bff_session.FunctionSession() + return function_session.remote_function(*args, **kwargs) remote_function.__doc__ = bff_session.FunctionSession.remote_function.__doc__ +def udf(*args, **kwargs): + function_session = bff_session.FunctionSession() + return function_session.udf(*args, **kwargs) + + +udf.__doc__ = bff_session.FunctionSession.udf.__doc__ + + +# TODO(b/399894805): Support managed function. def read_gbq_function( function_name: str, *, diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index ce9a59f26a..44018b8c5c 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -68,6 +68,12 @@ def area(self, crs=None) -> bigframes.series.Series: # type: ignore f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" ) + @property + def boundary(self) -> bigframes.series.Series: # type: ignore + series = self._apply_unary_op(ops.geo_st_boundary_op) + series.name = None + return series + @classmethod def from_wkt(cls, data, index=None) -> GeoSeries: series = bigframes.series.Series(data, index=index) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 7e6f1f793c..83cefbe6ba 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -39,6 +39,7 @@ ne_op, ) from bigframes.operations.date_ops import ( + date_diff_op, day_op, dayofweek_op, month_op, @@ -88,6 +89,7 @@ from bigframes.operations.geo_ops import ( geo_area_op, geo_st_astext_op, + geo_st_boundary_op, geo_st_geogfromtext_op, geo_st_geogpoint_op, geo_x_op, @@ -184,6 +186,8 @@ from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op from bigframes.operations.timedelta_ops import ( + date_add_op, + date_sub_op, timedelta_floor_op, timestamp_add_op, timestamp_sub_op, @@ -249,6 +253,7 @@ "upper_op", "ZfillOp", # Date ops + "date_diff_op", "day_op", "month_op", "year_op", @@ -260,6 +265,8 @@ "second_op", "normalize_op", # Timedelta ops + "date_add_op", + "date_sub_op", "timedelta_floor_op", "timestamp_add_op", "timestamp_sub_op", @@ -358,6 +365,7 @@ "manhattan_distance_op", # Geo ops "geo_area_op", + "geo_st_boundary_op", "geo_st_astext_op", "geo_st_geogfromtext_op", "geo_st_geogpoint_op", diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index e9d102b42d..a714f5804c 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -142,13 +142,16 @@ class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if not dtypes.is_numeric(input_types[0]): - raise TypeError(f"Type {input_types[0]} is not numeric") - if pd.api.types.is_bool_dtype(input_types[0]): - return dtypes.INT_DTYPE - else: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + + if dtypes.is_numeric(input_types[0]): + if pd.api.types.is_bool_dtype(input_types[0]): + return dtypes.INT_DTYPE return input_types[0] + raise TypeError(f"Type {input_types[0]} is not numeric or timedelta") + @dataclasses.dataclass(frozen=True) class MedianOp(UnaryAggregateOp): @@ -171,6 +174,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) class QuantileOp(UnaryAggregateOp): q: float + should_floor_result: bool = False @property def name(self): @@ -181,6 +185,8 @@ def order_independent(self) -> bool: return True def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -224,7 +230,11 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class MeanOp(UnaryAggregateOp): name: ClassVar[str] = "mean" + should_floor_result: bool = False + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -262,7 +272,12 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class StdOp(UnaryAggregateOp): name: ClassVar[str] = "std" + should_floor_result: bool = False + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + return signatures.FixedOutputType( dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" ).output_type(input_types[0]) @@ -485,7 +500,7 @@ def skips_nulls(self): return False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if dtypes.is_datetime_like(input_types[0]): + if dtypes.is_date_like(input_types[0]): return dtypes.TIMEDELTA_DTYPE return super().output_type(*input_types) @@ -504,6 +519,20 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT raise TypeError(f"expect datetime-like types, but got {input_types[0]}") +@dataclasses.dataclass(frozen=True) +class DateSeriesDiffOp(UnaryWindowOp): + periods: int + + @property + def skips_nulls(self): + return False + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] == dtypes.DATE_DTYPE: + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"expect date type, but got {input_types[0]}") + + @dataclasses.dataclass(frozen=True) class AllOp(UnaryAggregateOp): name: ClassVar[str] = "all" diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 24ff315ad5..88b34bf758 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -26,6 +26,9 @@ import bigframes.operations as ops import bigframes.series +FILE_FOLDER_REGEX = r"^.*\/(.*)$" +FILE_EXT_REGEX = r"(\.[0-9a-zA-Z]+$)" + class BlobAccessor(base.SeriesMethods): def __init__(self, *args, **kwargs): @@ -278,7 +281,9 @@ def image_blur( *, dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, - max_batching_rows: int = 10000, + max_batching_rows: int = 8192, + container_cpu: Union[float, int] = 0.33, + container_memory: str = "512Mi", ) -> bigframes.series.Series: """Blurs images. @@ -287,9 +292,15 @@ def image_blur( Args: ksize (tuple(int, int)): Kernel size. - dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. + dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of: + str: GCS folder str. The output filenames are the same as the input files. + blob Series: The output file paths are determined by the uris of the blob Series. + None: Output to BQ as bytes. + Encoding is determined by the extension of the output filenames (or input filenames if doesn't have output filenames). If filename doesn't have an extension, use ".jpeg" for encoding. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - max_batching_rows (int, default 10,000): Max number of rows per batch send to cloud run to execute the function. + max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. + container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: BigFrames Blob Series @@ -300,38 +311,47 @@ def image_blur( df = self._get_runtime_json_str(mode="R").to_frame() if dst is None: + ext = self.uri().str.extract(FILE_EXT_REGEX) + image_blur_udf = blob_func.TransformFunction( blob_func.image_blur_to_bytes_def, session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() df["ksize_x"], df["ksize_y"] = ksize + df["ext"] = ext # type: ignore res = df.apply(image_blur_udf, axis=1) return res if isinstance(dst, str): dst = os.path.join(dst, "") - src_uri = bigframes.series.Series(self._block).struct.explode()["uri"] # Replace src folder with dst folder, keep the file names. - dst_uri = src_uri.str.replace(r"^.*\/(.*)$", rf"{dst}\1", regex=True) + dst_uri = self.uri().str.replace(FILE_FOLDER_REGEX, rf"{dst}\1", regex=True) dst = cast( bigframes.series.Series, dst_uri.str.to_blob(connection=connection) ) + ext = dst.blob.uri().str.extract(FILE_EXT_REGEX) + image_blur_udf = blob_func.TransformFunction( blob_func.image_blur_def, session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() dst_rt = dst.blob._get_runtime_json_str(mode="RW") df = df.join(dst_rt, how="outer") df["ksize_x"], df["ksize_y"] = ksize + df["ext"] = ext # type: ignore res = df.apply(image_blur_udf, axis=1) res.cache() # to execute the udf @@ -346,7 +366,9 @@ def image_resize( fy: float = 0.0, dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, - max_batching_rows: int = 10000, + max_batching_rows: int = 8192, + container_cpu: Union[float, int] = 0.33, + container_memory: str = "512Mi", ): """Resize images. @@ -357,9 +379,15 @@ def image_resize( dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size. fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size. fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size. - dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. + dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of: + str: GCS folder str. The output filenames are the same as the input files. + blob Series: The output file paths are determined by the uris of the blob Series. + None: Output to BQ as bytes. + Encoding is determined by the extension of the output filenames (or input filenames if doesn't have output filenames). If filename doesn't have an extension, use ".jpeg" for encoding. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - max_batching_rows (int, default 10,000): Max number of rows per batch send to cloud run to execute the function. + max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. + container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: BigFrames Blob Series @@ -377,33 +405,41 @@ def image_resize( df = self._get_runtime_json_str(mode="R").to_frame() if dst is None: + ext = self.uri().str.extract(FILE_EXT_REGEX) + image_resize_udf = blob_func.TransformFunction( blob_func.image_resize_to_bytes_def, session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() df["dsize_x"], df["dsizye_y"] = dsize df["fx"], df["fy"] = fx, fy + df["ext"] = ext # type: ignore res = df.apply(image_resize_udf, axis=1) return res if isinstance(dst, str): dst = os.path.join(dst, "") - src_uri = bigframes.series.Series(self._block).struct.explode()["uri"] # Replace src folder with dst folder, keep the file names. - dst_uri = src_uri.str.replace(r"^.*\/(.*)$", rf"{dst}\1", regex=True) + dst_uri = self.uri().str.replace(FILE_FOLDER_REGEX, rf"{dst}\1", regex=True) dst = cast( bigframes.series.Series, dst_uri.str.to_blob(connection=connection) ) + ext = dst.blob.uri().str.extract(FILE_EXT_REGEX) + image_resize_udf = blob_func.TransformFunction( blob_func.image_resize_def, session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() dst_rt = dst.blob._get_runtime_json_str(mode="RW") @@ -411,6 +447,7 @@ def image_resize( df = df.join(dst_rt, how="outer") df["dsize_x"], df["dsizye_y"] = dsize df["fx"], df["fy"] = fx, fy + df["ext"] = ext # type: ignore res = df.apply(image_resize_udf, axis=1) res.cache() # to execute the udf @@ -425,7 +462,9 @@ def image_normalize( norm_type: str = "l2", dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, - max_batching_rows: int = 10000, + max_batching_rows: int = 8192, + container_cpu: Union[float, int] = 0.33, + container_memory: str = "512Mi", ) -> bigframes.series.Series: """Normalize images. @@ -436,9 +475,15 @@ def image_normalize( alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization. beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization. norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax". - dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. + dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of: + str: GCS folder str. The output filenames are the same as the input files. + blob Series: The output file paths are determined by the uris of the blob Series. + None: Output to BQ as bytes. + Encoding is determined by the extension of the output filenames (or input filenames if doesn't have output filenames). If filename doesn't have an extension, use ".jpeg" for encoding. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - max_batching_rows (int, default 10,000): Max number of rows per batch send to cloud run to execute the function. + max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. + container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: BigFrames Blob Series @@ -449,34 +494,42 @@ def image_normalize( df = self._get_runtime_json_str(mode="R").to_frame() if dst is None: + ext = self.uri().str.extract(FILE_EXT_REGEX) + image_normalize_udf = blob_func.TransformFunction( blob_func.image_normalize_to_bytes_def, session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() df["alpha"] = alpha df["beta"] = beta df["norm_type"] = norm_type + df["ext"] = ext # type: ignore res = df.apply(image_normalize_udf, axis=1) return res if isinstance(dst, str): dst = os.path.join(dst, "") - src_uri = bigframes.series.Series(self._block).struct.explode()["uri"] # Replace src folder with dst folder, keep the file names. - dst_uri = src_uri.str.replace(r"^.*\/(.*)$", rf"{dst}\1", regex=True) + dst_uri = self.uri().str.replace(FILE_FOLDER_REGEX, rf"{dst}\1", regex=True) dst = cast( bigframes.series.Series, dst_uri.str.to_blob(connection=connection) ) + ext = dst.blob.uri().str.extract(FILE_EXT_REGEX) + image_normalize_udf = blob_func.TransformFunction( blob_func.image_normalize_def, session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() dst_rt = dst.blob._get_runtime_json_str(mode="RW") @@ -485,6 +538,7 @@ def image_normalize( df["alpha"] = alpha df["beta"] = beta df["norm_type"] = norm_type + df["ext"] = ext # type: ignore res = df.apply(image_normalize_udf, axis=1) res.cache() # to execute the udf @@ -495,7 +549,9 @@ def pdf_extract( self, *, connection: Optional[str] = None, - max_batching_rows: int = 10000, + max_batching_rows: int = 8192, + container_cpu: Union[float, int] = 0.33, + container_memory: str = "512Mi", ) -> bigframes.series.Series: """Extracts and chunks text from PDF URLs and saves the text as arrays of string. @@ -508,8 +564,10 @@ def pdf_extract( connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - max_batching_rows (int, default 10,000): Max number of rows per batch + max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. + container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: bigframes.series.Series: conatins all text from a pdf file @@ -524,6 +582,8 @@ def pdf_extract( session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() src_rt = self._get_runtime_json_str(mode="R") @@ -536,7 +596,9 @@ def pdf_chunk( connection: Optional[str] = None, chunk_size: int = 1000, overlap_size: int = 200, - max_batching_rows: int = 10000, + max_batching_rows: int = 8192, + container_cpu: Union[float, int] = 0.33, + container_memory: str = "512Mi", ) -> bigframes.series.Series: """Extracts and chunks text from PDF URLs and saves the text as arrays of strings. @@ -554,8 +616,10 @@ def pdf_chunk( overlap_size (int, default 200): the number of overlapping characters between consective chunks. The helps to ensure context is perserved across chunk boundaries. - max_batching_rows (int, default 10,000): Max number of rows per batch + max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. + container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. Returns: bigframe.series.Series of array[str], where each string is a @@ -579,6 +643,8 @@ def pdf_chunk( session=self._block.session, connection=connection, max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ).udf() src_rt = self._get_runtime_json_str(mode="R") diff --git a/bigframes/operations/date_ops.py b/bigframes/operations/date_ops.py index 2b68a24caf..32d8eec118 100644 --- a/bigframes/operations/date_ops.py +++ b/bigframes/operations/date_ops.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses +import typing + +from bigframes import dtypes from bigframes.operations import base_ops import bigframes.operations.type as op_typing @@ -39,3 +43,22 @@ name="quarter", type_signature=op_typing.DATELIKE_ACCESSOR, ) + + +@dataclasses.dataclass(frozen=True) +class DateDiffOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_diff" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not input_types[1]: + raise TypeError( + f"two inputs have different types. left: {input_types[0]}, right: {input_types[1]}" + ) + + if input_types[0] != dtypes.DATE_DTYPE: + raise TypeError("expected date input") + + return dtypes.TIMEDELTA_DTYPE + + +date_diff_op = DateDiffOp() diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 04441957e7..9ef0983e24 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -16,20 +16,6 @@ from bigframes.operations import base_ops import bigframes.operations.type as op_typing -geo_x_op = base_ops.create_unary_op( - name="geo_x", - type_signature=op_typing.FixedOutputType( - dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" - ), -) - -geo_y_op = base_ops.create_unary_op( - name="geo_y", - type_signature=op_typing.FixedOutputType( - dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" - ), -) - geo_area_op = base_ops.create_unary_op( name="geo_area", type_signature=op_typing.FixedOutputType( @@ -37,7 +23,6 @@ ), ) - geo_st_astext_op = base_ops.create_unary_op( name="geo_st_astext", type_signature=op_typing.FixedOutputType( @@ -45,6 +30,12 @@ ), ) +geo_st_boundary_op = base_ops.create_unary_op( + name="geo_st_boundary", + type_signature=op_typing.FixedOutputType( + dtypes.is_geo_like, dtypes.GEO_DTYPE, description="geo-like" + ), +) geo_st_geogfromtext_op = base_ops.create_unary_op( name="geo_st_geogfromtext", @@ -53,6 +44,21 @@ ), ) + geo_st_geogpoint_op = base_ops.create_binary_op( name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() ) + +geo_x_op = base_ops.create_unary_op( + name="geo_x", + type_signature=op_typing.FixedOutputType( + dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" + ), +) + +geo_y_op = base_ops.create_unary_op( + name="geo_y", + type_signature=op_typing.FixedOutputType( + dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" + ), +) diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index f5a290bde5..ae23aff707 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -123,12 +123,18 @@ def output_type(self, *input_types): # String addition return input_types[0] - # Timestamp addition. + # Temporal addition. if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: return left_type if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): return right_type + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: + return dtypes.DATETIME_DTYPE + + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.DATE_DTYPE: + return dtypes.DATETIME_DTYPE + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE @@ -155,9 +161,15 @@ def output_type(self, *input_types): if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): return dtypes.TIMEDELTA_DTYPE + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.DATE_DTYPE: + return dtypes.TIMEDELTA_DTYPE + if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: return left_type + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: + return dtypes.DATETIME_DTYPE + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 364154f728..b831e3f864 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -79,6 +79,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT timestamp_add_op = TimestampAddOp() +@dataclasses.dataclass(frozen=True) class TimestampSubOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_sub" @@ -96,3 +97,49 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT timestamp_sub_op = TimestampSubOp() + + +@dataclasses.dataclass(frozen=True) +class DateAddOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_add" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # date + timedelta => timestamp without timezone + if ( + input_types[0] == dtypes.DATE_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE + ): + return dtypes.DATETIME_DTYPE + # timedelta + date => timestamp without timezone + if ( + input_types[0] == dtypes.TIMEDELTA_DTYPE + and input_types[1] == dtypes.DATE_DTYPE + ): + return dtypes.DATETIME_DTYPE + + raise TypeError( + f"unsupported types for date_add. left: {input_types[0]} right: {input_types[1]}" + ) + + +date_add_op = DateAddOp() + + +@dataclasses.dataclass(frozen=True) +class DateSubOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_sub" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # date - timedelta => timestamp without timezone + if ( + input_types[0] == dtypes.DATE_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE + ): + return dtypes.DATETIME_DTYPE + + raise TypeError( + f"unsupported types for date_sub. left: {input_types[0]} right: {input_types[1]}" + ) + + +date_sub_op = DateSubOp() diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 93c08a22aa..8ea7e6c320 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -80,9 +80,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Literal[ - "all", "internal-only", "internal-and-gclb" - ] = "all", + cloud_function_ingress_settings: Optional[ + Literal["all", "internal-only", "internal-and-gclb"] + ] = None, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -108,6 +108,29 @@ def remote_function( remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function) +def udf( + *, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, + name: Optional[str] = None, + packages: Optional[Sequence[str]] = None, +): + return global_session.with_default_session( + bigframes.session.Session.udf, + input_types=input_types, + output_type=output_type, + dataset=dataset, + bigquery_connection=bigquery_connection, + name=name, + packages=packages, + ) + + +udf.__doc__ = inspect.getdoc(bigframes.session.Session.udf) + + @typing.overload def to_datetime( arg: Union[ diff --git a/bigframes/series.py b/bigframes/series.py index 5a84dee32f..2c37913679 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -381,6 +381,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> pandas.Series: """Writes Series to pandas Series. @@ -403,6 +404,9 @@ def to_pandas( ordered (bool, default True): Determines whether the resulting pandas series will be ordered. In some cases, unordered may result in a faster-executing query. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: @@ -414,8 +418,10 @@ def to_pandas( sampling_method=sampling_method, random_state=random_state, ordered=ordered, + allow_large_results=allow_large_results, ) - self._set_internal_query_job(query_job) + if query_job: + self._set_internal_query_job(query_job) series = df.squeeze(axis=1) series.name = self._name return series @@ -685,7 +691,9 @@ def head(self, n: int = 5) -> Series: def tail(self, n: int = 5) -> Series: return typing.cast(Series, self.iloc[-n:]) - def peek(self, n: int = 5, *, force: bool = True) -> pandas.Series: + def peek( + self, n: int = 5, *, force: bool = True, allow_large_results=None + ) -> pandas.Series: """ Preview n arbitrary elements from the series without guarantees about row selection or ordering. @@ -699,17 +707,22 @@ def peek(self, n: int = 5, *, force: bool = True) -> pandas.Series: force (bool, default True): If the data cannot be peeked efficiently, the series will instead be fully materialized as part of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: pandas.Series: A pandas Series with n rows. Raises: ValueError: If force=False and data cannot be efficiently peeked. """ - maybe_result = self._block.try_peek(n) + maybe_result = self._block.try_peek(n, allow_large_results=allow_large_results) if maybe_result is None: if force: self._cached() - maybe_result = self._block.try_peek(n, force=True) + maybe_result = self._block.try_peek( + n, force=True, allow_large_results=allow_large_results + ) assert maybe_result is not None else: raise ValueError( @@ -1520,10 +1533,18 @@ def apply( "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported." ) - if not hasattr(func, "bigframes_remote_function"): - # It is not a remote function + # TODO(jialuo): Deprecate the "bigframes_remote_function" attribute. + # We have some tests using pre-defined remote_function that were defined + # based on "bigframes_remote_function" instead of + # "bigframes_bigquery_function". So we need to fix those pre-defined + # remote functions before deprecating the "bigframes_remote_function" + # attribute. + if not hasattr(func, "bigframes_remote_function") and not hasattr( + func, "bigframes_bigquery_function" + ): + # It is neither a remote function nor a managed function. # Then it must be a vectorized function that applies to the Series - # as a whole + # as a whole. if by_row: raise ValueError( "A vectorized non-remote function can be provided only with by_row=False." @@ -1572,7 +1593,9 @@ def combine( "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported." ) - if not hasattr(func, "bigframes_remote_function"): + if not hasattr(func, "bigframes_remote_function") and not hasattr( + func, "bigframes_bigquery_function" + ): # Keep this in sync with .apply try: return func(self, other) @@ -1754,22 +1777,36 @@ def to_csv( *, header: bool = True, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: if utils.is_gcs_path(path_or_buf): return self.to_frame().to_csv( - path_or_buf, sep=sep, header=header, index=index + path_or_buf, + sep=sep, + header=header, + index=index, + allow_large_results=allow_large_results, ) else: - pd_series = self.to_pandas() + pd_series = self.to_pandas(allow_large_results=allow_large_results) return pd_series.to_csv( path_or_buf=path_or_buf, sep=sep, header=header, index=index ) - def to_dict(self, into: type[dict] = dict) -> typing.Mapping: - return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore + def to_dict( + self, + into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, + ) -> typing.Mapping: + return typing.cast(dict, self.to_pandas(allow_large_results=allow_large_results).to_dict(into)) # type: ignore - def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None: - return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) + def to_excel( + self, excel_writer, sheet_name="Sheet1", *, allow_large_results=None, **kwargs + ) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_excel( + excel_writer, sheet_name, **kwargs + ) def to_json( self, @@ -1780,26 +1817,42 @@ def to_json( *, lines: bool = False, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: if utils.is_gcs_path(path_or_buf): return self.to_frame().to_json( - path_or_buf=path_or_buf, orient=orient, lines=lines, index=index + path_or_buf=path_or_buf, + orient=orient, + lines=lines, + index=index, + allow_large_results=allow_large_results, ) else: - pd_series = self.to_pandas() + pd_series = self.to_pandas(allow_large_results=allow_large_results) return pd_series.to_json( path_or_buf=path_or_buf, orient=orient, lines=lines, index=index # type: ignore ) def to_latex( - self, buf=None, columns=None, header=True, index=True, **kwargs + self, + buf=None, + columns=None, + header=True, + index=True, + *, + allow_large_results=None, + **kwargs, ) -> typing.Optional[str]: - return self.to_pandas().to_latex( + return self.to_pandas(allow_large_results=allow_large_results).to_latex( buf, columns=columns, header=header, index=index, **kwargs ) - def tolist(self) -> _list: - return self.to_pandas().to_list() + def tolist( + self, + *, + allow_large_results: Optional[bool] = None, + ) -> _list: + return self.to_pandas(allow_large_results=allow_large_results).to_list() to_list = tolist to_list.__doc__ = inspect.getdoc(vendored_pandas_series.Series.tolist) @@ -1809,14 +1862,24 @@ def to_markdown( buf: typing.IO[str] | None = None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> typing.Optional[str]: - return self.to_pandas().to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore def to_numpy( - self, dtype=None, copy=False, na_value=None, **kwargs + self, + dtype=None, + copy=False, + na_value=None, + *, + allow_large_results=None, + **kwargs, ) -> numpy.ndarray: - return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) + return self.to_pandas(allow_large_results=allow_large_results).to_numpy( + dtype, copy, na_value, **kwargs + ) def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: if copy is False: @@ -1825,8 +1888,10 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) - def to_pickle(self, path, **kwargs) -> None: - return self.to_pandas().to_pickle(path, **kwargs) + def to_pickle(self, path, *, allow_large_results=None, **kwargs) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_pickle( + path, **kwargs + ) def to_string( self, @@ -1840,8 +1905,10 @@ def to_string( name=False, max_rows=None, min_rows=None, + *, + allow_large_results=None, ) -> typing.Optional[str]: - return self.to_pandas().to_string( + return self.to_pandas(allow_large_results=allow_large_results).to_string( buf, na_rep, float_format, @@ -1854,8 +1921,12 @@ def to_string( min_rows, ) - def to_xarray(self): - return self.to_pandas().to_xarray() + def to_xarray( + self, + *, + allow_large_results: Optional[bool] = None, + ): + return self.to_pandas(allow_large_results=allow_large_results).to_xarray() def _throw_if_index_contains_duplicates( self, error_message: typing.Optional[str] = None diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c8c44be40b..13e49fca42 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -234,10 +234,6 @@ def __init__( # Whether this session treats objects as totally ordered. # Will expose as feature later, only False for internal testing self._strictly_ordered: bool = context.ordering_mode != "partial" - if not self._strictly_ordered: - msg = "Partial ordering mode is a preview feature and is subject to change." - warnings.warn(msg, bfe.OrderingModePartialPreviewWarning) - self._allow_ambiguity = not self._strictly_ordered self._default_index_type = ( bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 @@ -348,11 +344,25 @@ def _project(self): @property def bytes_processed_sum(self): """The sum of all bytes processed by bigquery jobs using this session.""" + warnings.warn( + "Queries executed with `allow_large_results=False` within the session will not " + "have their bytes processed counted in this sum. If you need precise " + "bytes processed information, query the `INFORMATION_SCHEMA` tables " + "to get relevant metrics.", + UserWarning, + ) return self._metrics.bytes_processed @property def slot_millis_sum(self): """The sum of all slot time used by bigquery jobs in this session.""" + warnings.warn( + "Queries executed with `allow_large_results=False` within the session will not " + "have their slot milliseconds counted in this sum. If you need precise slot " + "milliseconds information, query the `INFORMATION_SCHEMA` tables " + "to get relevant metrics.", + UserWarning, + ) return self._metrics.slot_millis @property @@ -1203,9 +1213,9 @@ def remote_function( cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, cloud_function_memory_mib: Optional[int] = 1024, - cloud_function_ingress_settings: Literal[ - "all", "internal-only", "internal-and-gclb" - ] = "all", + cloud_function_ingress_settings: Optional[ + Literal["all", "internal-only", "internal-and-gclb"] + ] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1215,6 +1225,14 @@ def remote_function( supports dataframe with column types ``Int64``/``Float64``/``boolean``/ ``string``/``binary[pyarrow]``. + .. warning:: + To use remote functions with Bigframes 2.0 and onwards, please (preferred) + set an explicit user-managed ``cloud_function_service_account`` or (discouraged) + set ``cloud_function_service_account`` to use the Compute Engine service account + by setting it to `"default"`. + + See, https://cloud.google.com/functions/docs/securing/function-identity. + .. note:: Please make sure following is setup before using this API: @@ -1369,8 +1387,9 @@ def remote_function( https://cloud.google.com/functions/docs/configuring/memory. cloud_function_ingress_settings (str, Optional): Ingress settings controls dictating what traffic can reach the - function. By default `all` will be used. It must be one of: - `all`, `internal-only`, `internal-and-gclb`. See for more details + function. Options are: `all`, `internal-only`, or `internal-and-gclb`. + If no setting is provided, `all` will be used by default and a warning + will be issued. See for more details https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings. Returns: collections.abc.Callable: @@ -1402,6 +1421,83 @@ def remote_function( cloud_function_ingress_settings=cloud_function_ingress_settings, ) + def udf( + self, + *, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, + name: Optional[str] = None, + packages: Optional[Sequence[str]] = None, + ): + """Decorator to turn a Python udf into a BigQuery managed function. + + .. note:: + Please have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + + Args: + input_types (type or sequence(type), Optional): + For scalar user defined function it should be the input type or + sequence of input types. The supported scalar input types are + `bool`, `bytes`, `float`, `int`, `str`. + output_type (type, Optional): + Data type of the output in the user defined function. If the + user defined function returns an array, then `list[type]` should + be specified. The supported output types are `bool`, `bytes`, + `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]` + and `list[str]`. + dataset (str, Optional): + Dataset in which to create a BigQuery managed function. It + should be in `.` or `` + format. If this parameter is not provided then session dataset + id is used. + bigquery_connection (str, Optional): + Name of the BigQuery connection. You should either have the + connection already created in the `location` you have chosen, or + you should have the Project IAM Admin role to enable the service + to create the connection for you if you need it. If this + parameter is not provided then the BigQuery connection from the + session is used. + name (str, Optional): + Explicit name of the persisted BigQuery managed function. Use it + with caution, because more than one users working in the same + project and dataset could overwrite each other's managed + functions if they use the same persistent name. When an explicit + name is provided, any session specific clean up ( + ``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. + packages (str[], Optional): + Explicit name of the external package dependencies. Each + dependency is added to the `requirements.txt` as is, and can be + of the form supported in + https://pip.pypa.io/en/stable/reference/requirements-file-format/. + Returns: + collections.abc.Callable: + A managed function object pointing to the cloud assets created + in the background to support the remote execution. The cloud + ssets can be located through the following properties set in the + object: + + `bigframes_bigquery_function` - The bigquery managed function + deployed for the user defined code. + """ + return self._function_session.udf( + input_types, + output_type, + session=self, + dataset=dataset, + bigquery_connection=bigquery_connection, + name=name, + packages=packages, + ) + def read_gbq_function( self, function_name: str, @@ -1593,11 +1689,13 @@ def _start_query_ml_ddl( # so we must reset any encryption set in the job config # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model job_config.destination_encryption_configuration = None - - return bf_io_bigquery.start_query_with_client( + iterator, query_job = bf_io_bigquery.start_query_with_client( self.bqclient, sql, job_config=job_config, metrics=self._metrics ) + assert query_job is not None + return iterator, query_job + def _create_object_table(self, path: str, connection: str) -> str: """Create a random id Object Table from the input path and connection.""" table = str(self._loader._storage_manager._random_table()) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 8fcc36b4d3..94cab7cbf6 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -228,7 +228,9 @@ def start_query_with_client( timeout: Optional[float] = None, api_name: Optional[str] = None, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, -) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + *, + query_with_job: bool = True, +) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: """ Starts query job and waits for results. """ @@ -236,6 +238,18 @@ def start_query_with_client( # Note: Ensure no additional labels are added to job_config after this point, # as `add_and_trim_labels` ensures the label count does not exceed 64. add_and_trim_labels(job_config, api_name=api_name) + if not query_with_job: + results_iterator = bq_client.query_and_wait( + sql, + job_config=job_config, + location=location, + project=project, + api_timeout=timeout, + ) + if metrics is not None: + metrics.count_job_stats() + return results_iterator, None + query_job = bq_client.query( sql, job_config=job_config, @@ -338,6 +352,7 @@ def create_bq_dataset_reference( # to the dataset, no BigQuery Session required. Note: there is a # different anonymous dataset per location. See: # https://cloud.google.com/bigquery/docs/cached-results#how_cached_results_are_stored + assert query_job is not None query_destination = query_job.destination return bigquery.DatasetReference( query_destination.project, diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 1b487d0277..fd8f387c3d 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -17,6 +17,7 @@ import os import typing from typing import Optional +import warnings import google.api_core.client_info import google.api_core.client_options @@ -94,8 +95,22 @@ def __init__( else _APPLICATION_NAME ) self._project = project + + if ( + use_regional_endpoints + and location is not None + and location.lower() + not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS + ): + warnings.warn( + bigframes.constants.LEP_DEPRECATION_WARNING_MESSAGE.format( + location=location + ), + category=FutureWarning, + ) self._location = location self._use_regional_endpoints = use_regional_endpoints + self._credentials = credentials self._bq_kms_key_name = bq_kms_key_name self._client_endpoints_override = client_endpoints_override @@ -117,20 +132,22 @@ def __init__( def _create_bigquery_client(self): bq_options = None - if self._use_regional_endpoints: - bq_options = google.api_core.client_options.ClientOptions( - api_endpoint=( - _BIGQUERY_REGIONAL_ENDPOINT - if self._location is not None - and self._location.lower() - in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - else _BIGQUERY_LOCATIONAL_ENDPOINT - ).format(location=self._location), - ) if "bqclient" in self._client_endpoints_override: bq_options = google.api_core.client_options.ClientOptions( api_endpoint=self._client_endpoints_override["bqclient"] ) + elif self._use_regional_endpoints: + endpoint_template = _BIGQUERY_REGIONAL_ENDPOINT + if ( + self._location is not None + and self._location.lower() + not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS + ): + endpoint_template = _BIGQUERY_LOCATIONAL_ENDPOINT + + bq_options = google.api_core.client_options.ClientOptions( + api_endpoint=endpoint_template.format(location=self._location) + ) bq_info = google.api_core.client_info.ClientInfo( user_agent=self._application_name @@ -172,16 +189,16 @@ def bqclient(self): def bqconnectionclient(self): if not self._bqconnectionclient: bqconnection_options = None - if self._use_regional_endpoints: + if "bqconnectionclient" in self._client_endpoints_override: + bqconnection_options = google.api_core.client_options.ClientOptions( + api_endpoint=self._client_endpoints_override["bqconnectionclient"] + ) + elif self._use_regional_endpoints: bqconnection_options = google.api_core.client_options.ClientOptions( api_endpoint=_BIGQUERYCONNECTION_LOCATIONAL_ENDPOINT.format( location=self._location ) ) - if "bqconnectionclient" in self._client_endpoints_override: - bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqconnectionclient"] - ) bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name @@ -200,21 +217,23 @@ def bqconnectionclient(self): def bqstoragereadclient(self): if not self._bqstoragereadclient: bqstorage_options = None - if self._use_regional_endpoints: + if "bqstoragereadclient" in self._client_endpoints_override: bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=( - _BIGQUERYSTORAGE_REGIONAL_ENDPOINT - if self._location is not None - and self._location.lower() - in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS - else _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT - ).format(location=self._location), + api_endpoint=self._client_endpoints_override["bqstoragereadclient"] ) + elif self._use_regional_endpoints: + endpoint_template = _BIGQUERYSTORAGE_REGIONAL_ENDPOINT + if ( + self._location is not None + and self._location.lower() + not in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS + ): + endpoint_template = _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT - if "bqstoragereadclient" in self._client_endpoints_override: bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=self._client_endpoints_override["bqstoragereadclient"] + api_endpoint=endpoint_template.format(location=self._location) ) + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 502692929d..22d1c1dcea 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -33,7 +33,7 @@ import weakref import google.api_core.exceptions -import google.cloud.bigquery as bigquery +from google.cloud import bigquery import google.cloud.bigquery.job as bq_job import google.cloud.bigquery.table as bq_table import google.cloud.bigquery_storage_v1 @@ -47,6 +47,7 @@ import bigframes.core.ordering as order import bigframes.core.schema import bigframes.core.tree_properties as tree_properties +import bigframes.dtypes import bigframes.features import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics @@ -60,6 +61,7 @@ _MAX_CLUSTER_COLUMNS = 4 # TODO: b/338258028 Enable pruning to reduce text size. ENABLE_PRUNING = False +MAX_SMALL_RESULT_BYTES = 10 * 1024 * 1024 * 1024 # 10G @dataclasses.dataclass(frozen=True) @@ -89,7 +91,6 @@ def to_sql( self, array_value: bigframes.core.ArrayValue, offset_column: Optional[str] = None, - col_id_overrides: Mapping[str, str] = {}, ordered: bool = False, enable_cache: bool = True, ) -> str: @@ -103,9 +104,7 @@ def execute( array_value: bigframes.core.ArrayValue, *, ordered: bool = True, - col_id_overrides: Mapping[str, str] = {}, - use_explicit_destination: bool = False, - get_size_bytes: bool = False, + use_explicit_destination: Optional[bool] = False, page_size: Optional[int] = None, max_results: Optional[int] = None, ): @@ -117,7 +116,6 @@ def execute( def export_gbq( self, array_value: bigframes.core.ArrayValue, - col_id_overrides: Mapping[str, str], destination: bigquery.TableReference, if_exists: Literal["fail", "replace", "append"] = "fail", cluster_cols: Sequence[str] = [], @@ -130,7 +128,6 @@ def export_gbq( def export_gcs( self, array_value: bigframes.core.ArrayValue, - col_id_overrides: Mapping[str, str], uri: str, format: Literal["json", "csv", "parquet"], export_options: Mapping[str, Union[bool, str]], @@ -154,6 +151,7 @@ def peek( self, array_value: bigframes.core.ArrayValue, n_rows: int, + use_explicit_destination: Optional[bool] = False, ) -> ExecuteResult: """ A 'peek' efficiently accesses a small number of rows in the dataframe. @@ -218,48 +216,39 @@ def to_sql( self, array_value: bigframes.core.ArrayValue, offset_column: Optional[str] = None, - col_id_overrides: Mapping[str, str] = {}, ordered: bool = False, enable_cache: bool = True, ) -> str: if offset_column: array_value, internal_offset_col = array_value.promote_offsets() - col_id_overrides = dict(col_id_overrides) - col_id_overrides[internal_offset_col] = offset_column node = ( self.replace_cached_subtrees(array_value.node) if enable_cache else array_value.node ) - if ordered: - return self.compiler.compile_ordered( - node, col_id_overrides=col_id_overrides - ) - return self.compiler.compile_unordered(node, col_id_overrides=col_id_overrides) + return self.compiler.compile(node, ordered=ordered) def execute( self, array_value: bigframes.core.ArrayValue, *, ordered: bool = True, - col_id_overrides: Mapping[str, str] = {}, - use_explicit_destination: bool = False, - get_size_bytes: bool = False, + use_explicit_destination: Optional[bool] = None, page_size: Optional[int] = None, max_results: Optional[int] = None, ): + if use_explicit_destination is None: + use_explicit_destination = bigframes.options.bigquery.allow_large_results + if bigframes.options.compute.enable_multi_query_execution: self._simplify_with_caching(array_value) - sql = self.to_sql( - array_value, ordered=ordered, col_id_overrides=col_id_overrides - ) - adjusted_schema = array_value.schema.rename(col_id_overrides) + sql = self.to_sql(array_value, ordered=ordered) job_config = bigquery.QueryJobConfig() # Use explicit destination to avoid 10GB limit of temporary table if use_explicit_destination: destination_table = self.storage_manager.create_temp_table( - adjusted_schema.to_bigquery(), cluster_cols=[] + array_value.schema.to_bigquery(), cluster_cols=[] ) job_config.destination = destination_table # TODO(swast): plumb through the api_name of the user-facing api that @@ -269,25 +258,34 @@ def execute( job_config=job_config, page_size=page_size, max_results=max_results, + query_with_job=use_explicit_destination, ) # Though we provide the read client, iterator may or may not use it based on what is efficient for the result def iterator_supplier(): return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient) - if get_size_bytes is True: + if query_job: size_bytes = self.bqclient.get_table(query_job.destination).num_bytes else: size_bytes = None + if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES: + warnings.warn( + "The query result size has exceeded 10 GB. In BigFrames 2.0 and " + "later, you might need to manually set `allow_large_results=True` in " + "the IO method or adjust the BigFrames option: " + "`bigframes.options.bigquery.allow_large_results=True`.", + FutureWarning, + ) # Runs strict validations to ensure internal type predictions and ibis are completely in sync # Do not execute these validations outside of testing suite. - if "PYTEST_CURRENT_TEST" in os.environ and len(col_id_overrides) == 0: + if "PYTEST_CURRENT_TEST" in os.environ: self._validate_result_schema(array_value, iterator.schema) return ExecuteResult( arrow_batches=iterator_supplier, - schema=adjusted_schema, + schema=array_value.schema, query_job=query_job, total_bytes=size_bytes, total_rows=iterator.total_rows, @@ -296,7 +294,6 @@ def iterator_supplier(): def export_gbq( self, array_value: bigframes.core.ArrayValue, - col_id_overrides: Mapping[str, str], destination: bigquery.TableReference, if_exists: Literal["fail", "replace", "append"] = "fail", cluster_cols: Sequence[str] = [], @@ -312,7 +309,7 @@ def export_gbq( "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, "append": bigquery.WriteDisposition.WRITE_APPEND, } - sql = self.to_sql(array_value, ordered=False, col_id_overrides=col_id_overrides) + sql = self.to_sql(array_value, ordered=False) job_config = bigquery.QueryJobConfig( write_disposition=dispositions[if_exists], destination=destination, @@ -324,12 +321,23 @@ def export_gbq( sql=sql, job_config=job_config, ) + + has_timedelta_col = any( + t == bigframes.dtypes.TIMEDELTA_DTYPE for t in array_value.schema.dtypes + ) + + if if_exists != "append" and has_timedelta_col: + # Only update schema if this is not modifying an existing table, and the + # new table contains timedelta columns. + table = self.bqclient.get_table(destination) + table.schema = array_value.schema.to_bigquery() + self.bqclient.update_table(table, ["schema"]) + return query_job def export_gcs( self, array_value: bigframes.core.ArrayValue, - col_id_overrides: Mapping[str, str], uri: str, format: Literal["json", "csv", "parquet"], export_options: Mapping[str, Union[bool, str]], @@ -337,7 +345,7 @@ def export_gcs( query_job = self.execute( array_value, ordered=False, - col_id_overrides=col_id_overrides, + use_explicit_destination=True, ).query_job result_table = query_job.destination export_data_statement = bq_io.create_export_data_statement( @@ -368,6 +376,7 @@ def peek( self, array_value: bigframes.core.ArrayValue, n_rows: int, + use_explicit_destination: Optional[bool] = None, ) -> ExecuteResult: """ A 'peek' efficiently accesses a small number of rows in the dataframe. @@ -376,12 +385,24 @@ def peek( if not tree_properties.can_fast_peek(plan): msg = "Peeking this value cannot be done efficiently." warnings.warn(msg) + if use_explicit_destination is None: + use_explicit_destination = bigframes.options.bigquery.allow_large_results - sql = self.compiler.compile_peek(plan, n_rows) + job_config = bigquery.QueryJobConfig() + # Use explicit destination to avoid 10GB limit of temporary table + if use_explicit_destination: + destination_table = self.storage_manager.create_temp_table( + array_value.schema.to_bigquery(), cluster_cols=[] + ) + job_config.destination = destination_table + + sql = self.compiler.compile(plan, ordered=False, limit=n_rows) # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. - iterator, query_job = self._run_execute_query(sql=sql) + iterator, query_job = self._run_execute_query( + sql=sql, job_config=job_config, query_with_job=use_explicit_destination + ) return ExecuteResult( # Probably don't need read client for small peek results, but let client decide arrow_batches=lambda: iterator.to_arrow_iterable( @@ -416,7 +437,7 @@ def head( assert tree_properties.can_fast_head(plan) head_plan = generate_head_plan(plan, n_rows) - sql = self.compiler.compile_ordered(head_plan) + sql = self.compiler.compile(head_plan) # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. @@ -439,7 +460,7 @@ def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: row_count_plan = self.replace_cached_subtrees( generate_row_count_plan(array_value.node) ) - sql = self.compiler.compile_unordered(row_count_plan) + sql = self.compiler.compile(row_count_plan, ordered=False) iter, _ = self._run_execute_query(sql) return next(iter)[0] @@ -476,7 +497,8 @@ def _run_execute_query( api_name: Optional[str] = None, page_size: Optional[int] = None, max_results: Optional[int] = None, - ) -> Tuple[bq_table.RowIterator, bigquery.QueryJob]: + query_with_job: bool = True, + ) -> Tuple[bq_table.RowIterator, Optional[bigquery.QueryJob]]: """ Starts BigQuery query job and waits for results. """ @@ -494,7 +516,7 @@ def _run_execute_query( # as `add_and_trim_labels` ensures the label count does not exceed 64. bq_io.add_and_trim_labels(job_config, api_name=api_name) try: - return bq_io.start_query_with_client( + iterator, query_job = bq_io.start_query_with_client( self.bqclient, sql, job_config=job_config, @@ -502,7 +524,9 @@ def _run_execute_query( max_results=max_results, page_size=page_size, metrics=self.metrics, + query_with_job=query_with_job, ) + return iterator, query_job except google.api_core.exceptions.BadRequest as e: # Unfortunately, this error type does not have a separate error code or exception type @@ -549,8 +573,8 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): """Executes the query and uses the resulting table to rewrite future executions.""" offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") w_offsets, offset_column = array_value.promote_offsets() - sql = self.compiler.compile_unordered( - self.replace_cached_subtrees(w_offsets.node) + sql = self.compiler.compile( + self.replace_cached_subtrees(w_offsets.node), ordered=False ) tmp_table = self._sql_as_cached_temp_table( @@ -633,7 +657,7 @@ def _sql_as_cached_temp_table( job_config=job_config, api_name="cached", ) - query_job.destination + assert query_job is not None query_job.result() return query_job.destination @@ -642,24 +666,41 @@ def _validate_result_schema( array_value: bigframes.core.ArrayValue, bq_schema: list[bigquery.SchemaField], ): - actual_schema = tuple(bq_schema) + actual_schema = _sanitize(tuple(bq_schema)) ibis_schema = bigframes.core.compile.test_only_ibis_inferred_schema( self.replace_cached_subtrees(array_value.node) - ) - internal_schema = array_value.schema + ).to_bigquery() + internal_schema = _sanitize(array_value.schema.to_bigquery()) if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: return - if internal_schema.to_bigquery() != actual_schema: + if internal_schema != actual_schema: raise ValueError( - f"This error should only occur while testing. BigFrames internal schema: {internal_schema.to_bigquery()} does not match actual schema: {actual_schema}" + f"This error should only occur while testing. BigFrames internal schema: {internal_schema} does not match actual schema: {actual_schema}" ) - if ibis_schema.to_bigquery() != actual_schema: + + if ibis_schema != actual_schema: raise ValueError( - f"This error should only occur while testing. Ibis schema: {ibis_schema.to_bigquery()} does not match actual schema: {actual_schema}" + f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}" ) +def _sanitize( + schema: Tuple[bigquery.SchemaField, ...] +) -> Tuple[bigquery.SchemaField, ...]: + # Schema inferred from SQL strings and Ibis expressions contain only names, types and modes, + # so we disregard other fields (e.g timedelta description for timedelta columns) for validations. + return tuple( + bigquery.SchemaField( + f.name, + f.field_type, + f.mode, # type:ignore + fields=_sanitize(f.fields), + ) + for f in schema + ) + + def generate_head_plan(node: nodes.BigFrameNode, n: int): return nodes.SliceNode(node, start=None, stop=n) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 7204a14870..7c2586fe76 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -726,7 +726,7 @@ def _start_query( job_config.maximum_bytes_billed = ( bigframes.options.compute.maximum_bytes_billed ) - return bf_io_bigquery.start_query_with_client( + iterator, query_job = bf_io_bigquery.start_query_with_client( self._bqclient, sql, job_config=job_config, @@ -734,6 +734,8 @@ def _start_query( timeout=timeout, api_name=api_name, ) + assert query_job is not None + return iterator, query_job def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 33bcd7fbf5..1cb561693b 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -32,7 +32,11 @@ class ExecutionMetrics: execution_secs: float = 0 query_char_count: int = 0 - def count_job_stats(self, query_job: bq_job.QueryJob): + def count_job_stats(self, query_job: Optional[bq_job.QueryJob] = None): + if query_job is None: + self.execution_count += 1 + return + stats = get_performance_stats(query_job) if stats is not None: bytes_processed, slot_millis, execution_secs, query_char_count = stats diff --git a/bigframes/version.py b/bigframes/version.py index 762deda9ff..f743c7e94d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.38.0" +__version__ = "1.39.0" + +# {x-release-please-start-date} +__release_date__ = "2025-03-05" +# {x-release-please-end} diff --git a/notebooks/dataframes/index_col_null.ipynb b/notebooks/dataframes/index_col_null.ipynb index de373050fe..655745dd2b 100644 --- a/notebooks/dataframes/index_col_null.ipynb +++ b/notebooks/dataframes/index_col_null.ipynb @@ -38,23 +38,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "96757c59-fc22-420e-a42f-c6cb956110ec", "metadata": {}, "outputs": [], "source": [ - "import warnings\n", - "\n", "import bigframes.enums\n", "import bigframes.exceptions\n", "import bigframes.pandas as bpd\n", "\n", - "# Explicitly opt-in to the NULL index preview feature.\n", - "warnings.simplefilter(\n", - " \"ignore\",\n", - " bigframes.exceptions.NullIndexPreviewWarning,\n", - ")\n", - "\n", "df = bpd.read_gbq(\n", " \"bigquery-public-data.baseball.schedules\",\n", " index_col=bigframes.enums.DefaultIndexKind.NULL,\n", diff --git a/notebooks/dataframes/pypi.ipynb b/notebooks/dataframes/pypi.ipynb index a62bd45768..c901a1c654 100644 --- a/notebooks/dataframes/pypi.ipynb +++ b/notebooks/dataframes/pypi.ipynb @@ -27,12 +27,7 @@ "source": [ "# Analyzing package downloads from PyPI with BigQuery DataFrames\n", "\n", - "In this notebook, you'll use the [PyPI public dataset](https://console.cloud.google.com/marketplace/product/gcp-public-data-pypi/pypi) and the [deps.dev public dataset](https://deps.dev/) to visualize Python package downloads for a package and its dependencies.\n", - "\n", - "> **âš  Important**\n", - ">\n", - "> You'll use features that are currently in [preview](https://cloud.google.com/blog/products/gcp/google-cloud-gets-simplified-product-launch-stages): `ordering_mode=\"partial\"` and \"NULL\" indexes. There may be breaking changes to this functionality in future versions of the BigQuery DataFrames package.\n", - "\n" + "In this notebook, you'll use the [PyPI public dataset](https://console.cloud.google.com/marketplace/product/gcp-public-data-pypi/pypi) and the [deps.dev public dataset](https://deps.dev/) to visualize Python package downloads for a package and its dependencies." ] }, { @@ -59,27 +54,6 @@ "bpd.options.bigquery.ordering_mode = \"partial\"" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Filter out the relevant warnings for preview features used." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "import bigframes.exceptions\n", - "\n", - "warnings.simplefilter(\"ignore\", category=bigframes.exceptions.NullIndexPreviewWarning)\n", - "warnings.simplefilter(\"ignore\", category=bigframes.exceptions.OrderingModePartialPreviewWarning)" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb index ffd772e7b4..7060128bf6 100644 --- a/notebooks/geo/geoseries.ipynb +++ b/notebooks/geo/geoseries.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -97,21 +97,21 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "78 POINT (-95.84727 44.4092)\n", - "130 POINT (-94.90431 41.67918)\n", - "544 POINT (-95.85272 40.38739)\n", - "995 POINT (-101.83333 47.30715)\n", - "1036 POINT (-88.36343 37.20952)\n", + "137 POINT (-86.87338 38.37334)\n", + "164 POINT (-118.48037 46.25461)\n", + "333 POINT (-92.5617 32.30429)\n", + "703 POINT (-83.46189 39.55525)\n", + "846 POINT (-119.46779 47.21363)\n", "Name: int_point_geom, dtype: geometry" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -130,21 +130,21 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-95.84727 44.4092)\n", - "1 POINT (-94.90431 41.67918)\n", - "2 POINT (-95.85272 40.38739)\n", - "3 POINT (-101.83333 47.30715)\n", - "4 POINT (-88.36343 37.20952)\n", + "0 POINT (-86.87338 38.37334)\n", + "1 POINT (-118.48037 46.25461)\n", + "2 POINT (-92.5617 32.30429)\n", + "3 POINT (-83.46189 39.55525)\n", + "4 POINT (-119.46779 47.21363)\n", "dtype: geometry" ] }, - "execution_count": 6, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -179,21 +179,21 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 -95.847268\n", - "1 -94.904312\n", - "2 -95.852721\n", - "3 -101.833328\n", - "4 -88.363426\n", + "0 -86.873385\n", + "1 -118.48037\n", + "2 -92.5617\n", + "3 -83.461893\n", + "4 -119.467788\n", "dtype: Float64" ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -211,21 +211,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 44.409195\n", - "1 41.679178\n", - "2 40.387389\n", - "3 47.307147\n", - "4 37.209517\n", + "0 38.373344\n", + "1 46.254606\n", + "2 32.30429\n", + "3 39.555246\n", + "4 47.213633\n", "dtype: Float64" ] }, - "execution_count": 8, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -250,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -284,7 +284,7 @@ "dtype: Float64" ] }, - "execution_count": 9, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -302,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -336,7 +336,7 @@ "dtype: Float64" ] }, - "execution_count": 10, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -361,21 +361,21 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "59 POLYGON ((-96.92479 43.43217, -96.92477 43.430...\n", - "132 POLYGON ((-91.95104 40.05078, -91.95105 40.050...\n", - "223 POLYGON ((-84.39719 40.78658, -84.39718 40.783...\n", - "328 POLYGON ((-91.80469 31.48623, -91.80469 31.486...\n", - "396 POLYGON ((-79.87705 40.03683, -79.87688 40.036...\n", + "78 POLYGON ((-95.97154 44.6306, -95.97919 44.6305...\n", + "130 POLYGON ((-95.0933 41.77694, -95.09331 41.7764...\n", + "544 POLYGON ((-96.0664 40.43618, -96.06639 40.4352...\n", + "995 POLYGON ((-101.83583 47.49547, -101.83665 47.4...\n", + "1036 POLYGON ((-88.42474 37.15094, -88.42526 37.149...\n", "Name: county_geom, dtype: geometry" ] }, - "execution_count": 11, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -394,21 +394,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POLYGON ((-96.92479 43.43217, -96.92477 43.430...\n", - "1 POLYGON ((-91.95104 40.05078, -91.95105 40.050...\n", - "2 POLYGON ((-84.39719 40.78658, -84.39718 40.783...\n", - "3 POLYGON ((-91.80469 31.48623, -91.80469 31.486...\n", - "4 POLYGON ((-79.87705 40.03683, -79.87688 40.036...\n", + "0 POLYGON ((-95.97154 44.6306, -95.97919 44.6305...\n", + "1 POLYGON ((-95.0933 41.77694, -95.09331 41.7764...\n", + "2 POLYGON ((-96.0664 40.43618, -96.06639 40.4352...\n", + "3 POLYGON ((-101.83583 47.49547, -101.83665 47.4...\n", + "4 POLYGON ((-88.42474 37.15094, -88.42526 37.149...\n", "dtype: geometry" ] }, - "execution_count": 12, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": { "tags": [ "raises-exception" @@ -442,14 +442,14 @@ "outputs": [ { "ename": "NotImplementedError", - "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.36.0", + "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", + "Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:67\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 69\u001b[0m )\n", - "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.36.0" + "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0" ] } ], @@ -466,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -475,21 +475,21 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 1493638545.448335\n", - "1 1321524759.411463\n", - "2 1052436575.522383\n", - "3 1937116615.360128\n", - "4 2065462414.544471\n", + "0 1865212769.084914\n", + "1 1146753653.723439\n", + "2 1059653048.84506\n", + "3 2873655557.502374\n", + "4 886267772.361455\n", "dtype: Float64" ] }, - "execution_count": 15, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -515,21 +515,21 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-95.84727 44.4092)\n", - "1 POINT (-94.90431 41.67918)\n", - "2 POINT (-95.85272 40.38739)\n", - "3 POINT (-101.83333 47.30715)\n", - "4 POINT (-88.36343 37.20952)\n", + "0 POINT (-86.87338 38.37334)\n", + "1 POINT (-118.48037 46.25461)\n", + "2 POINT (-92.5617 32.30429)\n", + "3 POINT (-83.46189 39.55525)\n", + "4 POINT (-119.46779 47.21363)\n", "dtype: geometry" ] }, - "execution_count": 16, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -554,21 +554,21 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT(-95.8472678 44.4091953)\n", - "1 POINT(-94.9043119 41.679178)\n", - "2 POINT(-95.8527214 40.3873891)\n", - "3 POINT(-101.8333279 47.3071473)\n", - "4 POINT(-88.3634261 37.2095174)\n", + "0 POINT(-86.8733845 38.3733441)\n", + "1 POINT(-118.4803697 46.2546057)\n", + "2 POINT(-92.5616997 32.3042901)\n", + "3 POINT(-83.4618927 39.5552462)\n", + "4 POINT(-119.467788 47.2136328)\n", "dtype: string" ] }, - "execution_count": 18, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -594,21 +594,21 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-95.84727 44.4092)\n", - "1 POINT (-94.90431 41.67918)\n", - "2 POINT (-95.85272 40.38739)\n", - "3 POINT (-101.83333 47.30715)\n", - "4 POINT (-88.36343 37.20952)\n", + "0 POINT (-86.87338 38.37334)\n", + "1 POINT (-118.48037 46.25461)\n", + "2 POINT (-92.5617 32.30429)\n", + "3 POINT (-83.46189 39.55525)\n", + "4 POINT (-119.46779 47.21363)\n", "dtype: geometry" ] }, - "execution_count": 19, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -617,6 +617,73 @@ "wkts_from_geo = bigframes.geopandas.GeoSeries.from_wkt(geo_to_wkts)\n", "wkts_from_geo" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Discover the set-theoretic boundary of geometry objects with `GeoSeries.boundary`" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POLYGON ((0 0, 1 1, 0 1, 0 0))\n", + "1 POLYGON ((10 0, 10 5, 0 0, 10 0))\n", + "2 POLYGON ((0 0, 2 2, 2 0, 0 0))\n", + "3 LINESTRING (0 0, 1 1, 0 1)\n", + "4 POINT (0 1)\n", + "dtype: geometry" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from shapely.geometry import Polygon, LineString, Point\n", + "geom_obj = bigframes.geopandas.GeoSeries(\n", + " [\n", + " Polygon([(0, 0), (1, 1), (0, 1)]),\n", + " Polygon([(10, 0), (10, 5), (0, 0)]),\n", + " Polygon([(0, 0), (2, 2), (2, 0)]),\n", + " LineString([(0, 0), (1, 1), (0, 1)]),\n", + " Point(0, 1),\n", + " ]\n", + ")\n", + "geom_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 LINESTRING (0 0, 1 1, 0 1, 0 0)\n", + "1 LINESTRING (10 0, 10 5, 0 0, 10 0)\n", + "2 LINESTRING (0 0, 2 2, 2 0, 0 0)\n", + "3 MULTIPOINT (0 0, 0 1)\n", + "4 GEOMETRYCOLLECTION EMPTY\n", + "dtype: geometry" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom_obj.geo.boundary" + ] } ], "metadata": { diff --git a/notebooks/remote_functions/remote_function_usecases.ipynb b/notebooks/remote_functions/remote_function_usecases.ipynb index d4dde6e6b1..a053dd3fbb 100644 --- a/notebooks/remote_functions/remote_function_usecases.ipynb +++ b/notebooks/remote_functions/remote_function_usecases.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 2, "metadata": { "id": "Y6QAttCqqMM0" }, @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -66,17 +66,21 @@ }, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3550: UserWarning: Reading cached table from 2024-07-24 08:01:12.491984+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n" - ] + "data": { + "text/html": [ + "Query job 1f6094e9-1942-477c-9ce3-87a614d71294 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/html": [ - "Query job 9d155f10-e37a-4d20-b2ff-02868ecb58f4 is DONE. 582.8 kB processed. Open Job" + "Query job ba19f29c-33d3-4f12-9605-ddeafb74918e is DONE. 582.8 kB processed. Open Job" ], "text/plain": [ "" @@ -88,7 +92,7 @@ { "data": { "text/html": [ - "Query job 5a524e70-12dc-4116-b416-04570bbf754e is DONE. 82.0 kB processed. Open Job" + "Query job dd1ff8be-700a-4ce5-91a0-31413f70cfad is DONE. 82.0 kB processed. Open Job" ], "text/plain": [ "" @@ -125,49 +129,49 @@ " \n", " \n", " \n", - " 36\n", - " Reds\n", - " Cubs\n", - " 159\n", + " 88\n", + " Royals\n", + " Athletics\n", + " 176\n", " \n", " \n", - " 358\n", + " 106\n", " Dodgers\n", - " Diamondbacks\n", - " 223\n", + " Giants\n", + " 216\n", " \n", " \n", - " 416\n", - " Yankees\n", - " White Sox\n", - " 216\n", + " 166\n", + " Phillies\n", + " Royals\n", + " 162\n", " \n", " \n", - " 523\n", - " Rays\n", - " Athletics\n", - " 187\n", + " 247\n", + " Rangers\n", + " Royals\n", + " 161\n", " \n", " \n", - " 594\n", - " Pirates\n", - " Brewers\n", - " 169\n", + " 374\n", + " Athletics\n", + " Astros\n", + " 161\n", " \n", " \n", "\n", "" ], "text/plain": [ - " homeTeamName awayTeamName duration_minutes\n", - "36 Reds Cubs 159\n", - "358 Dodgers Diamondbacks 223\n", - "416 Yankees White Sox 216\n", - "523 Rays Athletics 187\n", - "594 Pirates Brewers 169" + " homeTeamName awayTeamName duration_minutes\n", + "88 Royals Athletics 176\n", + "106 Dodgers Giants 216\n", + "166 Phillies Royals 162\n", + "247 Rangers Royals 161\n", + "374 Athletics Astros 161" ] }, - "execution_count": 22, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -216,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -226,10 +230,18 @@ "outputId": "19351206-116e-4da2-8ff0-f288b7745b27" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/functions/_function_session.py:335: UserWarning: You have not explicitly set a user-managed cloud_function_service_account. Using the default compute service account, {cloud_function_service_account}. To use Bigframes 2.0, please set an explicit user-managed cloud_function_service_account or set cloud_function_service_account explicitly to `default`.See, https://cloud.google.com/functions/docs/securing/function-identity.\n", + " warnings.warn(msg, category=UserWarning)\n" + ] + }, { "data": { "text/html": [ - "Query job ec8d958d-93ef-45ae-8150-6ccfa8feb89a is DONE. 0 Bytes processed. Open Job" + "Query job 7c021760-59c4-4f3a-846c-9693a4d16eef is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -242,7 +254,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-e22dbecc9ec0374bda36bc23df3775b0-g8zp' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_e22dbecc9ec0374bda36bc23df3775b0_g8zp'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-sessionca6012-ca541a90249f8b62951f38b7aba6a711-49to' and BQ remote function 'bigframes-dev._ed1e4d0f7d41174ba506d34d15dccf040d13f69e.bigframes_sessionca6012_ca541a90249f8b62951f38b7aba6a711_49to'.\n" ] } ], @@ -1430,7 +1442,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.19" } }, "nbformat": 4, diff --git a/noxfile.py b/noxfile.py index bffb6ebaa0..ca147e171d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -990,7 +990,7 @@ def cleanup(session): # project within the "Number of functions" quota # https://cloud.google.com/functions/quotas#resource_limits recency_cutoff_hours = 12 - cleanup_count_per_location = 20 + cleanup_count_per_location = 40 cleanup_options.extend( [ f"--recency-cutoff={recency_cutoff_hours}", diff --git a/owlbot.py b/owlbot.py index 10fc47ebd7..0e40cae1ad 100644 --- a/owlbot.py +++ b/owlbot.py @@ -50,6 +50,7 @@ "README.rst", "CONTRIBUTING.rst", ".github/release-trigger.yml", + ".github/release-please.yml", # BigQuery DataFrames manages its own Kokoro cluster for presubmit & continuous tests. ".kokoro/build.sh", ".kokoro/continuous/common.cfg", diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index 452d88746d..e4ace53a5c 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -78,8 +78,23 @@ def test_linear_regression(random_model_id: str) -> None: # 332 4740.7907 Gentoo penguin (Pygoscelis papua) Biscoe 46.2 14.4 214.0 4650.0 # 160 4731.310452 Gentoo penguin (Pygoscelis papua) Biscoe 44.5 14.3 216.0 4100.0 # [END bigquery_dataframes_bqml_linear_predict] + # [START bigquery_dataframes_bqml_linear_predict_explain] + # Use 'predict_explain' function to understand why the model is generating these prediction results. + # 'predict_explain'is an extended version of the 'predict' function that not only outputs prediction results, but also outputs additional columns to explain the prediction results. + # Using the trained model and utilizing data specific to Biscoe Island, explain the predictions of the top 3 features + explained = model.predict_explain(biscoe_data, top_k_features=3) + + # Expected results: + # predicted_body_mass_g top_feature_attributions baseline_prediction_value prediction_value approximation_error species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex + # 0 5413.510134 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 5413.510134 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 16.4 223.0 5950.0 MALE + # 1 4768.351092 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 4768.351092 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 14.5 213.0 4400.0 FEMALE + # 2 3235.896372 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 3235.896372 0.0 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.7 16.0 183.0 3075.0 FEMALE + # 3 5349.603734 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 5349.603734 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 15.6 221.0 5000.0 MALE + # 4 4637.165037 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 4637.165037 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 13.2 211.0 4500.0 FEMALE + # [END bigquery_dataframes_bqml_linear_predict_explain] assert feature_columns is not None assert label_columns is not None assert model is not None assert score is not None assert result is not None + assert explained is not None diff --git a/setup.py b/setup.py index 1f6114b634..9ea563b3cb 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ "numpy >=1.24.0", "pandas >=1.5.3", "pandas-gbq >=0.26.0", - "pyarrow >=10.0.1", + "pyarrow >=15.0.2", "pydata-google-auth >=1.8.2", "requests >=2.27.1", "sqlglot >=23.6.3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 30d5c1c3a7..b355e0915b 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -16,7 +16,7 @@ jellyfish==0.8.9 numpy==1.24.0 pandas==1.5.3 pandas-gbq==0.26.0 -pyarrow==10.0.1 +pyarrow==15.0.2 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 diff --git a/tests/data/json.jsonl b/tests/data/json.jsonl index fbf0593612..1abdcc9d56 100644 --- a/tests/data/json.jsonl +++ b/tests/data/json.jsonl @@ -6,10 +6,10 @@ {"rowindex": 5, "json_col": []} {"rowindex": 6, "json_col": [1, 2, 3]} {"rowindex": 7, "json_col": [{"a": 1}, {"a": 2}, {"a": null}, {}]} -{"rowindex": 8, "json_col": {"bool_value": true}} +{"rowindex": 8, "json_col": "100"} {"rowindex": 9, "json_col": {"folat_num": 3.14159}} {"rowindex": 10, "json_col": {"date": "2024-07-16"}} -{"rowindex": 11, "json_col": {"null_filed": null}} +{"rowindex": 11, "json_col": 100} {"rowindex": 12, "json_col": {"int_value": 2, "null_filed": null}} {"rowindex": 13, "json_col": {"list_data": [10, 20, 30]}} {"rowindex": 14, "json_col": {"person": {"name": "Alice", "age": 35}}} diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py new file mode 100644 index 0000000000..4db7a1c47c --- /dev/null +++ b/tests/system/large/functions/test_managed_function.py @@ -0,0 +1,166 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas +import pytest + +from bigframes.functions import _function_session as bff_session +from bigframes.functions._utils import get_python_version +import bigframes.pandas as bpd +from tests.system.utils import cleanup_function_assets + +bpd.options.experiments.udf = True + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_multiply_with_ibis( + session, + scalars_table_id, + bigquery_client, + ibis_client, + dataset_id, +): + + try: + + @session.udf( + input_types=[int, int], + output_type=int, + dataset=dataset_id, + ) + def multiply(x, y): + return x * y + + _, dataset_name, table_name = scalars_table_id.split(".") + if not ibis_client.dataset: + ibis_client.dataset = dataset_name + + col_name = "int64_col" + table = ibis_client.tables[table_name] + table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10) + sql = table.compile() + pandas_df_orig = bigquery_client.query(sql).to_dataframe() + + col = table[col_name] + col_2x = multiply(col, 2).name("int64_col_2x") + col_square = multiply(col, col).name("int64_col_square") + table = table.mutate([col_2x, col_square]) + sql = table.compile() + pandas_df_new = bigquery_client.query(sql).to_dataframe() + + pandas.testing.assert_series_equal( + pandas_df_orig[col_name] * 2, + pandas_df_new["int64_col_2x"], + check_names=False, + ) + + pandas.testing.assert_series_equal( + pandas_df_orig[col_name] * pandas_df_orig[col_name], + pandas_df_new["int64_col_square"], + check_names=False, + ) + finally: + # clean up the gcp assets created for the managed function. + cleanup_function_assets(multiply, bigquery_client) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_stringify_with_ibis( + session, + scalars_table_id, + bigquery_client, + ibis_client, + dataset_id, +): + try: + + @session.udf( + input_types=[int], + output_type=str, + dataset=dataset_id, + ) + def stringify(x): + return f"I got {x}" + + # Function should work locally. + assert stringify(8912) == "I got 8912" + + _, dataset_name, table_name = scalars_table_id.split(".") + if not ibis_client.dataset: + ibis_client.dataset = dataset_name + + col_name = "int64_col" + table = ibis_client.tables[table_name] + table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10) + sql = table.compile() + pandas_df_orig = bigquery_client.query(sql).to_dataframe() + + col = table[col_name] + col_2x = stringify.ibis_node(col).name("int64_str_col") + table = table.mutate([col_2x]) + sql = table.compile() + pandas_df_new = bigquery_client.query(sql).to_dataframe() + + pandas.testing.assert_series_equal( + pandas_df_orig[col_name].apply(lambda x: f"I got {x}"), + pandas_df_new["int64_str_col"], + check_names=False, + ) + finally: + # clean up the gcp assets created for the managed function. + cleanup_function_assets( + bigquery_client, session.cloudfunctionsclient, stringify + ) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_binop(session, scalars_dfs, dataset_id): + try: + + def func(x, y): + return x * abs(y % 4) + + managed_func = session.udf( + input_types=[str, int], + output_type=str, + dataset=dataset_id, + )(func) + + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_df = scalars_df.dropna() + scalars_pandas_df = scalars_pandas_df.dropna() + pd_result = scalars_pandas_df["string_col"].combine( + scalars_pandas_df["int64_col"], func + ) + bf_result = ( + scalars_df["string_col"] + .combine(scalars_df["int64_col"], managed_func) + .to_pandas() + ) + pandas.testing.assert_series_equal(bf_result, pd_result) + finally: + # clean up the gcp assets created for the managed function. + cleanup_function_assets( + session.bqclient, session.cloudfunctionsclient, managed_func + ) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 7363e370bb..350eae3783 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -21,6 +21,7 @@ import sys import tempfile import textwrap +import warnings import google.api_core.exceptions from google.cloud import bigquery, functions_v2, storage @@ -38,6 +39,7 @@ import bigframes.series from tests.system.utils import ( assert_pandas_df_equal, + cleanup_function_assets, delete_cloud_function, get_cloud_functions, ) @@ -54,30 +56,6 @@ ) -def cleanup_remote_function_assets( - bigquery_client, cloudfunctions_client, remote_udf, ignore_failures=True -): - """Clean up the GCP assets behind a bigframes remote function.""" - - # Clean up BQ remote function - try: - bigquery_client.delete_routine(remote_udf.bigframes_remote_function) - except Exception: - # By default don't raise exception in cleanup - if not ignore_failures: - raise - - # Clean up cloud function - try: - delete_cloud_function( - cloudfunctions_client, remote_udf.bigframes_cloud_function - ) - except Exception: - # By default don't raise exception in cleanup - if not ignore_failures: - raise - - def make_uniq_udf(udf): """Transform a udf to another with same behavior but a unique name. Use this to test remote functions with reuse=True, in which case parallel @@ -177,9 +155,7 @@ def multiply(x, y): ) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - bigquery_client, session.cloudfunctionsclient, multiply - ) + cleanup_function_assets(multiply, bigquery_client, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -229,8 +205,8 @@ def stringify(x): ) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - bigquery_client, session.cloudfunctionsclient, stringify + cleanup_function_assets( + stringify, bigquery_client, session.cloudfunctionsclient ) @@ -264,8 +240,8 @@ def func(x, y): pandas.testing.assert_series_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_func + cleanup_function_assets( + remote_func, session.bqclient, session.cloudfunctionsclient ) @@ -301,8 +277,8 @@ def func(x, y): pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_func + cleanup_function_assets( + remote_func, session.bqclient, session.cloudfunctionsclient ) @@ -346,9 +322,7 @@ def square(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square - ) + cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -392,8 +366,8 @@ def add_one(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_add_one + cleanup_function_assets( + remote_add_one, session.bqclient, session.cloudfunctionsclient ) @@ -422,8 +396,8 @@ def add_one(x): pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_add_one + cleanup_function_assets( + remote_add_one, session.bqclient, session.cloudfunctionsclient ) @@ -470,9 +444,7 @@ def square(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square - ) + cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -523,8 +495,8 @@ def sign(num): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_sign + cleanup_function_assets( + remote_sign, session.bqclient, session.cloudfunctionsclient ) @@ -570,8 +542,8 @@ def circumference(radius): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_circumference + cleanup_function_assets( + remote_circumference, session.bqclient, session.cloudfunctionsclient ) @@ -619,8 +591,8 @@ def find_team(num): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_find_team + cleanup_function_assets( + remote_find_team, session.bqclient, session.cloudfunctionsclient ) @@ -756,8 +728,8 @@ def inner_test(): shutil.rmtree(add_one_uniq_dir) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, remote_add_one + cleanup_function_assets( + remote_add_one, session.bqclient, session.cloudfunctionsclient ) @@ -796,8 +768,8 @@ def is_odd(num): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, is_odd_remote + cleanup_function_assets( + is_odd_remote, session.bqclient, session.cloudfunctionsclient ) @@ -839,8 +811,8 @@ def is_odd(num): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, is_odd_remote + cleanup_function_assets( + is_odd_remote, session.bqclient, session.cloudfunctionsclient ) @@ -881,8 +853,8 @@ def test_remote_udf_lambda(session, scalars_dfs, dataset_id, bq_cf_connection): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, add_one_lambda_remote + cleanup_function_assets( + add_one_lambda_remote, session.bqclient, session.cloudfunctionsclient ) @@ -938,8 +910,8 @@ def square(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient ) @@ -983,8 +955,8 @@ def pd_np_foo(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, pd_np_foo_remote + cleanup_function_assets( + pd_np_foo_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1136,14 +1108,14 @@ def plusone(x): test_internal(plusone_remote, plusone) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote1 + cleanup_function_assets( + square_remote1, session.bqclient, session.cloudfunctionsclient ) - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote2 + cleanup_function_assets( + square_remote2, session.bqclient, session.cloudfunctionsclient ) - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, plusone_remote + cleanup_function_assets( + plusone_remote, session.bqclient, session.cloudfunctionsclient ) for dir_ in dirs_to_cleanup: shutil.rmtree(dir_) @@ -1196,9 +1168,7 @@ def square(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square - ) + cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -1233,9 +1203,7 @@ def square(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square - ) + cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -1256,9 +1224,7 @@ def square(x): scalars_df["int64_col"].apply(square).to_pandas() finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square - ) + cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -1301,9 +1267,7 @@ def square(x): assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square - ) + cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -1354,8 +1318,18 @@ def square_num(x): assert gcf.service_config.service_account_email == gcf_service_account finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - rf_session.bqclient, rf_session.cloudfunctionsclient, square_num + cleanup_function_assets( + square_num, rf_session.bqclient, rf_session.cloudfunctionsclient + ) + + +def test_remote_function_warns_default_cloud_function_service_account(): + project = "bigframes-dev-perf" + rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) + + with pytest.warns(FutureWarning, match="You have not explicitly set a"): + rf_session.remote_function( + cloud_function_service_account=None, # Explicitly omit service account. ) @@ -1419,8 +1393,8 @@ def square_num(x): finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_num + cleanup_function_assets( + square_num, session.bqclient, session.cloudfunctionsclient ) @@ -1478,8 +1452,8 @@ def square_num(x): assert gcf.service_config.vpc_connector == gcf_vpc_connector finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - rf_session.bqclient, rf_session.cloudfunctionsclient, square_num_remote + cleanup_function_assets( + square_num_remote, rf_session.bqclient, rf_session.cloudfunctionsclient ) @@ -1514,8 +1488,8 @@ def square(x): pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1554,8 +1528,8 @@ def square(x): pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1603,8 +1577,8 @@ def square(x): pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1655,8 +1629,8 @@ def serialize_row(row): pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, serialize_row_remote + cleanup_function_assets( + serialize_row_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1696,8 +1670,8 @@ def analyze(row): pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, analyze_remote + cleanup_function_assets( + analyze_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1816,8 +1790,8 @@ def serialize_row(row): ) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, serialize_row_remote + cleanup_function_assets( + serialize_row_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1879,8 +1853,8 @@ def float_parser(row): pandas.testing.assert_series_equal(bq_result, bf_result) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, float_parser_remote + cleanup_function_assets( + float_parser_remote, session.bqclient, session.cloudfunctionsclient ) @@ -1920,8 +1894,8 @@ def square(x: int) -> int: pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient ) @@ -2016,9 +1990,7 @@ def foo(x: int) -> int: assert gcf.state is functions_v2.Function.State.ACTIVE finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, foo - ) + cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -2078,8 +2050,8 @@ def foo_named(x: int) -> int: assert gcf.state is functions_v2.Function.State.ACTIVE finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, foo_named + cleanup_function_assets( + foo_named, session.bqclient, session.cloudfunctionsclient ) @@ -2155,9 +2127,7 @@ def foo(x, y, z): ) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, foo - ) + cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) def test_df_apply_axis_1_multiple_params_array_output(session): @@ -2243,9 +2213,7 @@ def foo(x, y, z): ) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, foo - ) + cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) def test_df_apply_axis_1_single_param_non_series(session): @@ -2307,9 +2275,7 @@ def foo(x): ) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, foo - ) + cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) @pytest.mark.flaky(retries=2, delay=120) @@ -2347,46 +2313,70 @@ def generate_stats(row: pandas.Series) -> list[int]: pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, generate_stats + cleanup_function_assets( + generate_stats, session.bqclient, session.cloudfunctionsclient ) @pytest.mark.parametrize( - ("ingress_settings_args", "effective_ingress_settings"), + ("ingress_settings_args", "effective_ingress_settings", "expected_warning"), [ pytest.param( - {}, functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, id="no-set" + {}, + functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, + FutureWarning, + id="no-set", + ), + pytest.param( + {"cloud_function_ingress_settings": None}, + functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, + FutureWarning, + id="set-none", ), pytest.param( {"cloud_function_ingress_settings": "all"}, functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL, + None, id="set-all", ), pytest.param( {"cloud_function_ingress_settings": "internal-only"}, functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_ONLY, + None, id="set-internal-only", ), pytest.param( {"cloud_function_ingress_settings": "internal-and-gclb"}, functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_AND_GCLB, + None, id="set-internal-and-gclb", ), ], ) @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_ingress_settings( - session, scalars_dfs, ingress_settings_args, effective_ingress_settings + session, + scalars_dfs, + ingress_settings_args, + effective_ingress_settings, + expected_warning, ): try: + # Verify the function raises the expected security warning message. + with warnings.catch_warnings(record=True) as w: - def square(x: int) -> int: - return x * x + def square(x: int) -> int: + return x * x - square_remote = session.remote_function(reuse=False, **ingress_settings_args)( - square - ) + square_remote = session.remote_function( + reuse=False, **ingress_settings_args + )(square) + + if expected_warning is not None: + assert issubclass(w[0].category, FutureWarning) + assert "Consider using 'internal-only' for enhanced security." in str( + w[0].message + ) # Assert that the GCF is created with the intended maximum timeout gcf = session.cloudfunctionsclient.get_function( @@ -2402,8 +2392,8 @@ def square(x: int) -> int: pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, square_remote + cleanup_function_assets( + square_remote, session.bqclient, session.cloudfunctionsclient ) @@ -2488,8 +2478,8 @@ def add_one(x: int) -> int: # clean up the gcp assets created for the temporary remote function, # just in case it was not explicitly cleaned up in the try clause due # to assertion failure or exception earlier than that - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, add_one_remote_temp + cleanup_function_assets( + add_one_remote_temp, session.bqclient, session.cloudfunctionsclient ) @@ -2565,8 +2555,8 @@ def add_one(x: int) -> int: ) finally: # clean up the gcp assets created for the persistent remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, add_one_remote_persist + cleanup_function_assets( + add_one_remote_persist, session.bqclient, session.cloudfunctionsclient ) @@ -2612,8 +2602,8 @@ def featurize(x: int) -> list[array_dtype]: # type: ignore pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, featurize + cleanup_function_assets( + featurize, session.bqclient, session.cloudfunctionsclient ) @@ -2651,10 +2641,10 @@ def featurize(x: float) -> list[float]: # type: ignore pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( + cleanup_function_assets( + featurize, unordered_session.bqclient, unordered_session.cloudfunctionsclient, - featurize, ) @@ -2687,6 +2677,6 @@ def featurize(x: int) -> list[float]: pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function - cleanup_remote_function_assets( - session.bqclient, session.cloudfunctionsclient, featurize + cleanup_function_assets( + featurize, session.bqclient, session.cloudfunctionsclient ) diff --git a/tests/system/large/test_dataframe.py b/tests/system/large/test_dataframe.py index 20d383463a..396f2eb436 100644 --- a/tests/system/large/test_dataframe.py +++ b/tests/system/large/test_dataframe.py @@ -9,7 +9,7 @@ # See: https://github.com/python/cpython/issues/112282 reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", ) -def test_corr_w_numeric_only(scalars_df_numeric_150_columns_maybe_ordered): +def test_corr_150_columns(scalars_df_numeric_150_columns_maybe_ordered): scalars_df, scalars_pandas_df = scalars_df_numeric_150_columns_maybe_ordered bf_result = scalars_df.corr(numeric_only=True).to_pandas() pd_result = scalars_pandas_df.corr(numeric_only=True) @@ -28,7 +28,7 @@ def test_corr_w_numeric_only(scalars_df_numeric_150_columns_maybe_ordered): # See: https://github.com/python/cpython/issues/112282 reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", ) -def test_cov_w_numeric_only(scalars_df_numeric_150_columns_maybe_ordered): +def test_cov_150_columns(scalars_df_numeric_150_columns_maybe_ordered): scalars_df, scalars_pandas_df = scalars_df_numeric_150_columns_maybe_ordered bf_result = scalars_df.cov(numeric_only=True).to_pandas() pd_result = scalars_pandas_df.cov(numeric_only=True) diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py new file mode 100644 index 0000000000..c055babce6 --- /dev/null +++ b/tests/system/large/test_dataframe_io.py @@ -0,0 +1,59 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import warnings + +import google.api_core.exceptions +import pytest + +import bigframes + +WIKIPEDIA_TABLE = "bigquery-public-data.samples.wikipedia" +LARGE_TABLE_OPTION = "bigquery.allow_large_results" + + +def test_to_pandas_batches_raise_when_large_result_not_allowed(session): + with bigframes.option_context(LARGE_TABLE_OPTION, False), pytest.raises( + google.api_core.exceptions.Forbidden + ): + df = session.read_gbq(WIKIPEDIA_TABLE) + next(df.to_pandas_batches(page_size=500, max_results=1500)) + + +def test_to_pandas_batches_override_global_option( + session, +): + with bigframes.option_context(LARGE_TABLE_OPTION, False): + df = session.read_gbq(WIKIPEDIA_TABLE) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + next( + df.to_pandas_batches( + page_size=500, max_results=1500, allow_large_results=True + ) + ) + assert len(w) == 2 + assert issubclass(w[0].category, FutureWarning) + assert str(w[0].message).startswith( + "The query result size has exceeded 10 GB." + ) + + +def test_to_pandas_raise_when_large_result_not_allowed(session): + with bigframes.option_context(LARGE_TABLE_OPTION, False), pytest.raises( + google.api_core.exceptions.Forbidden + ): + df = session.read_gbq(WIKIPEDIA_TABLE) + next(df.to_pandas()) diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index 3521e4cd20..0b4a7afe2b 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -13,6 +13,7 @@ # limitations under the License. import typing +import warnings from google.cloud import bigquery import pytest @@ -118,12 +119,22 @@ def test_bq_location_non_canonical(set_location, resolved_location): sorted(bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS), ) def test_bq_rep_endpoints(bigquery_location): - session = bigframes.Session( - context=bigframes.BigQueryOptions( - location=bigquery_location, use_regional_endpoints=True + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + session = bigframes.Session( + context=bigframes.BigQueryOptions( + location=bigquery_location, use_regional_endpoints=True + ) + ) + assert ( + len([warn for warn in record if isinstance(warn.message, FutureWarning)]) + == 0 ) - ) + # Verify that location and endpoints are correctly set for the BigQuery API + # client + # TODO(shobs): Figure out if the same can be verified for the other API + # clients. assert session.bqclient.location == bigquery_location assert ( session.bqclient._connection.API_BASE_URL @@ -147,10 +158,21 @@ def test_bq_lep_endpoints(bigquery_location): # allowlisted for LEP access. We could hardcode one project which is # allowlisted but then not every open source developer will have access to # that. Let's rely on just creating the clients for LEP. - clients_provider = bigframes.session.clients.ClientsProvider( - location=bigquery_location, use_regional_endpoints=True - ) + with pytest.warns(FutureWarning) as record: + clients_provider = bigframes.session.clients.ClientsProvider( + location=bigquery_location, use_regional_endpoints=True + ) + assert len(record) == 1 + assert typing.cast(Warning, record[0].message).args[ + 0 + ] == bigframes.constants.LEP_DEPRECATION_WARNING_MESSAGE.format( + location=bigquery_location + ) + # Verify that location and endpoints are correctly set for the BigQuery API + # client + # TODO(shobs): Figure out if the same can be verified for the other API + # clients. assert clients_provider.bqclient.location == bigquery_location assert ( clients_provider.bqclient._connection.API_BASE_URL diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index 8ecb36ecc9..ca068afe46 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -66,3 +66,31 @@ def test_blob_create_from_glob_path(bq_connection: str, session: bigframes.Sessi pd.testing.assert_frame_equal( pd_blob_df, expected_df, check_dtype=False, check_index_type=False ) + + +def test_blob_create_read_gbq_object_table( + bq_connection: str, session: bigframes.Session +): + bigframes.options.experiments.blob = True + + obj_table = session._create_object_table( + "gs://bigframes_blob_test/images/*", bq_connection + ) + + blob_df = session.read_gbq_object_table(obj_table, name="blob_col") + pd_blob_df = blob_df["blob_col"].struct.explode().to_pandas() + expected_df = pd.DataFrame( + { + "uri": [ + "gs://bigframes_blob_test/images/img0.jpg", + "gs://bigframes_blob_test/images/img1.jpg", + ], + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + + pd.testing.assert_frame_equal( + pd_blob_df, expected_df, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/functions/test_managed_function.py b/tests/system/small/functions/test_managed_function.py new file mode 100644 index 0000000000..41a5785d01 --- /dev/null +++ b/tests/system/small/functions/test_managed_function.py @@ -0,0 +1,199 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import google.api_core.exceptions +import pandas as pd +import pytest + +import bigframes.exceptions +from bigframes.functions import _function_session as bff_session +from bigframes.functions._utils import get_python_version +from bigframes.pandas import udf +import bigframes.pandas as bpd +import bigframes.series +from tests.system.utils import assert_pandas_df_equal, get_function_name + +bpd.options.experiments.udf = True + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +@pytest.mark.parametrize( + ("typ",), + [ + pytest.param(int), + pytest.param(float), + pytest.param(bool), + pytest.param(str), + pytest.param(bytes), + ], +) +def test_managed_function_series_apply( + typ, + scalars_dfs, + dataset_id_permanent, +): + def foo(x): + # The bytes() constructor expects a non-negative interger as its arg. + return typ(abs(x)) + + foo = udf( + input_types=int, + output_type=typ, + dataset=dataset_id_permanent, + name=get_function_name(foo), + )(foo) + + # Function should still work normally. + assert foo(-2) == typ(2) + + assert hasattr(foo, "bigframes_bigquery_function") + assert hasattr(foo, "ibis_node") + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result_col = scalars_df["int64_too"].apply(foo) + bf_result = ( + scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_result_col = scalars_pandas_df["int64_too"].apply(foo) + pd_result = scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_series_combine(dataset_id_permanent, scalars_dfs): + # This function is deliberately written to not work with NA input. + def add(x: int, y: int) -> int: + return x + y + + scalars_df, scalars_pandas_df = scalars_dfs + int_col_name_with_nulls = "int64_col" + int_col_name_no_nulls = "int64_too" + bf_df = scalars_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + pd_df = scalars_pandas_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + + # make sure there are NA values in the test column. + assert any([pd.isna(val) for val in bf_df[int_col_name_with_nulls]]) + + add_managed_func = udf( + dataset=dataset_id_permanent, + name=get_function_name(add), + )(add) + + # with nulls in the series the managed function application would fail. + with pytest.raises( + google.api_core.exceptions.BadRequest, match="unsupported operand" + ): + bf_df[int_col_name_with_nulls].combine( + bf_df[int_col_name_no_nulls], add_managed_func + ).to_pandas() + + # after filtering out nulls the managed function application should work + # similar to pandas. + pd_filter = pd_df[int_col_name_with_nulls].notnull() + pd_result = pd_df[pd_filter][int_col_name_with_nulls].combine( + pd_df[pd_filter][int_col_name_no_nulls], add + ) + bf_filter = bf_df[int_col_name_with_nulls].notnull() + bf_result = ( + bf_df[bf_filter][int_col_name_with_nulls] + .combine(bf_df[bf_filter][int_col_name_no_nulls], add_managed_func) + .to_pandas() + ) + + # ignore any dtype difference. + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_dataframe_map(scalars_dfs, dataset_id_permanent): + def add_one(x): + return x + 1 + + mf_add_one = udf( + input_types=[int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), + )(add_one) + + scalars_df, scalars_pandas_df = scalars_dfs + int64_cols = ["int64_col", "int64_too"] + + bf_int64_df = scalars_df[int64_cols] + bf_int64_df_filtered = bf_int64_df.dropna() + bf_result = bf_int64_df_filtered.map(mf_add_one).to_pandas() + + pd_int64_df = scalars_pandas_df[int64_cols] + pd_int64_df_filtered = pd_int64_df.dropna() + pd_result = pd_int64_df_filtered.map(add_one) + # TODO(shobs): Figure why pandas .map() changes the dtype, i.e. + # pd_int64_df_filtered.dtype is Int64Dtype() + # pd_int64_df_filtered.map(lambda x: x).dtype is int64. + # For this test let's force the pandas dtype to be same as input. + for col in pd_result: + pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.skipif( + get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, + reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", +) +def test_managed_function_dataframe_apply_axis_1( + session, scalars_dfs, dataset_id_permanent +): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df["int64_too"] + series_pandas = scalars_pandas_df["int64_too"] + + def add_ints(x, y): + return x + y + + add_ints_mf = session.udf( + input_types=[int, int], + output_type=int, + dataset=dataset_id_permanent, + name=get_function_name(add_ints, is_row_processor=True), + )(add_ints) + assert add_ints_mf.bigframes_bigquery_function # type: ignore + + with pytest.warns( + bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." + ): + bf_result = ( + bpd.DataFrame({"x": series, "y": series}) + .apply(add_ints_mf, axis=1) + .to_pandas() + ) + + pd_result = pd.DataFrame({"x": series_pandas, "y": series_pandas}).apply( + lambda row: add_ints(row["x"], row["y"]), axis=1 + ) + + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_exact=True + ) diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 99a017c917..c12d0e03f5 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -29,7 +29,7 @@ from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff import bigframes.session._io.bigquery -from tests.system.utils import assert_pandas_df_equal +from tests.system.utils import assert_pandas_df_equal, get_function_name _prefixer = test_utils.prefixer.Prefixer("bigframes", "") @@ -92,20 +92,6 @@ def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: return session -def get_rf_name(func, package_requirements=None, is_row_processor=False): - """Get a remote function name for testing given a udf.""" - # Augment user package requirements with any internal package - # requirements - package_requirements = bff_utils._get_updated_package_requirements( - package_requirements, is_row_processor - ) - - # Compute a unique hash representing the user code - function_hash = bff_utils._get_hash(func, package_requirements) - - return f"bigframes_{function_hash}" - - @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, @@ -130,7 +116,7 @@ def square(x): bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - name=get_rf_name(square), + name=get_function_name(square), )(square) # Function should still work normally. @@ -189,7 +175,7 @@ def square(x): bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - name=get_rf_name(square), + name=get_function_name(square), )(square) # Function should still work normally. @@ -248,7 +234,7 @@ def square(x): bigquery_connection=bq_cf_connection_location_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - name=get_rf_name(square), + name=get_function_name(square), )(square) @@ -276,7 +262,7 @@ def square(x): bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - name=get_rf_name(square), + name=get_function_name(square), )(square) # Function should still work normally. @@ -337,7 +323,7 @@ def square(x): bigquery_connection=bq_cf_connection_location_project_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - name=get_rf_name(square), + name=get_function_name(square), )(square) @@ -353,7 +339,7 @@ def square(x): int, session=session_with_bq_connection, dataset=dataset_id_permanent, - name=get_rf_name(square), + name=get_function_name(square), )(square) # Function should still work normally. @@ -398,7 +384,7 @@ def square(x): # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. square = session_with_bq_connection.remote_function( - int, int, dataset_id_permanent, name=get_rf_name(square) + int, int, dataset_id_permanent, name=get_function_name(square) )(square) # Function should still work normally. @@ -442,7 +428,7 @@ def square(x): bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - name=get_rf_name(square), + name=get_function_name(square), )(square) # Function should still work normally. @@ -480,7 +466,7 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_rf_name(add_one) + [int], int, dataset_id_permanent, name=get_function_name(add_one) )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -511,7 +497,7 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_rf_name(add_one) + [int], int, dataset_id_permanent, name=get_function_name(add_one) )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -542,7 +528,7 @@ def add_one(x): return x + 1 remote_add_one = session_with_bq_connection.remote_function( - [int], int, dataset_id_permanent, name=get_rf_name(add_one) + [int], int, dataset_id_permanent, name=get_function_name(add_one) )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -586,7 +572,7 @@ def bytes_to_hex(mybytes: bytes) -> bytes: packages = ["pandas"] remote_bytes_to_hex = session_with_bq_connection.remote_function( dataset=dataset_id_permanent, - name=get_rf_name(bytes_to_hex, package_requirements=packages), + name=get_function_name(bytes_to_hex, package_requirements=packages), packages=packages, )(bytes_to_hex) bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() @@ -630,7 +616,10 @@ def add_one(x): return x + 1 # pragma: NO COVER session.remote_function( - [int], int, dataset=dataset_id_permanent, name=get_rf_name(add_one) + [int], + int, + dataset=dataset_id_permanent, + name=get_function_name(add_one), )(add_one) @@ -669,7 +658,7 @@ def square1(x): resource_manager_client=resourcemanager_client, bigquery_connection=bq_cf_connection, reuse=True, - name=get_rf_name(square1), + name=get_function_name(square1), )(square1) # Function should still work normally. @@ -1142,7 +1131,7 @@ def add_ints(row): bigframes.series.Series, int, dataset_id_permanent, - name=get_rf_name(add_ints, is_row_processor=True), + name=get_function_name(add_ints, is_row_processor=True), )(add_ints) assert add_ints_remote.bigframes_remote_function # type: ignore assert add_ints_remote.bigframes_cloud_function # type: ignore @@ -1191,7 +1180,7 @@ def add_ints(row): bigframes.series.Series, int, dataset_id_permanent, - name=get_rf_name(add_ints, is_row_processor=True), + name=get_function_name(add_ints, is_row_processor=True), )(add_ints) bf_result = ( @@ -1230,7 +1219,7 @@ def add_numbers(row): bigframes.series.Series, float, dataset_id_permanent, - name=get_rf_name(add_numbers, is_row_processor=True), + name=get_function_name(add_numbers, is_row_processor=True), )(add_numbers) bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas() @@ -1257,7 +1246,9 @@ def add_ints(row): # pandas works scalars_pandas_df.apply(add_ints, axis=1) - with pytest.raises(ValueError, match="For axis=1 a remote function must be used."): + with pytest.raises( + ValueError, match="For axis=1 a bigframes function must be used." + ): scalars_df[columns].apply(add_ints, axis=1) @@ -1281,7 +1272,7 @@ def echo_len(row): bigframes.series.Series, float, dataset_id_permanent, - name=get_rf_name(echo_len, is_row_processor=True), + name=get_function_name(echo_len, is_row_processor=True), )(echo_len) for column in columns_with_not_supported_dtypes: @@ -1314,7 +1305,7 @@ def should_mask(name: str) -> bool: assert "name" in inspect.signature(should_mask).parameters should_mask = session.remote_function( - dataset=dataset_id_permanent, name=get_rf_name(should_mask) + dataset=dataset_id_permanent, name=get_function_name(should_mask) )(should_mask) s = bigframes.series.Series(["Alice", "Bob", "Caroline"]) @@ -1373,7 +1364,7 @@ def is_odd(x: int) -> bool: # create a remote function is_odd_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_rf_name(is_odd) + dataset=dataset_id_permanent, name=get_function_name(is_odd) )(is_odd) # with nulls in the series the remote function application would fail @@ -1423,7 +1414,7 @@ def add(x: int, y: int) -> int: # create a remote function add_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_rf_name(add) + dataset=dataset_id_permanent, name=get_function_name(add) )(add) # with nulls in the series the remote function application would fail @@ -1476,7 +1467,7 @@ def add(x: int, y: int, z: float) -> float: # create a remote function add_remote = session.remote_function( - dataset=dataset_id_permanent, name=get_rf_name(add) + dataset=dataset_id_permanent, name=get_function_name(add) )(add) # pandas does not support nary functions, so let's create a proxy function @@ -1530,7 +1521,8 @@ def is_long_duration(minutes: int) -> bool: return minutes >= 120 is_long_duration = unordered_session.remote_function( - dataset=dataset_id_permanent, name=get_rf_name(is_long_duration) + dataset=dataset_id_permanent, + name=get_function_name(is_long_duration), )(is_long_duration) method = getattr(df["duration_minutes"], method) @@ -1549,7 +1541,7 @@ def combiner(x: int, y: int) -> int: return x combiner = unordered_session.remote_function( - dataset=dataset_id_permanent, name=get_rf_name(combiner) + dataset=dataset_id_permanent, name=get_function_name(combiner) )(combiner) df = scalars_df_index[["int64_col", "int64_too", "float64_col", "string_col"]] @@ -1565,7 +1557,7 @@ def processor(x: int, y: int, z: float, w: str) -> str: return f"I got x={x}, y={y}, z={z} and w={w}" processor = unordered_session.remote_function( - dataset=dataset_id_permanent, name=get_rf_name(processor) + dataset=dataset_id_permanent, name=get_function_name(processor) )(processor) df = scalars_df_index[["int64_col", "int64_too", "float64_col", "string_col"]] diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index b27009d9d8..d0987dbdaf 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -162,3 +162,35 @@ def test_geo_to_wkt(): pd_result, check_index=False, ) + + +def test_geo_boundary(): + bf_s = bigframes.pandas.Series( + [ + Polygon([(0, 0), (1, 1), (0, 1)]), + Polygon([(10, 0), (10, 5), (0, 0)]), + Polygon([(0, 0), (2, 2), (2, 0)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ], + ) + + pd_s = geopandas.GeoSeries( + [ + Polygon([(0, 0), (1, 1), (0, 1)]), + Polygon([(10, 0), (10, 5), (0, 0)]), + Polygon([(0, 0), (2, 2), (2, 0)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ], + ) + + bf_result = bf_s.geo.boundary.to_pandas() + pd_result = pd_s.boundary + + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_series_type=False, + check_index=False, + ) diff --git a/tests/system/small/operations/test_dates.py b/tests/system/small/operations/test_dates.py new file mode 100644 index 0000000000..e183bbfe43 --- /dev/null +++ b/tests/system/small/operations/test_dates.py @@ -0,0 +1,73 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import datetime + +import pandas as pd +import pandas.testing + +from bigframes import dtypes + + +def test_date_diff_between_series(session): + pd_df = pd.DataFrame( + { + "col_1": [datetime.date(2025, 1, 2), datetime.date(2025, 2, 1)], + "col_2": [datetime.date(2024, 1, 2), datetime.date(2026, 1, 30)], + } + ).astype(dtypes.DATE_DTYPE) + bf_df = session.read_pandas(pd_df) + + actual_result = (bf_df["col_1"] - bf_df["col_2"]).to_pandas() + + expected_result = (pd_df["col_1"] - pd_df["col_2"]).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_diff_literal_sub_series(scalars_dfs): + bf_df, pd_df = scalars_dfs + literal = datetime.date(2030, 5, 20) + + actual_result = (literal - bf_df["date_col"]).to_pandas() + + expected_result = (literal - pd_df["date_col"]).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_diff_series_sub_literal(scalars_dfs): + bf_df, pd_df = scalars_dfs + literal = datetime.date(1980, 5, 20) + + actual_result = (bf_df["date_col"] - literal).to_pandas() + + expected_result = (pd_df["date_col"] - literal).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_series_diff_agg(scalars_dfs): + bf_df, pd_df = scalars_dfs + + actual_result = bf_df["date_col"].diff().to_pandas() + + expected_result = pd_df["date_col"].diff().astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 356000b3f6..53cb5f7419 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -17,8 +17,10 @@ import operator import numpy as np +from packaging import version import pandas as pd import pandas.testing +import pyarrow as pa import pytest from bigframes import dtypes @@ -38,14 +40,22 @@ def temporal_dfs(session): pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), pd.Timestamp("2005-03-05 02:00:00", tz="UTC"), ], + "date_col": pd.Series( + [ + datetime.date(2000, 1, 1), + datetime.date(2001, 2, 3), + datetime.date(2020, 9, 30), + ], + dtype=pd.ArrowDtype(pa.date32()), + ), "timedelta_col_1": [ pd.Timedelta(5, "s"), - pd.Timedelta(-4, "d"), + pd.Timedelta(-4, "m"), pd.Timedelta(5, "h"), ], "timedelta_col_2": [ pd.Timedelta(3, "s"), - pd.Timedelta(-4, "d"), + pd.Timedelta(-4, "m"), pd.Timedelta(6, "h"), ], "numeric_col": [1.5, 2, -3], @@ -365,6 +375,81 @@ def test_timestamp_sub_dataframes(temporal_dfs): ) +@pytest.mark.parametrize( + ("left_col", "right_col"), + [ + ("date_col", "timedelta_col_1"), + ("timedelta_col_1", "date_col"), + ], +) +def test_date_add__series_add_series(temporal_dfs, left_col, right_col): + if version.Version(pd.__version__) < version.Version("2.1.0"): + pytest.skip("not supported by Pandas < 2.1.0") + + bf_df, pd_df = temporal_dfs + + actual_result = (bf_df[left_col] + bf_df[right_col]).to_pandas() + + expected_result = (pd_df[left_col] + pd_df[right_col]).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +# Pandas does not support date literal + timedelta series so we don't test it here. +def test_date_add__literal_add_series(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (literal + bf_df["date_col"]).to_pandas() + + expected_result = (literal + pd_df["date_col"]).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +# Pandas does not support timedelta series + date literal so we don't test it here. +def test_date_add__series_add_literal(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (bf_df["date_col"] + literal).to_pandas() + + expected_result = (pd_df["date_col"] + literal).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_sub__series_sub_series(temporal_dfs): + if version.Version(pd.__version__) < version.Version("2.1.0"): + pytest.skip("not supported by Pandas < 2.1.0") + + bf_df, pd_df = temporal_dfs + + actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() + + expected_result = (pd_df["date_col"] - pd_df["timedelta_col_1"]).astype( + dtypes.DATETIME_DTYPE + ) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_sub__series_sub_literal(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (bf_df["date_col"] - literal).to_pandas() + + expected_result = (pd_df["date_col"] - literal).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + @pytest.mark.parametrize( "compare_func", [ @@ -465,3 +550,49 @@ def test_timedelta_ordering(session): pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) + + +def test_timedelta_cumsum(temporal_dfs): + bf_df, pd_df = temporal_dfs + + actual_result = bf_df["timedelta_col_1"].cumsum().to_pandas() + + expected_result = pd_df["timedelta_col_1"].cumsum() + _assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize( + "agg_func", + [ + pytest.param(lambda x: x.min(), id="min"), + pytest.param(lambda x: x.max(), id="max"), + pytest.param(lambda x: x.sum(), id="sum"), + pytest.param(lambda x: x.mean(), id="mean"), + pytest.param(lambda x: x.median(), id="median"), + pytest.param(lambda x: x.quantile(0.5), id="quantile"), + pytest.param(lambda x: x.std(), id="std"), + ], +) +def test_timedelta_agg__timedelta_result(temporal_dfs, agg_func): + bf_df, pd_df = temporal_dfs + + actual_result = agg_func(bf_df["timedelta_col_1"]) + + expected_result = agg_func(pd_df["timedelta_col_1"]).floor("us") + assert actual_result == expected_result + + +@pytest.mark.parametrize( + "agg_func", + [ + pytest.param(lambda x: x.count(), id="count"), + pytest.param(lambda x: x.nunique(), id="nunique"), + ], +) +def test_timedelta_agg__int_result(temporal_dfs, agg_func): + bf_df, pd_df = temporal_dfs + + actual_result = agg_func(bf_df["timedelta_col_1"]) + + expected_result = agg_func(pd_df["timedelta_col_1"]) + assert actual_result == expected_result diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 26b941a596..db777137b0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -662,7 +662,25 @@ def test_rename(scalars_dfs): def test_df_peek(scalars_dfs_maybe_ordered): scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + session = scalars_df._block.session + slot_millis_sum = session.slot_millis_sum peek_result = scalars_df.peek(n=3, force=False) + + assert session.slot_millis_sum - slot_millis_sum > 1000 + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_with_large_results_not_allowed(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + session = scalars_df._block.session + slot_millis_sum = session.slot_millis_sum + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) + + # The metrics won't be fully updated when we call query_and_wait. + assert session.slot_millis_sum - slot_millis_sum < 500 pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 @@ -2214,7 +2232,7 @@ def test_combine_first( ), ], ) -def test_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): +def test_df_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas() @@ -2223,12 +2241,18 @@ def test_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, ) -def test_corr_w_invalid_parameters(scalars_dfs): +def test_df_corr_w_invalid_parameters(scalars_dfs): columns = ["int64_too", "int64_col", "float64_col"] scalars_df, _ = scalars_dfs @@ -2261,8 +2285,14 @@ def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, ) @@ -3414,6 +3444,24 @@ def test_iloc_tuple(scalars_df_index, scalars_pandas_df_index, index): assert bf_result == pd_result +@pytest.mark.parametrize( + "index", + [(slice(None), [1, 2, 3]), (slice(1, 7, 2), [2, 5, 3])], +) +def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index].to_pandas() + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): + index = (2, [2, 1, 3, -4]) + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + pd.testing.assert_series_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("index", "error"), [ @@ -4504,11 +4552,28 @@ def test_loc_bf_index_integer_index_renamed_col( ) def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): columns = ["bool_col", "int64_too", "int64_col"] - bf_series = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() - pd_series = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) + bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() + pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) pd.testing.assert_frame_equal( - pd_series, - bf_series, + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_drop_duplicates_w_json(json_df, keep): + bf_df = json_df.drop_duplicates(keep=keep).to_pandas() + pd_df = json_df.to_pandas().drop_duplicates(keep=keep) + pd.testing.assert_frame_equal( + pd_df, + bf_df, ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index b07213f943..4758c2d5b4 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -32,7 +32,10 @@ import typing +from google.cloud import bigquery + import bigframes +from bigframes import dtypes import bigframes.dataframe import bigframes.features import bigframes.pandas as bpd @@ -249,6 +252,32 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_to_pandas_override_global_option(scalars_df_index): + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + scalars_df_index.to_pandas() + table_id = scalars_df_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + scalars_df_index.to_pandas(allow_large_results=False) + assert scalars_df_index._query_job.destination.table_id == table_id + + +def test_to_arrow_override_global_option(scalars_df_index): + # Direct call to_arrow uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + scalars_df_index.to_arrow() + table_id = scalars_df_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + scalars_df_index.to_arrow(allow_large_results=False) + assert scalars_df_index._query_job.destination.table_id == table_id + + def test_load_json_w_unboxed_py_value(session): sql = """ SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, @@ -673,6 +702,58 @@ def test_to_gbq_w_json(bigquery_client): assert table.schema[1].field_type == "JSON" +def test_to_gbq_with_timedelta(bigquery_client, dataset_id): + destination_table = f"{dataset_id}.test_to_gbq_with_timedelta" + s1 = bpd.Series([1, 2, 3, 4]) + s2 = bpd.to_timedelta(bpd.Series([1, 2, 3, 4]), unit="s") + df = bpd.DataFrame({"id": s1, "timedelta_col": s2}) + + df.to_gbq(destination_table) + table = bigquery_client.get_table(destination_table) + + assert table.schema[1].name == "timedelta_col" + assert table.schema[1].field_type == "INTEGER" + assert dtypes.TIMEDELTA_DESCRIPTION_TAG in table.schema[1].description + + +def test_gbq_round_trip_with_timedelta(session, dataset_id): + destination_table = f"{dataset_id}.test_gbq_roundtrip_with_timedelta" + df = pd.DataFrame( + { + "col_1": [1], + "col_2": [pd.Timedelta(1, "s")], + "col_3": [1.1], + } + ) + bpd.DataFrame(df).to_gbq(destination_table) + + result = session.read_gbq(destination_table) + + assert result["col_1"].dtype == dtypes.INT_DTYPE + assert result["col_2"].dtype == dtypes.TIMEDELTA_DTYPE + assert result["col_3"].dtype == dtypes.FLOAT_DTYPE + + +def test_to_gbq_timedelta_tag_ignored_when_appending(bigquery_client, dataset_id): + # First, create a table + destination_table = f"{dataset_id}.test_to_gbq_timedelta_tag_ignored_when_appending" + schema = [bigquery.SchemaField("my_col", "INTEGER")] + bigquery_client.create_table(bigquery.Table(destination_table, schema)) + + # Then, append to that table with timedelta values + df = pd.DataFrame( + { + "my_col": [pd.Timedelta(1, "s")], + } + ) + bpd.DataFrame(df).to_gbq(destination_table, if_exists="append") + + table = bigquery_client.get_table(destination_table) + assert table.schema[0].name == "my_col" + assert table.schema[0].field_type == "INTEGER" + assert table.schema[0].description is None + + @pytest.mark.parametrize( ("index"), [True, False], diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index cbf6e1269d..b7101c90f3 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas # ================= # DataFrame.groupby @@ -94,6 +94,72 @@ def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q ) +@skip_legacy_pandas +@pytest.mark.parametrize( + ("na_option", "method", "ascending"), + [ + ( + "keep", + "average", + True, + ), + ( + "top", + "min", + False, + ), + ( + "bottom", + "max", + False, + ), + ( + "top", + "first", + False, + ), + ( + "bottom", + "dense", + False, + ), + ], +) +def test_dataframe_groupby_rank( + scalars_df_index, + scalars_pandas_df_index, + na_option, + method, + ascending, +): + col_names = ["int64_too", "float64_col", "int64_col", "string_col"] + bf_result = ( + scalars_df_index[col_names] + .groupby("string_col") + .rank( + na_option=na_option, + method=method, + ascending=ascending, + ) + ).to_pandas() + pd_result = ( + ( + scalars_pandas_df_index[col_names] + .groupby("string_col") + .rank( + na_option=na_option, + method=method, + ascending=ascending, + ) + ) + .astype("float64") + .astype("Float64") + ) + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("operator"), [ @@ -534,6 +600,72 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): ) +@skip_legacy_pandas +@pytest.mark.parametrize( + ("na_option", "method", "ascending"), + [ + ( + "keep", + "average", + True, + ), + ( + "top", + "min", + False, + ), + ( + "bottom", + "max", + False, + ), + ( + "top", + "first", + False, + ), + ( + "bottom", + "dense", + False, + ), + ], +) +def test_series_groupby_rank( + scalars_df_index, + scalars_pandas_df_index, + na_option, + method, + ascending, +): + col_names = ["int64_col", "string_col"] + bf_result = ( + scalars_df_index[col_names] + .groupby("string_col")["int64_col"] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + ) + ).to_pandas() + pd_result = ( + ( + scalars_pandas_df_index[col_names] + .groupby("string_col")["int64_col"] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + ) + ) + .astype("float64") + .astype("Float64") + ) + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize("dropna", [True, False]) def test_series_groupby_head(scalars_df_index, scalars_pandas_df_index, dropna): bf_result = ( diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py new file mode 100644 index 0000000000..a7cd4013b9 --- /dev/null +++ b/tests/system/small/test_index_io.py @@ -0,0 +1,43 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_to_pandas_override_global_option(scalars_df_index): + bf_index = scalars_df_index.index + + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_index.to_pandas() + table_id = bf_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + bf_index.to_pandas(allow_large_results=False) + assert bf_index._query_job.destination.table_id == table_id + + +def test_to_numpy_override_global_option(scalars_df_index): + bf_index = scalars_df_index.index + + # Direct call to_numpy uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_index.to_numpy() + table_id = bf_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + bf_index.to_numpy(allow_large_results=False) + assert bf_index._query_job.destination.table_id == table_id diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 2daa7dd825..980f2226b7 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2269,11 +2269,36 @@ def test_head_then_series_operation(scalars_dfs): def test_series_peek(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs + + session = scalars_df._block.session + slot_millis_sum = session.slot_millis_sum peek_result = scalars_df["float64_col"].peek(n=3, force=False) + + assert session.slot_millis_sum - slot_millis_sum > 1000 + pd.testing.assert_series_equal( + peek_result, + scalars_pandas_df["float64_col"].reindex_like(peek_result), + ) + assert len(peek_result) == 3 + + +def test_series_peek_with_large_results_not_allowed(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + session = scalars_df._block.session + slot_millis_sum = session.slot_millis_sum + peek_result = scalars_df["float64_col"].peek( + n=3, force=False, allow_large_results=False + ) + + # The metrics won't be fully updated when we call query_and_wait. + print(session.slot_millis_sum - slot_millis_sum) + assert session.slot_millis_sum - slot_millis_sum < 500 pd.testing.assert_series_equal( peek_result, scalars_pandas_df["float64_col"].reindex_like(peek_result), ) + assert len(peek_result) == 3 def test_series_peek_multi_index(scalars_dfs): diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py new file mode 100644 index 0000000000..d44d1e5b24 --- /dev/null +++ b/tests/system/small/test_series_io.py @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_to_pandas_override_global_option(scalars_df_index): + bf_series = scalars_df_index["int64_col"] + + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_series.to_pandas() + table_id = bf_series._query_job.destination.table_id + assert table_id.startswith("bqdf") + + session = bf_series._block.session + execution_count = session._metrics.execution_count + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + bf_series.to_pandas(allow_large_results=False) + assert bf_series._query_job.destination.table_id == table_id + assert session._metrics.execution_count - execution_count == 1 diff --git a/tests/system/utils.py b/tests/system/utils.py index 0772468085..fd8feb0eeb 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -383,3 +383,44 @@ def delete_cloud_function( def get_first_file_from_wildcard(path): return path.replace("*", "000000000000") + + +def cleanup_function_assets( + bigframes_func, + bigquery_client, + cloudfunctions_client=None, + ignore_failures=True, +) -> None: + """Clean up the GCP assets behind a bigframess function.""" + + # Clean up bigframes function. + try: + bigquery_client.delete_routine(bigframes_func.bigframes_bigquery_function) + except Exception: + # By default don't raise exception in cleanup. + if not ignore_failures: + raise + + # Clean up cloud function + try: + delete_cloud_function( + cloudfunctions_client, bigframes_func.bigframes_cloud_function + ) + except Exception: + # By default don't raise exception in cleanup. + if not ignore_failures: + raise + + +def get_function_name(func, package_requirements=None, is_row_processor=False): + """Get a bigframes function name for testing given a udf.""" + # Augment user package requirements with any internal package + # requirements. + package_requirements = bff_utils._get_updated_package_requirements( + package_requirements, is_row_processor + ) + + # Compute a unique hash representing the user code. + function_hash = bff_utils._get_hash(func, package_requirements) + + return f"bigframes_{function_hash}" diff --git a/tests/unit/polars_session.py b/tests/unit/polars_session.py index cffd8ff7ca..a27db0e438 100644 --- a/tests/unit/polars_session.py +++ b/tests/unit/polars_session.py @@ -13,7 +13,7 @@ # limitations under the License. import dataclasses -from typing import Mapping, Optional, Union +from typing import Optional, Union import weakref import polars @@ -39,9 +39,7 @@ def execute( array_value: bigframes.core.ArrayValue, *, ordered: bool = True, - col_id_overrides: Mapping[str, str] = {}, - use_explicit_destination: bool = False, - get_size_bytes: bool = False, + use_explicit_destination: Optional[bool] = False, page_size: Optional[int] = None, max_results: Optional[int] = None, ): diff --git a/tests/unit/resources.py b/tests/unit/resources.py index c091eac2a2..ebc1243eaf 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -24,6 +24,7 @@ import bigframes.clients import bigframes.core.ordering import bigframes.dataframe +import bigframes.series import bigframes.session.clients import bigframes.session.executor import bigframes.session.metrics diff --git a/tests/unit/test_dataframe_io.py b/tests/unit/test_dataframe_io.py new file mode 100644 index 0000000000..5deb0d7a24 --- /dev/null +++ b/tests/unit/test_dataframe_io.py @@ -0,0 +1,51 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import Mock + +import pytest + +from . import resources + + +@pytest.fixture +def mock_df(monkeypatch: pytest.MonkeyPatch): + dataframe = resources.create_dataframe(monkeypatch) + monkeypatch.setattr(dataframe, "to_pandas", Mock()) + return dataframe + + +@pytest.mark.parametrize( + "api_name, kwargs", + [ + ("to_csv", {"allow_large_results": True}), + ("to_json", {"allow_large_results": True}), + ("to_numpy", {"allow_large_results": True}), + ("to_parquet", {"allow_large_results": True}), + ("to_dict", {"allow_large_results": True}), + ("to_excel", {"excel_writer": "abc", "allow_large_results": True}), + ("to_latex", {"allow_large_results": True}), + ("to_records", {"allow_large_results": True}), + ("to_string", {"allow_large_results": True}), + ("to_html", {"allow_large_results": True}), + ("to_markdown", {"allow_large_results": True}), + ("to_pickle", {"path": "abc", "allow_large_results": True}), + ("to_orc", {"allow_large_results": True}), + ], +) +def test_dataframe_to_pandas(mock_df, api_name, kwargs): + getattr(mock_df, api_name)(**kwargs) + mock_df.to_pandas.assert_called_once_with( + allow_large_results=kwargs["allow_large_results"] + ) diff --git a/tests/unit/test_series_io.py b/tests/unit/test_series_io.py new file mode 100644 index 0000000000..a97293d3da --- /dev/null +++ b/tests/unit/test_series_io.py @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import Mock + +import pytest + +from . import resources + + +@pytest.fixture +def mock_series(monkeypatch: pytest.MonkeyPatch): + dataframe = resources.create_dataframe(monkeypatch) + series = dataframe["col"] + monkeypatch.setattr(series, "to_pandas", Mock()) + return series + + +@pytest.mark.parametrize( + "api_name, kwargs", + [ + ("to_csv", {"allow_large_results": True}), + ("to_dict", {"allow_large_results": True}), + ("to_excel", {"excel_writer": "abc", "allow_large_results": True}), + ("to_json", {"allow_large_results": True}), + ("to_latex", {"allow_large_results": True}), + ("to_list", {"allow_large_results": True}), + ("to_markdown", {"allow_large_results": True}), + ("to_numpy", {"allow_large_results": True}), + ("to_pickle", {"path": "abc", "allow_large_results": True}), + ("to_string", {"allow_large_results": True}), + ("to_xarray", {"allow_large_results": True}), + ], +) +def test_series_allow_large_results_param_passing(mock_series, api_name, kwargs): + getattr(mock_series, api_name)(**kwargs) + mock_series.to_pandas.assert_called_once_with( + allow_large_results=kwargs["allow_large_results"] + ) diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index b7040d4321..a2e7b74059 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -91,6 +91,46 @@ def y(self) -> bigframes.series.Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def boundary(self) -> bigframes.geopandas.GeoSeries: + """ + Returns a GeoSeries of lower dimensional objects representing each + geometry's set-theoretic boundary. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import geopandas.array + >>> import shapely + >>> bpd.options.display.progress_bar = None + + >>> from shapely.geometry import Polygon, LineString, Point + >>> s = geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (1, 1), (0, 1)]), + ... LineString([(0, 0), (1, 1), (1, 0)]), + ... Point(0, 0), + ... ] + ... ) + >>> s + 0 POLYGON ((0 0, 1 1, 0 1, 0 0)) + 1 LINESTRING (0 0, 1 1, 1 0) + 2 POINT (0 0) + dtype: geometry + + >>> s.boundary + 0 LINESTRING (0 0, 1 1, 0 1, 0 0) + 1 MULTIPOINT (0 0, 1 0) + 2 GEOMETRYCOLLECTION EMPTY + dtype: geometry + + Returns: + bigframes.geopandas.GeoSeries: + A GeoSeries of lower dimensional objects representing each + geometry's set-theoretic boundary + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @classmethod def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: """ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e296dcb9f6..e59232ee85 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -365,7 +365,15 @@ def from_records( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: + def to_numpy( + self, + dtype=None, + copy=False, + na_value=None, + *, + allow_large_results=None, + **kwargs, + ) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -388,7 +396,9 @@ def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarra na_value (Any, default None): The value to use for missing values. The default value depends on dtype and the dtypes of the DataFrame columns. - + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: numpy.ndarray: The converted NumPy array. """ @@ -509,6 +519,7 @@ def to_parquet( *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[bytes]: """Write a DataFrame to the binary Parquet format. @@ -534,14 +545,16 @@ def to_parquet( should be formatted ``gs:///``. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. - compression (str, default 'snappy'): Name of the compression to use. Use ``None`` for no compression. Supported options: ``'gzip'``, ``'snappy'``. - index (bool, default True): If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This parameter has + no effect when results are saved to Google Cloud Storage (GCS). Returns: None or bytes: @@ -560,6 +573,8 @@ def to_dict( "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> dict | list[dict]: """ @@ -613,11 +628,13 @@ def to_dict( in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. - index (bool, default True): Whether to include the index item (and index_names item if `orient` is 'tight') in the returned dictionary. Can only be ``False`` when `orient` is 'split' or 'tight'. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: dict or list of dict: Return a collections.abc.Mapping object representing the DataFrame. @@ -625,7 +642,14 @@ def to_dict( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: + def to_excel( + self, + excel_writer, + sheet_name: str = "Sheet1", + *, + allow_large_results: Optional[bool] = None, + **kwargs, + ) -> None: """ Write DataFrame to an Excel sheet. @@ -653,11 +677,21 @@ def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: File path or existing ExcelWriter. sheet_name (str, default 'Sheet1'): Name of sheet which will contain DataFrame. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_latex( - self, buf=None, columns=None, header=True, index=True, **kwargs + self, + buf=None, + columns=None, + header=True, + index=True, + *, + allow_large_results=None, + **kwargs, ) -> str | None: r""" Render object to a LaTeX tabular, longtable, or nested table. @@ -693,6 +727,9 @@ def to_latex( it is assumed to be aliases for the column names. index (bool, default True): Write row names (index). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: If buf is None, returns the result as a string. Otherwise returns @@ -701,7 +738,12 @@ def to_latex( raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_records( - self, index: bool = True, column_dtypes=None, index_dtypes=None + self, + index: bool = True, + column_dtypes=None, + index_dtypes=None, + *, + allow_large_results=None, ) -> np.recarray: """ Convert DataFrame to a NumPy record array. @@ -731,6 +773,9 @@ def to_records( If a string or type, the data type to store all index levels. If a dictionary, a mapping of index level names and indices (zero-indexed) to specific data types. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This mapping is applied only if `index=True`. @@ -761,6 +806,8 @@ def to_string( min_rows: int | None = None, max_colwidth: int | None = None, encoding: str | None = None, + *, + allow_large_results: Optional[bool] = None, ): """Render a DataFrame to a console-friendly tabular output. @@ -824,6 +871,9 @@ def to_string( Max width to truncate each column in characters. By default, no limit. encoding (str, default "utf-8"): Set character encoding. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: If buf is None, returns the result as a string. Otherwise returns @@ -856,6 +906,8 @@ def to_html( table_id: str | None = None, render_links: bool = False, encoding: str | None = None, + *, + allow_large_results: bool | None = None, ): """Render a DataFrame as an HTML table. @@ -948,6 +1000,9 @@ def to_html( Convert URLs to HTML links. encoding (str, default "utf-8"): Set character encoding. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: str or None: If buf is None, returns the result as a string. Otherwise @@ -960,6 +1015,8 @@ def to_markdown( buf=None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ): """Print DataFrame in Markdown-friendly format. @@ -983,6 +1040,9 @@ def to_markdown( Mode in which file is opened. index (bool, optional, default True): Add index (row) labels. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. **kwargs These parameters will be passed to `tabulate `_. @@ -992,7 +1052,7 @@ def to_markdown( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_pickle(self, path, **kwargs) -> None: + def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: """Pickle (serialize) object to file. **Examples:** @@ -1007,10 +1067,13 @@ def to_pickle(self, path, **kwargs) -> None: Args: path (str): File path where the pickled object will be stored. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_orc(self, path=None, **kwargs) -> bytes | None: + def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -1030,6 +1093,9 @@ def to_orc(self, path=None, **kwargs) -> bytes | None: we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: bytes or None: diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 9dae802b6e..ee35bfa429 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -223,6 +223,7 @@ def to_json( *, index: bool = True, lines: bool = False, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: """Convert the object to a JSON string, written to Cloud Storage. @@ -278,6 +279,11 @@ def to_json( throw ValueError if incorrect 'orient' since others are not list-like. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This parameter has + no effect when results are saved to Google Cloud Storage (GCS). + Returns: None or str: If path_or_buf is None, returns the resulting json format as a @@ -289,7 +295,13 @@ def to_json( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_csv(self, path_or_buf, *, index: bool = True) -> Optional[str]: + def to_csv( + self, + path_or_buf, + *, + index: bool = True, + allow_large_results: Optional[bool] = None, + ) -> Optional[str]: """Write object to a comma-separated values (csv) file on Cloud Storage. Args: @@ -313,6 +325,11 @@ def to_csv(self, path_or_buf, *, index: bool = True) -> Optional[str]: index (bool, default True): If True, write row names (index). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This parameter has + no effect when results are saved to Google Cloud Storage (GCS). + Returns: None or str: If path_or_buf is None, returns the resulting json format as a string. Otherwise returns None. diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 1e30d827ca..31a9aa6a93 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -363,6 +363,77 @@ def var( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rank( + self, + method: str = "average", + ascending: bool = True, + na_option: str = "keep", + ): + """ + Provide the rank of values within each group. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame( + ... { + ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], + ... } + ... ) + >>> df + group value + 0 a 2 + 1 a 4 + 2 a 2 + 3 a 3 + 4 a 5 + 5 b 1 + 6 b 2 + 7 b 4 + 8 b 1 + 9 b 5 + + [10 rows x 2 columns] + >>> for method in ['average', 'min', 'max', 'dense', 'first']: + ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) + >>> df + group value average_rank min_rank max_rank dense_rank first_rank + 0 a 2 1.5 1.0 2.0 1.0 1.0 + 1 a 4 4.0 4.0 4.0 3.0 4.0 + 2 a 2 1.5 1.0 2.0 1.0 2.0 + 3 a 3 3.0 3.0 3.0 2.0 3.0 + 4 a 5 5.0 5.0 5.0 4.0 5.0 + 5 b 1 1.5 1.0 2.0 1.0 1.0 + 6 b 2 3.0 3.0 3.0 2.0 3.0 + 7 b 4 4.0 4.0 4.0 3.0 4.0 + 8 b 1 1.5 1.0 2.0 1.0 2.0 + 9 b 5 5.0 5.0 5.0 4.0 5.0 + + [10 rows x 7 columns] + + Args: + method ({'average', 'min', 'max', 'first', 'dense'}, default 'average'): + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. + ascending (bool, default True): + False for ranks by high (1) to low (N). + na_option ({'keep', 'top', 'bottom'}, default 'keep'): + * keep: leave NA values where they are. + * top: smallest rank if ascending. + * bottom: smallest rank if descending. + + Returns: + DataFrame with ranking of values within each group + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def skew( self, *, diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 59504ee68c..c94f707671 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1061,13 +1061,16 @@ def drop_duplicates(self, *, keep: str = "first"): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_numpy(self, dtype): + def to_numpy(self, dtype, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index. Args: dtype: The dtype to pass to :meth:`numpy.asarray`. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. **kwargs: Additional keywords passed through to the ``to_numpy`` method of the underlying array (for extension arrays). diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 5e6f546d09..913a2e7c3e 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -458,6 +458,8 @@ def to_string( name: bool = False, max_rows: int | None = None, min_rows: int | None = None, + *, + allow_large_results: Optional[bool] = None, ) -> str | None: """ Render a string representation of the Series. @@ -486,6 +488,9 @@ def to_string( min_rows (int, optional): The number of rows to display in a truncated repr (when number of rows is above `max_rows`). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: @@ -498,6 +503,8 @@ def to_markdown( buf: IO[str] | None = None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: """ @@ -537,6 +544,9 @@ def to_markdown( Buffer to write to. If None, the output is returned as a string. mode (str, optional): Mode in which file is opened, "wt" by default. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. index (bool, optional, default True): Add index (row) labels. @@ -546,7 +556,12 @@ def to_markdown( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_dict(self, into: type[dict] = dict) -> Mapping: + def to_dict( + self, + into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, + ) -> Mapping: """ Convert Series to {label -> value} dict or dict-like object. @@ -573,6 +588,9 @@ def to_dict(self, into: type[dict] = dict) -> Mapping: object. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: collections.abc.Mapping: @@ -611,7 +629,13 @@ def to_frame(self, name=None) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_excel(self, excel_writer, sheet_name): + def to_excel( + self, + excel_writer, + sheet_name, + *, + allow_large_results=None, + ): """ Write Series to an Excel sheet. @@ -630,10 +654,22 @@ def to_excel(self, excel_writer, sheet_name): File path or existing ExcelWriter. sheet_name (str, default 'Sheet1'): Name of sheet to contain Series. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): + def to_latex( + self, + buf=None, + columns=None, + header=True, + index=True, + *, + allow_large_results=None, + **kwargs, + ): """ Render object to a LaTeX tabular, longtable, or nested table. @@ -647,6 +683,9 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): it is assumed to be aliases for the column names. index (bool, default True): Write row names (index). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: @@ -655,7 +694,7 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def tolist(self) -> list: + def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: """ Return a list of the values. @@ -678,6 +717,11 @@ def tolist(self) -> list: >>> s.to_list() [1, 2, 3] + Args: + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. + Returns: list: list of the values. @@ -686,7 +730,7 @@ def tolist(self) -> list: to_list = tolist - def to_numpy(self, dtype, copy=False, na_value=None): + def to_numpy(self, dtype, copy=False, na_value=None, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index. @@ -727,6 +771,9 @@ def to_numpy(self, dtype, copy=False, na_value=None): na_value (Any, optional): The value to use for missing values. The default value depends on `dtype` and the type of the array. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. ``**kwargs``: Additional keywords passed through to the ``to_numpy`` method of the underlying array (for extension arrays). @@ -738,7 +785,7 @@ def to_numpy(self, dtype, copy=False, na_value=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_pickle(self, path, **kwargs): + def to_pickle(self, path, *, allow_large_results=None, **kwargs): """ Pickle (serialize) object to file. @@ -776,13 +823,16 @@ def to_pickle(self, path, **kwargs): String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a binary ``write()`` function. File path where the pickled object will be stored. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: None """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_xarray(self): + def to_xarray(self, *, allow_large_results=None): """ Return an xarray object from the pandas object. @@ -791,6 +841,9 @@ def to_xarray(self): Data in the pandas structure converted to Dataset if the object is a DataFrame, or a DataArray if the object is a Series. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 762deda9ff..f743c7e94d 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.38.0" +__version__ = "1.39.0" + +# {x-release-please-start-date} +__release_date__ = "2025-03-05" +# {x-release-please-end}