diff --git a/.kokoro/continuous/doctest.cfg b/.kokoro/continuous/doctest.cfg index 6016700408..2aad95beed 100644 --- a/.kokoro/continuous/doctest.cfg +++ b/.kokoro/continuous/doctest.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "doctest cleanup" + value: "cleanup doctest" } env_vars: { diff --git a/.kokoro/presubmit/doctest.cfg b/.kokoro/presubmit/doctest.cfg index 6016700408..2aad95beed 100644 --- a/.kokoro/presubmit/doctest.cfg +++ b/.kokoro/presubmit/doctest.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "doctest cleanup" + value: "cleanup doctest" } env_vars: { diff --git a/CHANGELOG.md b/CHANGELOG.md index 84dd3f36c1..0393ad944c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,31 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.5.0...v2.6.0) (2025-06-09) + + +### Features + +* Add blob.transcribe function ([#1773](https://github.com/googleapis/python-bigquery-dataframes/issues/1773)) ([86159a7](https://github.com/googleapis/python-bigquery-dataframes/commit/86159a7d24102574c26764a056478757844e2eca)) +* Implement ai.classify() ([#1781](https://github.com/googleapis/python-bigquery-dataframes/issues/1781)) ([8af26d0](https://github.com/googleapis/python-bigquery-dataframes/commit/8af26d07cf3e8b22e0c69dd0172352fadc1857d8)) +* Implement item() for Series and Index ([#1792](https://github.com/googleapis/python-bigquery-dataframes/issues/1792)) ([d2154c8](https://github.com/googleapis/python-bigquery-dataframes/commit/d2154c82fa0fed6e89c47db747d3c9cd57f9c618)) +* Implement ST_ISCLOSED geography function ([#1789](https://github.com/googleapis/python-bigquery-dataframes/issues/1789)) ([36bc179](https://github.com/googleapis/python-bigquery-dataframes/commit/36bc179ee7ef9b0b6799f98f8fac3f64d91412af)) +* Implement ST_LENGTH geography function ([#1791](https://github.com/googleapis/python-bigquery-dataframes/issues/1791)) ([c5b7fda](https://github.com/googleapis/python-bigquery-dataframes/commit/c5b7fdae74a22e581f7705bc0cf5390e928f4425)) +* Support isin with bigframes.pandas.Index arg ([#1779](https://github.com/googleapis/python-bigquery-dataframes/issues/1779)) ([e480d29](https://github.com/googleapis/python-bigquery-dataframes/commit/e480d29f03636fa9824404ef90c510701e510195)) + + +### Bug Fixes + +* Address `read_csv` with both `index_col` and `use_cols` behavior inconsistency with pandas ([#1785](https://github.com/googleapis/python-bigquery-dataframes/issues/1785)) ([ba7c313](https://github.com/googleapis/python-bigquery-dataframes/commit/ba7c313c8d308e3ff3f736b60978cb7a51715209)) +* Allow KMeans model init parameter as k-means++ alias ([#1790](https://github.com/googleapis/python-bigquery-dataframes/issues/1790)) ([0b59cf1](https://github.com/googleapis/python-bigquery-dataframes/commit/0b59cf1008613770fa1433c6da395e755c86fe22)) +* Replace function now can handle bpd.NA value. ([#1786](https://github.com/googleapis/python-bigquery-dataframes/issues/1786)) ([7269512](https://github.com/googleapis/python-bigquery-dataframes/commit/7269512a28eb42029447d5380c764353278a74e1)) + + +### Documentation + +* Adjust strip method examples to match latest pandas ([#1797](https://github.com/googleapis/python-bigquery-dataframes/issues/1797)) ([817b0c0](https://github.com/googleapis/python-bigquery-dataframes/commit/817b0c0c5dc481598fbfdbe40fd925fb38f3a066)) +* Fix docstrings to improve html rendering of code examples ([#1788](https://github.com/googleapis/python-bigquery-dataframes/issues/1788)) ([38d9b73](https://github.com/googleapis/python-bigquery-dataframes/commit/38d9b7376697f8e19124e5d1f5fccda82d920b92)) + ## [2.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.4.0...v2.5.0) (2025-05-30) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 301207bb31..22bcfb1407 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -32,6 +32,8 @@ st_difference, st_distance, st_intersection, + st_isclosed, + st_length, ) from bigframes.bigquery._operations.json import ( json_extract, @@ -58,6 +60,8 @@ "st_difference", "st_distance", "st_intersection", + "st_isclosed", + "st_length", # json ops "json_extract", "json_extract_array", diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index fc9bd1a653..bdc85eed9f 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -380,3 +380,126 @@ def st_intersection( each aligned geometry with other. """ return series._apply_binary_op(other, ops.geo_st_intersection_op) + + +def st_isclosed( + series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], +) -> bigframes.series.Series: + """ + Returns TRUE for a non-empty Geography, where each element in the + Geography has an empty boundary. + + .. note:: + BigQuery's Geography functions, like `st_isclosed`, interpret the geometry + data type as a point set on the Earth's surface. A point set is a set + of points, lines, and polygons on the WGS84 reference spheroid, with + geodesic edges. See: https://cloud.google.com/bigquery/docs/geospatial-data + + **Examples:** + + >>> import bigframes.geopandas + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> from shapely.geometry import Point, LineString, Polygon + >>> bpd.options.display.progress_bar = None + + >>> series = bigframes.geopandas.GeoSeries( + ... [ + ... Point(0, 0), # Point + ... LineString([(0, 0), (1, 1)]), # Open LineString + ... LineString([(0, 0), (1, 1), (0, 1), (0, 0)]), # Closed LineString + ... Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + ... None, + ... ] + ... ) + >>> series + 0 POINT (0 0) + 1 LINESTRING (0 0, 1 1) + 2 LINESTRING (0 0, 1 1, 0 1, 0 0) + 3 POLYGON ((0 0, 1 1, 0 1, 0 0)) + 4 None + dtype: geometry + + >>> bbq.st_isclosed(series) + 0 True + 1 False + 2 True + 3 False + 4 + dtype: boolean + + Args: + series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries): + A series containing geography objects. + + Returns: + bigframes.pandas.Series: + Series of booleans indicating whether each geometry is closed. + """ + series = series._apply_unary_op(ops.geo_st_isclosed_op) + series.name = None + return series + + +def st_length( + series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], + *, + use_spheroid: bool = False, +) -> bigframes.series.Series: + """Returns the total length in meters of the lines in the input GEOGRAPHY. + + If a series element is a point or a polygon, returns zero for that row. + If a series element is a collection, returns the length of the lines + in the collection; if the collection doesn't contain lines, returns + zero. + + The optional use_spheroid parameter determines how this function + measures distance. If use_spheroid is FALSE, the function measures + distance on the surface of a perfect sphere. + + The use_spheroid parameter currently only supports the value FALSE. The + default value of use_spheroid is FALSE. See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_length + + **Examples:** + + >>> import bigframes.geopandas + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> from shapely.geometry import Polygon, LineString, Point, GeometryCollection + >>> bpd.options.display.progress_bar = None + + >>> series = bigframes.geopandas.GeoSeries( + ... [ + ... LineString([(0, 0), (1, 0)]), # Length will be approx 1 degree in meters + ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), # Length is 0 + ... Point(0, 1), # Length is 0 + ... GeometryCollection([LineString([(0,0),(0,1)]), Point(1,1)]) # Length of LineString only + ... ] + ... ) + + >>> result = bbq.st_length(series) + >>> result + 0 111195.101177 + 1 0.0 + 2 0.0 + 3 111195.101177 + dtype: Float64 + + Args: + series (bigframes.series.Series | bigframes.geopandas.GeoSeries): + A series containing geography objects. + use_spheroid (bool, optional): + Determines how this function measures distance. + If FALSE (default), measures distance on a perfect sphere. + Currently, only FALSE is supported. + + Returns: + bigframes.series.Series: + Series of floats representing the lengths in meters. + """ + series = series._apply_unary_op(ops.GeoStLengthOp(use_spheroid=use_spheroid)) + series.name = None + return series diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index f8fdb21946..51c030a23b 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -95,6 +95,10 @@ def _create_udf(self): sql, job_config=bigquery.QueryJobConfig(), metrics=self._session._metrics, + location=None, + project=None, + timeout=None, + query_with_job=True, ) return udf_name diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 20773fd1b4..a6c700a485 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -34,7 +34,6 @@ import bigframes.core.ordering as orderings import bigframes.core.schema as schemata import bigframes.core.tree_properties -import bigframes.core.utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.exceptions as bfe diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index fb5399b7cb..451783602d 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -22,10 +22,9 @@ import bigframes_vendored.ibis.expr.api as ibis_api import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.types as ibis_types -import pyarrow as pa from bigframes import dtypes, operations -from bigframes.core import expression +from bigframes.core import expression, pyarrow_utils import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.configs as configs @@ -172,9 +171,7 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args): pa_table = pa_table.rename_columns([item.id.sql for item in node.scan_list.items]) if offsets: - pa_table = pa_table.append_column( - offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) - ) + pa_table = pyarrow_utils.append_offsets(pa_table, offsets) return compiled.UnorderedIR.from_polars(pa_table, bq_schema) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 14d8e8501c..a0e85d8c69 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,14 +16,17 @@ import dataclasses import functools import itertools -from typing import cast, Optional, Sequence, Tuple, TYPE_CHECKING +import operator +from typing import cast, Literal, Optional, Sequence, Tuple, TYPE_CHECKING + +import pandas as pd import bigframes.core -from bigframes.core import window_spec +from bigframes.core import identifiers, nodes, ordering, window_spec import bigframes.core.expression as ex import bigframes.core.guid as guid -import bigframes.core.nodes as nodes import bigframes.core.rewrite +import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -37,6 +40,45 @@ polars_installed = False if polars_installed: + _DTYPE_MAPPING = { + # Direct mappings + bigframes.dtypes.INT_DTYPE: pl.Int64(), + bigframes.dtypes.FLOAT_DTYPE: pl.Float64(), + bigframes.dtypes.BOOL_DTYPE: pl.Boolean(), + bigframes.dtypes.STRING_DTYPE: pl.String(), + bigframes.dtypes.NUMERIC_DTYPE: pl.Decimal(38, 9), + bigframes.dtypes.BIGNUMERIC_DTYPE: pl.Decimal(76, 38), + bigframes.dtypes.BYTES_DTYPE: pl.Binary(), + bigframes.dtypes.DATE_DTYPE: pl.Date(), + bigframes.dtypes.DATETIME_DTYPE: pl.Datetime(time_zone=None), + bigframes.dtypes.TIMESTAMP_DTYPE: pl.Datetime(time_zone="UTC"), + bigframes.dtypes.TIME_DTYPE: pl.Time(), + bigframes.dtypes.TIMEDELTA_DTYPE: pl.Duration(), + # Indirect mappings + bigframes.dtypes.GEO_DTYPE: pl.String(), + bigframes.dtypes.JSON_DTYPE: pl.String(), + } + + def _bigframes_dtype_to_polars_dtype( + dtype: bigframes.dtypes.ExpressionType, + ) -> pl.DataType: + if dtype is None: + return pl.Null() + if bigframes.dtypes.is_struct_like(dtype): + return pl.Struct( + [ + pl.Field(name, _bigframes_dtype_to_polars_dtype(type)) + for name, type in bigframes.dtypes.get_struct_fields(dtype).items() + ] + ) + if bigframes.dtypes.is_array_like(dtype): + return pl.Array( + inner=_bigframes_dtype_to_polars_dtype( + bigframes.dtypes.get_array_inner_type(dtype) + ) + ) + else: + return _DTYPE_MAPPING[dtype] @dataclasses.dataclass(frozen=True) class PolarsExpressionCompiler: @@ -47,33 +89,45 @@ class PolarsExpressionCompiler: """ @functools.singledispatchmethod - def compile_expression(self, expression: ex.Expression): + def compile_expression(self, expression: ex.Expression) -> pl.Expr: raise NotImplementedError(f"Cannot compile expression: {expression}") @compile_expression.register def _( self, expression: ex.ScalarConstantExpression, - ): - return pl.lit(expression.value) + ) -> pl.Expr: + value = expression.value + if not isinstance(value, float) and pd.isna(value): # type: ignore + value = None + if expression.dtype is None: + return pl.lit(None) + return pl.lit(value, _bigframes_dtype_to_polars_dtype(expression.dtype)) @compile_expression.register def _( self, expression: ex.DerefOp, - ): + ) -> pl.Expr: return pl.col(expression.id.sql) + @compile_expression.register + def _( + self, + expression: ex.SchemaFieldRefExpression, + ) -> pl.Expr: + return pl.col(expression.field.id.sql) + @compile_expression.register def _( self, expression: ex.OpExpression, - ): + ) -> pl.Expr: # TODO: Complete the implementation, convert to hash dispatch op = expression.op args = tuple(map(self.compile_expression, expression.inputs)) if isinstance(op, ops.invert_op.__class__): - return args[0].neg() + return ~args[0] if isinstance(op, ops.and_op.__class__): return args[0] & args[1] if isinstance(op, ops.or_op.__class__): @@ -82,6 +136,21 @@ def _( return args[0] + args[1] if isinstance(op, ops.sub_op.__class__): return args[0] - args[1] + if isinstance(op, ops.mul_op.__class__): + return args[0] * args[1] + if isinstance(op, ops.div_op.__class__): + return args[0] / args[1] + if isinstance(op, ops.floordiv_op.__class__): + # TODO: Handle int // 0 + return args[0] // args[1] + if isinstance(op, (ops.pow_op.__class__, ops.unsafe_pow_op.__class__)): + return args[0] ** args[1] + if isinstance(op, ops.abs_op.__class__): + return args[0].abs() + if isinstance(op, ops.neg_op.__class__): + return args[0].neg() + if isinstance(op, ops.pos_op.__class__): + return args[0] if isinstance(op, ops.ge_op.__class__): return args[0] >= args[1] if isinstance(op, ops.gt_op.__class__): @@ -91,23 +160,48 @@ def _( if isinstance(op, ops.lt_op.__class__): return args[0] < args[1] if isinstance(op, ops.eq_op.__class__): - return args[0] == args[1] + return args[0].eq(args[1]) + if isinstance(op, ops.eq_null_match_op.__class__): + return args[0].eq_missing(args[1]) if isinstance(op, ops.ne_op.__class__): - return args[0] != args[1] + return args[0].ne(args[1]) + if isinstance(op, ops.IsInOp): + # TODO: Filter out types that can't be coerced to right type + if op.match_nulls or not any(map(pd.isna, op.values)): + # newer polars version have nulls_equal arg + return args[0].is_in(op.values) + else: + return args[0].is_in(op.values) or args[0].is_null() if isinstance(op, ops.mod_op.__class__): return args[0] % args[1] if isinstance(op, ops.coalesce_op.__class__): return pl.coalesce(*args) + if isinstance(op, ops.fillna_op.__class__): + return pl.coalesce(*args) + if isinstance(op, ops.isnull_op.__class__): + return args[0].is_null() + if isinstance(op, ops.notnull_op.__class__): + return args[0].is_not_null() if isinstance(op, ops.CaseWhenOp): expr = pl.when(args[0]).then(args[1]) for pred, result in zip(args[2::2], args[3::2]): - return expr.when(pred).then(result) + expr = expr.when(pred).then(result) # type: ignore return expr if isinstance(op, ops.where_op.__class__): original, condition, otherwise = args return pl.when(condition).then(original).otherwise(otherwise) + if isinstance(op, ops.AsTypeOp): + return self.astype(args[0], op.to_type, safe=op.safe) + raise NotImplementedError(f"Polars compiler hasn't implemented {op}") + def astype( + self, col: pl.Expr, dtype: bigframes.dtypes.Dtype, safe: bool + ) -> pl.Expr: + # TODO: Polars casting works differently, need to lower instead to specific conversion ops. + # eg. We want "True" instead of "true" for bool to string. + return col.cast(_DTYPE_MAPPING[dtype], strict=not safe) + @dataclasses.dataclass(frozen=True) class PolarsAggregateCompiler: scalar_compiler = PolarsExpressionCompiler() @@ -149,12 +243,26 @@ def compile_agg_expr(self, expr: ex.Aggregation): return self.compile_agg_op(expr.op, inputs) - def compile_agg_op(self, op: agg_ops.WindowOp, inputs: Sequence[str] = []): + def compile_agg_op( + self, op: agg_ops.WindowOp, inputs: Sequence[str] = [] + ) -> pl.Expr: if isinstance(op, agg_ops.ProductOp): - # TODO: Need schema to cast back to original type if posisble (eg float back to int) - return pl.col(*inputs).log().sum().exp() + # TODO: Fix datatype inconsistency with float/int + return pl.col(*inputs).product() if isinstance(op, agg_ops.SumOp): return pl.sum(*inputs) + if isinstance(op, (agg_ops.SizeOp, agg_ops.SizeUnaryOp)): + return pl.len() + if isinstance(op, agg_ops.MeanOp): + return pl.mean(*inputs) + if isinstance(op, agg_ops.MedianOp): + return pl.median(*inputs) + if isinstance(op, agg_ops.AllOp): + return pl.all(*inputs) + if isinstance(op, agg_ops.AnyOp): + return pl.any(*inputs) # type: ignore + if isinstance(op, agg_ops.NuniqueOp): + return pl.col(*inputs).drop_nulls().n_unique() if isinstance(op, agg_ops.MinOp): return pl.min(*inputs) if isinstance(op, agg_ops.MaxOp): @@ -162,7 +270,35 @@ def compile_agg_op(self, op: agg_ops.WindowOp, inputs: Sequence[str] = []): if isinstance(op, agg_ops.CountOp): return pl.count(*inputs) if isinstance(op, agg_ops.CorrOp): - return pl.corr(*inputs) + return pl.corr( + pl.col(inputs[0]).fill_nan(None), pl.col(inputs[1]).fill_nan(None) + ) + if isinstance(op, agg_ops.CovOp): + return pl.cov( + pl.col(inputs[0]).fill_nan(None), pl.col(inputs[1]).fill_nan(None) + ) + if isinstance(op, agg_ops.StdOp): + return pl.std(inputs[0]) + if isinstance(op, agg_ops.VarOp): + return pl.var(inputs[0]) + if isinstance(op, agg_ops.PopVarOp): + return pl.var(inputs[0], ddof=0) + if isinstance(op, agg_ops.FirstNonNullOp): + return pl.col(*inputs).drop_nulls().first() + if isinstance(op, agg_ops.LastNonNullOp): + return pl.col(*inputs).drop_nulls().last() + if isinstance(op, agg_ops.FirstOp): + return pl.col(*inputs).first() + if isinstance(op, agg_ops.LastOp): + return pl.col(*inputs).last() + if isinstance(op, agg_ops.ShiftOp): + return pl.col(*inputs).shift(op.periods) + if isinstance(op, agg_ops.DiffOp): + return pl.col(*inputs) - pl.col(*inputs).shift(op.periods) + if isinstance(op, agg_ops.AnyValueOp): + return pl.max( + *inputs + ) # probably something faster? maybe just get first item? raise NotImplementedError( f"Aggregate op {op} not yet supported in polars engine." ) @@ -197,11 +333,14 @@ def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: # TODO: Create standard way to configure BFET -> BFET rewrites # Polars has incomplete slice support in lazy mode - node = nodes.bottom_up(array_value.node, bigframes.core.rewrite.rewrite_slice) + node = array_value.node + node = bigframes.core.rewrite.column_pruning(node) + node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) + node = bigframes.core.rewrite.pull_out_window_order(node) return self.compile_node(node) @functools.singledispatchmethod - def compile_node(self, node: nodes.BigFrameNode): + def compile_node(self, node: nodes.BigFrameNode) -> pl.LazyFrame: """Defines transformation but isn't cached, always use compile_node instead""" raise ValueError(f"Can't compile unrecognized node: {node}") @@ -213,7 +352,12 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): lazy_frame = cast( pl.DataFrame, pl.from_arrow(node.local_data_source.data) ).lazy() - return lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) + lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) + if node.offsets_col: + lazy_frame = lazy_frame.with_columns( + [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.offsets_col.sql)] + ) + return lazy_frame @compile_node.register def compile_filter(self, node: nodes.FilterNode): @@ -227,17 +371,18 @@ def compile_orderby(self, node: nodes.OrderByNode): if len(node.by) == 0: # pragma: no cover return frame - - frame = frame.sort( - [ - self.expr_compiler.compile_expression(by.scalar_expression) - for by in node.by - ], - descending=[not by.direction.is_ascending for by in node.by], - nulls_last=[by.na_last for by in node.by], + return self._sort(frame, node.by) + + def _sort( + self, frame: pl.LazyFrame, by: Sequence[ordering.OrderingExpression] + ) -> pl.LazyFrame: + sorted = frame.sort( + [self.expr_compiler.compile_expression(by.scalar_expression) for by in by], + descending=[not by.direction.is_ascending for by in by], + nulls_last=[by.na_last for by in by], maintain_order=True, ) - return frame + return sorted @compile_node.register def compile_reversed(self, node: nodes.ReversedNode): @@ -251,10 +396,15 @@ def compile_selection(self, node: nodes.SelectionNode): @compile_node.register def compile_projection(self, node: nodes.ProjectionNode): - new_cols = [ - self.expr_compiler.compile_expression(ex).alias(name.sql) - for ex, name in node.assignments - ] + new_cols = [] + for proj_expr, name in node.assignments: + bound_expr = ex.bind_schema_fields(proj_expr, node.child.field_by_id) + new_col = self.expr_compiler.compile_expression(bound_expr).alias(name.sql) + if bound_expr.output_type is None: + new_col = new_col.cast( + _bigframes_dtype_to_polars_dtype(bigframes.dtypes.DEFAULT_DTYPE) + ) + new_cols.append(new_col) return self.compile_node(node.child).with_columns(new_cols) @compile_node.register @@ -265,37 +415,91 @@ def compile_offsets(self, node: nodes.PromoteOffsetsNode): @compile_node.register def compile_join(self, node: nodes.JoinNode): - # Always totally order this, as adding offsets is relatively cheap for in-memory columnar data - left = self.compile_node(node.left_child).with_columns( + left = self.compile_node(node.left_child) + right = self.compile_node(node.right_child) + left_on = [l_name.id.sql for l_name, _ in node.conditions] + right_on = [r_name.id.sql for _, r_name in node.conditions] + if node.type == "right": + return self._ordered_join( + right, left, "left", right_on, left_on, node.joins_nulls + ).select([id.sql for id in node.ids]) + return self._ordered_join( + left, right, node.type, left_on, right_on, node.joins_nulls + ) + + def _ordered_join( + self, + left_frame: pl.LazyFrame, + right_frame: pl.LazyFrame, + how: Literal["inner", "outer", "left", "cross"], + left_on: Sequence[str], + right_on: Sequence[str], + join_nulls: bool, + ): + if how == "right": + # seems to cause seg faults as of v1.30 for no apparent reason + raise ValueError("right join not supported") + left = left_frame.with_columns( [ pl.int_range(pl.len()).alias("_bf_join_l"), ] ) - right = self.compile_node(node.right_child).with_columns( + right = right_frame.with_columns( [ pl.int_range(pl.len()).alias("_bf_join_r"), ] ) - if node.type != "cross": - left_on = [l_name.id.sql for l_name, _ in node.conditions] - right_on = [r_name.id.sql for _, r_name in node.conditions] + if how != "cross": joined = left.join( - right, how=node.type, left_on=left_on, right_on=right_on, coalesce=False + right, + how=how, + left_on=left_on, + right_on=right_on, + # Note: join_nulls renamed to nulls_equal for polars 1.24 + join_nulls=join_nulls, # type: ignore + coalesce=False, ) else: - joined = left.join(right, how=node.type) - return joined.sort(["_bf_join_l", "_bf_join_r"]).drop( + joined = left.join(right, how=how, coalesce=False) + + join_order = ( + ["_bf_join_l", "_bf_join_r"] + if how != "right" + else ["_bf_join_r", "_bf_join_l"] + ) + return joined.sort(join_order, nulls_last=True).drop( ["_bf_join_l", "_bf_join_r"] ) @compile_node.register def compile_concat(self, node: nodes.ConcatNode): - return pl.concat(self.compile_node(child) for child in node.child_nodes) + child_frames = [self.compile_node(child) for child in node.child_nodes] + child_frames = [ + frame.rename( + {col: id.sql for col, id in zip(frame.columns, node.output_ids)} + ) + for frame in child_frames + ] + df = pl.concat(child_frames) + return df @compile_node.register def compile_agg(self, node: nodes.AggregateNode): df = self.compile_node(node.child) - + if node.dropna and len(node.by_column_ids) > 0: + df = df.filter( + [pl.col(ref.id.sql).is_not_null() for ref in node.by_column_ids] + ) + if node.order_by: + df = self._sort(df, node.order_by) + return self._aggregate(df, node.aggregations, node.by_column_ids) + + def _aggregate( + self, + df: pl.LazyFrame, + aggregations: Sequence[Tuple[ex.Aggregation, identifiers.ColumnId]], + grouping_keys: Tuple[ex.DerefOp, ...], + ) -> pl.LazyFrame: # Need to materialize columns to broadcast constants agg_inputs = [ list( @@ -304,7 +508,7 @@ def compile_agg(self, node: nodes.AggregateNode): self.agg_compiler.get_args(agg), ) ) - for agg, _ in node.aggregations + for agg, _ in aggregations ] df_agg_inputs = df.with_columns(itertools.chain(*agg_inputs)) @@ -313,18 +517,19 @@ def compile_agg(self, node: nodes.AggregateNode): self.agg_compiler.compile_agg_op( agg.op, list(map(lambda x: x.meta.output_name(), inputs)) ).alias(id.sql) - for (agg, id), inputs in zip(node.aggregations, agg_inputs) + for (agg, id), inputs in zip(aggregations, agg_inputs) ] - if len(node.by_column_ids) > 0: - group_exprs = [pl.col(ref.id.sql) for ref in node.by_column_ids] + if len(grouping_keys) > 0: + group_exprs = [pl.col(ref.id.sql) for ref in grouping_keys] grouped_df = df_agg_inputs.group_by(group_exprs) - return grouped_df.agg(agg_exprs).sort(group_exprs) + return grouped_df.agg(agg_exprs).sort(group_exprs, nulls_last=True) else: return df_agg_inputs.select(agg_exprs) @compile_node.register def compile_explode(self, node: nodes.ExplodeNode): + assert node.offsets_col is None df = self.compile_node(node.child) cols = [pl.col(col.id.sql) for col in node.column_ids] return df.explode(cols) @@ -338,55 +543,92 @@ def compile_sample(self, node: nodes.RandomSampleNode): @compile_node.register def compile_window(self, node: nodes.WindowOpNode): df = self.compile_node(node.child) - agg_expr = self.agg_compiler.compile_agg_expr(node.expression).alias( - node.output_name.sql - ) - # Three window types: completely unbound, grouped and row bounded window = node.window_spec - + # Should have been handled by reweriter + assert len(window.ordering) == 0 if window.min_periods > 0: raise NotImplementedError("min_period not yet supported for polars engine") - if window.bounds is None: + if (window.bounds is None) or (window.is_unbounded): # polars will automatically broadcast the aggregate to the matching input rows - if len(window.grouping_keys) == 0: # unbound window - pass - else: # partition-only window - agg_expr = agg_expr.over( - partition_by=[ref.id.sql for ref in window.grouping_keys] - ) - return df.with_columns([agg_expr]) - + agg_pl = self.agg_compiler.compile_agg_expr(node.expression) + if window.grouping_keys: + agg_pl = agg_pl.over(id.id.sql for id in window.grouping_keys) + result = df.with_columns(agg_pl.alias(node.output_name.sql)) else: # row-bounded window - assert isinstance(window.bounds, window_spec.RowsWindowBounds) - # Polars API semi-bounded, and any grouped rolling window challenging - # https://github.com/pola-rs/polars/issues/4799 - # https://github.com/pola-rs/polars/issues/8976 - index_col_name = "_bf_pl_engine_offsets" - indexed_df = df.with_row_index(index_col_name) - if len(window.grouping_keys) == 0: # rolling-only window - # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html - offset_n = window.bounds.start - period_n = _get_period(window.bounds) or df.collect().height - results = indexed_df.rolling( - index_column=index_col_name, - period=f"{period_n}i", - offset=f"{offset_n}i" if offset_n else None, - ).agg(agg_expr) - else: # groupby-rolling window - raise NotImplementedError( - "Groupby rolling windows not yet implemented in polars engine" - ) - # polars is columnar, so this is efficient - # TODO: why can't just add columns? - return pl.concat([df, results], how="horizontal") + window_result = self._calc_row_analytic_func( + df, node.expression, node.window_spec, node.output_name.sql + ) + result = pl.concat([df, window_result], how="horizontal") + + # Probably easier just to pull this out as a rewriter + if ( + node.expression.op.skips_nulls + and not node.never_skip_nulls + and node.expression.column_references + ): + nullity_expr = functools.reduce( + operator.or_, + ( + pl.col(column.sql).is_null() + for column in node.expression.column_references + ), + ) + result = result.with_columns( + pl.when(nullity_expr) + .then(None) + .otherwise(pl.col(node.output_name.sql)) + .alias(node.output_name.sql) + ) + return result + + def _calc_row_analytic_func( + self, + frame: pl.LazyFrame, + agg_expr: ex.Aggregation, + window: window_spec.WindowSpec, + name: str, + ) -> pl.LazyFrame: + if not isinstance(window.bounds, window_spec.RowsWindowBounds): + raise NotImplementedError("Only row bounds supported by polars engine") + groupby = None + if len(window.grouping_keys) > 0: + groupby = [ + self.expr_compiler.compile_expression(ref) + for ref in window.grouping_keys + ] + + # Polars API semi-bounded, and any grouped rolling window challenging + # https://github.com/pola-rs/polars/issues/4799 + # https://github.com/pola-rs/polars/issues/8976 + pl_agg_expr = self.agg_compiler.compile_agg_expr(agg_expr).alias(name) + index_col_name = "_bf_pl_engine_offsets" + indexed_df = frame.with_row_index(index_col_name) + # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html + period_n, offset_n = _get_period_and_offset(window.bounds) + return ( + indexed_df.rolling( + index_column=index_col_name, + period=f"{period_n}i", + offset=f"{offset_n}i" if (offset_n is not None) else None, + group_by=groupby, + ) + .agg(pl_agg_expr) + .select(name) + ) -def _get_period(bounds: window_spec.RowsWindowBounds) -> Optional[int]: - """Returns None if the boundary is infinite.""" - if bounds.start is None or bounds.end is None: - return None +def _get_period_and_offset( + bounds: window_spec.RowsWindowBounds, +) -> tuple[int, Optional[int]]: + # fixed size window + if (bounds.start is not None) and (bounds.end is not None): + return ((bounds.end - bounds.start + 1), bounds.start - 1) - # collecting height is a massive kludge - return bounds.end - bounds.start + 1 + LARGE_N = 1000000000 + if bounds.start is not None: + return (LARGE_N, bounds.start - 1) + if bounds.end is not None: + return (LARGE_N, None) + raise ValueError("Not a bounded window") diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 7707f16dad..a1fc995159 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -30,7 +30,6 @@ import bigframes.core.compile.default_ordering import bigframes.core.compile.ibis_types import bigframes.core.expression as ex -import bigframes.dtypes import bigframes.operations as ops _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) @@ -1074,11 +1073,22 @@ def geo_st_intersection_op_impl(x: ibis_types.Value, y: ibis_types.Value): ) +@scalar_op_compiler.register_unary_op(ops.geo_st_isclosed_op, pass_op=False) +def geo_st_isclosed_op_impl(x: ibis_types.Value): + return st_isclosed(x) + + @scalar_op_compiler.register_unary_op(ops.geo_x_op) def geo_x_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).x() +@scalar_op_compiler.register_unary_op(ops.GeoStLengthOp, pass_op=True) +def geo_length_op_impl(x: ibis_types.Value, op: ops.GeoStLengthOp): + # Call the st_length UDF defined in this file (or imported) + return st_length(x, op.use_spheroid) + + @scalar_op_compiler.register_unary_op(ops.geo_y_op) def geo_y_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).y() @@ -2057,6 +2067,12 @@ def st_distance(a: ibis_dtypes.geography, b: ibis_dtypes.geography, use_spheroid """Convert string to geography.""" +@ibis_udf.scalar.builtin +def st_length(geog: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore + """ST_LENGTH BQ builtin. This body is never executed.""" + pass + + @ibis_udf.scalar.builtin def unix_micros(a: ibis_dtypes.timestamp) -> int: # type: ignore """Convert a timestamp to microseconds""" @@ -2180,6 +2196,11 @@ def str_lstrip_op( # type: ignore[empty-body] """Remove leading and trailing characters.""" +@ibis_udf.scalar.builtin +def st_isclosed(a: ibis_dtypes.geography) -> ibis_dtypes.boolean: # type: ignore + """Checks if a geography is closed.""" + + @ibis_udf.scalar.builtin(name="rtrim") def str_rstrip_op( # type: ignore[empty-body] x: ibis_dtypes.String, to_strip: ibis_dtypes.String diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 1cb270297c..50169d1a8b 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -18,10 +18,9 @@ import typing from google.cloud import bigquery -import pyarrow as pa import sqlglot.expressions as sge -from bigframes.core import expression, guid, identifiers, nodes, rewrite +from bigframes.core import expression, guid, identifiers, nodes, pyarrow_utils, rewrite from bigframes.core.compile import configs import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir @@ -155,9 +154,7 @@ def compile_readlocal(self, node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: offsets = node.offsets_col.sql if node.offsets_col else None if offsets: - pa_table = pa_table.append_column( - offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) - ) + pa_table = pyarrow_utils.append_offsets(pa_table, offsets) return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema, uid_gen=self.uid_gen) diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index d4d70f5a06..8732b55990 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -112,3 +112,23 @@ def get_global_session(): def with_default_session(func: Callable[..., _T], *args, **kwargs) -> _T: return func(get_global_session(), *args, **kwargs) + + +class _GlobalSessionContext: + """ + Context manager for testing that sets global session. + """ + + def __init__(self, session: bigframes.session.Session): + self._session = session + + def __enter__(self): + global _global_session, _global_session_lock + with _global_session_lock: + self._previous_session = _global_session + _global_session = self._session + + def __exit__(self, *exc_details): + global _global_session, _global_session_lock + with _global_session_lock: + _global_session = self._previous_session diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 44b1d9d4fa..836d84b46a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -504,6 +504,10 @@ def unique(self, level: Hashable | int | None = None) -> Index: return self.get_level_values(level).drop_duplicates() def isin(self, values) -> Index: + import bigframes.series as series + + if isinstance(values, (series.Series, Index)): + return Index(self.to_series().isin(values)) if not utils.is_list_like(values): raise TypeError( "only list-like objects are allowed to be passed to " @@ -614,6 +618,10 @@ def to_numpy(self, dtype=None, *, allow_large_results=None, **kwargs) -> np.ndar def __len__(self): return self.shape[0] + def item(self): + # Docstring is in third_party/bigframes_vendored/pandas/core/indexes/base.py + return self.to_series().peek(2).item() + def _should_create_datetime_index(block: blocks.Block) -> bool: if len(block.index.dtypes) != 1: diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 2e8c4aff44..da1c174bc4 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -265,7 +265,13 @@ def _adapt_pandas_series( ) -> tuple[Union[pa.ChunkedArray, pa.Array], bigframes.dtypes.Dtype]: # Mostly rely on pyarrow conversions, but have to convert geo without its help. if series.dtype == bigframes.dtypes.GEO_DTYPE: - series = geopandas.GeoSeries(series).to_wkt(rounding_precision=-1) + # geoseries produces eg "POINT (1, 1)", while bq uses style "POINT(1, 1)" + # we normalize to bq style for consistency + series = ( + geopandas.GeoSeries(series) + .to_wkt(rounding_precision=-1) + .str.replace(r"(\w+) \(", repl=r"\1(", regex=True) + ) return pa.array(series, type=pa.string()), bigframes.dtypes.GEO_DTYPE try: return _adapt_arrow_array(pa.array(series)) @@ -295,7 +301,7 @@ def _adapt_chunked_array( def _adapt_arrow_array(array: pa.Array) -> tuple[pa.Array, bigframes.dtypes.Dtype]: - """Normalize the array to managed storage types. Preverse shapes, only transforms values.""" + """Normalize the array to managed storage types. Preserve shapes, only transforms values.""" if array.offset != 0: # Offset arrays don't have all operations implemented return _adapt_arrow_array(pa.concat_arrays([array])) @@ -326,7 +332,7 @@ def _adapt_arrow_array(array: pa.Array) -> tuple[pa.Array, bigframes.dtypes.Dtyp return new_value.fill_null([]), bigframes.dtypes.list_type(values_type) if array.type == bigframes.dtypes.JSON_ARROW_TYPE: return _canonicalize_json(array), bigframes.dtypes.JSON_DTYPE - target_type = _logical_type_replacements(array.type) + target_type = logical_type_replacements(array.type) if target_type != array.type: # TODO: Maybe warn if lossy conversion? array = array.cast(target_type) @@ -372,6 +378,10 @@ def recursive_f(type: pa.DataType) -> pa.DataType: if new_field_t != type.value_type: return pa.list_(new_field_t) return type + # polars can produce large lists, and we want to map these down to regular lists + if pa.types.is_large_list(type): + new_field_t = recursive_f(type.value_type) + return pa.list_(new_field_t) if pa.types.is_struct(type): struct_type = cast(pa.StructType, type) new_fields: list[pa.Field] = [] @@ -385,7 +395,7 @@ def recursive_f(type: pa.DataType) -> pa.DataType: @_recursive_map_types -def _logical_type_replacements(type: pa.DataType) -> pa.DataType: +def logical_type_replacements(type: pa.DataType) -> pa.DataType: if pa.types.is_timestamp(type): # This is potentially lossy, but BigFrames doesn't support ns new_tz = "UTC" if (type.tz is not None) else None @@ -403,8 +413,11 @@ def _logical_type_replacements(type: pa.DataType) -> pa.DataType: if pa.types.is_large_string(type): # simple string type can handle the largest strings needed return pa.string() + if pa.types.is_large_binary(type): + # simple string type can handle the largest strings needed + return pa.binary() if pa.types.is_dictionary(type): - return _logical_type_replacements(type.value_type) + return logical_type_replacements(type.value_type) if pa.types.is_null(type): # null as a type not allowed, default type is float64 for bigframes return pa.float64() diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index cc82c844f7..9dcd74182b 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -154,6 +154,16 @@ def is_limit(self) -> bool: and (self.stop > 0) ) + @property + def is_noop(self) -> bool: + """Returns whether this node doesn't actually change the results.""" + # TODO: Handle tail case. + return ( + ((not self.start) or (self.start == 0)) + and (self.step == 1) + and ((self.stop is None) or (self.stop == self.row_count)) + ) + @property def row_count(self) -> typing.Optional[int]: child_length = self.child.row_count @@ -591,6 +601,10 @@ class ScanList: items: typing.Tuple[ScanItem, ...] + @classmethod + def from_items(cls, items: Iterable[ScanItem]) -> ScanList: + return cls(tuple(items)) + def filter_cols( self, ids: AbstractSet[identifiers.ColumnId], diff --git a/bigframes/core/pyarrow_utils.py b/bigframes/core/pyarrow_utils.py index eead30d908..4196e68304 100644 --- a/bigframes/core/pyarrow_utils.py +++ b/bigframes/core/pyarrow_utils.py @@ -85,3 +85,18 @@ def truncate_pyarrow_iterable( else: yield batch total_yielded += batch.num_rows + + +def append_offsets( + pa_table: pa.Table, + offsets_col: str, +) -> pa.Table: + return pa_table.append_column( + offsets_col, pa.array(range(pa_table.num_rows), type=pa.int64()) + ) + + +def as_nullable(pa_table: pa.Table): + """Normalizes schema to nullable for value-wise comparisons.""" + nullable_schema = pa.schema(field.with_nullable(True) for field in pa_table.schema) + return pa_table.cast(nullable_schema) diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index b8f1d26db8..5d554d45d7 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -24,7 +24,7 @@ ) from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions -from bigframes.core.rewrite.windows import rewrite_range_rolling +from bigframes.core.rewrite.windows import pull_out_window_order, rewrite_range_rolling __all__ = [ "legacy_join_as_projection", @@ -41,4 +41,5 @@ "bake_order", "try_reduce_to_local_scan", "fold_row_counts", + "pull_out_window_order", ] diff --git a/bigframes/core/rewrite/scan_reduction.py b/bigframes/core/rewrite/scan_reduction.py index b9050c0c34..b0729337e7 100644 --- a/bigframes/core/rewrite/scan_reduction.py +++ b/bigframes/core/rewrite/scan_reduction.py @@ -16,6 +16,7 @@ from typing import Optional from bigframes.core import nodes +import bigframes.core.rewrite.slices def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTableNode]: @@ -28,7 +29,15 @@ def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTab return None -def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLocalNode]: +def try_reduce_to_local_scan( + node: nodes.BigFrameNode, +) -> Optional[tuple[nodes.ReadLocalNode, Optional[int]]]: + """Create a ReadLocalNode with optional limit, if possible. + + Similar to ReadApiSemiExecutor._try_adapt_plan. + """ + node, limit = bigframes.core.rewrite.slices.pull_out_limit(node) + if not all( map( lambda x: isinstance(x, (nodes.ReadLocalNode, nodes.SelectionNode)), @@ -38,7 +47,7 @@ def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLoc return None result = node.bottom_up(merge_scan) if isinstance(result, nodes.ReadLocalNode): - return result + return result, limit return None diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 92911310da..bed3a8a3f3 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -57,6 +57,9 @@ def pull_out_limit( if (prior_limit is not None) and (prior_limit < limit): limit = prior_limit return new_root, limit + if root.is_noop: + new_root, prior_limit = pull_out_limit(root.child) + return new_root, prior_limit elif ( isinstance(root, (nodes.SelectionNode, nodes.ProjectionNode)) and root.row_preserving diff --git a/bigframes/core/rewrite/windows.py b/bigframes/core/rewrite/windows.py index 9f55db23af..6e9ba0dd3d 100644 --- a/bigframes/core/rewrite/windows.py +++ b/bigframes/core/rewrite/windows.py @@ -17,7 +17,7 @@ import dataclasses from bigframes import operations as ops -from bigframes.core import nodes +from bigframes.core import guid, identifiers, nodes, ordering def rewrite_range_rolling(node: nodes.BigFrameNode) -> nodes.BigFrameNode: @@ -43,3 +43,34 @@ def rewrite_range_rolling(node: nodes.BigFrameNode) -> nodes.BigFrameNode: node, window_spec=dataclasses.replace(node.window_spec, ordering=(new_ordering,)), ) + + +def pull_out_window_order(root: nodes.BigFrameNode) -> nodes.BigFrameNode: + return root.bottom_up(rewrite_window_node) + + +def rewrite_window_node(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if not isinstance(node, nodes.WindowOpNode): + return node + if len(node.window_spec.ordering) == 0: + return node + else: + offsets_id = guid.generate_guid() + w_offsets = nodes.PromoteOffsetsNode( + node.child, identifiers.ColumnId(offsets_id) + ) + sorted_child = nodes.OrderByNode(w_offsets, node.window_spec.ordering) + new_window_node = dataclasses.replace( + node, + child=sorted_child, + window_spec=node.window_spec.without_order(force=True), + ) + w_resetted_order = nodes.OrderByNode( + new_window_node, + by=(ordering.ascending_over(identifiers.ColumnId(offsets_id)),), + is_total_order=True, + ) + w_offsets_dropped = nodes.SelectionNode( + w_resetted_order, tuple(nodes.AliasedRef.identity(id) for id in node.ids) + ) + return w_offsets_dropped diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index 4f636ab210..b1a77d1259 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -17,7 +17,7 @@ from dataclasses import dataclass import functools import typing -from typing import Sequence +from typing import Dict, List, Sequence import google.cloud.bigquery import pyarrow @@ -47,14 +47,24 @@ def from_bq_table( column_type_overrides: typing.Optional[ typing.Dict[str, bigframes.dtypes.Dtype] ] = None, + ): + return ArraySchema.from_bq_schema( + table.schema, column_type_overrides=column_type_overrides + ) + + @classmethod + def from_bq_schema( + cls, + schema: List[google.cloud.bigquery.SchemaField], + column_type_overrides: typing.Optional[ + Dict[str, bigframes.dtypes.Dtype] + ] = None, ): if column_type_overrides is None: column_type_overrides = {} items = tuple( SchemaItem(name, column_type_overrides.get(name, dtype)) - for name, dtype in bigframes.dtypes.bf_type_from_type_kind( - table.schema - ).items() + for name, dtype in bigframes.dtypes.bf_type_from_type_kind(schema).items() ) return ArraySchema(items) diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index d08ba3d12a..2be30135ee 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -234,7 +234,9 @@ def is_row_bounded(self): This is relevant for determining whether the window requires a total order to calculate deterministically. """ - return isinstance(self.bounds, RowsWindowBounds) + return isinstance(self.bounds, RowsWindowBounds) and ( + (self.bounds.start is not None) or (self.bounds.end is not None) + ) @property def is_range_bounded(self): @@ -254,7 +256,9 @@ def is_unbounded(self): This is relevant for determining whether the window requires a total order to calculate deterministically. """ - return self.bounds is None + return self.bounds is None or ( + self.bounds.start is None and self.bounds.end is None + ) @property def all_referenced_columns(self) -> Set[ids.ColumnId]: @@ -266,9 +270,9 @@ def all_referenced_columns(self) -> Set[ids.ColumnId]: ) return set(itertools.chain((i.id for i in self.grouping_keys), ordering_vars)) - def without_order(self) -> WindowSpec: + def without_order(self, force: bool = False) -> WindowSpec: """Removes ordering clause if ordering isn't required to define bounds.""" - if self.is_row_bounded: + if self.is_row_bounded and not force: raise ValueError("Cannot remove order from row-bounded window") return replace(self, ordering=()) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 262fa9dde7..2c5df89665 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -754,7 +754,7 @@ def bf_type_from_type_kind( def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: """Captures whether a scalar can be losslessly represented by a dtype.""" - if scalar is None: + if pd.isna(scalar): return True if pd.api.types.is_bool_dtype(dtype): return pd.api.types.is_bool(scalar) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 0cc3d52c38..d03021dd23 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -125,11 +125,15 @@ def _ensure_dataset_exists(self) -> None: def _create_bq_function(self, create_function_ddl: str) -> None: # TODO(swast): plumb through the original, user-facing api_name. _, query_job = bigframes.session._io.bigquery.start_query_with_client( - self._session.bqclient, + cast(bigquery.Client, self._session.bqclient), create_function_ddl, job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) - assert query_job is not None logger.info(f"Created bigframes function {query_job.ddl_target_routine}") def _format_function_options(self, function_options: dict) -> str: diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index 38ebda7d92..2999625cda 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -30,6 +30,12 @@ def __init__(self, data=None, index=None, **kwargs): data=data, index=index, dtype=geopandas.array.GeometryDtype(), **kwargs ) + @property + def length(self): + raise NotImplementedError( + "GeoSeries.length is not yet implemented. Please use bigframes.bigquery.st_length(geoseries) instead." + ) + @property def x(self) -> bigframes.series.Series: series = self._apply_unary_op(ops.geo_x_op) @@ -57,6 +63,15 @@ def boundary(self) -> bigframes.series.Series: # type: ignore series.name = None return series + @property + def is_closed(self) -> bigframes.series.Series: + # TODO(tswast): GeoPandas doesn't treat Point as closed. Use ST_LENGTH + # when available to filter out "closed" shapes that return false in + # GeoPandas. + raise NotImplementedError( + f"GeoSeries.is_closed is not supported. Use bigframes.bigquery.st_isclosed(series), instead. {constants.FEEDBACK_LINK}" + ) + @classmethod def from_wkt(cls, data, index=None) -> GeoSeries: series = bigframes.series.Series(data, index=index) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index a03dc937dc..cd27357680 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -59,7 +59,8 @@ def __init__( warm_start: bool = False, ): self.n_clusters = n_clusters - self.init = init + # allow the alias to be compatible with sklean + self.init = "kmeans++" if init == "k-means++" else init self.init_col = init_col self.distance_type = distance_type self.max_iter = max_iter diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 3e97ec6f4a..faf4e18d5e 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -98,9 +98,11 @@ geo_st_geogfromtext_op, geo_st_geogpoint_op, geo_st_intersection_op, + geo_st_isclosed_op, geo_x_op, geo_y_op, GeoStDistanceOp, + GeoStLengthOp, ) from bigframes.operations.json_ops import ( JSONExtract, @@ -385,6 +387,8 @@ "geo_st_geogfromtext_op", "geo_st_geogpoint_op", "geo_st_intersection_op", + "geo_st_isclosed_op", + "GeoStLengthOp", "geo_x_op", "geo_y_op", "GeoStDistanceOp", diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index e3f15e67a1..1c321c0bf8 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -439,7 +439,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.INT_DTYPE -# TODO: Convert to NullaryWindowOp @dataclasses.dataclass(frozen=True) class RankOp(UnaryWindowOp): name: ClassVar[str] = "rank" @@ -456,7 +455,6 @@ def implicitly_inherits_order(self): return False -# TODO: Convert to NullaryWindowOp @dataclasses.dataclass(frozen=True) class DenseRankOp(UnaryWindowOp): @property diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index c65947f53f..87245d104e 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -16,7 +16,7 @@ import re import typing -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence import warnings import numpy as np @@ -258,6 +258,101 @@ def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series: return concat([self._df, *attach_columns], axis=1) + def classify( + self, + instruction: str, + model, + labels: Sequence[str], + output_column: str = "result", + ground_with_google_search: bool = False, + attach_logprobs=False, + ): + """ + Classifies the rows of dataframes based on user instruction into the provided labels. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + + >>> df = bpd.DataFrame({ + ... "feedback_text": [ + ... "The product is amazing, but the shipping was slow.", + ... "I had an issue with my recent bill.", + ... "The user interface is very intuitive." + ... ], + ... }) + >>> df.ai.classify("{feedback_text}", model=model, labels=["Shipping", "Billing", "UI"]) + feedback_text result + 0 The product is amazing, but the shipping was s... Shipping + 1 I had an issue with my recent bill. Billing + 2 The user interface is very intuitive. UI + + [3 rows x 2 columns] + + Args: + instruction (str): + An instruction on how to classify the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "feedback", you can refer to this column + with"{food}". + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + labels (Sequence[str]): + A collection of labels (categories). It must contain at least two and at most 20 elements. + Labels are case sensitive. Duplicated labels are not allowed. + + output_column (str, default "result"): + The name of column for the output. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + + + Returns: + bigframes.pandas.DataFrame: DataFrame with classification result. + + Raises: + NotImplementedError: when the AI operator experiment is off. + ValueError: when the instruction refers to a non-existing column, when no + columns are referred to, or when the count of labels does not meet the + requirement. + """ + + if len(labels) < 2 or len(labels) > 20: + raise ValueError( + f"The number of labels should be between 2 and 20 (inclusive), but {len(labels)} labels are provided." + ) + + if len(set(labels)) != len(labels): + raise ValueError("There are duplicate labels.") + + updated_instruction = f"Based on the user instruction {instruction}, you must provide an answer that must exist in the following list of labels: {labels}" + + return self.map( + updated_instruction, + model, + output_schema={output_column: "string"}, + ground_with_google_search=ground_with_google_search, + attach_logprobs=attach_logprobs, + ) + def join( self, other, diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 8da88d1ff8..e143cfc519 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -15,7 +15,7 @@ from __future__ import annotations import os -from typing import cast, Optional, Union +from typing import cast, Literal, Optional, Union import warnings import IPython.display as ipy_display @@ -736,3 +736,77 @@ def pdf_chunk( return struct_series else: return content_series + + def audio_transcribe( + self, + *, + connection: Optional[str] = None, + model_name: Optional[ + Literal[ + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", + ] + ] = None, + verbose: bool = False, + ) -> bigframes.series.Series: + """ + Transcribe audio content using a Gemini multimodal model. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + model_name (str): The model for natural language tasks. Accepted + values are "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001". + See "https://ai.google.dev/gemini-api/docs/models" for model choices. + verbose (bool, default "False"): controls the verbosity of the output. + When set to True, both error messages and the transcribed content + are displayed. Conversely, when set to False, only the transcribed + content is presented, suppressing error messages. + + Returns: + bigframes.series.Series: str or struct[str, str], + depend on the "verbose" parameter. + Contains the transcribed text from the audio file. + Includes error messages if verbosity is enabled. + """ + import bigframes.bigquery as bbq + import bigframes.ml.llm as llm + import bigframes.pandas as bpd + + # col name doesn't matter here. Rename to avoid column name conflicts + audio_series = bigframes.series.Series(self._block) + + prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." + + llm_model = llm.GeminiTextGenerator( + model_name=model_name, + session=self._block.session, + connection_name=connection, + ) + + # transcribe audio using ML.GENERATE_TEXT + transcribed_results = llm_model.predict( + X=audio_series, + prompt=[prompt_text, audio_series], + temperature=0.0, + ) + + transcribed_content_series = cast( + bpd.Series, transcribed_results["ml_generate_text_llm_result"] + ).rename("transcribed_content") + + if verbose: + transcribed_status_series = cast( + bpd.Series, transcribed_results["ml_generate_text_status"] + ) + results_df = bpd.DataFrame( + { + "status": transcribed_status_series, + "content": transcribed_content_series, + } + ) + results_struct = bbq.struct(results_df).rename("transcription_results") + return results_struct + else: + return transcribed_content_series diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 98da9099cd..1b99e47ab1 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -54,6 +54,13 @@ name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() ) +geo_st_isclosed_op = base_ops.create_unary_op( + name="geo_st_isclosed", + type_signature=op_typing.FixedOutputType( + dtypes.is_geo_like, dtypes.BOOL_DTYPE, description="geo-like" + ), +) + geo_x_op = base_ops.create_unary_op( name="geo_x", type_signature=op_typing.FixedOutputType( @@ -80,3 +87,12 @@ class GeoStDistanceOp(base_ops.BinaryOp): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return dtypes.FLOAT_DTYPE + + +@dataclasses.dataclass(frozen=True) +class GeoStLengthOp(base_ops.UnaryOp): + name = "geo_st_length" + use_spheroid: bool = False + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.FLOAT_DTYPE diff --git a/bigframes/series.py b/bigframes/series.py index 74e8d03c8d..1bb0c1e0dc 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -960,6 +960,10 @@ def peek( as_series.name = self.name return as_series + def item(self): + # Docstring is in third_party/bigframes_vendored/pandas/core/series.py + return self.peek(2).item() + def nlargest(self, n: int = 5, keep: str = "first") -> Series: if keep not in ("first", "last", "all"): raise ValueError("'keep must be one of 'first', 'last', or 'all'") @@ -979,8 +983,10 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: ) def isin(self, values) -> "Series" | None: - if isinstance(values, (Series,)): + if isinstance(values, Series): return Series(self._block.isin(values._block)) + if isinstance(values, indexes.Index): + return Series(self._block.isin(values.to_series()._block)) if not _is_list_like(values): raise TypeError( "only list-like objects are allowed to be passed to " diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c24dca554a..ab09230c99 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -537,6 +537,10 @@ def _read_gbq_colab( index_col=bigframes.enums.DefaultIndexKind.NULL, force_total_order=False, dry_run=typing.cast(Union[Literal[False], Literal[True]], dry_run), + # TODO(tswast): we may need to allow allow_large_results to be overwritten + # or possibly a general configuration object for an explicit + # destination table and write disposition. + allow_large_results=False, ) @overload @@ -1166,7 +1170,11 @@ def _read_csv_w_bigquery_engine( table_id = self._loader.load_file(filepath_or_buffer, job_config=job_config) df = self._loader.read_gbq_table( - table_id, index_col=index_col, columns=columns, names=names + table_id, + index_col=index_col, + columns=columns, + names=names, + index_col_in_columns=True, ) if dtype is not None: @@ -1917,10 +1925,15 @@ def _start_query_ml_ddl( # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model job_config.destination_encryption_configuration = None iterator, query_job = bf_io_bigquery.start_query_with_client( - self.bqclient, sql, job_config=job_config, metrics=self._metrics + self.bqclient, + sql, + job_config=job_config, + metrics=self._metrics, + location=None, + project=None, + timeout=None, + query_with_job=True, ) - - assert query_job is not None return iterator, query_job def _create_object_table(self, path: str, connection: str) -> str: @@ -1943,6 +1956,10 @@ def _create_object_table(self, path: str, connection: str) -> str: sql, job_config=bigquery.QueryJobConfig(), metrics=self._metrics, + location=None, + project=None, + timeout=None, + query_with_job=True, ) return table diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 267111afe0..fdc240fa69 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -22,7 +22,7 @@ import textwrap import types import typing -from typing import Dict, Iterable, Mapping, Optional, Tuple, Union +from typing import Dict, Iterable, Literal, Mapping, Optional, overload, Tuple, Union import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions @@ -38,7 +38,6 @@ IO_ORDERING_ID = "bqdf_row_nums" -MAX_LABELS_COUNT = 64 - 8 _LIST_TABLES_LIMIT = 10000 # calls to bqclient.list_tables # will be limited to this many tables @@ -73,7 +72,12 @@ def create_job_configs_labels( ) ) values = list(itertools.chain(job_configs_labels.values(), api_methods)) - return dict(zip(labels[:MAX_LABELS_COUNT], values[:MAX_LABELS_COUNT])) + return dict( + zip( + labels[: log_adapter.MAX_LABELS_COUNT], + values[: log_adapter.MAX_LABELS_COUNT], + ) + ) def create_export_data_statement( @@ -223,8 +227,7 @@ def format_option(key: str, value: Union[bool, str]) -> str: def add_and_trim_labels(job_config): """ Add additional labels to the job configuration and trim the total number of labels - to ensure they do not exceed the maximum limit allowed by BigQuery, which is 64 - labels per job. + to ensure they do not exceed MAX_LABELS_COUNT labels per job. """ api_methods = log_adapter.get_and_reset_api_methods(dry_run=job_config.dry_run) job_config.labels = create_job_configs_labels( @@ -233,23 +236,54 @@ def add_and_trim_labels(job_config): ) +@overload def start_query_with_client( bq_client: bigquery.Client, sql: str, - job_config: bigquery.job.QueryJobConfig, + *, + job_config: bigquery.QueryJobConfig, + location: Optional[str], + project: Optional[str], + timeout: Optional[float], + metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + query_with_job: Literal[True], +) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + ... + + +@overload +def start_query_with_client( + bq_client: bigquery.Client, + sql: str, + *, + job_config: bigquery.QueryJobConfig, + location: Optional[str], + project: Optional[str], + timeout: Optional[float], + metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + query_with_job: Literal[False], +) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: + ... + + +def start_query_with_client( + bq_client: bigquery.Client, + sql: str, + *, + job_config: bigquery.QueryJobConfig, location: Optional[str] = None, project: Optional[str] = None, timeout: Optional[float] = None, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, - *, query_with_job: bool = True, ) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: """ Starts query job and waits for results. """ try: - # Note: Ensure no additional labels are added to job_config after this point, - # as `add_and_trim_labels` ensures the label count does not exceed 64. + # Note: Ensure no additional labels are added to job_config after this + # point, as `add_and_trim_labels` ensures the label count does not + # exceed MAX_LABELS_COUNT. add_and_trim_labels(job_config) if not query_with_job: results_iterator = bq_client.query_and_wait( @@ -322,8 +356,8 @@ def delete_tables_matching_session_id( def create_bq_dataset_reference( bq_client: bigquery.Client, - location=None, - project=None, + location: Optional[str] = None, + project: Optional[str] = None, ) -> bigquery.DatasetReference: """Create and identify dataset(s) for temporary BQ resources. @@ -352,6 +386,9 @@ def create_bq_dataset_reference( location=location, job_config=job_config, project=project, + timeout=None, + metrics=None, + query_with_job=True, ) # The anonymous dataset is used by BigQuery to write query results and @@ -359,7 +396,6 @@ def create_bq_dataset_reference( # to the dataset, no BigQuery Session required. Note: there is a # different anonymous dataset per location. See: # https://cloud.google.com/bigquery/docs/cached-results#how_cached_results_are_stored - assert query_job is not None query_destination = query_job.destination return bigquery.DatasetReference( query_destination.project, diff --git a/bigframes/session/_io/bigquery/read_gbq_query.py b/bigframes/session/_io/bigquery/read_gbq_query.py new file mode 100644 index 0000000000..70c83d7875 --- /dev/null +++ b/bigframes/session/_io/bigquery/read_gbq_query.py @@ -0,0 +1,90 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Private helpers for implementing read_gbq_query.""" + +from __future__ import annotations + +from typing import Optional + +from google.cloud import bigquery +import google.cloud.bigquery.table +import pandas + +from bigframes import dataframe +from bigframes.core import local_data, pyarrow_utils +import bigframes.core as core +import bigframes.core.blocks as blocks +import bigframes.core.guid +import bigframes.core.schema as schemata +import bigframes.session + + +def create_dataframe_from_query_job_stats( + query_job: Optional[bigquery.QueryJob], *, session: bigframes.session.Session +) -> dataframe.DataFrame: + """Convert a QueryJob into a DataFrame with key statistics about the query. + + Any changes you make here, please try to keep in sync with pandas-gbq. + """ + return dataframe.DataFrame( + data=pandas.DataFrame( + { + "statement_type": [ + query_job.statement_type if query_job else "unknown" + ], + "job_id": [query_job.job_id if query_job else "unknown"], + "location": [query_job.location if query_job else "unknown"], + } + ), + session=session, + ) + + +def create_dataframe_from_row_iterator( + rows: google.cloud.bigquery.table.RowIterator, *, session: bigframes.session.Session +) -> dataframe.DataFrame: + """Convert a RowIterator into a DataFrame wrapping a LocalNode. + + This allows us to create a DataFrame from query results, even in the + 'jobless' case where there's no destination table. + """ + pa_table = rows.to_arrow() + + # TODO(tswast): Use array_value.promote_offsets() instead once that node is + # supported by the local engine. + offsets_col = bigframes.core.guid.generate_guid() + pa_table = pyarrow_utils.append_offsets(pa_table, offsets_col=offsets_col) + + # We use the ManagedArrowTable constructor directly, because the + # results of to_arrow() should be the source of truth with regards + # to canonical formats since it comes from either the BQ Storage + # Read API or has been transformed by google-cloud-bigquery to look + # like the output of the BQ Storage Read API. + mat = local_data.ManagedArrowTable( + pa_table, + schemata.ArraySchema.from_bq_schema( + list(rows.schema) + [bigquery.SchemaField(offsets_col, "INTEGER")] + ), + ) + mat.validate() + + array_value = core.ArrayValue.from_managed(mat, session) + block = blocks.Block( + array_value, + (offsets_col,), + [field.name for field in rows.schema], + (None,), + ) + return dataframe.DataFrame(block) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 33d3314a1e..47be6fa768 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -320,6 +320,10 @@ def export_gcs( export_data_statement, job_config=bigquery.QueryJobConfig(), metrics=self.metrics, + project=None, + location=None, + timeout=None, + query_with_job=True, ) return query_job @@ -383,14 +387,29 @@ def _run_execute_query( job_config.labels["bigframes-mode"] = "unordered" try: - iterator, query_job = bq_io.start_query_with_client( - self.bqclient, - sql, - job_config=job_config, - metrics=self.metrics, - query_with_job=query_with_job, - ) - return iterator, query_job + # Trick the type checker into thinking we got a literal. + if query_with_job: + return bq_io.start_query_with_client( + self.bqclient, + sql, + job_config=job_config, + metrics=self.metrics, + project=None, + location=None, + timeout=None, + query_with_job=True, + ) + else: + return bq_io.start_query_with_client( + self.bqclient, + sql, + job_config=job_config, + metrics=self.metrics, + project=None, + location=None, + timeout=None, + query_with_job=False, + ) except google.api_core.exceptions.BadRequest as e: # Unfortunately, this error type does not have a separate error code or exception type diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py new file mode 100644 index 0000000000..4b19f7441d --- /dev/null +++ b/bigframes/session/direct_gbq_execution.py @@ -0,0 +1,76 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Optional, Tuple + +from google.cloud import bigquery +import google.cloud.bigquery.job as bq_job +import google.cloud.bigquery.table as bq_table + +from bigframes.core import compile, nodes +from bigframes.session import executor, semi_executor +import bigframes.session._io.bigquery as bq_io + + +# used only in testing right now, BigQueryCachingExecutor is the fully featured engine +# simplified, doesnt not do large >10 gb result queries, error handling, respect global config +# or record metrics. Also avoids caching, and most pre-compile rewrites, to better serve as a +# reference for validating more complex executors. +class DirectGbqExecutor(semi_executor.SemiExecutor): + def __init__(self, bqclient: bigquery.Client): + self.bqclient = bqclient + + def execute( + self, + plan: nodes.BigFrameNode, + ordered: bool, + peek: Optional[int] = None, + ) -> executor.ExecuteResult: + """Just execute whatever plan as is, without further caching or decomposition.""" + # TODO(swast): plumb through the api_name of the user-facing api that + # caused this query. + + compiled = compile.compile_sql( + compile.CompileRequest(plan, sort_rows=ordered, peek_count=peek) + ) + iterator, query_job = self._run_execute_query( + sql=compiled.sql, + ) + + return executor.ExecuteResult( + arrow_batches=iterator.to_arrow_iterable(), + schema=plan.schema, + query_job=query_job, + total_rows=iterator.total_rows, + ) + + def _run_execute_query( + self, + sql: str, + job_config: Optional[bq_job.QueryJobConfig] = None, + ) -> Tuple[bq_table.RowIterator, Optional[bigquery.QueryJob]]: + """ + Starts BigQuery query job and waits for results. + """ + return bq_io.start_query_with_client( + self.bqclient, + sql, + job_config=job_config or bq_job.QueryJobConfig(), + project=None, + location=None, + timeout=None, + metrics=None, + query_with_job=False, + ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index ba669a62bb..814d44292e 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -22,6 +22,7 @@ import os import typing from typing import ( + cast, Dict, Generator, Hashable, @@ -39,6 +40,7 @@ import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions from google.cloud import bigquery_storage_v1 +import google.cloud.bigquery import google.cloud.bigquery as bigquery from google.cloud.bigquery_storage_v1 import types as bq_storage_types import pandas @@ -52,6 +54,7 @@ import bigframes.formatting_helpers as formatting_helpers from bigframes.session import dry_runs import bigframes.session._io.bigquery as bf_io_bigquery +import bigframes.session._io.bigquery.read_gbq_query as bf_read_gbq_query import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.metrics import bigframes.session.temporary_storage @@ -93,7 +96,31 @@ def _to_index_cols( return index_cols -def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]): +def _check_column_duplicates( + index_cols: Iterable[str], columns: Iterable[str], index_col_in_columns: bool +) -> Iterable[str]: + """Validates and processes index and data columns for duplicates and overlap. + + This function performs two main tasks: + 1. Ensures there are no duplicate column names within the `index_cols` list + or within the `columns` list. + 2. Based on the `index_col_in_columns` flag, it validates the relationship + between `index_cols` and `columns`. + + Args: + index_cols (Iterable[str]): + An iterable of column names designated as the index. + columns (Iterable[str]): + An iterable of column names designated as the data columns. + index_col_in_columns (bool): + A flag indicating how to handle overlap between `index_cols` and + `columns`. + - If `False`, the two lists must be disjoint (contain no common + elements). An error is raised if any overlap is found. + - If `True`, `index_cols` is expected to be a subset of + `columns`. An error is raised if an index column is not found + in the `columns` list. + """ index_cols_list = list(index_cols) if index_cols is not None else [] columns_list = list(columns) if columns is not None else [] set_index = set(index_cols_list) @@ -105,17 +132,29 @@ def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]): "All column names specified in 'index_col' must be unique." ) + if len(columns_list) == 0: + return columns + if len(columns_list) > len(set_columns): raise ValueError( "The 'columns' argument contains duplicate names. " "All column names specified in 'columns' must be unique." ) - if not set_index.isdisjoint(set_columns): - raise ValueError( - "Found column names that exist in both 'index_col' and 'columns' arguments. " - "These arguments must specify distinct sets of columns." - ) + if index_col_in_columns: + if not set_index.issubset(set_columns): + raise ValueError( + f"The specified index column(s) were not found: {set_index - set_columns}. " + f"Available columns are: {set_columns}" + ) + return [col for col in columns if col not in set_index] + else: + if not set_index.isdisjoint(set_columns): + raise ValueError( + "Found column names that exist in both 'index_col' and 'columns' arguments. " + "These arguments must specify distinct sets of columns." + ) + return columns @dataclasses.dataclass @@ -388,6 +427,7 @@ def read_gbq_table( # type: ignore[overload-overlap] dry_run: Literal[False] = ..., force_total_order: Optional[bool] = ..., n_rows: Optional[int] = None, + index_col_in_columns: bool = False, ) -> dataframe.DataFrame: ... @@ -410,6 +450,7 @@ def read_gbq_table( dry_run: Literal[True] = ..., force_total_order: Optional[bool] = ..., n_rows: Optional[int] = None, + index_col_in_columns: bool = False, ) -> pandas.Series: ... @@ -431,7 +472,67 @@ def read_gbq_table( dry_run: bool = False, force_total_order: Optional[bool] = None, n_rows: Optional[int] = None, + index_col_in_columns: bool = False, ) -> dataframe.DataFrame | pandas.Series: + """Read a BigQuery table into a BigQuery DataFrames DataFrame. + + This method allows you to create a DataFrame from a BigQuery table. + You can specify the columns to load, an index column, and apply + filters. + + Args: + table_id (str): + The identifier of the BigQuery table to read. + index_col (Iterable[str] | str | Iterable[int] | int | bigframes.enums.DefaultIndexKind, optional): + The column(s) to use as the index for the DataFrame. This can be + a single column name or a list of column names. If not provided, + a default index will be used based on the session's + ``default_index_type``. + columns (Iterable[str], optional): + The columns to read from the table. If not specified, all + columns will be read. + names (Optional[Iterable[str]], optional): + A list of column names to use for the resulting DataFrame. This + is useful if you want to rename the columns as you read the + data. + max_results (Optional[int], optional): + The maximum number of rows to retrieve from the table. If not + specified, all rows will be loaded. + use_cache (bool, optional): + Whether to use cached results for the query. Defaults to True. + Setting this to False will force a re-execution of the query. + filters (third_party_pandas_gbq.FiltersType, optional): + A list of filters to apply to the data. Filters are specified + as a list of tuples, where each tuple contains a column name, + an operator (e.g., '==', '!='), and a value. + enable_snapshot (bool, optional): + If True, a snapshot of the table is used to ensure that the + DataFrame is deterministic, even if the underlying table + changes. Defaults to True. + dry_run (bool, optional): + If True, the function will not actually execute the query but + will instead return statistics about the table. Defaults to False. + force_total_order (Optional[bool], optional): + If True, a total ordering is enforced on the DataFrame, which + can be useful for operations that require a stable row order. + If None, the session's default behavior is used. + n_rows (Optional[int], optional): + The number of rows to consider for type inference and other + metadata operations. This does not limit the number of rows + in the final DataFrame. + index_col_in_columns (bool, optional): + Specifies if the ``index_col`` is also present in the ``columns`` + list. Defaults to ``False``. + + * If ``False``, ``index_col`` and ``columns`` must specify + distinct sets of columns. An error will be raised if any + column is found in both. + * If ``True``, the column(s) in ``index_col`` are expected to + also be present in the ``columns`` list. This is useful + when the index is selected from the data columns (e.g., in a + ``read_csv`` scenario). The column will be used as the + DataFrame's index and removed from the list of value columns. + """ import bigframes._tools.strings import bigframes.dataframe as dataframe @@ -513,7 +614,9 @@ def read_gbq_table( index_col=index_col, names=names, ) - _check_column_duplicates(index_cols, columns) + columns = list( + _check_column_duplicates(index_cols, columns, index_col_in_columns) + ) for key in index_cols: if key not in table_column_names: @@ -736,6 +839,7 @@ def read_gbq_query( # type: ignore[overload-overlap] filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[False] = ..., force_total_order: Optional[bool] = ..., + allow_large_results: bool = ..., ) -> dataframe.DataFrame: ... @@ -752,6 +856,7 @@ def read_gbq_query( filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[True] = ..., force_total_order: Optional[bool] = ..., + allow_large_results: bool = ..., ) -> pandas.Series: ... @@ -767,9 +872,8 @@ def read_gbq_query( filters: third_party_pandas_gbq.FiltersType = (), dry_run: bool = False, force_total_order: Optional[bool] = None, + allow_large_results: bool = True, ) -> dataframe.DataFrame | pandas.Series: - import bigframes.dataframe as dataframe - configuration = _transform_read_gbq_configuration(configuration) if "query" not in configuration: @@ -794,7 +898,9 @@ def read_gbq_query( ) index_cols = _to_index_cols(index_col) - _check_column_duplicates(index_cols, columns) + columns = _check_column_duplicates( + index_cols, columns, index_col_in_columns=False + ) filters_copy1, filters_copy2 = itertools.tee(filters) has_filters = len(list(filters_copy1)) != 0 @@ -824,29 +930,72 @@ def read_gbq_query( query_job, list(columns), index_cols ) - # No cluster candidates as user query might not be clusterable (eg because of ORDER BY clause) - destination, query_job = self._query_to_destination( - query, - cluster_candidates=[], - configuration=configuration, - ) + query_job_for_metrics: Optional[bigquery.QueryJob] = None + destination: Optional[bigquery.TableReference] = None + # TODO(b/421161077): If an explicit destination table is set in + # configuration, should we respect that setting? + if allow_large_results: + destination, query_job = self._query_to_destination( + query, + # No cluster candidates as user query might not be clusterable + # (eg because of ORDER BY clause) + cluster_candidates=[], + configuration=configuration, + ) + query_job_for_metrics = query_job + rows = None + else: + job_config = typing.cast( + bigquery.QueryJobConfig, + bigquery.QueryJobConfig.from_api_repr(configuration), + ) + + # TODO(b/420984164): We may want to set a page_size here to limit + # the number of results in the first jobs.query response. + rows = self._start_query_with_job_optional( + query, + job_config=job_config, + ) + + # If there is a query job, fetch it so that we can get the + # statistics and destination table, if needed. + if rows.job_id and rows.location and rows.project: + query_job = cast( + bigquery.QueryJob, + self._bqclient.get_job( + rows.job_id, project=rows.project, location=rows.location + ), + ) + destination = query_job.destination + query_job_for_metrics = query_job + + # We split query execution from results fetching so that we can log + # metrics from either the query job, row iterator, or both. if self._metrics is not None: - self._metrics.count_job_stats(query_job) + self._metrics.count_job_stats( + query_job=query_job_for_metrics, row_iterator=rows + ) - # If there was no destination table, that means the query must have - # been DDL or DML. Return some job metadata, instead. + # It's possible that there's no job and corresponding destination table. + # In this case, we must create a local node. + # + # TODO(b/420984164): Tune the threshold for which we download to + # local node. Likely there are a wide range of sizes in which it + # makes sense to download the results beyond the first page, even if + # there is a job and destination table available. + if rows is not None and destination is None: + return bf_read_gbq_query.create_dataframe_from_row_iterator( + rows, + session=self._session, + ) + + # If there was no destination table and we've made it this far, that + # means the query must have been DDL or DML. Return some job metadata, + # instead. if not destination: - return dataframe.DataFrame( - data=pandas.DataFrame( - { - "statement_type": [ - query_job.statement_type if query_job else "unknown" - ], - "job_id": [query_job.job_id if query_job else "unknown"], - "location": [query_job.location if query_job else "unknown"], - } - ), + return bf_read_gbq_query.create_dataframe_from_query_job_stats( + query_job_for_metrics, session=self._session, ) @@ -872,9 +1021,12 @@ def _query_to_destination( # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. dry_run_config = bigquery.QueryJobConfig() dry_run_config.dry_run = True - _, dry_run_job = self._start_query(query, job_config=dry_run_config) + dry_run_job = self._start_query_with_job( + query, + job_config=dry_run_config, + ) if dry_run_job.statement_type != "SELECT": - _, query_job = self._start_query(query) + query_job = self._start_query_with_job(query) return query_job.destination, query_job # Create a table to workaround BigQuery 10 GB query results limit. See: @@ -908,7 +1060,7 @@ def _query_to_destination( # Write to temp table to workaround BigQuery 10 GB query results # limit. See: internal issue 303057336. job_config.labels["error_caught"] = "true" - _, query_job = self._start_query( + query_job = self._start_query_with_job( query, job_config=job_config, timeout=timeout, @@ -919,34 +1071,72 @@ def _query_to_destination( # tables as the destination. For example, if the query has a # top-level ORDER BY, this conflicts with our ability to cluster # the table by the index column(s). - _, query_job = self._start_query(query, timeout=timeout) + query_job = self._start_query_with_job(query, timeout=timeout) return query_job.destination, query_job - def _start_query( + def _prepare_job_config( + self, + job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, + ) -> google.cloud.bigquery.QueryJobConfig: + job_config = bigquery.QueryJobConfig() if job_config is None else job_config + + if bigframes.options.compute.maximum_bytes_billed is not None: + # Maybe this should be pushed down into start_query_with_client + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + + return job_config + + def _start_query_with_job_optional( + self, + sql: str, + *, + job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, + timeout: Optional[float] = None, + ) -> google.cloud.bigquery.table.RowIterator: + """ + Starts BigQuery query with job optional and waits for results. + + Do not execute dataframe through this API, instead use the executor. + """ + job_config = self._prepare_job_config(job_config) + rows, _ = bf_io_bigquery.start_query_with_client( + self._bqclient, + sql, + job_config=job_config, + timeout=timeout, + location=None, + project=None, + metrics=None, + query_with_job=False, + ) + return rows + + def _start_query_with_job( self, sql: str, + *, job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, timeout: Optional[float] = None, - ) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: + ) -> bigquery.QueryJob: """ Starts BigQuery query job and waits for results. Do not execute dataframe through this API, instead use the executor. """ - job_config = bigquery.QueryJobConfig() if job_config is None else job_config - if bigframes.options.compute.maximum_bytes_billed is not None: - # Maybe this should be pushed down into start_query_with_client - job_config.maximum_bytes_billed = ( - bigframes.options.compute.maximum_bytes_billed - ) - iterator, query_job = bf_io_bigquery.start_query_with_client( + job_config = self._prepare_job_config(job_config) + _, query_job = bf_io_bigquery.start_query_with_client( self._bqclient, sql, job_config=job_config, timeout=timeout, + location=None, + project=None, + metrics=None, + query_with_job=True, ) - assert query_job is not None - return iterator, query_job + return query_job def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: diff --git a/bigframes/session/local_scan_executor.py b/bigframes/session/local_scan_executor.py index 88304fa181..b4d7b226e2 100644 --- a/bigframes/session/local_scan_executor.py +++ b/bigframes/session/local_scan_executor.py @@ -30,11 +30,17 @@ def execute( ordered: bool, peek: Optional[int] = None, ) -> Optional[executor.ExecuteResult]: - node = rewrite.try_reduce_to_local_scan(plan) - if not node: + reduced_result = rewrite.try_reduce_to_local_scan(plan) + if not reduced_result: return None - # TODO: Can support some slicing, sorting + node, limit = reduced_result + + if limit is not None: + if peek is None or limit < peek: + peek = limit + + # TODO: Can support some sorting offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None arrow_table = node.local_data_source.to_pyarrow_table(offsets_col=offsets_col) if peek: @@ -46,8 +52,8 @@ def execute( arrow_table = arrow_table.select(needed_cols) arrow_table = arrow_table.rename_columns([id.sql for id in node.ids]) - total_rows = node.row_count + if (peek is not None) and (total_rows is not None): total_rows = min(peek, total_rows) diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 6a8038e189..48cb92a8b4 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -79,17 +79,24 @@ def get_performance_stats( return None bytes_processed = query_job.total_bytes_processed - if not isinstance(bytes_processed, int): + if bytes_processed and not isinstance(bytes_processed, int): return None # filter out mocks slot_millis = query_job.slot_millis - if not isinstance(slot_millis, int): + if slot_millis and not isinstance(slot_millis, int): return None # filter out mocks execution_secs = (query_job.ended - query_job.created).total_seconds() query_char_count = len(query_job.query) - return query_char_count, bytes_processed, slot_millis, execution_secs + return ( + query_char_count, + # Not every job populates these. For example, slot_millis is missing + # from queries that came from cached results. + bytes_processed if bytes_processed else 0, + slot_millis if slot_millis else 0, + execution_secs, + ) def write_stats_to_disk( diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py new file mode 100644 index 0000000000..e215866874 --- /dev/null +++ b/bigframes/session/polars_executor.py @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Optional, TYPE_CHECKING + +import pyarrow as pa + +from bigframes.core import array_value, bigframe_node, local_data, nodes +from bigframes.session import executor, semi_executor + +if TYPE_CHECKING: + import polars as pl + + +_COMPATIBLE_NODES = ( + nodes.ReadLocalNode, + nodes.OrderByNode, + nodes.ReversedNode, + nodes.SelectionNode, + nodes.FilterNode, # partial support + nodes.ProjectionNode, # partial support +) + + +class PolarsExecutor(semi_executor.SemiExecutor): + def __init__(self): + # This will error out if polars is not installed + from bigframes.core.compile.polars import PolarsCompiler + + self._compiler = PolarsCompiler() + + def execute( + self, + plan: bigframe_node.BigFrameNode, + ordered: bool, + peek: Optional[int] = None, + ) -> Optional[executor.ExecuteResult]: + if not self._can_execute(plan): + return None + # Note: Ignoring ordered flag, as just executing totally ordered is fine. + try: + lazy_frame: pl.LazyFrame = self._compiler.compile( + array_value.ArrayValue(plan) + ) + except Exception: + return None + if peek is not None: + lazy_frame = lazy_frame.limit(peek) + pa_table = lazy_frame.collect().to_arrow() + return executor.ExecuteResult( + arrow_batches=iter(map(self._adapt_batch, pa_table.to_batches())), + schema=plan.schema, + total_bytes=pa_table.nbytes, + total_rows=pa_table.num_rows, + ) + + def _can_execute(self, plan: bigframe_node.BigFrameNode): + return all(isinstance(node, _COMPATIBLE_NODES) for node in plan.unique_nodes()) + + def _adapt_array(self, array: pa.Array) -> pa.Array: + target_type = local_data.logical_type_replacements(array.type) + if target_type != array.type: + return array.cast(target_type) + return array + + def _adapt_batch(self, batch: pa.RecordBatch) -> pa.RecordBatch: + new_arrays = [self._adapt_array(arr) for arr in batch.columns] + return pa.RecordBatch.from_arrays(new_arrays, names=batch.column_names) diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index ca6fa57d0b..7ddc2e2e6e 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -14,11 +14,14 @@ import copy import datetime -from typing import Any, Dict, Optional, Sequence +from typing import Any, Dict, Literal, Optional, Sequence import unittest.mock as mock +from bigframes_vendored.google_cloud_bigquery import _pandas_helpers import google.auth.credentials import google.cloud.bigquery +import google.cloud.bigquery.table +import pyarrow import pytest import bigframes @@ -40,6 +43,7 @@ def create_bigquery_session( table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA, anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None, location: str = "test-region", + ordering_mode: Literal["strict", "partial"] = "partial", ) -> bigframes.Session: """[Experimental] Create a mock BigQuery DataFrames session that avoids making Google Cloud API calls. @@ -79,43 +83,75 @@ def create_bigquery_session( queries = [] job_configs = [] - def query_mock(query, *args, job_config=None, **kwargs): + def query_mock( + query, + *args, + job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, + **kwargs, + ): queries.append(query) job_configs.append(copy.deepcopy(job_config)) - query_job = mock.create_autospec(google.cloud.bigquery.QueryJob) + query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True) query_job._properties = {} type(query_job).destination = mock.PropertyMock( return_value=anonymous_dataset.table("test_table"), ) - type(query_job).session_info = google.cloud.bigquery.SessionInfo( - {"sessionInfo": {"sessionId": session_id}}, - ) + type(query_job).statement_type = mock.PropertyMock(return_value="SELECT") + + if job_config is not None and job_config.create_session: + type(query_job).session_info = google.cloud.bigquery.SessionInfo( + {"sessionId": session_id}, + ) if query.startswith("SELECT CURRENT_TIMESTAMP()"): query_job.result = mock.MagicMock(return_value=[[bq_time]]) + elif "CREATE TEMP TABLE".casefold() in query.casefold(): + type(query_job).destination = mock.PropertyMock( + return_value=anonymous_dataset.table("temp_table_from_session"), + ) else: type(query_job).schema = mock.PropertyMock(return_value=table_schema) return query_job - existing_query_and_wait = bqclient.query_and_wait - def query_and_wait_mock(query, *args, job_config=None, **kwargs): queries.append(query) job_configs.append(copy.deepcopy(job_config)) + if query.startswith("SELECT CURRENT_TIMESTAMP()"): return iter([[datetime.datetime.now()]]) - else: - return existing_query_and_wait(query, *args, **kwargs) - bqclient.query = query_mock - bqclient.query_and_wait = query_and_wait_mock + rows = mock.create_autospec( + google.cloud.bigquery.table.RowIterator, instance=True + ) + row = mock.create_autospec(google.cloud.bigquery.table.Row, instance=True) + rows.__iter__.return_value = [row] + type(rows).schema = mock.PropertyMock(return_value=table_schema) + rows.to_arrow.return_value = pyarrow.Table.from_pydict( + {field.name: [None] for field in table_schema}, + schema=pyarrow.schema( + _pandas_helpers.bq_to_arrow_field(field) for field in table_schema + ), + ) + + if job_config is not None and job_config.destination is None: + # Assume that the query finishes fast enough for jobless mode. + type(rows).job_id = mock.PropertyMock(return_value=None) + + return rows + + bqclient.query.side_effect = query_mock + bqclient.query_and_wait.side_effect = query_and_wait_mock clients_provider = mock.create_autospec(bigframes.session.clients.ClientsProvider) type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient) clients_provider._credentials = credentials - bqoptions = bigframes.BigQueryOptions(credentials=credentials, location=location) + bqoptions = bigframes.BigQueryOptions( + credentials=credentials, + location=location, + ordering_mode=ordering_mode, + ) session = bigframes.Session(context=bqoptions, clients_provider=clients_provider) session._bq_connection_manager = mock.create_autospec( bigframes.clients.BqConnectionManager, instance=True diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index f8dda8da55..5e5de2d0b2 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -20,12 +20,9 @@ import polars import bigframes -import bigframes.clients import bigframes.core.blocks import bigframes.core.compile.polars -import bigframes.core.ordering import bigframes.dataframe -import bigframes.session.clients import bigframes.session.executor import bigframes.session.metrics @@ -35,6 +32,26 @@ class TestExecutor(bigframes.session.executor.Executor): compiler = bigframes.core.compile.polars.PolarsCompiler() + def peek( + self, + array_value: bigframes.core.ArrayValue, + n_rows: int, + use_explicit_destination: Optional[bool] = False, + ): + """ + A 'peek' efficiently accesses a small number of rows in the dataframe. + """ + lazy_frame: polars.LazyFrame = self.compiler.compile(array_value) + pa_table = lazy_frame.collect().limit(n_rows).to_arrow() + # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. + # Nullability may be different, and might use large versions of list, string datatypes. + return bigframes.session.executor.ExecuteResult( + arrow_batches=pa_table.to_batches(), + schema=array_value.schema, + total_bytes=pa_table.nbytes, + total_rows=pa_table.num_rows, + ) + def execute( self, array_value: bigframes.core.ArrayValue, @@ -58,6 +75,14 @@ def execute( total_rows=pa_table.num_rows, ) + def cached( + self, + array_value: bigframes.core.ArrayValue, + *, + config, + ) -> None: + return + class TestSession(bigframes.session.Session): def __init__(self): @@ -92,3 +117,8 @@ def read_pandas(self, pandas_dataframe, write_engine="default"): pandas_dataframe = pandas_dataframe.to_frame() local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) return bigframes.dataframe.DataFrame(local_block) + + @property + def bqclient(self): + # prevents logger from trying to call bq upon any errors + return None diff --git a/bigframes/version.py b/bigframes/version.py index 6cc3d952ed..e41364d4d1 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.5.0" +__version__ = "2.6.0" # {x-release-please-start-date} -__release_date__ = "2025-05-30" +__release_date__ = "2025-06-09" # {x-release-please-end} diff --git a/noxfile.py b/noxfile.py index 297e8f9d6f..dee5f929b7 100644 --- a/noxfile.py +++ b/noxfile.py @@ -79,7 +79,7 @@ UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = ["tests"] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.12": ["polars", "scikit-learn"], + "3.12": ["tests", "polars", "scikit-learn"], } # 3.10 is needed for Windows tests as it is the only version installed in the @@ -108,8 +108,8 @@ SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { "3.9": ["tests"], "3.10": ["tests"], - "3.12": ["tests", "scikit-learn"], - "3.13": ["tests"], + "3.12": ["tests", "scikit-learn", "polars"], + "3.13": ["tests", "polars"], } LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -202,14 +202,11 @@ def install_unittest_dependencies(session, install_test_extra, *constraints): if UNIT_TEST_LOCAL_DEPENDENCIES: session.install(*UNIT_TEST_LOCAL_DEPENDENCIES, *constraints) - if install_test_extra and UNIT_TEST_EXTRAS_BY_PYTHON: - extras = UNIT_TEST_EXTRAS_BY_PYTHON.get(session.python, []) - if install_test_extra and UNIT_TEST_EXTRAS: - extras = UNIT_TEST_EXTRAS - else: - extras = [] - - if extras: + if install_test_extra: + if session.python in UNIT_TEST_EXTRAS_BY_PYTHON: + extras = UNIT_TEST_EXTRAS_BY_PYTHON[session.python] + else: + extras = UNIT_TEST_EXTRAS session.install("-e", f".[{','.join(extras)}]", *constraints) else: session.install("-e", ".", *constraints) diff --git a/scripts/data/audio/audio_LJ001-0010.wav b/scripts/data/audio/audio_LJ001-0010.wav new file mode 100644 index 0000000000..01a2e68829 Binary files /dev/null and b/scripts/data/audio/audio_LJ001-0010.wav differ diff --git a/scripts/data/pdfs/pdfs_sample-local-pdf.pdf b/scripts/data/pdfs/pdfs_sample-local-pdf.pdf new file mode 100644 index 0000000000..d162cd6877 Binary files /dev/null and b/scripts/data/pdfs/pdfs_sample-local-pdf.pdf differ diff --git a/scripts/data/pdfs/test-protected.pdf b/scripts/data/pdfs/test-protected.pdf new file mode 100644 index 0000000000..0d8cd28baa Binary files /dev/null and b/scripts/data/pdfs/test-protected.pdf differ diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 824e774dbe..a4bab1bcfe 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -1521,3 +1521,17 @@ def pdf_mm_df( pdf_gcs_path, session: bigframes.Session, bq_connection: str ) -> bpd.DataFrame: return session.from_glob_path(pdf_gcs_path, name="pdf", connection=bq_connection) + + +@pytest.fixture(scope="session") +def audio_gcs_path() -> str: + return "gs://bigframes_blob_test/audio/*" + + +@pytest.fixture(scope="session") +def audio_mm_df( + audio_gcs_path, session: bigframes.Session, bq_connection: str +) -> bpd.DataFrame: + return session.from_glob_path( + audio_gcs_path, name="audio", connection=bq_connection + ) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 3ebded3d29..4a95e4c6d1 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -385,3 +385,54 @@ def test_blob_pdf_chunk( check_dtype=False, check_index=False, ) + + +@pytest.mark.parametrize( + "model_name, verbose", + [ + ("gemini-2.0-flash-001", True), + ("gemini-2.0-flash-001", False), + ("gemini-2.0-flash-lite-001", True), + ("gemini-2.0-flash-lite-001", False), + ], +) +def test_blob_transcribe( + audio_mm_df: bpd.DataFrame, + model_name: str, + verbose: bool, +): + actual = ( + audio_mm_df["audio"] + .blob.audio_transcribe( + model_name=model_name, + verbose=verbose, + ) + .to_pandas() + ) + + # check relative length + expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress" + expected_len = len(expected_text) + + actual_text = "" + if verbose: + actual_text = actual[0]["content"] + else: + actual_text = actual[0] + actual_len = len(actual_text) + + relative_length_tolerance = 0.2 + min_acceptable_len = expected_len * (1 - relative_length_tolerance) + max_acceptable_len = expected_len * (1 + relative_length_tolerance) + assert min_acceptable_len <= actual_len <= max_acceptable_len, ( + f"Item (verbose={verbose}): Transcribed text length {actual_len} is outside the acceptable range " + f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. " + f"Expected reference length was {expected_len}. " + ) + + # check for major keywords + major_keywords = ["book", "picture"] + for keyword in major_keywords: + assert ( + keyword.lower() in actual_text.lower() + ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in transcribed text. " diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index 1b1d3a3376..c0716220b1 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -398,6 +398,33 @@ def test_map_invalid_model_raise_error(): ) +def test_classify(gemini_flash_model, session): + df = dataframe.DataFrame(data={"creature": ["dog", "rose"]}, session=session) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df.ai.classify( + "{creature}", + gemini_flash_model, + labels=["animal", "plant"], + output_column="result", + ).to_pandas() + + expected_result = pd.DataFrame( + { + "creature": ["dog", "rose"], + "result": ["animal", "plant"], + } + ) + pandas.testing.assert_frame_equal( + actual_result, expected_result, check_index_type=False, check_dtype=False + ) + + @pytest.mark.parametrize( "instruction", [ diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py index be517fb5cc..f888fd0364 100644 --- a/tests/system/small/bigquery/test_geo.py +++ b/tests/system/small/bigquery/test_geo.py @@ -19,10 +19,14 @@ from shapely.geometry import ( # type: ignore GeometryCollection, LineString, + MultiLineString, + MultiPoint, + MultiPolygon, Point, Polygon, ) +from bigframes.bigquery import st_length import bigframes.bigquery as bbq import bigframes.geopandas @@ -59,6 +63,66 @@ def test_geo_st_area(): ) +# Expected length for 1 degree of longitude at the equator is approx 111195.079734 meters +DEG_LNG_EQUATOR_METERS = 111195.07973400292 + + +def test_st_length_various_geometries(session): + input_geometries = [ + Point(0, 0), + LineString([(0, 0), (1, 0)]), + Polygon([(0, 0), (1, 0), (0, 1), (0, 0)]), + MultiPoint([Point(0, 0), Point(1, 1)]), + MultiLineString([LineString([(0, 0), (1, 0)]), LineString([(0, 0), (0, 1)])]), + MultiPolygon( + [ + Polygon([(0, 0), (1, 0), (0, 1), (0, 0)]), + Polygon([(2, 2), (3, 2), (2, 3), (2, 2)]), + ] + ), + GeometryCollection([Point(0, 0), LineString([(0, 0), (1, 0)])]), + GeometryCollection([]), + None, # Represents NULL geography input + GeometryCollection([Point(1, 1), Point(2, 2)]), + ] + geoseries = bigframes.geopandas.GeoSeries(input_geometries, session=session) + + expected_lengths = pd.Series( + [ + 0.0, # Point + DEG_LNG_EQUATOR_METERS, # LineString + 0.0, # Polygon + 0.0, # MultiPoint + 2 * DEG_LNG_EQUATOR_METERS, # MultiLineString + 0.0, # MultiPolygon + DEG_LNG_EQUATOR_METERS, # GeometryCollection (Point + LineString) + 0.0, # Empty GeometryCollection + pd.NA, # None input for ST_LENGTH(NULL) is NULL + 0.0, # GeometryCollection (Point + Point) + ], + index=pd.Index(range(10), dtype="Int64"), + dtype="Float64", + ) + + # Test default use_spheroid + result_default = st_length(geoseries).to_pandas() + pd.testing.assert_series_equal( + result_default, + expected_lengths, + rtol=1e-3, + atol=1e-3, # For comparisons involving 0.0 + ) # type: ignore + + # Test explicit use_spheroid=False + result_explicit_false = st_length(geoseries, use_spheroid=False).to_pandas() + pd.testing.assert_series_equal( + result_explicit_false, + expected_lengths, + rtol=1e-3, + atol=1e-3, # For comparisons involving 0.0 + ) # type: ignore + + def test_geo_st_difference_with_geometry_objects(): data1 = [ Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), @@ -354,3 +418,40 @@ def test_geo_st_intersection_with_similar_geometry_objects(): check_exact=False, rtol=0.1, ) + + +def test_geo_st_isclosed(): + bf_gs = bigframes.geopandas.GeoSeries( + [ + Point(0, 0), # Point + LineString([(0, 0), (1, 1)]), # Open LineString + LineString([(0, 0), (1, 1), (0, 1), (0, 0)]), # Closed LineString + Polygon([(0, 0), (1, 1), (0, 1)]), # Open polygon + GeometryCollection(), # Empty GeometryCollection + bigframes.geopandas.GeoSeries.from_wkt(["GEOMETRYCOLLECTION EMPTY"]).iloc[ + 0 + ], # Also empty + None, # Should be filtered out by dropna + ], + index=[0, 1, 2, 3, 4, 5, 6], + ) + bf_result = bbq.st_isclosed(bf_gs).to_pandas() + + # Expected results based on ST_ISCLOSED documentation: + expected_data = [ + True, # Point: True + False, # Open LineString: False + True, # Closed LineString: True + False, # Polygon: False (only True if it's a full polygon) + False, # Empty GeometryCollection: False (An empty GEOGRAPHY isn't closed) + False, # GEOMETRYCOLLECTION EMPTY: False + None, + ] + expected_series = pd.Series(data=expected_data, dtype="boolean") + + pd.testing.assert_series_equal( + bf_result, + expected_series, + # We default to Int64 (nullable) dtype, but pandas defaults to int64 index. + check_index_type=False, + ) diff --git a/tests/system/small/engines/__init__.py b/tests/system/small/engines/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/engines/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/engines/conftest.py b/tests/system/small/engines/conftest.py new file mode 100644 index 0000000000..2a72cb2196 --- /dev/null +++ b/tests/system/small/engines/conftest.py @@ -0,0 +1,81 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib +from typing import Generator + +from google.cloud import bigquery +import pandas as pd +import pytest + +import bigframes +from bigframes.core import local_data +from bigframes.session import ( + direct_gbq_execution, + local_scan_executor, + polars_executor, + semi_executor, +) + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent.parent.parent / "data" + + +@pytest.fixture(scope="module") +def fake_session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + + # its a "polars session", but we are bypassing session-provided execution + # we just want a minimal placeholder session without expensive setup + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture(scope="session", params=["pyarrow", "polars", "bq"]) +def engine(request, bigquery_client: bigquery.Client) -> semi_executor.SemiExecutor: + if request.param == "pyarrow": + return local_scan_executor.LocalScanExecutor() + if request.param == "polars": + return polars_executor.PolarsExecutor() + if request.param == "bq": + return direct_gbq_execution.DirectGbqExecutor(bigquery_client) + raise ValueError(f"Unrecognized param: {request.param}") + + +@pytest.fixture(scope="module") +def managed_data_source( + scalars_pandas_df_index: pd.DataFrame, +) -> local_data.ManagedArrowTable: + return local_data.ManagedArrowTable.from_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def zero_row_source() -> local_data.ManagedArrowTable: + return local_data.ManagedArrowTable.from_pandas(pd.DataFrame({"a": [], "b": []})) + + +@pytest.fixture(scope="module") +def nested_data_source( + nested_pandas_df: pd.DataFrame, +) -> local_data.ManagedArrowTable: + return local_data.ManagedArrowTable.from_pandas(nested_pandas_df) + + +@pytest.fixture(scope="module") +def repeated_data_source( + repeated_pandas_df: pd.DataFrame, +) -> local_data.ManagedArrowTable: + return local_data.ManagedArrowTable.from_pandas(repeated_pandas_df) diff --git a/tests/system/small/engines/test_read_local.py b/tests/system/small/engines/test_read_local.py new file mode 100644 index 0000000000..7bf1316a44 --- /dev/null +++ b/tests/system/small/engines/test_read_local.py @@ -0,0 +1,132 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes +from bigframes.core import identifiers, local_data, nodes +from bigframes.session import polars_executor, semi_executor + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +def ensure_equivalence( + node: nodes.BigFrameNode, + engine1: semi_executor.SemiExecutor, + engine2: semi_executor.SemiExecutor, +): + e1_result = engine1.execute(node, ordered=True) + e2_result = engine2.execute(node, ordered=True) + assert e1_result is not None + assert e2_result is not None + # Schemas might have extra nullity markers, normalize to node expected schema, which should be looser + e1_table = e1_result.to_arrow_table().cast(node.schema.to_pyarrow()) + e2_table = e2_result.to_arrow_table().cast(node.schema.to_pyarrow()) + assert e1_table.equals(e2_table), f"{e1_table} is not equal to {e2_table}" + + +def test_engines_read_local( + fake_session: bigframes.Session, + managed_data_source: local_data.ManagedArrowTable, + engine, +): + scan_list = nodes.ScanList.from_items( + nodes.ScanItem(identifiers.ColumnId(item.column), item.dtype, item.column) + for item in managed_data_source.schema.items + ) + local_node = nodes.ReadLocalNode( + managed_data_source, scan_list, fake_session, offsets_col=None + ) + ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + + +def test_engines_read_local_w_offsets( + fake_session: bigframes.Session, + managed_data_source: local_data.ManagedArrowTable, + engine, +): + scan_list = nodes.ScanList.from_items( + nodes.ScanItem(identifiers.ColumnId(item.column), item.dtype, item.column) + for item in managed_data_source.schema.items + ) + local_node = nodes.ReadLocalNode( + managed_data_source, + scan_list, + fake_session, + offsets_col=identifiers.ColumnId("offsets"), + ) + ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + + +def test_engines_read_local_w_col_subset( + fake_session: bigframes.Session, + managed_data_source: local_data.ManagedArrowTable, + engine, +): + scan_list = nodes.ScanList.from_items( + nodes.ScanItem(identifiers.ColumnId(item.column), item.dtype, item.column) + for item in managed_data_source.schema.items[::-2] + ) + local_node = nodes.ReadLocalNode( + managed_data_source, scan_list, fake_session, offsets_col=None + ) + ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + + +def test_engines_read_local_w_zero_row_source( + fake_session: bigframes.Session, + zero_row_source: local_data.ManagedArrowTable, + engine, +): + scan_list = nodes.ScanList.from_items( + nodes.ScanItem(identifiers.ColumnId(item.column), item.dtype, item.column) + for item in zero_row_source.schema.items + ) + local_node = nodes.ReadLocalNode( + zero_row_source, scan_list, fake_session, offsets_col=None + ) + ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + + +def test_engines_read_local_w_nested_source( + fake_session: bigframes.Session, + nested_data_source: local_data.ManagedArrowTable, + engine, +): + scan_list = nodes.ScanList.from_items( + nodes.ScanItem(identifiers.ColumnId(item.column), item.dtype, item.column) + for item in nested_data_source.schema.items + ) + local_node = nodes.ReadLocalNode( + nested_data_source, scan_list, fake_session, offsets_col=None + ) + ensure_equivalence(local_node, REFERENCE_ENGINE, engine) + + +def test_engines_read_local_w_repeated_source( + fake_session: bigframes.Session, + repeated_data_source: local_data.ManagedArrowTable, + engine, +): + scan_list = nodes.ScanList.from_items( + nodes.ScanItem(identifiers.ColumnId(item.column), item.dtype, item.column) + for item in repeated_data_source.schema.items + ) + local_node = nodes.ReadLocalNode( + repeated_data_source, scan_list, fake_session, offsets_col=None + ) + ensure_equivalence(local_node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 51e0459014..7fc7caf2fc 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -764,6 +764,11 @@ def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_un """ ), job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) func = session.read_gbq_function(routine_id_unique) @@ -797,6 +802,11 @@ def test_read_gbq_function_runs_existing_udf_2_params_array_output( """ ), job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) func = session.read_gbq_function(routine_id_unique) @@ -832,6 +842,11 @@ def test_read_gbq_function_runs_existing_udf_4_params_array_output( """ ), job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) func = session.read_gbq_function(routine_id_unique) diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index ae99fd6fc2..36dd070ef5 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -96,6 +96,17 @@ def test_geo_area_not_supported(): bf_series.area +def test_geoseries_length_property_not_implemented(session): + gs = bigframes.geopandas.GeoSeries([Point(0, 0)], session=session) + with pytest.raises( + NotImplementedError, + match=re.escape( + "GeoSeries.length is not yet implemented. Please use bigframes.bigquery.st_length(geoseries) instead." + ), + ): + _ = gs.length + + def test_geo_distance_not_supported(): s1 = bigframes.pandas.Series( [ diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 25d411bef8..83aca8b5b1 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -108,6 +108,65 @@ def test_map(session): ) +def test_classify(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + { + "result": ["A", "B"], + "full_response": _create_dummy_full_response(2), + }, + session=session, + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.classify( + "classify {col}", model=model, labels=["A", "B"] + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame( + {"col": ["A", "B"], "result": ["A", "B"]}, dtype=dtypes.STRING_DTYPE + ), + check_index_type=False, + ) + + +@pytest.mark.parametrize( + "labels", + [ + pytest.param([], id="empty-label"), + pytest.param(["A", "A", "B"], id="duplicate-labels"), + ], +) +def test_classify_invalid_labels_raise_error(session, labels): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + { + "result": ["A", "B"], + "full_response": _create_dummy_full_response(2), + }, + session=session, + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ), pytest.raises(ValueError): + df.ai.classify("classify {col}", model=model, labels=labels) + + def test_join(session): left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 032d93c19d..8801faf657 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -325,6 +325,11 @@ def test_isalpha(weird_strings, weird_strings_pd): ) +@pytest.mark.skipif( + "dev" in pa.__version__, + # b/333484335 pyarrow is inconsistent on the behavior + reason="pyarrow dev version is inconsistent on isdigit behavior.", +) def test_isdigit(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.isdigit() bf_result = weird_strings.str.isdigit().to_pandas() diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index a821901e4c..0992a10055 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -19,18 +19,22 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): + # This query should return enough results to be too big to fit in a single + # page from jobs.query. executions_before_sql = maybe_ordered_session._metrics.execution_count df = maybe_ordered_session._read_gbq_colab( """ SELECT name, + state, + gender, + year, SUM(number) AS total FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE state LIKE 'W%' - GROUP BY name + GROUP BY name, state, gender, year ORDER BY total DESC - LIMIT 300 """ ) executions_before_python = maybe_ordered_session._metrics.execution_count @@ -39,12 +43,17 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi ) executions_after = maybe_ordered_session._metrics.execution_count - total_rows = 0 + num_batches = 0 for batch in batches: assert batch["total"].is_monotonic_decreasing - total_rows += len(batch.index) + assert len(batch.index) == 100 + num_batches += 1 + + # Only test the first few pages to avoid downloading unnecessary data + # and so we can confirm we have full pages in each batch. + if num_batches >= 3: + break - assert total_rows > 0 assert executions_after == executions_before_python == executions_before_sql + 1 @@ -103,6 +112,9 @@ def test_read_gbq_colab_includes_formatted_scalars(session): # This is not a supported type, but ignored if not referenced. "some_object": object(), } + + # This query should return few enough results to be small enough to fit in a + # single page from jobs.query. df = session._read_gbq_colab( """ SELECT {some_integer} as some_integer, @@ -124,6 +136,7 @@ def test_read_gbq_colab_includes_formatted_scalars(session): "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), } ), + check_index_type=False, # int64 vs Int64 ) @@ -152,4 +165,8 @@ def test_read_gbq_colab_includes_formatted_bigframes_dataframe( .assign(int64_col=scalars_pandas_df_index["int64_too"]) .reset_index(drop=False)[["int64_col", "rowindex"]] ) - pandas.testing.assert_frame_equal(result, expected) + pandas.testing.assert_frame_equal( + result, + expected, + check_index_type=False, # int64 vs Int64 + ) diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 97f44694b0..1ba8ed7e09 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -70,7 +70,7 @@ def test_session_query_job(bq_cmek, session_with_bq_cmek): if not bq_cmek: # pragma: NO COVER pytest.skip("no cmek set for testing") # pragma: NO COVER - _, query_job = session_with_bq_cmek._loader._start_query( + query_job = session_with_bq_cmek._loader._start_query_with_job( "SELECT 123", job_config=bigquery.QueryJobConfig(use_query_cache=False) ) query_job.result() diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 9f45c8465b..7643f5701b 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re + import numpy import pandas as pd import pytest @@ -375,7 +377,7 @@ def test_index_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep): ) -def test_index_isin(scalars_df_index, scalars_pandas_df_index): +def test_index_isin_list(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" bf_series = ( scalars_df_index.set_index(col_name).index.isin([2, 55555, 4]).to_pandas() @@ -389,6 +391,38 @@ def test_index_isin(scalars_df_index, scalars_pandas_df_index): ) +def test_index_isin_bf_series(scalars_df_index, scalars_pandas_df_index, session): + col_name = "int64_col" + bf_series = ( + scalars_df_index.set_index(col_name) + .index.isin(bpd.Series([2, 55555, 4], session=session)) + .to_pandas() + ) + pd_result_array = scalars_pandas_df_index.set_index(col_name).index.isin( + [2, 55555, 4] + ) + pd.testing.assert_index_equal( + pd.Index(pd_result_array).set_names(col_name), + bf_series, + ) + + +def test_index_isin_bf_index(scalars_df_index, scalars_pandas_df_index, session): + col_name = "int64_col" + bf_series = ( + scalars_df_index.set_index(col_name) + .index.isin(bpd.Index([2, 55555, 4], session=session)) + .to_pandas() + ) + pd_result_array = scalars_pandas_df_index.set_index(col_name).index.isin( + [2, 55555, 4] + ) + pd.testing.assert_index_equal( + pd.Index(pd_result_array).set_names(col_name), + bf_series, + ) + + def test_multiindex_name_is_none(session): df = pd.DataFrame( { @@ -426,3 +460,42 @@ def test_multiindex_repr_includes_all_names(session): ) index = session.read_pandas(df).set_index(["A", "B"]).index assert "names=['A', 'B']" in repr(index) + + +def test_index_item(session): + # Test with a single item + bf_idx_single = bpd.Index([42], session=session) + pd_idx_single = pd.Index([42]) + assert bf_idx_single.item() == pd_idx_single.item() + + +def test_index_item_with_multiple(session): + # Test with multiple items + bf_idx_multiple = bpd.Index([1, 2, 3], session=session) + pd_idx_multiple = pd.Index([1, 2, 3]) + + try: + pd_idx_multiple.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_idx_multiple.item() + + +def test_index_item_with_empty(session): + # Test with an empty Index + bf_idx_empty = bpd.Index([], dtype="Int64", session=session) + pd_idx_empty: pd.Index = pd.Index([], dtype="Int64") + + try: + pd_idx_empty.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_idx_empty.item() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 710e1481be..10671720af 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -629,6 +629,18 @@ def test_series_replace_list_scalar(scalars_dfs): ) +def test_series_replace_nans_with_pd_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].replace({pd.NA: "UNKNOWN"}).to_pandas() + pd_result = scalars_pandas_df[col_name].replace({pd.NA: "UNKNOWN"}) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + @pytest.mark.parametrize( ("replacement_dict",), ( @@ -1368,6 +1380,24 @@ def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session): ) +def test_isin_bigframes_index(scalars_dfs, session): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + scalars_df["string_col"] + .isin(bigframes.pandas.Index(["Hello, World!", "Hi", "こんにちは"], session=session)) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df["string_col"] + .isin(pd.Index(["Hello, World!", "Hi", "こんにちは"])) + .astype("boolean") + ) + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + @pytest.mark.parametrize( ( "col_name", @@ -4255,13 +4285,16 @@ def test_apply_lambda(scalars_dfs, col, lambda_): bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() pd_col = scalars_pandas_df[col] - if pd.__version__.startswith("2.2"): + if pd.__version__[:3] in ("2.2", "2.3"): pd_result = pd_col.apply(lambda_, by_row=False) else: pd_result = pd_col.apply(lambda_) # ignore dtype check, which are Int64 and object respectively - assert_series_equal(bf_result, pd_result, check_dtype=False) + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) @pytest.mark.parametrize( @@ -4345,13 +4378,16 @@ def foo(x): pd_col = scalars_pandas_df["int64_col"] - if pd.__version__.startswith("2.2"): + if pd.__version__[:3] in ("2.2", "2.3"): pd_result = pd_col.apply(foo, by_row=False) else: pd_result = pd_col.apply(foo) # ignore dtype check, which are Int64 and object respectively - assert_series_equal(bf_result, pd_result, check_dtype=False) + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) @pytest.mark.parametrize( @@ -4606,3 +4642,42 @@ def test_series_to_pandas_dry_run(scalars_df_index): assert isinstance(result, pd.Series) assert len(result) > 0 + + +def test_series_item(session): + # Test with a single item + bf_s_single = bigframes.pandas.Series([42], session=session) + pd_s_single = pd.Series([42]) + assert bf_s_single.item() == pd_s_single.item() + + +def test_series_item_with_multiple(session): + # Test with multiple items + bf_s_multiple = bigframes.pandas.Series([1, 2, 3], session=session) + pd_s_multiple = pd.Series([1, 2, 3]) + + try: + pd_s_multiple.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_s_multiple.item() + + +def test_series_item_with_empty(session): + # Test with an empty Series + bf_s_empty = bigframes.pandas.Series([], dtype="Int64", session=session) + pd_s_empty = pd.Series([], dtype="Int64") + + try: + pd_s_empty.item() + except ValueError as e: + expected_message = str(e) + else: + raise AssertionError("Expected ValueError from pandas, but didn't get one") + + with pytest.raises(ValueError, match=re.escape(expected_message)): + bf_s_empty.item() diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 6e68a759b4..9febb0da42 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1320,10 +1320,6 @@ def test_read_csv_for_names_less_than_columns(session, df_and_gcs_csv_for_two_co assert bf_df.shape == pd_df.shape assert bf_df.columns.tolist() == pd_df.columns.tolist() - # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs - # (b/280889935) or guarantee row ordering. - bf_df = bf_df.sort_index() - # Pandas's index name is None, while BigFrames's index name is "rowindex". pd_df.index.name = "rowindex" pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) @@ -1479,41 +1475,70 @@ def test_read_csv_for_gcs_file_w_header(session, df_and_gcs_csv, header): def test_read_csv_w_usecols(session, df_and_local_csv): # Compares results for pandas and bigframes engines scalars_df, path = df_and_local_csv + usecols = ["rowindex", "bool_col"] with open(path, "rb") as buffer: bf_df = session.read_csv( buffer, engine="bigquery", - usecols=["bool_col"], + usecols=usecols, ) with open(path, "rb") as buffer: # Convert default pandas dtypes to match BigQuery DataFrames dtypes. pd_df = session.read_csv( buffer, - usecols=["bool_col"], + usecols=usecols, dtype=scalars_df[["bool_col"]].dtypes.to_dict(), ) - # Cannot compare two dataframe due to b/408499371. - assert len(bf_df.columns) == 1 - assert len(pd_df.columns) == 1 + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index("rowindex").sort_index() + pd_df = pd_df.set_index("rowindex") + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) -@pytest.mark.parametrize( - "engine", - [ - pytest.param("bigquery", id="bq_engine"), - pytest.param(None, id="default_engine"), - ], -) -def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine): - with tempfile.TemporaryDirectory() as dir: - path = dir + "/test_read_csv_local_w_usecols.csv" - # Using the pandas to_csv method because the BQ one does not support local write. - scalars_pandas_df_index.to_csv(path, index=False) - # df should only have 1 column which is bool_col. - df = session.read_csv(path, usecols=["bool_col"], engine=engine) - assert len(df.columns) == 1 +def test_read_csv_w_usecols_and_indexcol(session, df_and_local_csv): + # Compares results for pandas and bigframes engines + scalars_df, path = df_and_local_csv + usecols = ["rowindex", "bool_col"] + with open(path, "rb") as buffer: + bf_df = session.read_csv( + buffer, + engine="bigquery", + usecols=usecols, + index_col="rowindex", + ) + with open(path, "rb") as buffer: + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + pd_df = session.read_csv( + buffer, + usecols=usecols, + index_col="rowindex", + dtype=scalars_df[["bool_col"]].dtypes.to_dict(), + ) + + assert bf_df.shape == pd_df.shape + assert bf_df.columns.tolist() == pd_df.columns.tolist() + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + + +def test_read_csv_w_indexcol_not_in_usecols(session, df_and_local_csv): + _, path = df_and_local_csv + with open(path, "rb") as buffer: + with pytest.raises( + ValueError, + match=re.escape("The specified index column(s) were not found"), + ): + session.read_csv( + buffer, + engine="bigquery", + usecols=["bool_col"], + index_col="rowindex", + ) @pytest.mark.parametrize( @@ -1553,9 +1578,6 @@ def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index): bf_df = session.read_csv( path, engine="bigquery", index_col="rowindex", encoding="ISO-8859-1" ) - # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs - # (b/280889935) or guarantee row ordering. - bf_df = bf_df.sort_index() pd.testing.assert_frame_equal( bf_df.to_pandas(), penguins_pandas_df_default_index ) diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index d7e47b6032..a34f3526d6 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -7,7 +7,7 @@ WITH `bfcte_0` AS ( CAST(b'Hello, World!' AS BYTES), CAST('2021-07-21' AS DATE), CAST('2021-07-21T11:39:45' AS DATETIME), - ST_GEOGFROMTEXT('POINT (-122.0838511 37.3860517)'), + ST_GEOGFROMTEXT('POINT(-122.0838511 37.3860517)'), 123456789, 0, CAST(1.234567890 AS NUMERIC), @@ -24,7 +24,7 @@ WITH `bfcte_0` AS ( CAST(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf' AS BYTES), CAST('1991-02-03' AS DATE), CAST('1991-01-02T03:45:06' AS DATETIME), - ST_GEOGFROMTEXT('POINT (-71.104 42.315)'), + ST_GEOGFROMTEXT('POINT(-71.104 42.315)'), -987654321, 1, CAST(1.234567890 AS NUMERIC), @@ -41,7 +41,7 @@ WITH `bfcte_0` AS ( CAST(b'\xc2\xa1Hola Mundo!' AS BYTES), CAST('2023-03-01' AS DATE), CAST('2023-03-01T10:55:13' AS DATETIME), - ST_GEOGFROMTEXT('POINT (-0.124474760143016 51.5007826749545)'), + ST_GEOGFROMTEXT('POINT(-0.124474760143016 51.5007826749545)'), 314159, 0, CAST(101.101010100 AS NUMERIC), @@ -109,7 +109,7 @@ WITH `bfcte_0` AS ( CAST(b'Hello\tBigFrames!\x07' AS BYTES), CAST('2023-05-23' AS DATE), CAST('2023-05-23T11:37:01' AS DATETIME), - ST_GEOGFROMTEXT('LINESTRING (-0.127959 51.507728, -0.127026 51.507473)'), + ST_GEOGFROMTEXT('LINESTRING(-0.127959 51.507728, -0.127026 51.507473)'), 101202303, 2, CAST(-10.090807000 AS NUMERIC), diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index e5e2c58d59..cfee5ea98d 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -14,7 +14,7 @@ import datetime import re -from typing import Iterable +from typing import Iterable, Optional from unittest import mock import google.cloud.bigquery as bigquery @@ -203,7 +203,7 @@ def test_add_and_trim_labels_length_limit_met(): [(None, None), (30.0, "test_api")], ) def test_start_query_with_client_labels_length_limit_met( - mock_bq_client, timeout, api_name + mock_bq_client: bigquery.Client, timeout: Optional[float], api_name ): sql = "select * from abc" cur_labels = { @@ -229,8 +229,12 @@ def test_start_query_with_client_labels_length_limit_met( io_bq.start_query_with_client( mock_bq_client, sql, - job_config, + job_config=job_config, + location=None, + project=None, timeout=timeout, + metrics=None, + query_with_job=True, ) assert job_config.labels is not None diff --git a/tests/unit/session/test_local_scan_executor.py b/tests/unit/session/test_local_scan_executor.py new file mode 100644 index 0000000000..30b1b5f78d --- /dev/null +++ b/tests/unit/session/test_local_scan_executor.py @@ -0,0 +1,105 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import pyarrow +import pytest + +from bigframes import dtypes +from bigframes.core import identifiers, local_data, nodes +from bigframes.session import local_scan_executor +from bigframes.testing import mocks + + +@pytest.fixture +def object_under_test(): + return local_scan_executor.LocalScanExecutor() + + +def create_read_local_node(arrow_table: pyarrow.Table): + session = mocks.create_bigquery_session() + local_data_source = local_data.ManagedArrowTable.from_pyarrow(arrow_table) + return nodes.ReadLocalNode( + local_data_source=local_data_source, + session=session, + scan_list=nodes.ScanList( + items=tuple( + nodes.ScanItem( + id=identifiers.ColumnId(column_name), + dtype=dtypes.arrow_dtype_to_bigframes_dtype( + arrow_table.field(column_name).type + ), + source_id=column_name, + ) + for column_name in arrow_table.column_names + ), + ), + ) + + +@pytest.mark.parametrize( + ("start", "stop", "expected_rows"), + ( + # No-op slices. + (None, None, 10), + (0, None, 10), + (None, 10, 10), + # Slices equivalent to limits. + (None, 7, 7), + (0, 3, 3), + ), +) +def test_local_scan_executor_with_slice(start, stop, expected_rows, object_under_test): + pyarrow_table = pyarrow.Table.from_pydict( + { + "rowindex": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "letters": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + } + ) + assert pyarrow_table.num_rows == 10 + + local_node = create_read_local_node(pyarrow_table) + plan = nodes.SliceNode( + child=local_node, + start=start, + stop=stop, + ) + + result = object_under_test.execute(plan, ordered=True) + result_table = pyarrow.Table.from_batches(result.arrow_batches) + assert result_table.num_rows == expected_rows + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + ( + (-1, None, 1), + (None, -1, 1), + (None, None, 2), + (None, None, -1), + (4, None, 6), + (1, 9, 8), + ), +) +def test_local_scan_executor_with_slice_unsupported_inputs( + start, stop, step, object_under_test +): + local_node = create_read_local_node(pyarrow.Table.from_pydict({"col": [1, 2, 3]})) + plan = nodes.SliceNode( + child=local_node, + start=start, + stop=stop, + step=step, + ) + assert object_under_test.execute(plan, ordered=True) is None diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index cffc6b3af7..c4635f85a9 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -80,3 +80,19 @@ def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch): assert config.dry_run assert query.strip() == expected.strip() + + +def test_read_gbq_colab_doesnt_set_destination_table(): + """For best performance, we don't try to workaround the 10 GB query results limitation.""" + session = mocks.create_bigquery_session() + + _ = session._read_gbq_colab("SELECT 'my-test-query';") + queries = session._queries # type: ignore + configs = session._job_configs # type: ignore + + for query, config in zip(queries, configs): + if query == "SELECT 'my-test-query';" and not config.dry_run: + break + + assert query == "SELECT 'my-test-query';" + assert config.destination is None diff --git a/tests/unit/session/test_read_gbq_query.py b/tests/unit/session/test_read_gbq_query.py new file mode 100644 index 0000000000..afd9922426 --- /dev/null +++ b/tests/unit/session/test_read_gbq_query.py @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for read_gbq_query functions.""" + +from bigframes.testing import mocks + + +def test_read_gbq_query_sets_destination_table(): + """Workaround the 10 GB query results limitation by setting a destination table. + + See internal issue b/303057336. + """ + # Use partial ordering mode to skip column uniqueness checks. + session = mocks.create_bigquery_session(ordering_mode="partial") + + _ = session.read_gbq_query("SELECT 'my-test-query';") + queries = session._queries # type: ignore + configs = session._job_configs # type: ignore + + for query, config in zip(queries, configs): + if query == "SELECT 'my-test-query';" and not config.dry_run: + break + + assert query == "SELECT 'my-test-query';" + assert config.destination is not None diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 6a4ae7cb60..0c67e05813 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -81,14 +81,17 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte } bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" - bqclient.get_table.return_value = table + session = mocks.create_bigquery_session( + bqclient=bqclient, table_schema=table.schema + ) + # Mock bqclient _after_ creating session to override its mocks. + bqclient.get_table.return_value = table + bqclient.query_and_wait.side_effect = None bqclient.query_and_wait.return_value = ( {"total_count": 3, "distinct_count": 3 if values_distinct else 2}, ) - session = mocks.create_bigquery_session( - bqclient=bqclient, table_schema=table.schema - ) + table._properties["location"] = session._location result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index cbd31f588a..26b74a3f8a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -273,7 +273,11 @@ def test_default_index_warning_raised_by_read_gbq(table): bqclient.project = "test-project" bqclient.get_table.return_value = table bqclient.query_and_wait.return_value = ({"total_count": 3, "distinct_count": 2},) - session = mocks.create_bigquery_session(bqclient=bqclient) + session = mocks.create_bigquery_session( + bqclient=bqclient, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", + ) table._properties["location"] = session._location with pytest.warns(bigframes.exceptions.DefaultIndexWarning): @@ -296,7 +300,11 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 bqclient.project = "test-project" bqclient.get_table.return_value = table bqclient.query_and_wait.return_value = ({"total_count": 4, "distinct_count": 3},) - session = mocks.create_bigquery_session(bqclient=bqclient) + session = mocks.create_bigquery_session( + bqclient=bqclient, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", + ) table._properties["location"] = session._location # No warnings raised because we set the option allowing the default indexes. @@ -344,7 +352,10 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_columns( {"total_count": total_count, "distinct_count": distinct_count}, ) session = mocks.create_bigquery_session( - bqclient=bqclient, table_schema=table.schema + bqclient=bqclient, + table_schema=table.schema, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", ) table._properties["location"] = session._location @@ -386,7 +397,10 @@ def test_default_index_warning_not_raised_by_read_gbq_primary_key(table): bqclient.project = "test-project" bqclient.get_table.return_value = table session = mocks.create_bigquery_session( - bqclient=bqclient, table_schema=table.schema + bqclient=bqclient, + table_schema=table.schema, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", ) table._properties["location"] = session._location diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py new file mode 100644 index 0000000000..2bda563418 --- /dev/null +++ b/tests/unit/test_dataframe_polars.py @@ -0,0 +1,4422 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import operator +import pathlib +import tempfile +import typing +from typing import Generator, List, Tuple + +import numpy as np +import pandas as pd +import pandas.testing +import pytest + +import bigframes +import bigframes._config.display_options as display_options +import bigframes.core.indexes as bf_indexes +import bigframes.dataframe as dataframe +import bigframes.pandas as bpd +import bigframes.series as series +from tests.system.utils import ( + assert_dfs_equivalent, + assert_pandas_df_equal, + assert_series_equal, + assert_series_equivalent, + convert_pandas_dtypes, +) + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="2.0.0") + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent / "data" + + +@pytest.fixture(scope="module", autouse=True) +def session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture(scope="module") +def scalars_pandas_df_index() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + df.index.name = None + return df.set_index("rowindex").sort_index() + + +@pytest.fixture(scope="module") +def scalars_df_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_df_2_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_dfs( + scalars_df_index, + scalars_pandas_df_index, +): + return scalars_df_index, scalars_pandas_df_index + + +def test_df_construct_copy(scalars_dfs): + columns = ["int64_col", "string_col", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + # Make the mapping from label to col_id non-trivial + bf_df = scalars_df.copy() + bf_df["int64_col"] = bf_df["int64_col"] / 2 + pd_df = scalars_pandas_df.copy() + pd_df["int64_col"] = pd_df["int64_col"] / 2 + + bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() + + pd_result = pd.DataFrame(pd_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_pandas_default(scalars_dfs): + # This should trigger the inlined codepath + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + "string_col", + "date_col", + "datetime_col", + "numeric_col", + "float64_col", + "time_col", + "timestamp_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_structs(session): + pd_frame = pd.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ] + ).to_frame() + bf_series = session.read_pandas(pd_frame) + pd.testing.assert_frame_equal( + bf_series.to_pandas(), pd_frame, check_index_type=False, check_dtype=False + ) + + +def test_df_construct_pandas_set_dtype(scalars_dfs): + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame( + scalars_pandas_df, columns=columns, dtype="Float64" + ).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns, dtype="Float64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_from_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame( + {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, + dtype="string[pyarrow]", + ) + pd_result = pd.DataFrame( + {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, + dtype="string[pyarrow]", + ) + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_construct_from_dict(): + input_dict = { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + bf_result = dataframe.DataFrame(input_dict).to_pandas() + pd_result = pd.DataFrame(input_dict) + + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_construct_dtype(): + data = { + "int_col": [1, 2, 3], + "string_col": ["1.1", "2.0", "3.5"], + "float_col": [1.0, 2.0, 3.0], + } + dtype = pd.StringDtype(storage="pyarrow") + bf_result = dataframe.DataFrame(data, dtype=dtype) + pd_result = pd.DataFrame(data, dtype=dtype) + pd_result.index = pd_result.index.astype("Int64") + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + series = scalars_df[col_name] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df[col_name] + assert_series_equal(bf_result, pd_result) + + +def test_get_column_nonstring(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + "row_slice", + [ + (slice(1, 7, 2)), + (slice(1, 7, None)), + (slice(None, -3, None)), + ], +) +def test_get_rows_with_slice(scalars_dfs, row_slice): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[row_slice].to_pandas() + pd_result = scalars_pandas_df[row_slice] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_hasattr(scalars_dfs): + scalars_df, _ = scalars_dfs + assert hasattr(scalars_df, "int64_col") + assert hasattr(scalars_df, "head") + assert not hasattr(scalars_df, "not_exist") + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_head_with_custom_column_labels( + scalars_df_index, scalars_pandas_df_index, ordered +): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) + bf_result = bf_df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).tail(3) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).tail(3) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_get_column_by_attr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.int64_col + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.int64_col + assert_series_equal(bf_result, pd_result) + + +def test_get_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["bool_col", "float64_col", "int64_col"] + df_subset = scalars_df.get(col_names) + df_pandas = df_subset.to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df[col_names].columns + ) + + +def test_get_columns_default(scalars_dfs): + scalars_df, _ = scalars_dfs + col_names = ["not", "column", "names"] + result = scalars_df.get(col_names, "default_val") + assert result == "default_val" + + +@pytest.mark.parametrize( + ("loc", "column", "value", "allow_duplicates"), + [ + (0, 666, 2, False), + (5, "float64_col", 2.2, True), + (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), + pytest.param( + 14, + "test", + 2, + False, + marks=pytest.mark.xfail( + raises=IndexError, + ), + ), + pytest.param( + 12, + "int64_col", + 2, + False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], +) +def test_insert(scalars_dfs, loc, column, value, allow_duplicates): + scalars_df, scalars_pandas_df = scalars_dfs + # insert works inplace, so will influence other tests. + # make a copy to avoid inplace changes. + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.insert(loc, column, value, allow_duplicates) + pd_df.insert(loc, column, value, allow_duplicates) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + + +def test_where_series_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is dataframe, other is None (as default). + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + bf_result = scalars_df_index.where(cond_bf).to_pandas() + pd_result = scalars_pandas_df_index.where(cond_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + + bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] + bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas() + pd_result = pd_df.mask(cond_pd, pd_df + 1) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_multi_index(scalars_df_index, scalars_pandas_df_index): + # Test when a dataframe has multi-index or multi-columns. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + + dataframe_bf.columns = pd.MultiIndex.from_tuples( + [("str1", 1), ("str2", 2)], names=["STR", "INT"] + ) + cond_bf = dataframe_bf["str1"] > 0 + + with pytest.raises(NotImplementedError) as context: + dataframe_bf.where(cond_bf).to_pandas() + assert ( + str(context.value) + == "The dataframe.where() method does not support multi-index and/or multi-column." + ) + + +def test_where_series_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + dataframe_bf.columns.name = "test_name" + dataframe_pd.columns.name = "test_name" + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other = 0 + + bf_result = dataframe_bf.where(cond_bf, other).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_cond_dataframe_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other_bf = -dataframe_bf + other_pd = -dataframe_pd + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is None. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + + bf_result = dataframe_bf.where(cond_bf, None).to_pandas() + pd_result = dataframe_pd.where(cond_pd, None) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = 10 + other_pd = 10 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_dataframe_other( + scalars_df_index, scalars_pandas_df_index +): + # Condition is a dataframe, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = dataframe_bf * 2 + other_pd = dataframe_pd * 2 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_drop_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + df_pandas = scalars_df.drop(columns=col_name).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns + ) + + +def test_drop_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "geography_col", "time_col"] + df_pandas = scalars_df.drop(columns=col_names).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns + ) + + +def test_drop_labels_axis_1(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + labels = ["int64_col", "geography_col", "time_col"] + + pd_result = scalars_pandas_df.drop(labels=labels, axis=1) + bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_with_custom_column_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + dropped_columns = [ + "言語列", + "timestamp_col", + ] + bf_df = scalars_df.rename(columns=rename_mapping).drop(columns=dropped_columns) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( + columns=dropped_columns + ) + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_memory_usage(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.memory_usage() + bf_result = scalars_df.memory_usage() + + pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + + +def test_df_info(scalars_dfs): + expected = ( + "\n" + "Index: 9 entries, 0 to 8\n" + "Data columns (total 13 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 1269 bytes\n" + ) + + scalars_df, _ = scalars_dfs + bf_result = io.StringIO() + + scalars_df.info(buf=bf_result) + + assert expected == bf_result.getvalue() + + +@pytest.mark.parametrize( + ("include", "exclude"), + [ + ("Int64", None), + (["int"], None), + ("number", None), + ([pd.Int64Dtype(), pd.BooleanDtype()], None), + (None, [pd.Int64Dtype(), pd.BooleanDtype()]), + ("Int64", ["boolean"]), + ], +) +def test_select_dtypes(scalars_dfs, include, exclude): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) + bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) + bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_pandas_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_df.loc[[4, 1, 2]].index + drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index("bytes_col") + scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") + drop_index = scalars_df.iloc[[3, 5]].index + drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_multiindex(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + sub_df = scalars_df.iloc[[4, 1, 2]] + sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]] + sub_df = sub_df.set_index(["bytes_col", "numeric_col"]) + sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"]) + drop_index = sub_df.index + drop_pandas_index = sub_pandas_df.index + + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_labels_axis_0(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) + bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index_and_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") + bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": 1.2345} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_peek(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_with_large_results_not_allowed(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3, force=False) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_force_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_df_peek_reset_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = ( + scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) + ) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_repr_w_all_rows(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + # Remove columns with flaky formatting, like NUMERIC columns (which use the + # object dtype). Also makes a copy so that mutating the index name doesn't + # break other tests. + scalars_df = scalars_df.drop(columns=["numeric_col"]) + scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) + + # When there are 10 or fewer rows, the outputs should be identical. + actual = repr(scalars_df.head(10)) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df.head(10)) + + assert actual == expected + + +def test_join_repr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_df = ( + scalars_df[["int64_col"]] + .join(scalars_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + scalars_pandas_df = ( + scalars_pandas_df[["int64_col"]] + .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly + scalars_pandas_df.index.name = None + + actual = repr(scalars_df) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df) + + assert actual == expected + + +def test_repr_html_w_all_rows(scalars_dfs, session): + scalars_df, _ = scalars_dfs + # get a pandas df of the expected format + df, _ = scalars_df._block.to_pandas() + pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) + pandas_df.index.name = scalars_df.index.name + + # When there are 10 or fewer rows, the outputs should be identical except for the extra note. + actual = scalars_df.head(10)._repr_html_() + + with display_options.pandas_repr(bigframes.options.display): + pandas_repr = pandas_df.head(10)._repr_html_() + + expected = ( + pandas_repr + + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" + ) + assert actual == expected + + +def test_df_column_name_with_space(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": "bool col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_get_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + + bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() + pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@pytest.mark.parametrize( + ("indices", "axis"), + [ + ([1, 3, 5], 0), + ([2, 4, 6], 1), + ([1, -3, -5, -6], "index"), + ([-2, -4, -6], "columns"), + ], +) +def test_take_df(scalars_dfs, indices, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices, axis=axis).to_pandas() + pd_result = scalars_pandas_df.take(indices, axis=axis) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_filter_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_bool_series = scalars_df["bool_col"] + bf_result = scalars_df[bf_bool_series].to_pandas() + + pd_bool_series = scalars_pandas_df["bool_col"] + pd_result = scalars_pandas_df[pd_bool_series] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_new_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"new_col": 2} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_new_column_w_loc(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[:, "new_col"] = 2 + pd_df.loc[:, "new_col"] = 2 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("scalar",), + [ + (2.1,), + (None,), + ], +) +def test_assign_new_column_w_setitem(scalars_dfs, scalar): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = scalar + pd_df["new_col"] = scalar + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_dataframe(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["int64_col"] = bf_df["int64_too"].to_frame() + pd_df["int64_col"] = pd_df["int64_too"].to_frame() + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + bf_df["impossible_col"] = bf_df[["int64_too", "string_col"]] + with pytest.raises(ValueError): + pd_df["impossible_col"] = pd_df[["int64_too", "string_col"]] + + +def test_assign_new_column_w_setitem_list(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + # set the custom index + pd_df = pd_df.set_index(["string_col", "int64_col"]) + bf_df = bf_df.set_index(["string_col", "int64_col"]) + + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 + with pytest.raises(ValueError): + bf_df["new_col"] = [1, 2, 3] + + +def test_assign_existing_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_listlike_to_empty_df(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + bf_result = empty_df.assign(new_col=[1, 2, 3]) + pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) + + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + + +def test_assign_to_empty_df_multiindex_error(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + empty_df["empty_col_1"] = typing.cast(series.Series, []) + empty_df["empty_col_2"] = typing.cast(series.Series, []) + empty_pandas_df["empty_col_1"] = [] + empty_pandas_df["empty_col_2"] = [] + empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) + empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) + + with pytest.raises(ValueError): + empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + with pytest.raises(ValueError): + empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_assign_series(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(new_col=scalars_df[column_name]) + bf_result = df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_assign_series_overwrite(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(**{column_name: scalars_df[column_name] + 3}) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign( + **{column_name: scalars_pandas_df[column_name] + 3} + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_sequential(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2, "new_col": 3, "new_col2": 4} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +# Require an index so that the self-join is consistent each time. +def test_assign_same_table_different_index_performs_self_join( + scalars_df_index, scalars_pandas_df_index +): + column_name = "int64_col" + bf_df = scalars_df_index.assign( + alternative_index=scalars_df_index["rowindex_2"] + 2 + ) + pd_df = scalars_pandas_df_index.assign( + alternative_index=scalars_pandas_df_index["rowindex_2"] + 2 + ) + bf_df_2 = bf_df.set_index("alternative_index") + pd_df_2 = pd_df.set_index("alternative_index") + bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).to_pandas() + pd_result = pd_df.assign(new_col=pd_df_2[column_name] * 10) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +# Different table expression must have Index +def test_assign_different_df( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + column_name = "int64_col" + df = scalars_df_index.assign(new_col=scalars_df_2_index[column_name]) + bf_result = df.to_pandas() + # Doesn't matter to pandas if it comes from the same DF or a different DF. + pd_result = scalars_pandas_df_index.assign( + new_col=scalars_pandas_df_index[column_name] + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_different_df_w_loc( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df.loc[:, "int64_col"] = bf_df2.loc[:, "int64_col"] + 1 + pd_df.loc[:, "int64_col"] = pd_df.loc[:, "int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_different_df_w_setitem( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df["int64_col"] = bf_df2["int64_col"] + 1 + pd_df["int64_col"] = pd_df["int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_callable_lambda(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"new_col": lambda x: x["int64_col"] + x["int64_too"]} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis", "how", "ignore_index", "subset"), + [ + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), + ], +) +def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_dropna_range_columns(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df.columns = pandas.RangeIndex(0, len(scalars_df.columns)) + scalars_pandas_df.columns = pandas.RangeIndex(0, len(scalars_pandas_df.columns)) + + df = scalars_df.dropna() + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_interpolate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + bf_result = scalars_df[columns].interpolate().to_pandas() + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = scalars_pandas_df[columns].astype("float64").interpolate() + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "col, fill_value", + [ + (["int64_col", "float64_col"], 3), + (["string_col"], "A"), + (["datetime_col"], pd.Timestamp("2023-01-01")), + ], +) +def test_df_fillna(scalars_dfs, col, fill_value): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col].fillna(fill_value).to_pandas() + pd_result = scalars_pandas_df[col].fillna(fill_value) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_ffill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_bfill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_series_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0): + return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7) + + bf_result = ( + scalars_df_index[columns] + .apply(foo, args=(33, 61), kwarg1=52, kwarg2=21) + .to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply( + foo, args=(33, 61), kwarg1=52, kwarg2=21 + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_listlike_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = ( + scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24]) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result.index = pd_result.index.astype("Int64") + pd_result = pd_result.astype("Int64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_scalar_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = scalars_df_index[columns].apply(lambda x: x.sum()) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum()) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + +def test_df_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_keys( + scalars_df_index, + scalars_pandas_df_index, +): + pandas.testing.assert_index_equal( + scalars_df_index.keys(), scalars_pandas_df_index.keys() + ) + + +def test_df_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): + assert bf_i == df_i + + +def test_iterrows( + scalars_df_index, + scalars_pandas_df_index, +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.add_suffix("_suffix", axis=1) + scalars_pandas_df_index = scalars_pandas_df_index.add_suffix("_suffix", axis=1) + for (bf_index, bf_series), (pd_index, pd_series) in zip( + scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() + ): + assert bf_index == pd_index + pandas.testing.assert_series_equal(bf_series, pd_series) + + +@pytest.mark.parametrize( + ( + "index", + "name", + ), + [ + ( + True, + "my_df", + ), + (False, None), + ], +) +def test_itertuples(scalars_df_index, index, name): + # Numeric has slightly different representation as a result of conversions. + bf_tuples = scalars_df_index.itertuples(index, name) + pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) + for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): + assert bf_tuple == pd_tuple + + +def test_df_cross_merge(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + bf_result = left.merge(right, "cross").to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "cross", + ) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_df_merge(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + on = "rowindex_2" + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]), + # Polars engine is currently strict on join key types + # (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]), + ], +) +def test_df_merge_multi_key(scalars_dfs, left_on, right_on): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "outer", + left_on=left_on, + right_on=right_on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_custom_col_name(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col"] + right_columns = ["int64_col", "bool_col", "string_col"] + on = "int64_col" + rename_columns = {"float64_col": "f64_col"} + + left = scalars_df[left_columns] + left = left.rename(columns=rename_columns) + right = scalars_df[right_columns] + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pandas_left_df = scalars_pandas_df[left_columns] + pandas_left_df = pandas_left_df.rename(columns=rename_columns) + pandas_right_df = scalars_pandas_df[right_columns] + pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_left_on_right_on(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "int64_too"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + right = scalars_df[right_columns] + + df = left.merge( + right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True + ) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns], + merge_how, + left_on="int64_too", + right_on="rowindex_2", + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +def test_shape(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.shape + pd_result = scalars_pandas_df.shape + + assert bf_result == pd_result + + +def test_len(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = len(scalars_df) + pd_result = len(scalars_pandas_df) + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("n_rows",), + [ + (50,), + (10000,), + ], +) +def test_df_len_local(session, n_rows): + assert ( + len( + session.read_pandas( + pd.DataFrame(np.random.randint(1, 7, n_rows), columns=["one"]), + ) + ) + == n_rows + ) + + +def test_size(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.size + pd_result = scalars_pandas_df.size + + assert bf_result == pd_result + + +def test_ndim(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.ndim + pd_result = scalars_pandas_df.ndim + + assert bf_result == pd_result + + +def test_empty_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.empty + pd_result = scalars_pandas_df.empty + + assert bf_result == pd_result + + +def test_empty_true_column_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[[]].empty + pd_result = scalars_pandas_df[[]].empty + + assert bf_result == pd_result + + +def test_empty_true_row_filter(scalars_dfs: Tuple[dataframe.DataFrame, pd.DataFrame]): + scalars_df, scalars_pandas_df = scalars_dfs + bf_bool: series.Series = typing.cast(series.Series, scalars_df["bool_col"]) + pd_bool: pd.Series = scalars_pandas_df["bool_col"] + bf_false = bf_bool.notna() & (bf_bool != bf_bool) + pd_false = pd_bool.notna() & (pd_bool != pd_bool) + + bf_result = scalars_df[bf_false].empty + pd_result = scalars_pandas_df[pd_false].empty + + assert pd_result + assert bf_result == pd_result + + +def test_empty_true_memtable(session: bigframes.Session): + bf_df = dataframe.DataFrame(session=session) + pd_df = pd.DataFrame() + + bf_result = bf_df.empty + pd_result = pd_df.empty + + assert pd_result + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("drop",), + ((True,), (False,)), +) +def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): + df = scalars_df_index.reset_index(drop=drop) + assert df.index.name is None + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=drop) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_then_filter( + scalars_df_index, + scalars_pandas_df_index, +): + bf_filter = scalars_df_index["bool_col"].fillna(True) + bf_df = scalars_df_index.reset_index()[bf_filter] + bf_result = bf_df.to_pandas() + pd_filter = scalars_pandas_df_index["bool_col"].fillna(True) + pd_result = scalars_pandas_df_index.reset_index()[pd_filter] + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering and index keys + # post-filter will have gaps. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_index( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.reset_index(drop=False) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "index". + assert df.columns[0] == "index" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_multiindex( + scalars_df_index, + scalars_pandas_df_index, +): + bf_df = dataframe.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + pd_df = pd.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + + bf_df = bf_df.reset_index() + pd_df = pd_df.reset_index() + + assert pd_df.columns[0] == "level_0" + assert bf_df.columns[0] == "level_0" + assert pd_df.columns[1] == "level_1" + assert bf_df.columns[1] == "level_1" + + +def test_reset_index_with_unnamed_index_and_index_column( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.assign(index=scalars_df_index["int64_col"]).reset_index( + drop=False + ) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "level_0" if the "index" column already exists. + assert df.columns[0] == "level_0" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.assign( + index=scalars_pandas_df_index["int64_col"] + ).reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("drop",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("append",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("index_column",), + (("int64_too",), ("string_col",), ("timestamp_col",)), +) +def test_set_index(scalars_dfs, index_column, drop, append): + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column, append=append, drop=drop) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.set_index(index_column, append=append, drop=drop) + + # Sort to disambiguate when there are duplicate index labels. + # Note: Doesn't use assert_pandas_df_equal_ignore_ordering because we get + # "ValueError: 'timestamp_col' is both an index level and a column label, + # which is ambiguous" when trying to sort by a column with the same name as + # the index. + bf_result = bf_result.sort_values("rowindex_2") + pd_result = pd_result.sort_values("rowindex_2") + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_set_index_key_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + with pytest.raises(KeyError): + scalars_pandas_df.set_index(["not_a_col"]) + with pytest.raises(KeyError): + scalars_df.set_index(["not_a_col"]) + + +@pytest.mark.parametrize( + ("ascending",), + ((True,), (False,)), +) +@pytest.mark.parametrize( + ("na_position",), + (("first",), ("last",)), +) +def test_sort_index(scalars_dfs, ascending, na_position): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column) + bf_result = df.sort_index(ascending=ascending, na_position=na_position).to_pandas() + pd_result = scalars_pandas_df.set_index(index_column).sort_index( + ascending=ascending, na_position=na_position + ) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_dataframe_sort_index_inplace(scalars_dfs): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.copy().set_index(index_column) + df.sort_index(ascending=False, inplace=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_abs(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df[columns].abs() + pd_result = scalars_pandas_df[columns].abs() + + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_pos(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_neg(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_invert(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "bool_col"] + + bf_result = (~scalars_df[columns]).to_pandas() + pd_result = ~scalars_pandas_df[columns] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_isnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].isnull().to_pandas() + pd_result = scalars_pandas_df[columns].isnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_notnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].notnull().to_pandas() + pd_result = scalars_pandas_df[columns].notnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels", "overwrite", "fill_value"), + [ + (["a", "b", "c"], ["c", "a", "b"], True, None), + (["a", "b", "c"], ["c", "a", "b"], False, None), + (["a", "b", "c"], ["a", "b", "c"], False, 2), + ], + ids=[ + "one_one_match_overwrite", + "one_one_match_no_overwrite", + "exact_match", + ], +) +def test_combine( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, + overwrite, + fill_value, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = bf_df_a.combine( + bf_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a.combine( + pd_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("overwrite", "filter_func"), + [ + (True, None), + (False, None), + (True, lambda x: x.isna() | (x % 2 == 0)), + ], + ids=[ + "default", + "overwritefalse", + "customfilter", + ], +) +def test_df_update(overwrite, filter_func): + if pd.__version__.startswith("1."): + pytest.skip("dtype handled differently in pandas 1.x.") + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) + pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) + + pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) + + +def test_df_idxmin(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmin().to_pandas() + pd_result = pd_df.idxmin() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_idxmax(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmax().to_pandas() + pd_result = pd_df.idxmax() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("join", "axis"), + [ + ("outer", None), + ("outer", 0), + ("outer", 1), + ("left", 0), + ("right", 1), + ("inner", None), + ("inner", 1), + ], +) +def test_df_align(join, axis): + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) + pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) + + # Don't check dtype as pandas does unnecessary float conversion + assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( + bf_result2, dataframe.DataFrame + ) + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + + +def test_combine_first( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns].iloc[0:6] + bf_df_a.columns = ["a", "b", "c"] + bf_df_b = scalars_df_2_index[columns].iloc[2:8] + bf_df_b.columns = ["b", "a", "d"] + bf_result = bf_df_a.combine_first(bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns].iloc[0:6] + pd_df_a.columns = ["a", "b", "c"] + pd_df_b = scalars_pandas_df_index[columns].iloc[2:8] + pd_df_b.columns = ["b", "a", "d"] + pd_result = pd_df_a.combine_first(pd_df_b) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_corr_w_invalid_parameters(scalars_dfs): + columns = ["int64_too", "int64_col", "float64_col"] + scalars_df, _ = scalars_dfs + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(method="kendall") + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(min_periods=1) + + +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, + ) + + +def test_df_corrwith_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_cols = ["int64_too", "float64_col"] + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_cols]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_cols]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_numeric_only(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + bf_result = ( + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=True).to_pandas() + ) + pd_result = scalars_pandas_df[l_cols].corrwith( + scalars_pandas_df[r_cols], numeric_only=True + ) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_non_numeric_error(scalars_dfs): + scalars_df, _ = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + with pytest.raises(NotImplementedError): + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) + + +def test_df_corrwith_series(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_col = "float64_col" + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_col]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_col]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("op"), + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + # operator.floordiv, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ], + ids=[ + "add", + "subtract", + "multiply", + "true_divide", + # "floor_divide", + "eq", + "ne", + "gt", + "ge", + "lt", + "le", + ], +) +# TODO(garrettwu): deal with NA values +@pytest.mark.parametrize(("other_scalar"), [1, 2.5, 0, 0.0]) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "float64_col"] + + maybe_reversed_op = (lambda x, y: op(y, x)) if reverse_operands else op + + bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize(("other_scalar"), [1, -2]) +def test_mod(scalars_dfs, other_scalar): + # Zero case excluded as pandas produces 0 result for Int64 inputs rather than NA/NaN. + # This is likely a pandas bug as mod 0 is undefined in other dtypes, and most programming languages. + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_scalar_binop_str_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + columns = ["string_col"] + with pytest.raises(TypeError, match="Cannot add dtypes"): + (scalars_df[columns] + 1).to_pandas() + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x, y: x.add(y, axis="index")), + (lambda x, y: x.radd(y, axis="index")), + (lambda x, y: x.sub(y, axis="index")), + (lambda x, y: x.rsub(y, axis="index")), + (lambda x, y: x.mul(y, axis="index")), + (lambda x, y: x.rmul(y, axis="index")), + (lambda x, y: x.truediv(y, axis="index")), + (lambda x, y: x.rtruediv(y, axis="index")), + # (lambda x, y: x.floordiv(y, axis="index")), + # (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.gt(y, axis="index")), + (lambda x, y: x.ge(y, axis="index")), + (lambda x, y: x.lt(y, axis="index")), + (lambda x, y: x.le(y, axis="index")), + ], + ids=[ + "add", + "radd", + "sub", + "rsub", + "mul", + "rmul", + "truediv", + "rtruediv", + # "floordiv", + # "rfloordiv", + "gt", + "ge", + "lt", + "le", + ], +) +def test_series_binop_axis_index( + scalars_dfs, + op, +): + scalars_df, scalars_pandas_df = scalars_dfs + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() + pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("input"), + [ + ((1000, 2000, 3000)), + (pd.Index([1000, 2000, 3000])), + (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), + ], + ids=[ + "tuple", + "pd_index", + "pd_series", + ], +) +def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() + if hasattr(input, "to_pandas"): + input = input.to_pandas() + pd_result = scalars_pandas_df[df_columns].add(input, axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_reverse_binop_pandas(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + pd_series = pd.Series([100, 200, 300]) + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = pd_series + scalars_df[df_columns].to_pandas() + pd_result = pd_series + scalars_pandas_df[df_columns] + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_listlike_binop_axis_1_bf_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = ( + scalars_df[df_columns] + .add(bf_indexes.Index([1000, 2000, 3000]), axis=1) + .to_pandas() + ) + pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_binop_with_self_aggregate(session, scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_df = scalars_df[df_columns] + bf_result = (bf_df - bf_df.mean()).to_pandas() + + pd_df = scalars_pandas_df[df_columns] + pd_result = pd_df - pd_df.mean() + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels"), + [ + (["a", "a", "b"], ["c", "c", "d"]), + (["a", "b", "c"], ["c", "a", "b"]), + (["a", "c", "c"], ["c", "a", "c"]), + (["a", "b", "c"], ["a", "b", "c"]), + ], + ids=[ + "no_overlap", + "one_one_match", + "multi_match", + "exact_match", + ], +) +def test_binop_df_df_binary_op( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = (bf_df_a - bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a - pd_df_b + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +# Differnt table will only work for explicit index, since default index orders are arbitrary. +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_series_binop_add_different_table( + scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered +): + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = ( + scalars_df_index[df_columns] + .add(scalars_df_2_index[series_column], axis="index") + .to_pandas(ordered=ordered) + ) + pd_result = scalars_pandas_df_index[df_columns].add( + scalars_pandas_df_index[series_column], axis="index" + ) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +# TODO(garrettwu): Test series binop with different index + +all_joins = pytest.mark.parametrize( + ("how",), + (("outer",), ("left",), ("right",), ("inner",), ("cross",)), +) + + +@all_joins +def test_join_same_table(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + if not bf_df._session._strictly_ordered and how == "cross": + pytest.skip("Cross join not supported in partial ordering mode.") + + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_a = bf_df_a.sort_index() + + bf_df_b = bf_df.set_index("int64_too")[["float64_col"]] + bf_df_b = bf_df_b[bf_df_b.float64_col > 0] + bf_df_b = bf_df_b.sort_values("float64_col") + + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + + pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]].sort_index() + pd_df_a = pd_df_a.sort_index() + + pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] + pd_df_b = pd_df_b[pd_df_b.float64_col > 0] + pd_df_b = pd_df_b.sort_values("float64_col") + + pd_result = pd_df_a.join(pd_df_b, how=how) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_join_different_table( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index, how +): + bf_df_a = scalars_df_index[["string_col", "int64_col"]] + bf_df_b = scalars_df_2_index.dropna()[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] + pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): + scalars_df, _ = scalars_dfs + df_a = scalars_df[["string_col", "float64_col"]] + df_b = scalars_df[["float64_col"]] + with pytest.raises(NotImplementedError): + df_a.join(df_b, how="outer").to_pandas() + + +@all_joins +def test_join_param_on(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_df_b = bf_df[["float64_col"]] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_df_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_df_join_series(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_series_b = bf_df["float64_col"] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_series_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_series_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_series_b = pd_df["float64_col"] + pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ("int64_col", False, "first"), + (["bool_col", "int64_col"], [False, True], "last"), + (["bool_col", "int64_col"], [True, False], "first"), + ], +) +def test_dataframe_sort_values( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_result = scalars_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ).to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ], +) +def test_dataframe_sort_values_inplace( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_sorted = scalars_df_index.copy() + bf_sorted.sort_values( + by, ascending=ascending, na_position=na_position, inplace=True + ) + bf_result = bf_sorted.to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_dataframe_sort_values_invalid_input(scalars_df_index): + with pytest.raises(KeyError): + scalars_df_index.sort_values(by=scalars_df_index["int64_col"]) + + +def test_dataframe_sort_values_stable(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.sort_values("int64_col", kind="stable") + .sort_values("bool_col", kind="stable") + .to_pandas() + ) + pd_result = scalars_pandas_df_index.sort_values( + "int64_col", kind="stable" + ).sort_values("bool_col", kind="stable") + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("operator", "columns"), + [ + pytest.param(lambda x: x.cumsum(), ["float64_col", "int64_too"]), + # pytest.param(lambda x: x.cumprod(), ["float64_col", "int64_too"]), + pytest.param( + lambda x: x.cumprod(), + ["string_col"], + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "cumsum", + # "cumprod", + "non-numeric", + ], +) +def test_dataframe_numeric_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator, columns +): + # TODO: Add nullable ints (pandas 1.x has poor behavior on these) + bf_series = operator(scalars_df_index[columns]) + pd_series = operator(scalars_pandas_df_index[columns]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x: x.cummin()), + (lambda x: x.cummax()), + (lambda x: x.shift(2)), + (lambda x: x.shift(-2)), + ], + ids=[ + "cummin", + "cummax", + "shiftpostive", + "shiftnegative", + ], +) +def test_dataframe_general_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator +): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_series = operator(scalars_df_index[col_names]) + pd_series = operator(scalars_pandas_df_index[col_names]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal( + pd_series, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_dataframe_agg_single_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg("sum").to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg("sum") + + assert bf_result.dtype == "Float64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("agg",), + ( + ("sum",), + ("size",), + ), +) +def test_dataframe_agg_int_single_string(scalars_dfs, agg): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg(agg).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(agg) + + assert bf_result.dtype == "Int64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_dataframe_agg_multi_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + aggregations = [ + "sum", + "mean", + "median", + "std", + "var", + "min", + "max", + "nunique", + "count", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg(aggregations) + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Drop median, as it's an approximation. + bf_median = bf_result.loc["median", :] + bf_result = bf_result.drop(labels=["median"]) + pd_result = pd_result.drop(labels=["median"]) + + assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) + + # Double-check that median is at least plausible. + assert ( + (bf_result.loc["min", :] <= bf_median) & (bf_median <= bf_result.loc["max", :]) + ).all() + + +def test_dataframe_agg_int_multi_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + aggregations = [ + "sum", + "nunique", + "count", + "size", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + for dtype in bf_result.dtypes: + assert dtype == "Int64" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_transpose(): + # Include some floats to ensure type coercion + values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] + # Test complex case of both axes being multi-indices with non-unique elements + + columns: pandas.Index = pd.Index( + ["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow") + ) + columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) + + index: pandas.Index = pd.Index( + ["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow") + ) + rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) + + pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) + bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) + + pd_result = pd_df.T + bf_result = bf_df.T.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +def test_df_transpose_error(): + with pytest.raises(TypeError, match="Cannot coerce.*to a common type."): + dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose() + + +def test_df_transpose_repeated_uses_cache(): + bf_df = dataframe.DataFrame([[1, 2.5], [2, 3.5]]) + pd_df = pandas.DataFrame([[1, 2.5], [2, 3.5]]) + # Transposing many times so that operation will fail from complexity if not using cache + for i in range(10): + # Cache still works even with simple scalar binop + bf_df = bf_df.transpose() + i + pd_df = pd_df.transpose() + i + + pd.testing.assert_frame_equal( + pd_df, bf_df.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_df_stack(scalars_dfs): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].stack().to_pandas() + pd_result = scalars_pandas_df[columns].stack(future_stack=True) + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_unstack(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = [ + "rowindex_2", + "int64_col", + "int64_too", + ] + + # unstack on mono-index produces series + bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].unstack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) + + +def test_ipython_key_completions_with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert col_names not in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test_ipython_key_completions_with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert "string_col" not in results + assert "a_renamed_column" in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test__dir__with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert col_names not in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +def test__dir__with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert "string_col" not in results + assert "a_renamed_column" in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (0, 0, None), + (None, None, None), + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50000000000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50000000000), + ], +) +def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): + bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index.iloc[start:stop:step] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_iloc_slice_zero_step(scalars_df_index): + with pytest.raises(ValueError): + scalars_df_index.iloc[0:0:0] + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +@pytest.mark.parametrize( + "index", + [0, 5, -2, (2,)], +) +def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iloc_tuple(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + "index", + [(slice(None), [1, 2, 3]), (slice(1, 7, 2), [2, 5, 3])], +) +def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index].to_pandas() + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): + index = (2, [2, 1, 3, -4]) + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("index", "error"), + [ + ((1, 1, 1), pd.errors.IndexingError), + (("asd", "asd", "asd"), pd.errors.IndexingError), + (("asd"), TypeError), + ], +) +def test_iloc_tuple_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_df_index.iloc[index] + with pytest.raises(error): + scalars_pandas_df_index.iloc[index] + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iat(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iat[index] + pd_result = scalars_pandas_df_index.iat[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index", "error"), + [ + (0, TypeError), + ("asd", ValueError), + ((1, 2, 3), TypeError), + (("asd", "asd"), ValueError), + ], +) +def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_pandas_df_index.iat[index] + with pytest.raises(error): + scalars_df_index.iat[index] + + +def test_iloc_single_integer_out_of_bound_error( + scalars_df_index, scalars_pandas_df_index +): + with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): + scalars_df_index.iloc[99] + + +def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() + pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() + pd_result = scalars_pandas_df_index.loc[:, "int64_col"] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, scalars_df_index.dtypes == "Int64"].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.dtypes == "Int64" + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition_bf_series( + scalars_df_index, scalars_pandas_df_index +): + # (b/347072677) GEOGRAPH type doesn't support DISTINCT op + columns = [ + item for item in scalars_pandas_df_index.columns if item != "geography_col" + ] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + size_half = len(scalars_pandas_df_index) / 2 + bf_result = scalars_df_index.loc[ + :, scalars_df_index.nunique() > size_half + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.nunique() > size_half + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_at_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.at[index, "int64_too"] + pd_result = scalars_pandas_df_index.at[index, "int64_too"] + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_at_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.at[index, "string_col"] + pd_result = scalars_pandas_df_index.at[index, "string_col"] + assert bf_result == pd_result + + +def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99 + pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99 + + # pandas uses float64 instead + pd_df["new_col"] = pd_df["new_col"].astype("Float64") + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +@pytest.mark.parametrize( + ("col", "value"), + [ + ("string_col", "hello"), + ("int64_col", 3), + ("float64_col", 3.5), + ], +) +def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 1, col] = value + pd_df.loc[pd_df["int64_too"] == 1, col] = value + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +def test_loc_setitem_bool_series_scalar_error(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(Exception): + bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99 + with pytest.raises(Exception): + pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 + + +@pytest.mark.parametrize( + ("col", "op"), + [ + # Int aggregates + pytest.param("int64_col", lambda x: x.sum(), id="int-sum"), + pytest.param("int64_col", lambda x: x.min(), id="int-min"), + pytest.param("int64_col", lambda x: x.max(), id="int-max"), + pytest.param("int64_col", lambda x: x.count(), id="int-count"), + pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"), + # Float aggregates + pytest.param("float64_col", lambda x: x.count(), id="float-count"), + pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"), + # Bool aggregates + pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"), + pytest.param("bool_col", lambda x: x.count(), id="bool-count"), + pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"), + # String aggregates + pytest.param("string_col", lambda x: x.count(), id="string-count"), + pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"), + ], +) +def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "Int64" + # Is otherwise "object" dtype + pd_result.index = pd_result.index.astype("string[pyarrow]") + # Pandas may produce narrower numeric types + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("col", "op"), + [ + pytest.param("bool_col", lambda x: x.min(), id="bool-min"), + pytest.param("bool_col", lambda x: x.max(), id="bool-max"), + ], +) +def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "boolean" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("op", "bf_dtype"), + [ + (lambda x: x.sum(numeric_only=True), "Float64"), + (lambda x: x.mean(numeric_only=True), "Float64"), + (lambda x: x.min(numeric_only=True), "Float64"), + (lambda x: x.max(numeric_only=True), "Float64"), + (lambda x: x.std(numeric_only=True), "Float64"), + (lambda x: x.var(numeric_only=True), "Float64"), + (lambda x: x.count(numeric_only=False), "Int64"), + (lambda x: x.nunique(), "Int64"), + ], + ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], +) +def test_dataframe_aggregates(scalars_dfs, op, bf_dtype): + scalars_df_index, scalars_pandas_df_index = scalars_dfs + col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] + bf_series = op(scalars_df_index[col_names]) + bf_result = bf_series + pd_result = op(scalars_pandas_df_index[col_names]) + + # Check dtype separately + assert bf_result.dtype == bf_dtype + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equivalent( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.sum(axis=1, numeric_only=True)), + (lambda x: x.mean(axis=1, numeric_only=True)), + (lambda x: x.min(axis=1, numeric_only=True)), + (lambda x: x.max(axis=1, numeric_only=True)), + (lambda x: x.std(axis=1, numeric_only=True)), + (lambda x: x.var(axis=1, numeric_only=True)), + ], + ids=["sum", "mean", "min", "max", "std", "var"], +) +def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): + col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] + bf_result = op(scalars_df_index[col_names]).to_pandas() + pd_result = op(scalars_pandas_df_index[col_names]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.all(bool_only=True)), + (lambda x: x.any(bool_only=True)), + (lambda x: x.all(axis=1, bool_only=True)), + (lambda x: x.any(axis=1, bool_only=True)), + ], + ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], +) +def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): + # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later + scalars_df_index = scalars_df_index.assign( + bool_col=scalars_df_index.bool_col.fillna(False) + ) + scalars_pandas_df_index = scalars_pandas_df_index.assign( + bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") + ) + bf_series = op(scalars_df_index) + pd_series = op(scalars_pandas_df_index).astype("boolean") + bf_result = bf_series.to_pandas() + + pd_series.index = pd_series.index.astype(bf_result.index.dtype) + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col"] + bf_series = scalars_df_index[col_names].prod() + pd_series = scalars_pandas_df_index[col_names].prod() + bf_result = bf_series.to_pandas() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_series = pd_series.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_df_skew_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).skew().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_skew(scalars_dfs, ordered): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + assert_series_equal( + pd_result, bf_result, check_index_type=False, ignore_order=not ordered + ) + + +def test_df_kurt_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).kurt().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_df_kurt(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].kurt().to_pandas() + pd_result = scalars_pandas_df[columns].kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_sample_raises_value_error(scalars_dfs): + scalars_df, _ = scalars_dfs + with pytest.raises( + ValueError, match="Only one of 'n' or 'frac' parameter can be specified." + ): + scalars_df.sample(frac=0.5, n=4) + + +@pytest.mark.parametrize( + ("axis",), + [ + (None,), + (0,), + (1,), + ], +) +def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_prefix("prefix_", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("axis",), + [ + (0,), + (1,), + ], +) +def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_suffix("_suffix", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_df_astype_error_error(session): + input = pd.DataFrame(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + +def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) + # Ignore column ordering as pandas order differently depending on version + pd.testing.assert_frame_equal( + bf_result.sort_index(axis=1), + pd_result.sort_index(axis=1), + ) + + +def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(like="64_col").to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="64_col") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_rows_list(scalars_dfs): + scalars_df_index, scalars_pandas_df_index = scalars_dfs + bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) + + pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too").reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_index): + # First, make sure the two dataframes have the same columns in order. + columns = ["int64_col", "int64_too"] + bf = scalars_df_index[columns] + pd_df = scalars_pandas_df_index[columns] + + bf_result = bf.reindex(columns=columns).to_pandas() + pd_result = pd_df.reindex(columns=columns) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "geography_col", + ] + scalars_df_index = scalars_df_index.drop(columns=unsupported) + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported) + + bf_result = scalars_df_index.equals(scalars_df_index) + pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index) + + assert pd_result == bf_result + + +def test_df_equals_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"]) + pd_result = scalars_pandas_df_index[["int64_col"]].equals( + scalars_pandas_df_index["int64_col"] + ) + + assert pd_result == bf_result + + +def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified = bf_modified.astype("Float64") + + pd_modified = scalars_pandas_df_index.copy() + pd_modified = pd_modified.astype("Float64") + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified["int64_col"] = bf_modified.int64_col + 1 + + pd_modified = scalars_pandas_df_index.copy() + pd_modified["int64_col"] = pd_modified.int64_col + 1 + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + more_columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns]) + pd_result = scalars_pandas_df_index[columns].equals( + scalars_pandas_df_index[more_columns] + ) + + assert pd_result == bf_result + + +def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): + reindex_target_bf = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas() + + reindex_target_pd = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_values(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.values + + pd_result = scalars_pandas_df_index.values + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df_to_numpy(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_numpy() + + pd_result = scalars_pandas_df_index.to_numpy() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df___array__(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.__array__() + + pd_result = scalars_pandas_df_index.__array__() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index): + # swapaxes is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): + scalars_df_index.swapaxes() + + +def test_df_getattr_attribute_error(scalars_df_index): + with pytest.raises(AttributeError): + scalars_df_index.not_a_method() + + +def test_df_getattr_axes(): + df = dataframe.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + assert isinstance(df.index, bigframes.core.indexes.Index) + assert isinstance(df.columns, pandas.Index) + assert isinstance(df.my_column, series.Series) + + +def test_df_setattr_index(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.index = pandas.Index([4, 5]) + bf_df.index = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_columns(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.columns = typing.cast(pandas.Index, pandas.Index([4, 5, 6])) + + bf_df.columns = pandas.Index([4, 5, 6]) + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_modify_column(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + pd_df.my_column = [4, 5] + bf_df.my_column = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): + index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[index_list].to_pandas() + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): + index_list = [3, 2, 1, 3, 2, 1] + + bf_result = scalars_df_index.loc[index_list] + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_multiindex(scalars_dfs): + scalars_df_index, scalars_pandas_df_index = scalars_dfs + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] + + bf_result = scalars_df_multiindex.loc[index_list] + pd_result = scalars_pandas_df_multiindex.loc[index_list] + + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list_multiindex(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df.iloc[index_list] + pd_result = scalars_pandas_df.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): + + index_list: List[int] = [] + + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + bf_result = bf_result.to_pandas() + assert bf_result.shape == pd_result.shape # types are known to be different + + +def test_rename_axis(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis("newindexname") + pd_result = scalars_pandas_df_index.rename_axis("newindexname") + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_axis_nonstring(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis((4,)) + pd_result = scalars_pandas_df_index.rename_axis((4,)) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[bf_string_series] + pd_result = scalars_pandas_df_index.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + + bf_result = scalars_df_multiindex.loc[bf_string_series] + pd_result = scalars_pandas_df_multiindex.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index_renamed_col( + scalars_df_index, scalars_pandas_df_index +): + scalars_df_index = scalars_df_index.rename(columns={"int64_col": "rename"}) + scalars_pandas_df_index = scalars_pandas_df_index.rename( + columns={"int64_col": "rename"} + ) + + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + "bool_col", + ["bool_col", "int64_too"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + (False,), + ], +) +def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() + pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) + pd.testing.assert_frame_equal( + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + ["bool_col"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + (False,), + ], +) +def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) + pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + + +def test_df_from_dict_columns_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="columns") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_index_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict( + data, orient="index", columns=["col1", "col2"] + ).to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_tight_orient(): + data = { + "index": [("i1", "i2"), ("i3", "i4")], + "columns": ["col1", "col2"], + "data": [[1, 2.6], [3, 4.5]], + "index_names": ["in1", "in2"], + "column_names": ["column_axis"], + } + + bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="tight") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_records(): + records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) + + bf_result = dataframe.DataFrame.from_records( + records, columns=["c1", "c2"] + ).to_pandas() + pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_dict() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict() + + assert bf_result == pd_result + + +def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_json() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_json(default_handler=str) + + assert bf_result == pd_result + + +def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_json(bf_result_file, orient="table") + # default_handler for arrow types that have no default conversion + scalars_pandas_df_index.to_json( + pd_result_file, orient="table", default_handler=str + ) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_csv() + + assert bf_result == pd_result + + +def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_csv(bf_result_file) + scalars_pandas_df_index.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + + bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_records(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] + bf_result = scalars_df_index.drop(columns=unsupported).to_records() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records() + + for bfi, pdi in zip(bf_result, pd_result): + for bfj, pdj in zip(bfi, pdi): + assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj + + +def test_df_to_string(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_string() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string() + + assert bf_result == pd_result + + +def test_df_to_html(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_html() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() + + assert bf_result == pd_result + + +def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): + # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 + bf_result = scalars_df_index.dropna().to_markdown() + pd_result = scalars_pandas_df_index.dropna().to_markdown() + + assert bf_result == pd_result + + +def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_pickle(bf_result_file) + scalars_pandas_df_index.to_pickle(pd_result_file) + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "numeric_col", + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "geography_col", + ] + + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc( + pd_result_file + ) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_df_eval(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.eval(expr).to_pandas() + pd_result = scalars_pandas_df.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("expr",), + [ + ("int64_col > int64_too",), + ("bool_col",), + ("((int64_col - int64_too) % @local_var) == 0",), + ], +) +def test_df_query(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + # local_var is referenced in expressions + local_var = 3 # NOQA + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.query(expr).to_pandas() + pd_result = scalars_pandas_df.query(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("subset", "normalize", "ascending", "dropna"), + [ + (None, False, False, False), + (None, True, True, True), + ("bool_col", True, False, True), + ], +) +def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = ( + scalars_df[["string_col", "bool_col"]] + .value_counts(subset, normalize=normalize, ascending=ascending, dropna=dropna) + .to_pandas() + ) + pd_result = scalars_pandas_df[["string_col", "bool_col"]].value_counts( + subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_bool_interpretation_error(scalars_df_index): + with pytest.raises(ValueError): + True if scalars_df_index else False + + +def test_assign_after_binop_row_joins(): + pd_df = pd.DataFrame( + { + "idx1": [1, 1, 1, 1, 2, 2, 2, 2], + "idx2": [10, 10, 20, 20, 10, 10, 20, 20], + "metric1": [10, 14, 2, 13, 6, 2, 9, 5], + "metric2": [25, -3, 8, 2, -1, 0, 0, -4], + }, + dtype=pd.Int64Dtype(), + ).set_index(["idx1", "idx2"]) + bf_df = dataframe.DataFrame(pd_df) + + # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join + bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 + pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 + + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_df_dot_inline(session): + df1 = pd.DataFrame([[1, 2, 3], [2, 5, 7]]) + df2 = pd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]]) + + bf1 = session.read_pandas(df1) + bf2 = session.read_pandas(df2) + bf_result = bf1.dot(bf2).to_pandas() + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas uses int64 instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_series_inline(): + left = [[1, 2, 3], [2, 5, 7]] + right = [2, 1, 3] + + bf1 = dataframe.DataFrame(left) + bf2 = series.Series(right) + bf_result = bf1.dot(bf2).to_pandas() + + df1 = pd.DataFrame(left) + df2 = pd.Series(right) + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas result is int64 instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("col_names", "ignore_index"), + [ + pytest.param(["A"], False, id="one_array_false"), + pytest.param(["A"], True, id="one_array_true"), + pytest.param(["B"], False, id="one_float_false"), + pytest.param(["B"], True, id="one_float_true"), + pytest.param(["A", "C"], False, id="two_arrays_false"), + pytest.param(["A", "C"], True, id="two_arrays_true"), + ], +) +def test_dataframe_explode(col_names, ignore_index, session): + data = { + "A": [[0, 1, 2], [], [3, 4]], + "B": 3, + "C": [["a", "b", "c"], np.nan, ["d", "e"]], + } + + df = bpd.DataFrame(data, session=session) + pd_df = df.to_pandas() + pd_result = pd_df.explode(col_names, ignore_index=ignore_index) + bf_result = df.explode(col_names, ignore_index=ignore_index) + + # Check that to_pandas() results in at most a single query execution + bf_materialized = bf_result.to_pandas() + + pd.testing.assert_frame_equal( + bf_materialized, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_dataframe_explode_reserve_order(session, ignore_index, ordered): + data = { + "a": [np.random.randint(0, 10, 10) for _ in range(10)], + "b": [np.random.randint(0, 10, 10) for _ in range(10)], + } + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( + pd.Int64Dtype() + ) + pd.testing.assert_frame_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param( + ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), + ], +) +def test_dataframe_explode_xfail(col_names): + df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) + df.explode(col_names) diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index 613a929c04..92a58b3dc6 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -483,3 +483,25 @@ def intersection(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignor each aligned geometry with other. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def is_closed(self: GeoSeries) -> bigframes.series.Series: + """ + [Not Implemented] Use ``bigframes.bigquery.st_isclosed(series)`` + instead to return a boolean indicating if a shape is closed. + + In GeoPandas, this returns a Series of booleans with value True if a + LineString's or LinearRing's first and last points are equal. + + Returns False for any other geometry type. + + Returns: + bigframes.pandas.Series: + Series of booleans. + + Raises: + NotImplementedError: + GeoSeries.is_closed is not supported. Use + ``bigframes.bigquery.st_isclosed(series)``, instead. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/ibis/expr/types/core.py b/third_party/bigframes_vendored/ibis/expr/types/core.py index 9685e4ddca..5704dc993a 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/core.py +++ b/third_party/bigframes_vendored/ibis/expr/types/core.py @@ -19,6 +19,7 @@ import bigframes_vendored.ibis.expr.operations as ops from bigframes_vendored.ibis.expr.types.pretty import to_rich from bigframes_vendored.ibis.util import experimental +import pandas as pd from public import public from rich.console import Console from rich.jupyter import JupyterMixin @@ -34,7 +35,6 @@ EdgeAttributeGetter, NodeAttributeGetter, ) - import pandas as pd import polars as pl import pyarrow as pa import torch @@ -744,9 +744,9 @@ def _binop(op_class: type[ops.Binary], left: ir.Value, right: ir.Value) -> ir.Va def _is_null_literal(value: Any) -> bool: """Detect whether `value` will be treated by ibis as a null literal.""" - if value is None: - return True if isinstance(value, Expr): op = value.op() return isinstance(op, ops.Literal) and op.value is None + if pd.isna(value): + return True return False diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index 56d60174a6..d3d11a9c2a 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -171,6 +171,7 @@ def eval( with plain ol' Python evaluation. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c1b5b5a86b..6c927a5c26 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4253,6 +4253,7 @@ def corrwith( correlations. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 469f35f181..dfb1cf9efc 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -204,6 +204,7 @@ def isocalendar(self): Calculate year, week, and day according to the ISO 8601 standard. **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 7df1c7a9de..6a6bb96897 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1087,6 +1087,25 @@ def unique(self, level: Hashable | int | None = None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def item(self, *args, **kwargs): + """Return the first element of the underlying data as a Python scalar. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1], index=['a']) + >>> s.index.item() + 'a' + + Returns: + scalar: The first element of Index. + + Raises: + ValueError: If the data is not length = 1. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 61cd6a47bf..b2846d675c 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -4933,6 +4933,26 @@ def kurt(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def item(self: Series, *args, **kwargs): + """Return the first element of the underlying data as a Python scalar. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1]) + >>> s.item() + np.int64(1) + + Returns: + scalar: The first element of Series. + + Raises: + ValueError: If the data is not length = 1. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def items(self): """ Lazily iterate over (index, value) tuples. diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 9f3d87ecb7..9b5b461ea5 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -252,15 +252,12 @@ def strip(self, to_strip: typing.Optional[str] = None): >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['1. Ant.', ' 2. Bee? ', '\\t3. Cat!\\n', bpd.NA]) - >>> s - 0 1. Ant. - 1 2. Bee? - 2 3. Cat! - - 3 - dtype: string - + >>> s = bpd.Series([ + ... '1. Ant.', + ... ' 2. Bee? ', + ... '\\t3. Cat!\\n', + ... bpd.NA, + ... ]) >>> s.str.strip() 0 1. Ant. 1 2. Bee? @@ -269,10 +266,10 @@ def strip(self, to_strip: typing.Optional[str] = None): dtype: string >>> s.str.strip('123.!? \\n\\t') - 0 Ant - 1 Bee - 2 Cat - 3 + 0 Ant + 1 Bee + 2 Cat + 3 dtype: string Args: @@ -543,7 +540,7 @@ def isdecimal(self): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rstrip(self, to_strip: typing.Optional[str] = None): - """Remove trailing characters. + r"""Remove trailing characters. Strip whitespaces (including newlines) or a set of specified characters from each string in the Series/Index from right side. @@ -555,19 +552,11 @@ def rstrip(self, to_strip: typing.Optional[str] = None): >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) - >>> s - 0 Ant - 1 Bee - 2 Cat - - 3 - dtype: string - + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.rstrip() 0 Ant 1 Bee - 2 Cat + 2 \tCat 3 dtype: string @@ -584,7 +573,7 @@ def rstrip(self, to_strip: typing.Optional[str] = None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lstrip(self, to_strip: typing.Optional[str] = None): - """Remove leading characters. + r"""Remove leading characters. Strip whitespaces (including newlines) or a set of specified characters from each string in the Series/Index from left side. @@ -596,21 +585,12 @@ def lstrip(self, to_strip: typing.Optional[str] = None): >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) - >>> s - 0 Ant - 1 Bee - 2 Cat - - 3 - dtype: string - + >>> s = bpd.Series(['Ant', ' Bee ', '\tCat\n', bpd.NA]) >>> s.str.lstrip() - 0 Ant - 1 Bee - 2 Cat - - 3 + 0 Ant + 1 Bee + 2 Cat\n + 3 dtype: string Args: diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index aa4d862b65..a0d4092571 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -67,6 +67,7 @@ def read_gbq( >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") Read table path with wildcard suffix and filters: + >>> df = bpd.read_gbq_table("bigquery-public-data.noaa_gsod.gsod19*", filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")]) Preserve ordering in a query input. diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 6cc3d952ed..e41364d4d1 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.5.0" +__version__ = "2.6.0" # {x-release-please-start-date} -__release_date__ = "2025-05-30" +__release_date__ = "2025-06-09" # {x-release-please-end}