diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9426df54f0..63f36d4ddd 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -23,7 +23,8 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled +import bigframes.core.compile.compiler as compiler import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -78,7 +79,7 @@ def from_pandas(cls, pd_df: pandas.DataFrame): @property def column_ids(self) -> typing.Sequence[str]: - return self.compile().column_ids + return self._compile_ordered().column_ids @property def session(self) -> Session: @@ -88,15 +89,18 @@ def session(self) -> Session: return self.node.session[0] if required_session else get_global_session() def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - return self.compile().get_column_type(key) + return self._compile_ordered().get_column_type(key) - def compile(self) -> compiled.CompiledArrayValue: - return compiled.compile_node(self.node) + def _compile_ordered(self) -> compiled.OrderedIR: + return compiler.compile_ordered(self.node) + + def _compile_unordered(self) -> compiled.UnorderedIR: + return compiler.compile_unordered(self.node) def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" - width = len(self.compile().columns) - count_expr = self.compile()._to_ibis_expr("unordered").count() + width = len(self._compile_unordered().columns) + count_expr = self._compile_unordered()._to_ibis_expr().count() # Support in-memory engines for hermetic unit tests. if not self.node.session: @@ -121,11 +125,14 @@ def to_sql( col_id_overrides: typing.Mapping[str, str] = {}, sorted: bool = False, ) -> str: - return self.compile().to_sql( - offset_column=offset_column, - col_id_overrides=col_id_overrides, - sorted=sorted, - ) + if sorted or offset_column: + return self._compile_ordered().to_sql( + offset_column=offset_column, + col_id_overrides=col_id_overrides, + sorted=sorted, + ) + else: + return self._compile_unordered().to_sql(col_id_overrides=col_id_overrides) def start_query( self, @@ -154,7 +161,7 @@ def start_query( def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: """Write the ArrayValue to a session table and create a new block object that references it.""" - compiled_value = self.compile() + compiled_value = self._compile_ordered() ibis_expr = compiled_value._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True ) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index bf3b9321c3..6358d28e2e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -386,6 +386,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" if max_download_size is None: @@ -412,6 +414,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) return df, query_job @@ -446,12 +449,16 @@ def _compute_and_count( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. expr = self._apply_value_keys_to_expr(value_keys=value_keys) - results_iterator, query_job = expr.start_query(max_results=max_results) + results_iterator, query_job = expr.start_query( + max_results=max_results, sorted=ordered + ) table_size = ( expr.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index c86f4463dc..761fd9a465 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.compile.compiled import CompiledArrayValue -from bigframes.core.compile.compiler import compile_node +from bigframes.core.compile.compiled import OrderedIR, UnorderedIR +from bigframes.core.compile.compiler import compile_ordered, compile_unordered __all__ = [ - "compile_node", - "CompiledArrayValue", + "compile_ordered", + "compile_unordered", + "OrderedIR", + "UnorderedIR", ] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 1134f1aab0..4ba5e6bd08 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -13,8 +13,8 @@ # limitations under the License. from __future__ import annotations +import abc import functools -import math import textwrap import typing from typing import Collection, Iterable, Literal, Optional, Sequence @@ -32,8 +32,6 @@ ExpressionOrdering, IntegerEncoding, OrderingColumnReference, - reencode_order_string, - StringEncoding, ) import bigframes.core.utils as utils from bigframes.core.window_spec import WindowSpec @@ -44,8 +42,568 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" +T = typing.TypeVar("T", bound="BaseIbisIR") -class CompiledArrayValue: + +class BaseIbisIR(abc.ABC): + """Implementation detail, contains common logic between ordered and unordered IR""" + + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self._table = table + self._predicates = tuple(predicates) if predicates is not None else () + # Allow creating a DataFrame directly from an Ibis table expression. + # TODO(swast): Validate that each column references the same table (or + # no table for literal values). + self._columns = tuple(columns) + # To allow for more efficient lookup by column name, create a + # dictionary mapping names to column values. + self._column_names = {column.get_name(): column for column in self._columns} + + @property + def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + return self._columns + + @property + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) + + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None + ) + + @abc.abstractmethod + def select_columns(self: T, column_ids: typing.Sequence[str]) -> T: + """Creates a new expression based on this expression with new columns.""" + ... + + def drop_columns(self: T, columns: Iterable[str]) -> T: + return self.select_columns( + [col for col in self.column_ids if col not in columns] + ) + + @abc.abstractmethod + def filter(self: T, predicate_id: str, keep_null: bool = False) -> T: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + ... + + @abc.abstractmethod + def unpivot( + self: T, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> T: + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + ... + + @abc.abstractmethod + def _reproject_to_table(self: T) -> T: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + ... + + def project_unary_op( + self: T, + input_column_id: str, + op: ops.UnaryOp, + output_column_id: typing.Optional[str] = None, + ) -> T: + """Creates a new expression based on this expression with unary operation applied to one column.""" + result_id = ( + output_column_id or input_column_id + ) # overwrite input if not output id provided + value = op._as_ibis(self._get_ibis_column(input_column_id)).name(result_id) + return self._set_or_replace_by_id(result_id, value) + + def project_binary_op( + self: T, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> T: + """Creates a new expression based on this expression with binary operation applied to two columns.""" + value = op( + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def project_ternary_op( + self: T, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> T: + """Creates a new expression based on this expression with ternary operation applied to three columns.""" + value = op( + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def assign(self: T, source_id: str, destination_id: str) -> T: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) + + def assign_constant( + self: T, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> T: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() + + @abc.abstractmethod + def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T: + ... + + def _get_ibis_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column.""" + if key not in self.column_ids: + raise ValueError( + "Column name {} not in set of values: {}".format(key, self.column_ids) + ) + return typing.cast(ibis_types.Value, self._column_names[key]) + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + ibis_type = typing.cast( + bigframes.dtypes.IbisDtype, self._get_ibis_column(key).type() + ) + return typing.cast( + bigframes.dtypes.Dtype, + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + ) + + +# Ibis Implementations +class UnorderedIR(BaseIbisIR): + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + super().__init__(table, columns, predicates) + + def builder(self): + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return UnorderedIR.Builder( + self._table, + columns=self._columns, + predicates=self._predicates, + ) + + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + if offset_column or sorted: + raise ValueError("Cannot produce sorted sql in unordered mode") + sql = ibis_bigquery.Backend().compile( + self._to_ibis_expr( + col_id_overrides=col_id_overrides, + ) + ) + return typing.cast(str, sql) + + def _to_ibis_expr( + self, + *, + expose_hidden_cols: bool = False, + fraction: Optional[float] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + ): + """ + Creates an Ibis table expression representing the DataFrame. + + ArrayValue objects are sorted, so the following options are available + to reflect this in the ibis expression. + + * "offset_col": Zero-based offsets are generated as a column, this will + not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. + * "unordered": No ordering information will be provided in output. Only + value columns are projected. + + For offset or ordered column, order_col_name can be used to assign the + output label for the ordering column. If none is specified, the default + column name will be 'bigframes_ordering_id' + + Args: + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. + col_id_overrides: + overrides the column ids for the result + Returns: + An ibis expression representing the data help by the ArrayValue object. + """ + columns = list(self._columns) + columns_to_drop: list[ + str + ] = [] # Ordering/Filtering columns that will be dropped at end + + if self._reduced_predicate is not None: + columns.append(self._reduced_predicate) + # Usually drop predicate as it is will be all TRUE after filtering + if not expose_hidden_cols: + columns_to_drop.append(self._reduced_predicate.get_name()) + + # Special case for empty tables, since we can't create an empty + # projection. + if not columns: + return ibis.memtable([]) + + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR: + """Creates a new expression based on this expression with new columns.""" + columns = [self._get_ibis_column(col_id) for col_id in column_ids] + builder = self.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + + def filter(self, predicate_id: str, keep_null: bool = False) -> UnorderedIR: + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> UnorderedIR: + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr() + row_n = len(row_labels) + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + unpivot_offset_id, + ) + + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + return UnorderedIR( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + ) + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> OrderedIR: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + table = self._to_ibis_expr() + stats = { + col_out: agg_op._as_ibis(table[col_in]) + for col_in, agg_op, col_out in aggregations + } + if by_column_ids: + result = table.group_by(by_column_ids).aggregate(**stats) + # Must have deterministic ordering, so order by the unique "by" column + ordering = ExpressionOrdering( + tuple( + [ + OrderingColumnReference(column_id=column_id) + for column_id in by_column_ids + ] + ), + total_ordering_columns=frozenset(by_column_ids), + ) + columns = tuple(result[key] for key in result.columns) + expr = OrderedIR(result, columns=columns, ordering=ordering) + if dropna: + for column_id in by_column_ids: + expr = expr._filter( + ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) + ) + # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation + return expr._project_offsets() + else: + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return OrderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> OrderedIR: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + table = self._to_ibis_expr() + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return OrderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def _uniform_sampling(self, fraction: float) -> UnorderedIR: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + table = self._to_ibis_expr(fraction=fraction) + columns = [table[column_name] for column_name in self._column_names] + return UnorderedIR( + table, + columns=columns, + ) + + ## Helpers + def _set_or_replace_by_id( + self, id: str, new_value: ibis_types.Value + ) -> UnorderedIR: + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + def _reproject_to_table(self) -> UnorderedIR: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + table = self._to_ibis_expr() + columns = [table[column_name] for column_name in self._column_names] + return UnorderedIR( + table, + columns=columns, + ) + + class Builder: + def __init__( + self, + table: ibis_types.Table, + columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self.table = table + self.columns = list(columns) + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> UnorderedIR: + return UnorderedIR( + table=self.table, + columns=self.columns, + predicates=self.predicates, + ) + + +class OrderedIR(BaseIbisIR): """Immutable BigQuery DataFrames expression tree. Note: Usage of this class is considered to be private and subject to change @@ -71,17 +629,11 @@ def __init__( ordering: ExpressionOrdering = ExpressionOrdering(), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): - self._table = table - self._predicates = tuple(predicates) if predicates is not None else () + super().__init__(table, columns, predicates) # TODO: Validate ordering if not ordering.total_ordering_columns: raise ValueError("Must have total ordering defined by one or more columns") self._ordering = ordering - # Allow creating a DataFrame directly from an Ibis table expression. - # TODO(swast): Validate that each column references the same table (or - # no table for literal values). - self._columns = tuple(columns) - # Meta columns store ordering, or other data that doesn't correspond to dataframe columns self._hidden_ordering_columns = ( tuple(hidden_ordering_columns) @@ -111,10 +663,10 @@ def __init__( raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") @classmethod - def mem_expr_from_pandas( + def from_pandas( cls, pd_df: pandas.DataFrame, - ) -> CompiledArrayValue: + ) -> OrderedIR: """ Builds an in-memory only (SQL only) expr from a pandas dataframe. """ @@ -173,27 +725,10 @@ def mem_expr_from_pandas( hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), ) - @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: - return self._columns - - @property - def column_ids(self) -> typing.Sequence[str]: - return tuple(self._column_names.keys()) - @property def _hidden_column_ids(self) -> typing.Sequence[str]: return tuple(self._hidden_ordering_column_names.keys()) - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None - ) - @property def _ibis_order(self) -> Sequence[ibis_types.Value]: """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" @@ -202,12 +737,15 @@ def _ibis_order(self) -> Sequence[ibis_types.Value]: self._ordering.all_ordering_columns, ) - def builder(self) -> ArrayValueBuilder: + def to_unordered(self) -> UnorderedIR: + return UnorderedIR(self._table, self._columns, self._predicates) + + def builder(self) -> OrderedIR.Builder: """Creates a mutable builder for expressions.""" # Since ArrayValue is intended to be immutable (immutability offers # potential opportunities for caching, though we might need to introduce # more node types for that to be useful), we create a builder class. - return ArrayValueBuilder( + return OrderedIR.Builder( self._table, columns=self._columns, hidden_ordering_columns=self._hidden_ordering_columns, @@ -215,160 +753,39 @@ def builder(self) -> ArrayValueBuilder: predicates=self._predicates, ) - def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: - # Must generate offsets if we are dropping a column that ordering depends on - expr = self - for ordering_column in set(columns).intersection( - [col.column_id for col in self._ordering.ordering_value_columns] - ): - expr = self._hide_column(ordering_column) - - expr_builder = expr.builder() - remain_cols = [ - column for column in expr.columns if column.get_name() not in columns - ] - expr_builder.columns = remain_cols - return expr_builder.build() - - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self._get_any_column(key).type() - ) - return typing.cast( - bigframes.dtypes.Dtype, - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), - ) - - def _get_ibis_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column.""" - if key not in self.column_ids: - raise ValueError( - "Column name {} not in set of values: {}".format(key, self.column_ids) - ) - return typing.cast(ibis_types.Value, self._column_names[key]) - - def _get_any_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column. Will also get hidden columns.""" - all_columns = {**self._column_names, **self._hidden_ordering_column_names} - if key not in all_columns.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, all_columns.keys() - ) - ) - return typing.cast(ibis_types.Value, all_columns[key]) - - def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: - """Gets the Ibis expression for a given hidden column.""" - if key not in self._hidden_ordering_column_names.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._hidden_ordering_column_names.keys() - ) - ) - return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) - - def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - condition = typing.cast( - ibis_types.BooleanValue, self._get_ibis_column(predicate_id) - ) - if keep_null: - condition = typing.cast( - ibis_types.BooleanValue, - condition.fillna( - typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) - ), - ) - return self._filter(condition) - - def _filter(self, predicate_value: ibis_types.BooleanValue) -> CompiledArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - expr = self.builder() - expr.ordering = expr.ordering.with_non_sequential() - expr.predicates = [*self._predicates, predicate_value] - return expr.build() - def order_by( self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> CompiledArrayValue: + ) -> OrderedIR: expr_builder = self.builder() expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) return expr_builder.build() - def reversed(self) -> CompiledArrayValue: + def reversed(self) -> OrderedIR: expr_builder = self.builder() expr_builder.ordering = self._ordering.with_reverse() return expr_builder.build() - def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: + def _uniform_sampling(self, fraction: float) -> OrderedIR: """Sampling the table on given fraction. .. warning:: The row numbers of result is non-deterministic, avoid to use. """ table = self._to_ibis_expr( - "unordered", expose_hidden_cols=True, fraction=fraction + ordering_mode="unordered", expose_hidden_cols=True, fraction=fraction ) columns = [table[column_name] for column_name in self._column_names] hidden_ordering_columns = [ table[column_name] for column_name in self._hidden_ordering_column_names ] - return CompiledArrayValue( + return OrderedIR( table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, ordering=self._ordering, ) - @property - def _offsets(self) -> ibis_types.IntegerColumn: - if not self._ordering.is_sequential: - raise ValueError( - "Expression does not have offsets. Generate them first using project_offsets." - ) - if not self._ordering.total_order_col: - raise ValueError( - "Ordering is invalid. Marked as sequential but no total order columns." - ) - column = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, column) - - def _project_offsets(self) -> CompiledArrayValue: - """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - # TODO(tbergeron): Enforce total ordering - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return CompiledArrayValue( - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def _hide_column(self, column_id) -> CompiledArrayValue: - """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" - expr_builder = self.builder() - # Need to rename column as caller might be creating a new row with the same name but different values. - # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") - expr_builder.hidden_ordering_columns = [ - *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name), - ] - expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) - return expr_builder.build() - - def promote_offsets(self, col_id: str) -> CompiledArrayValue: + def promote_offsets(self, col_id: str) -> OrderedIR: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ @@ -384,194 +801,21 @@ def promote_offsets(self, col_id: str) -> CompiledArrayValue: ] return expr_builder.build() - def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: + def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR: """Creates a new expression based on this expression with new columns.""" columns = [self._get_ibis_column(col_id) for col_id in column_ids] expr = self for ordering_column in set(self.column_ids).intersection( [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - - def concat(self, other: typing.Sequence[CompiledArrayValue]) -> CompiledArrayValue: - """Append together multiple ArrayValue objects.""" - if len(other) == 0: - return self - tables = [] - prefix_base = 10 - prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) - # Must normalize all ids to the same encoding size - max_encoding_size = max( - self._ordering.string_encoding.length, - *[expression._ordering.string_encoding.length for expression in other], - ) - for i, expr in enumerate([self, *other]): - ordering_prefix = str(i).zfill(prefix_size) - table = expr._to_ibis_expr( - ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN - ) - # Rename the value columns based on horizontal offset before applying union. - table = table.select( - [ - table[col].name(f"column_{i}") - if col != ORDER_ID_COLUMN - else ( - ordering_prefix - + reencode_order_string( - table[ORDER_ID_COLUMN], max_encoding_size - ) - ).name(ORDER_ID_COLUMN) - for i, col in enumerate(table.columns) - ] - ) - tables.append(table) - combined_table = ibis.union(*tables) - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - string_encoding=StringEncoding(True, prefix_size + max_encoding_size), - ) - return CompiledArrayValue( - combined_table, - columns=[ - combined_table[col] - for col in combined_table.columns - if col != ORDER_ID_COLUMN - ], - hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def project_unary_op( - self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with unary operation applied to one column.""" - value = op._as_ibis(self._get_ibis_column(column_name)).name( - output_name or column_name - ) - return self._set_or_replace_by_id(output_name or column_name, value) - - def project_binary_op( - self, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with binary operation applied to two columns.""" - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def project_ternary_op( - self, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def aggregate( - self, - aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], - by_column_ids: typing.Sequence[str] = (), - dropna: bool = True, - ) -> CompiledArrayValue: - """ - Apply aggregations to the expression. - Arguments: - aggregations: input_column_id, operation, output_column_id tuples - by_column_id: column id of the aggregation key, this is preserved through the transform - dropna: whether null keys should be dropped - """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: agg_op._as_ibis(table[col_in]) - for col_in, agg_op, col_out in aggregations - } - if by_column_ids: - result = table.group_by(by_column_ids).aggregate(**stats) - # Must have deterministic ordering, so order by the unique "by" column - ordering = ExpressionOrdering( - tuple( - [ - OrderingColumnReference(column_id=column_id) - for column_id in by_column_ids - ] - ), - total_ordering_columns=frozenset(by_column_ids), - ) - columns = tuple(result[key] for key in result.columns) - expr = CompiledArrayValue(result, columns=columns, ordering=ordering) - if dropna: - for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) - # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr._project_offsets() - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [OrderingColumnReference(ORDER_ID_COLUMN)] - ), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return CompiledArrayValue( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def corr_aggregate( - self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] - ) -> CompiledArrayValue: - """ - Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. - This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. - Arguments: - corr_aggregations: left_column_id, right_column_id, output_column_id tuples - """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: table[col_left].corr(table[col_right], how="pop") - for col_left, col_right, col_out in corr_aggregations - } - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return CompiledArrayValue( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, - ) + ): + # Need to hide ordering columns that are being dropped. Alternatively, could project offsets + expr = expr._hide_column(ordering_column) + builder = expr.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + ## Methods that only work with ordering def project_window_op( self, column_name: str, @@ -581,7 +825,7 @@ def project_window_op( *, never_skip_nulls=False, skip_reproject_unsafe: bool = False, - ) -> CompiledArrayValue: + ) -> OrderedIR: """ Creates a new expression based on this expression with unary operation applied to one column. column_name: the id of the input column present in the expression @@ -625,6 +869,168 @@ def project_window_op( # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. return result._reproject_to_table() if not skip_reproject_unsafe else result + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> OrderedIR: + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + row_n = len(row_labels) + hidden_col_ids = self._hidden_ordering_column_names.keys() + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + *hidden_col_ids, + unpivot_offset_id, + ) + + # Extend the original ordering using unpivot_offset_id + old_ordering = self._ordering + if how == "left": + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *old_ordering.ordering_value_columns, + OrderingColumnReference(unpivot_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + else: # how=="right" + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + OrderingColumnReference(unpivot_offset_id), + *old_ordering.ordering_value_columns, + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + hidden_ordering_columns = [ + unpivot_table[unpivot_offset_id], + *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], + ] + return OrderedIR( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + hidden_ordering_columns=hidden_ordering_columns, + ordering=new_ordering, + ) + + def _reproject_to_table(self) -> OrderedIR: + table = self._to_ibis_expr( + ordering_mode="unordered", + expose_hidden_cols=True, + ) + columns = [table[column_name] for column_name in self._column_names] + ordering_col_ids = [ + ref.column_id for ref in self._ordering.all_ordering_columns + ] + hidden_ordering_columns = [ + table[column_name] + for column_name in self._hidden_ordering_column_names + if column_name in ordering_col_ids + ] + return OrderedIR( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + def to_sql( self, offset_column: typing.Optional[str] = None, @@ -644,21 +1050,22 @@ def to_sql( ) if sorted: sql = textwrap.dedent( - f""" - SELECT * EXCEPT (`{offsets_id}`) - FROM ({sql}) - ORDER BY `{offsets_id}` - """ + f"SELECT * EXCEPT (`{offsets_id}`)\n" + "FROM (\n" + f"{sql}\n" + ")\n" + f"ORDER BY `{offsets_id}`\n" ) return typing.cast(str, sql) def _to_ibis_expr( self, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], - order_col_name: Optional[str] = ORDER_ID_COLUMN, + *, expose_hidden_cols: bool = False, fraction: Optional[float] = None, col_id_overrides: typing.Mapping[str, str] = {}, + ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + order_col_name: Optional[str] = ORDER_ID_COLUMN, ): """ Creates an Ibis table expression representing the DataFrame. @@ -677,16 +1084,16 @@ def _to_ibis_expr( column name will be 'bigframes_ordering_id' Args: + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. ordering_mode: How to construct the Ibis expression from the ArrayValue. See above for details. order_col_name: If the ordering mode outputs a single ordering or offsets column, use this as the column name. - expose_hidden_cols: - If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. col_id_overrides: overrides the column ids for the result Returns: @@ -723,20 +1130,115 @@ def _to_ibis_expr( if not columns: return ibis.memtable([]) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def filter(self, predicate_id: str, keep_null: bool = False) -> OrderedIR: + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> OrderedIR: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.ordering = expr.ordering.with_non_sequential() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> OrderedIR: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + ## Ordering specific helpers + def _get_any_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column. Will also get hidden columns.""" + all_columns = {**self._column_names, **self._hidden_ordering_column_names} + if key not in all_columns.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, all_columns.keys() + ) + ) + return typing.cast(ibis_types.Value, all_columns[key]) + + def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: + """Gets the Ibis expression for a given hidden column.""" + if key not in self._hidden_ordering_column_names.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, self._hidden_ordering_column_names.keys() + ) + ) + return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + + def _hide_column(self, column_id) -> OrderedIR: + """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" + expr_builder = self.builder() + # Need to rename column as caller might be creating a new row with the same name but different values. + # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. + new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + expr_builder.hidden_ordering_columns = [ + *self._hidden_ordering_columns, + self._get_ibis_column(column_id).name(new_name), + ] + expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) + return expr_builder.build() + + def _project_offsets(self) -> OrderedIR: + """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" + if self._ordering.is_sequential: + return self + # TODO(tbergeron): Enforce total ordering + table = self._to_ibis_expr( + ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN + ) + columns = [table[column_name] for column_name in self._column_names] + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(True, is_sequential=True), + ) + return OrderedIR( + table, + columns=columns, + hidden_ordering_columns=[table[ORDER_ID_COLUMN]], + ordering=ordering, ) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) - if col_id_overrides: - table = table.relabel(col_id_overrides) - if fraction is not None: - table = table.filter(ibis.random() < ibis.literal(fraction)) - return table def _create_order_columns( self, @@ -789,34 +1291,6 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn: ) return encode_order_string(row_nums) - def _reproject_to_table(self) -> CompiledArrayValue: - """ - Internal operators that projects the internal representation into a - new ibis table expression where each value column is a direct - reference to a column in that table expression. Needed after - some operations such as window operations that cannot be used - recursively in projections. - """ - table = self._to_ibis_expr( - "unordered", - expose_hidden_cols=True, - ) - columns = [table[column_name] for column_name in self._column_names] - ordering_col_ids = [ - ref.column_id for ref in self._ordering.all_ordering_columns - ] - hidden_ordering_columns = [ - table[column_name] - for column_name in self._hidden_ordering_column_names - if column_name in ordering_col_ids - ] - return CompiledArrayValue( - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): group_by: typing.List[ibis_types.Value] = ( [ @@ -851,229 +1325,29 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal group_by=group_by, ) - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> CompiledArrayValue: - """ - Unpivot ArrayValue columns. - - Args: - row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. - unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. - passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. - index_col_id (str): The column id to be used for the row labels. - dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. - - Returns: - ArrayValue: The unpivoted ArrayValue - """ - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr("unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) + class Builder: + def __init__( + self, + table: ibis_types.Table, + ordering: ExpressionOrdering, + columns: Collection[ibis_types.Value] = (), + hidden_ordering_columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - *old_ordering.ordering_value_columns, - OrderingColumnReference(unpivot_offset_id), - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - OrderingColumnReference(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] - return CompiledArrayValue( - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, - ) - - def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> CompiledArrayValue: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + self.table = table + self.columns = list(columns) + self.hidden_ordering_columns = list(hidden_ordering_columns) + self.ordering = ordering + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> OrderedIR: + return OrderedIR( + table=self.table, + columns=self.columns, + hidden_ordering_columns=self.hidden_ordering_columns, + ordering=self.ordering, + predicates=self.predicates, ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - - def _set_or_replace_by_id( - self, id: str, new_value: ibis_types.Value - ) -> CompiledArrayValue: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = [ - col_ref.column_id for col_ref in self._ordering.ordering_value_columns - ] - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - -class ArrayValueBuilder: - """Mutable expression class. - Use ArrayValue.builder() to create from a ArrayValue object. - """ - - def __init__( - self, - table: ibis_types.Table, - ordering: ExpressionOrdering, - columns: Collection[ibis_types.Value] = (), - hidden_ordering_columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - self.table = table - self.columns = list(columns) - self.hidden_ordering_columns = list(hidden_ordering_columns) - self.ordering = ordering - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> CompiledArrayValue: - return CompiledArrayValue( - table=self.table, - columns=self.columns, - hidden_ordering_columns=self.hidden_ordering_columns, - ordering=self.ordering, - predicates=self.predicates, - ) def _reduce_predicate_list( diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 195d830122..662e73a433 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -19,7 +19,8 @@ import pandas as pd -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled +import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.single_column import bigframes.core.nodes as nodes @@ -28,120 +29,167 @@ import bigframes.session +def compile_ordered(node: nodes.BigFrameNode) -> compiled.OrderedIR: + return typing.cast(compiled.OrderedIR, compile_node(node, True)) + + +def compile_unordered(node: nodes.BigFrameNode) -> compiled.UnorderedIR: + return typing.cast(compiled.UnorderedIR, compile_node(node, False)) + + @functools.cache -def compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: +def compile_node( + node: nodes.BigFrameNode, ordered: bool = True +) -> compiled.UnorderedIR | compiled.OrderedIR: """Compile node into CompileArrayValue. Caches result.""" - return _compile_node(node) + return _compile_node(node, ordered) @functools.singledispatch -def _compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: +def _compile_node( + node: nodes.BigFrameNode, ordered: bool = True +) -> compiled.UnorderedIR: """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unnrecognized node: {node}") + raise ValueError(f"Can't compile unrecognized node: {node}") @_compile_node.register -def compile_join(node: nodes.JoinNode): - compiled_left = compile_node(node.left_child) - compiled_right = compile_node(node.right_child) - return bigframes.core.compile.single_column.join_by_column( - compiled_left, - node.left_column_ids, - compiled_right, - node.right_column_ids, - how=node.how, - allow_row_identity_join=node.allow_row_identity_join, - ) +def compile_join(node: nodes.JoinNode, ordered: bool = True): + if ordered: + left_ordered = compile_ordered(node.left_child) + right_ordered = compile_ordered(node.right_child) + return bigframes.core.compile.single_column.join_by_column_ordered( + left_ordered, + node.left_column_ids, + right_ordered, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) + else: + left_unordered = compile_unordered(node.left_child) + right_unordered = compile_unordered(node.right_child) + return bigframes.core.compile.single_column.join_by_column_unordered( + left_unordered, + node.left_column_ids, + right_unordered, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) @_compile_node.register -def compile_select(node: nodes.SelectNode): - return compile_node(node.child).select_columns(node.column_ids) +def compile_select(node: nodes.SelectNode, ordered: bool = True): + return compile_node(node.child, ordered).select_columns(node.column_ids) @_compile_node.register -def compile_drop(node: nodes.DropColumnsNode): - return compile_node(node.child).drop_columns(node.columns) +def compile_drop(node: nodes.DropColumnsNode, ordered: bool = True): + return compile_node(node.child, ordered).drop_columns(node.columns) @_compile_node.register -def compile_readlocal(node: nodes.ReadLocalNode): +def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) - return compiled.CompiledArrayValue.mem_expr_from_pandas(array_as_pd) + ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd) + if ordered: + return ordered_ir + else: + return ordered_ir.to_unordered() @_compile_node.register -def compile_readgbq(node: nodes.ReadGbqNode): - return compiled.CompiledArrayValue( - node.table, - node.columns, - node.hidden_ordering_columns, - node.ordering, - ) +def compile_readgbq(node: nodes.ReadGbqNode, ordered: bool = True): + if ordered: + return compiled.OrderedIR( + node.table, + node.columns, + node.hidden_ordering_columns, + node.ordering, + ) + else: + return compiled.UnorderedIR( + node.table, + node.columns, + ) @_compile_node.register -def compile_promote_offsets(node: nodes.PromoteOffsetsNode): - return compile_node(node.child).promote_offsets(node.col_id) +def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): + result = compile_ordered(node.child).promote_offsets(node.col_id) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_filter(node: nodes.FilterNode): - return compile_node(node.child).filter(node.predicate_id, node.keep_null) +def compile_filter(node: nodes.FilterNode, ordered: bool = True): + return compile_node(node.child, ordered).filter(node.predicate_id, node.keep_null) @_compile_node.register -def compile_orderby(node: nodes.OrderByNode): - return compile_node(node.child).order_by(node.by, node.stable) +def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): + if ordered: + return compile_ordered(node.child).order_by(node.by, node.stable) + else: + return compile_unordered(node.child) @_compile_node.register -def compile_reversed(node: nodes.ReversedNode): - return compile_node(node.child).reversed() +def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): + if ordered: + return compile_ordered(node.child).reversed() + else: + return compile_unordered(node.child) @_compile_node.register -def compile_project_unary(node: nodes.ProjectUnaryOpNode): - return compile_node(node.child).project_unary_op( +def compile_project_unary(node: nodes.ProjectUnaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_unary_op( node.input_id, node.op, node.output_id ) @_compile_node.register -def compile_project_binary(node: nodes.ProjectBinaryOpNode): - return compile_node(node.child).project_binary_op( +def compile_project_binary(node: nodes.ProjectBinaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_binary_op( node.left_input_id, node.right_input_id, node.op, node.output_id ) @_compile_node.register -def compile_project_ternary(node: nodes.ProjectTernaryOpNode): - return compile_node(node.child).project_ternary_op( +def compile_project_ternary(node: nodes.ProjectTernaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_ternary_op( node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id ) @_compile_node.register -def compile_concat(node: nodes.ConcatNode): - compiled_nodes = [compile_node(node) for node in node.children] - return compiled_nodes[0].concat(compiled_nodes[1:]) +def compile_concat(node: nodes.ConcatNode, ordered: bool = True): + if ordered: + compiled_ordered = [compile_ordered(node) for node in node.children] + return concat_impl.concat_ordered(compiled_ordered) + else: + compiled_unordered = [compile_unordered(node) for node in node.children] + return concat_impl.concat_unordered(compiled_unordered) @_compile_node.register -def compile_aggregate(node: nodes.AggregateNode): - return compile_node(node.child).aggregate( +def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): + result = compile_unordered(node.child).aggregate( node.aggregations, node.by_column_ids, node.dropna ) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_corr(node: nodes.CorrNode): - return compile_node(node.child).corr_aggregate(node.corr_aggregations) +def compile_corr(node: nodes.CorrNode, ordered: bool = True): + result = compile_unordered(node.child).corr_aggregate(node.corr_aggregations) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_window(node: nodes.WindowOpNode): - return compile_node(node.child).project_window_op( +def compile_window(node: nodes.WindowOpNode, ordered: bool = True): + result = compile_ordered(node.child).project_window_op( node.column_name, node.op, node.window_spec, @@ -149,16 +197,17 @@ def compile_window(node: nodes.WindowOpNode): never_skip_nulls=node.never_skip_nulls, skip_reproject_unsafe=node.skip_reproject_unsafe, ) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_reproject(node: nodes.ReprojectOpNode): - return compile_node(node.child)._reproject_to_table() +def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): + return compile_node(node.child, ordered)._reproject_to_table() @_compile_node.register -def compile_unpivot(node: nodes.UnpivotNode): - return compile_node(node.child).unpivot( +def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): + return compile_node(node.child, ordered).unpivot( node.row_labels, node.unpivot_columns, passthrough_columns=node.passthrough_columns, @@ -169,17 +218,17 @@ def compile_unpivot(node: nodes.UnpivotNode): @_compile_node.register -def compile_assign(node: nodes.AssignNode): - return compile_node(node.child).assign(node.source_id, node.destination_id) +def compile_assign(node: nodes.AssignNode, ordered: bool = True): + return compile_node(node.child, ordered).assign(node.source_id, node.destination_id) @_compile_node.register -def compile_assign_constant(node: nodes.AssignConstantNode): - return compile_node(node.child).assign_constant( +def compile_assign_constant(node: nodes.AssignConstantNode, ordered: bool = True): + return compile_node(node.child, ordered).assign_constant( node.destination_id, node.value, node.dtype ) @_compile_node.register -def compiler_random_sample(node: nodes.RandomSampleNode): - return compile_node(node.child)._uniform_sampling(node.fraction) +def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): + return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/concat.py b/bigframes/core/compile/concat.py new file mode 100644 index 0000000000..d39569370e --- /dev/null +++ b/bigframes/core/compile/concat.py @@ -0,0 +1,100 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import typing + +import ibis + +import bigframes.core.compile.compiled as compiled +from bigframes.core.ordering import ( + ExpressionOrdering, + OrderingColumnReference, + reencode_order_string, + StringEncoding, +) + +ORDER_ID_COLUMN = "bigframes_ordering_id" + + +def concat_unordered( + items: typing.Sequence[compiled.UnorderedIR], +) -> compiled.UnorderedIR: + """Append together multiple ArrayValue objects.""" + if len(items) == 1: + return items[0] + tables = [] + for expr in items: + table = expr._to_ibis_expr() + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [table[col].name(f"column_{i}") for i, col in enumerate(table.columns)] + ) + tables.append(table) + combined_table = ibis.union(*tables) + return compiled.UnorderedIR( + combined_table, + columns=[combined_table[col] for col in combined_table.columns], + ) + + +def concat_ordered( + items: typing.Sequence[compiled.OrderedIR], +) -> compiled.OrderedIR: + """Append together multiple ArrayValue objects.""" + if len(items) == 1: + return items[0] + + tables = [] + prefix_base = 10 + prefix_size = math.ceil(math.log(len(items), prefix_base)) + # Must normalize all ids to the same encoding size + max_encoding_size = max( + *[expression._ordering.string_encoding.length for expression in items], + ) + for i, expr in enumerate(items): + ordering_prefix = str(i).zfill(prefix_size) + table = expr._to_ibis_expr( + ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN + ) + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [ + table[col].name(f"column_{i}") + if col != ORDER_ID_COLUMN + else ( + ordering_prefix + + reencode_order_string(table[ORDER_ID_COLUMN], max_encoding_size) + ).name(ORDER_ID_COLUMN) + for i, col in enumerate(table.columns) + ] + ) + tables.append(table) + combined_table = ibis.union(*tables) + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + string_encoding=StringEncoding(True, prefix_size + max_encoding_size), + ) + return compiled.OrderedIR( + combined_table, + columns=[ + combined_table[col] + for col in combined_table.columns + if col != ORDER_ID_COLUMN + ], + hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], + ordering=ordering, + ) diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py index 2e9bc0527c..71d53f90dc 100644 --- a/bigframes/core/compile/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -23,16 +23,76 @@ import ibis.expr.types as ibis_types import bigframes.constants as constants -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled import bigframes.core.joins.name_resolution as naming import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} -def join_by_row_identity( - left: compiled.CompiledArrayValue, right: compiled.CompiledArrayValue, *, how: str -) -> compiled.CompiledArrayValue: +def join_by_row_identity_unordered( + left: compiled.UnorderedIR, + right: compiled.UnorderedIR, + *, + how: str, +) -> compiled.UnorderedIR: + """Compute join when we are joining by row identity not a specific column.""" + if how not in SUPPORTED_ROW_IDENTITY_HOW: + raise NotImplementedError( + f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" + ) + + if not left._table.equals(right._table): + raise ValueError( + "Cannot combine objects without an explicit join/merge key. " + f"Left based on: {left._table.compile()}, but " + f"right based on: {right._table.compile()}" + ) + + left_predicates = left._predicates + right_predicates = right._predicates + # TODO(tbergeron): Skip generating these for inner part of join + ( + left_relative_predicates, + right_relative_predicates, + ) = _get_relative_predicates(left_predicates, right_predicates) + + combined_predicates = [] + if left_predicates or right_predicates: + joined_predicates = _join_predicates( + left_predicates, right_predicates, join_type=how + ) + combined_predicates = list(joined_predicates) # builder expects mutable list + + left_mask = left_relative_predicates if how in ["right", "outer"] else None + right_mask = right_relative_predicates if how in ["left", "outer"] else None + + # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result + map_left_id, map_right_id = naming.JOIN_NAME_REMAPPER( + left.column_ids, right.column_ids + ) + joined_columns = [ + _mask_value(left._get_ibis_column(key), left_mask).name(map_left_id[key]) + for key in left.column_ids + ] + [ + _mask_value(right._get_ibis_column(key), right_mask).name(map_right_id[key]) + for key in right.column_ids + ] + + joined_expr = compiled.UnorderedIR( + left._table, + columns=joined_columns, + predicates=combined_predicates, + ) + return joined_expr + + +def join_by_row_identity_ordered( + left: compiled.OrderedIR, + right: compiled.OrderedIR, + *, + how: str, +) -> compiled.OrderedIR: """Compute join when we are joining by row identity not a specific column.""" if how not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( @@ -118,7 +178,7 @@ def join_by_row_identity( if key.column_id in right._hidden_ordering_column_names.keys() ] - joined_expr = compiled.CompiledArrayValue( + joined_expr = compiled.OrderedIR( left._table, columns=joined_columns, hidden_ordering_columns=hidden_ordering_columns, diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index 93ba3f16f1..a9088feb49 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,16 +23,16 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled import bigframes.core.compile.row_identity import bigframes.core.joins as joining import bigframes.core.ordering as orderings -def join_by_column( - left: compiled.CompiledArrayValue, +def join_by_column_ordered( + left: compiled.OrderedIR, left_column_ids: typing.Sequence[str], - right: compiled.CompiledArrayValue, + right: compiled.OrderedIR, right_column_ids: typing.Sequence[str], *, how: Literal[ @@ -43,7 +43,7 @@ def join_by_column( "cross", ], allow_row_identity_join: bool = True, -) -> compiled.CompiledArrayValue: +) -> compiled.OrderedIR: """Join two expressions by column equality. Arguments: @@ -68,13 +68,13 @@ def join_by_column( # regards to value its possible that they both have the same names but # were modified in different ways. Ignore differences in the names. and all( - left._get_any_column(lcol) + left._get_ibis_column(lcol) .name("index") - .equals(right._get_any_column(rcol).name("index")) + .equals(right._get_ibis_column(rcol).name("index")) for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): - return bigframes.core.compile.row_identity.join_by_row_identity( + return bigframes.core.compile.row_identity.join_by_row_identity_ordered( left, right, how=how ) else: @@ -89,12 +89,12 @@ def join_by_column( r_mapping = {**r_public_mapping, **r_hidden_mapping} left_table = left._to_ibis_expr( - "unordered", + ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=l_mapping, ) right_table = right._to_ibis_expr( - "unordered", + ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=r_mapping, ) @@ -135,7 +135,7 @@ def join_by_column( for col in right._hidden_ordering_columns ], ] - return compiled.CompiledArrayValue( + return compiled.OrderedIR( combined_table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, @@ -143,6 +143,88 @@ def join_by_column( ) +def join_by_column_unordered( + left: compiled.UnorderedIR, + left_column_ids: typing.Sequence[str], + right: compiled.UnorderedIR, + right_column_ids: typing.Sequence[str], + *, + how: Literal[ + "inner", + "left", + "outer", + "right", + "cross", + ], + allow_row_identity_join: bool = True, +) -> compiled.UnorderedIR: + """Join two expressions by column equality. + + Arguments: + left: Expression for left table to join. + left_column_ids: Column IDs (not label) to join by. + right: Expression for right table to join. + right_column_ids: Column IDs (not label) to join by. + how: The type of join to perform. + allow_row_identity_join (bool): + If True, allow matching by row identity. Set to False to always + perform a true JOIN in generated SQL. + Returns: + The joined expression. The resulting columns will be, in order, + first the coalesced join keys, then, all the left columns, and + finally, all the right columns. + """ + if ( + allow_row_identity_join + and how in bigframes.core.compile.row_identity.SUPPORTED_ROW_IDENTITY_HOW + and left._table.equals(right._table) + # Make sure we're joining on exactly the same column(s), at least with + # regards to value its possible that they both have the same names but + # were modified in different ways. Ignore differences in the names. + and all( + left._get_ibis_column(lcol) + .name("index") + .equals(right._get_ibis_column(rcol).name("index")) + for lcol, rcol in zip(left_column_ids, right_column_ids) + ) + ): + return bigframes.core.compile.row_identity.join_by_row_identity_unordered( + left, right, how=how + ) + else: + # Value column mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result + l_mapping, r_mapping = joining.JOIN_NAME_REMAPPER( + left.column_ids, right.column_ids + ) + left_table = left._to_ibis_expr( + col_id_overrides=l_mapping, + ) + right_table = right._to_ibis_expr( + col_id_overrides=r_mapping, + ) + join_conditions = [ + value_to_join_key(left_table[l_mapping[left_index]]) + == value_to_join_key(right_table[r_mapping[right_index]]) + for left_index, right_index in zip(left_column_ids, right_column_ids) + ] + + combined_table = ibis.join( + left_table, + right_table, + predicates=join_conditions, + how=how, # type: ignore + ) + # We could filter out the original join columns, but predicates/ordering + # might still reference them in implicit joins. + columns = [ + combined_table[l_mapping[col.get_name()]] for col in left.columns + ] + [combined_table[r_mapping[col.get_name()]] for col in right.columns] + return compiled.UnorderedIR( + combined_table, + columns=columns, + ) + + def value_to_join_key(value: ibis_types.Value): """Converts nullable values to non-null string SQL will not match null keys together - but pandas does.""" if not value.type().is_string(): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1e143144fe..0a03575491 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -861,6 +861,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -880,6 +882,9 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting pandas dataframe will be deterministically ordered. + In some cases, unordered may result in a faster-executing query. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -891,6 +896,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) diff --git a/bigframes/series.py b/bigframes/series.py index 9e111618de..1952acbf6d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -271,6 +271,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> pandas.Series: """Writes Series to pandas Series. @@ -290,6 +292,10 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting pandas series will be deterministically ordered. + In some cases, unordered may result in a faster-executing query. + Returns: pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb @@ -300,6 +306,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) self._set_internal_query_job(query_job) series = df[self._value_column] diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b49e2469a9..8f9fa37787 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -617,10 +617,8 @@ def _read_gbq_table( is_total_ordering = True ordering = orderings.ExpressionOrdering( ordering_value_columns=tuple( - [ - core.OrderingColumnReference(column_id) - for column_id in total_ordering_cols - ] + core.OrderingColumnReference(column_id) + for column_id in total_ordering_cols ), total_ordering_columns=frozenset(total_ordering_cols), ) diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index eae6896669..f01116665f 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -16,7 +16,7 @@ import pytest from bigframes.ml import cluster -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal @pytest.mark.flaky(retries=2, delay=120) @@ -105,7 +105,7 @@ def test_cluster_configure_fit_score_predict( index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) expected.index.name = "observation" - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) # save, load, check n_clusters to ensure configuration was kept reloaded_model = model.to_gbq( diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 6874a9f301..3e56954058 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -24,7 +24,7 @@ pipeline, preprocessing, ) -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal def test_pipeline_linear_regression_fit_score_predict( @@ -555,7 +555,7 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict( ), ) expected.index.name = "observation" - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index): diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index c8f8f66eba..6ed3e6511a 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -32,7 +32,7 @@ get_cloud_function_name, get_remote_function_locations, ) -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal # Use this to control the number of cloud functions being deleted in a single # test session. This should help soften the spike of the number of mutations per @@ -357,7 +357,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, square) @@ -401,7 +401,7 @@ def add_one(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -446,7 +446,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, square) @@ -497,7 +497,7 @@ def sign(num): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, remote_sign) @@ -542,7 +542,7 @@ def circumference(radius): pd_result_col = pd_result_col.astype(pandas.Float64Dtype()) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -591,7 +591,7 @@ def find_team(num): pd_result_col = pd_result_col.astype(pandas.StringDtype(storage="pyarrow")) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -675,7 +675,7 @@ def inner_test(): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Test that the remote function works as expected inner_test() @@ -765,7 +765,7 @@ def is_odd(num): pd_result_col = pd_int64_col.mask(is_odd) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -808,7 +808,7 @@ def is_odd(num): pd_result_col = pd_int64_col[pd_int64_col.notnull()].mask(is_odd, -1) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -852,7 +852,7 @@ def test_remote_udf_lambda( pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -909,7 +909,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -954,7 +954,7 @@ def pd_np_foo(x): # comparing for the purpose of this test pd_result.result = pd_result.result.astype(pandas.Float64Dtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -998,7 +998,7 @@ def test_internal(rf, udf): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Create an explicit name for the remote function prefixer = test_utils.prefixer.Prefixer("foo", "") @@ -1167,7 +1167,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -1204,7 +1204,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index d95a1e1bc2..caeffa7768 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -15,7 +15,7 @@ import pandas as pd from bigframes.ml import cluster -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal _PD_NEW_PENGUINS = pd.DataFrame.from_dict( { @@ -68,7 +68,7 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): dtype="Int64", index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index f911dd7eeb..ec1f351d87 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -225,7 +225,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, @@ -233,6 +233,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame check_index_type=False, check_dtype=False, + ignore_order=True, ) diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index e31681f4a0..cc4d2e5801 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -130,13 +130,14 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): "explained_variance": [3.278657, 1.270829, 1.125354], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, rtol=0.1, check_index_type=False, check_dtype=False, + ignore_order=True, ) @@ -149,11 +150,12 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): "explained_variance_ratio": [0.469357, 0.181926, 0.1611], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, rtol=0.1, check_index_type=False, check_dtype=False, + ignore_order=True, ) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 7dc55b9367..177194c7a8 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -16,7 +16,7 @@ import pytest import bigframes.series -from tests.system.utils import assert_series_equal_ignoring_order +from tests.system.utils import assert_series_equal DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)] @@ -33,7 +33,7 @@ def test_day(scalars_dfs, col_name): bf_result = bf_series.dt.day.to_pandas() pd_result = scalars_pandas_df[col_name].dt.day - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -51,7 +51,7 @@ def test_date(scalars_dfs, col_name): bf_result = bf_series.dt.date.to_pandas() pd_result = scalars_pandas_df[col_name].dt.date - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -69,7 +69,7 @@ def test_dayofweek(scalars_dfs, col_name): bf_result = bf_series.dt.dayofweek.to_pandas() pd_result = scalars_pandas_df[col_name].dt.dayofweek - assert_series_equal_ignoring_order(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -84,7 +84,7 @@ def test_hour(scalars_dfs, col_name): bf_result = bf_series.dt.hour.to_pandas() pd_result = scalars_pandas_df[col_name].dt.hour - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -102,7 +102,7 @@ def test_minute(scalars_dfs, col_name): bf_result = bf_series.dt.minute.to_pandas() pd_result = scalars_pandas_df[col_name].dt.minute - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -120,7 +120,7 @@ def test_month(scalars_dfs, col_name): bf_result = bf_series.dt.month.to_pandas() pd_result = scalars_pandas_df[col_name].dt.month - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -138,7 +138,7 @@ def test_quarter(scalars_dfs, col_name): bf_result = bf_series.dt.quarter.to_pandas() pd_result = scalars_pandas_df[col_name].dt.quarter - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -156,7 +156,7 @@ def test_second(scalars_dfs, col_name): bf_result = bf_series.dt.second.to_pandas() pd_result = scalars_pandas_df[col_name].dt.second - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -174,7 +174,7 @@ def test_time(scalars_dfs, col_name): bf_result = bf_series.dt.time.to_pandas() pd_result = scalars_pandas_df[col_name].dt.time - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -192,7 +192,7 @@ def test_year(scalars_dfs, col_name): bf_result = bf_series.dt.year.to_pandas() pd_result = scalars_pandas_df[col_name].dt.year - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 241cbd576b..27a35134d4 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -19,7 +19,7 @@ import bigframes.series -from ...utils import assert_series_equal_ignoring_order +from ...utils import assert_series_equal def test_find(scalars_dfs): @@ -31,7 +31,7 @@ def test_find(scalars_dfs): # One of type mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but # the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948 - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -173,7 +173,7 @@ def test_len(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but # the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948 - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -186,7 +186,7 @@ def test_lower(scalars_dfs): bf_result = bf_series.str.lower().to_pandas() pd_result = scalars_pandas_df[col_name].str.lower() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -205,7 +205,7 @@ def test_reverse(scalars_dfs): else: pd_result.loc[i] = cell[::-1] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -222,7 +222,7 @@ def test_slice(scalars_dfs, start, stop): pd_series = scalars_pandas_df[col_name] pd_result = pd_series.str.slice(start, stop) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -235,7 +235,7 @@ def test_strip(scalars_dfs): bf_result = bf_series.str.strip().to_pandas() pd_result = scalars_pandas_df[col_name].str.strip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -248,7 +248,7 @@ def test_upper(scalars_dfs): bf_result = bf_series.str.upper().to_pandas() pd_result = scalars_pandas_df[col_name].str.upper() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -330,7 +330,7 @@ def test_islower(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.islower() bf_result = weird_strings.str.islower().to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result.astype(pd.BooleanDtype()) # the dtype here is a case of intentional diversion from pandas @@ -342,7 +342,7 @@ def test_isupper(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.isupper() bf_result = weird_strings.str.isupper().to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result.astype(pd.BooleanDtype()) # the dtype here is a case of intentional diversion from pandas @@ -357,7 +357,7 @@ def test_rstrip(scalars_dfs): bf_result = bf_series.str.rstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.rstrip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -370,7 +370,7 @@ def test_lstrip(scalars_dfs): bf_result = bf_series.str.lstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.lstrip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -384,7 +384,7 @@ def test_repeat(scalars_dfs, repeats): bf_result = bf_series.str.repeat(repeats).to_pandas() pd_result = scalars_pandas_df[col_name].str.repeat(repeats) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -397,7 +397,7 @@ def test_capitalize(scalars_dfs): bf_result = bf_series.str.capitalize().to_pandas() pd_result = scalars_pandas_df[col_name].str.capitalize() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -415,7 +415,7 @@ def test_cat_with_series(scalars_dfs): pd_right = scalars_pandas_df[col_name] pd_result = pd_left.str.cat(others=pd_right) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -429,7 +429,7 @@ def test_str_match(scalars_dfs): bf_result = bf_series.str.match(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.match(pattern) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -443,7 +443,7 @@ def test_str_fullmatch(scalars_dfs): bf_result = bf_series.str.fullmatch(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.fullmatch(pattern) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -456,7 +456,7 @@ def test_str_get(scalars_dfs): bf_result = bf_series.str.get(8).to_pandas() pd_result = scalars_pandas_df[col_name].str.get(8) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -469,7 +469,7 @@ def test_str_pad(scalars_dfs): bf_result = bf_series.str.pad(8, side="both", fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.pad(8, side="both", fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -492,7 +492,7 @@ def test_str_ljust(scalars_dfs): bf_result = bf_series.str.ljust(7, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.ljust(7, fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -505,7 +505,7 @@ def test_str_rjust(scalars_dfs): bf_result = bf_series.str.rjust(9, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.rjust(9, fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 605d4abc1d..e522878229 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -28,10 +28,7 @@ import bigframes._config.display_options as display_options import bigframes.dataframe as dataframe import bigframes.series as series -from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, - assert_series_equal_ignoring_order, -) +from tests.system.utils import assert_pandas_df_equal, assert_series_equal def test_df_construct_copy(scalars_dfs): @@ -98,7 +95,7 @@ def test_get_column(scalars_dfs): series = scalars_df[col_name] bf_result = series.to_pandas() pd_result = scalars_pandas_df[col_name] - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_get_column_nonstring(scalars_dfs): @@ -106,7 +103,7 @@ def test_get_column_nonstring(scalars_dfs): series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] bf_result = series.to_pandas() pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_hasattr(scalars_dfs): @@ -116,15 +113,24 @@ def test_hasattr(scalars_dfs): assert not hasattr(scalars_df, "not_exist") -def test_head_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_head_with_custom_column_labels( + scalars_df_index, scalars_pandas_df_index, ordered +): rename_mapping = { "int64_col": "Integer Column", "string_col": "言語列", } bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) - bf_result = bf_df.to_pandas() + bf_result = bf_df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) - pandas.testing.assert_frame_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): @@ -183,7 +189,7 @@ def test_get_column_by_attr(scalars_dfs): series = scalars_df.int64_col bf_result = series.to_pandas() pd_result = scalars_pandas_df.int64_col - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_get_columns(scalars_dfs): @@ -246,7 +252,7 @@ def test_drop_with_custom_column_labels(scalars_dfs): pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( columns=dropped_columns ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_drop_index(scalars_dfs): @@ -420,7 +426,7 @@ def test_filter_df(scalars_dfs): pd_bool_series = scalars_pandas_df["bool_col"] pd_result = scalars_pandas_df[pd_bool_series] - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_new_column(scalars_dfs): @@ -433,7 +439,7 @@ def test_assign_new_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_new_column_w_loc(scalars_dfs): @@ -564,7 +570,7 @@ def test_assign_existing_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_listlike_to_empty_df(session): @@ -576,7 +582,7 @@ def test_assign_listlike_to_empty_df(session): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result.to_pandas(), pd_result) + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) def test_assign_to_empty_df_multiindex_error(session): @@ -595,14 +601,21 @@ def test_assign_to_empty_df_multiindex_error(session): empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) -def test_assign_series(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_assign_series(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs column_name = "int64_col" df = scalars_df.assign(new_col=scalars_df[column_name]) - bf_result = df.to_pandas() + bf_result = df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_assign_series_overwrite(scalars_dfs): @@ -614,7 +627,7 @@ def test_assign_series_overwrite(scalars_dfs): **{column_name: scalars_pandas_df[column_name] + 3} ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_sequential(scalars_dfs): @@ -629,7 +642,7 @@ def test_assign_sequential(scalars_dfs): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Require an index so that the self-join is consistent each time. @@ -663,7 +676,7 @@ def test_assign_different_df( new_col=scalars_pandas_df_index[column_name] ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_different_df_w_loc( @@ -714,7 +727,7 @@ def test_assign_callable_lambda(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -965,7 +978,9 @@ def test_df_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -998,7 +1013,9 @@ def test_df_merge_multi_key(scalars_dfs, left_on, right_on): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -1028,7 +1045,9 @@ def test_merge_custom_col_name(scalars_dfs, merge_how): pandas_right_df = scalars_pandas_df[right_columns] pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -1061,7 +1080,9 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) def test_get_dtypes(scalars_df_default_index): @@ -1375,7 +1396,7 @@ def test_df_abs(scalars_dfs): bf_result = scalars_df[columns].abs().to_pandas() pd_result = scalars_pandas_df[columns].abs() - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_df_isnull(scalars_dfs): @@ -1392,7 +1413,7 @@ def test_df_isnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_df_notnull(scalars_dfs): @@ -1409,7 +1430,7 @@ def test_df_notnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1629,7 +1650,7 @@ def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize(("other_scalar"), [1, -2]) @@ -1641,7 +1662,7 @@ def test_mod(scalars_dfs, other_scalar): bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_scalar_binop_str_exception(scalars_dfs): @@ -1697,7 +1718,7 @@ def test_series_binop_axis_index( bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1743,8 +1764,15 @@ def test_binop_df_df_binary_op( # Differnt table will only work for explicit index, since default index orders are arbitrary. +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) def test_series_binop_add_different_table( - scalars_df_index, scalars_pandas_df_index, scalars_df_2_index + scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered ): df_columns = ["int64_col", "float64_col"] series_column = "int64_too" @@ -1752,13 +1780,13 @@ def test_series_binop_add_different_table( bf_result = ( scalars_df_index[df_columns] .add(scalars_df_2_index[series_column], axis="index") - .to_pandas() + .to_pandas(ordered=ordered) ) pd_result = scalars_pandas_df_index[df_columns].add( scalars_pandas_df_index[series_column], axis="index" ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) # TODO(garrettwu): Test series binop with different index @@ -1779,7 +1807,7 @@ def test_join_same_table(scalars_dfs, how): pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]] pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -1792,7 +1820,7 @@ def test_join_different_table( pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): @@ -1821,7 +1849,7 @@ def test_join_param_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -2037,7 +2065,14 @@ def test_df_describe(scalars_dfs): ).all() -def test_df_stack(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_stack(scalars_dfs, ordered): if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): pytest.skip("pandas <2.1 uses different stack implementation") scalars_df, scalars_pandas_df = scalars_dfs @@ -2047,11 +2082,13 @@ def test_df_stack(scalars_dfs): # Can only stack identically-typed columns columns = ["int64_col", "int64_too", "rowindex_2"] - bf_result = scalars_df[columns].stack().to_pandas() + bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].stack(future_stack=True) # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) def test_df_melt_default(scalars_dfs): @@ -2097,7 +2134,14 @@ def test_df_melt_parameterized(scalars_dfs): ) -def test_df_unstack(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_unstack(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes scalars_pandas_df = scalars_pandas_df.copy() @@ -2110,11 +2154,13 @@ def test_df_unstack(scalars_dfs): ] # unstack on mono-index produces series - bf_result = scalars_df[columns].unstack().to_pandas() + bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].unstack() # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) @pytest.mark.parametrize( @@ -2259,14 +2305,18 @@ def test_iloc_slice_zero_step(scalars_df_index): scalars_df_index.iloc[0:0:0] -def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas() +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) @pytest.mark.parametrize( @@ -2457,6 +2507,13 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs): pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) @pytest.mark.parametrize( ("op"), [ @@ -2471,16 +2528,18 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs): ], ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], ) -def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op): +def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered): col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] bf_series = op(scalars_df_index[col_names]) pd_series = op(scalars_pandas_df_index[col_names]) - bf_result = bf_series.to_pandas() + bf_result = bf_series.to_pandas(ordered=ordered) # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_series = pd_series.astype("Float64") # Pandas has object index type - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + assert_series_equal( + pd_series, bf_result, check_index_type=False, ignore_order=not ordered + ) @pytest.mark.parametrize( @@ -2571,16 +2630,25 @@ def test_df_skew_too_few_values(scalars_dfs): pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) -def test_df_skew(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_skew(scalars_dfs, ordered): columns = ["float64_col", "int64_col"] scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].skew().to_pandas() + bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].skew() # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + assert_series_equal( + pd_result, bf_result, check_index_type=False, ignore_order=not ordered + ) def test_df_kurt_too_few_values(scalars_dfs): @@ -2731,9 +2799,10 @@ def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_pandas_df_equal_ignore_ordering( + assert_pandas_df_equal( bf_result, pd_result, + ignore_order=True, check_names=False, ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 8f5d706f62..d700d93be9 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -19,10 +19,7 @@ import pyarrow as pa import pytest -from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, - convert_pandas_dtypes, -) +from tests.system.utils import assert_pandas_df_equal, convert_pandas_dtypes try: import pandas_gbq # type: ignore @@ -380,7 +377,7 @@ def test_to_sql_query_unnamed_index_included( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) roundtrip.index.names = [None] - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) def test_to_sql_query_named_index_included( @@ -397,7 +394,7 @@ def test_to_sql_query_named_index_included( pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) def test_to_sql_query_unnamed_index_excluded( @@ -412,7 +409,9 @@ def test_to_sql_query_unnamed_index_excluded( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal( + roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True + ) def test_to_sql_query_named_index_excluded( @@ -429,4 +428,6 @@ def test_to_sql_query_named_index_excluded( "rowindex_2", drop=True ).reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal( + roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True + ) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 05154f7ab7..a24713c2b3 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -16,6 +16,7 @@ import pytest import bigframes.pandas as bpd +from tests.system.utils import assert_pandas_df_equal @pytest.mark.parametrize( @@ -88,16 +89,23 @@ def test_dataframe_groupby_aggregate( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) -def test_dataframe_groupby_agg_string(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_dataframe_groupby_agg_string( + scalars_df_index, scalars_pandas_df_index, ordered +): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("string_col").agg("count") pd_result = scalars_pandas_df_index[col_names].groupby("string_col").agg("count") - bf_result_computed = bf_result.to_pandas() + bf_result_computed = bf_result.to_pandas(ordered=ordered) - pd.testing.assert_frame_equal( - pd_result, - bf_result_computed, - check_dtype=False, + assert_pandas_df_equal( + pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered ) @@ -270,13 +278,22 @@ def test_dataframe_groupby_kurt(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) -def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index, ordered): col_names = ["float64_col", "int64_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("string_col").diff(-1) pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1) - bf_result_computed = bf_result.to_pandas() + bf_result_computed = bf_result.to_pandas(ordered=ordered) - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + assert_pandas_df_equal( + pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered + ) def test_dataframe_groupby_getitem( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index bc35f633fd..e7e93849c6 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal # Row Multi-index tests @@ -429,7 +429,7 @@ def test_multi_index_dataframe_join(scalars_dfs, how): (["bool_col", "rowindex_2"]) )[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -450,7 +450,7 @@ def test_multi_index_dataframe_join_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index b88901f3bc..a1079288cf 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,16 +16,23 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal -def test_concat_dataframe(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_concat_dataframe(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat(11 * [scalars_df]) - bf_result = bf_result.to_pandas() + bf_result = bf_result.to_pandas(ordered=ordered) pd_result = pd.concat(11 * [scalars_pandas_df]) - pd.testing.assert_frame_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_concat_series(scalars_dfs): @@ -252,7 +259,7 @@ def test_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -286,7 +293,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_pd_merge_cross(scalars_dfs): @@ -341,7 +348,7 @@ def test_merge_series(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_cut(scalars_dfs): diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 89907a53df..3d8532a13b 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -18,7 +18,7 @@ import bigframes from bigframes import remote_function as rf -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal @pytest.fixture(scope="module") @@ -121,7 +121,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -170,7 +170,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -246,7 +246,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -309,7 +309,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -348,7 +348,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -387,7 +387,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -418,7 +418,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -447,7 +447,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df[col].dtype) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -535,7 +535,7 @@ def square1(x): s2_result_col = int64_col_filtered.apply(square2) s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col) - assert_pandas_df_equal_ignore_ordering(s1_result.to_pandas(), s2_result.to_pandas()) + assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas()) @pytest.mark.flaky(retries=2, delay=120) @@ -583,7 +583,9 @@ def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) indirect_df = indirect_df.to_pandas() - assert_pandas_df_equal_ignore_ordering(direct_df, indirect_df) + assert_pandas_df_equal( + direct_df, indirect_df, ignore_order=True, check_index_type=False + ) @pytest.mark.flaky(retries=2, delay=120) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 006d370818..f59d64fe06 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -24,10 +24,7 @@ import bigframes.pandas import bigframes.series as series -from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, - assert_series_equal_ignoring_order, -) +from tests.system.utils import assert_pandas_df_equal, assert_series_equal def test_series_construct_copy(scalars_dfs): @@ -210,7 +207,7 @@ def test_abs(scalars_dfs, col_name): bf_result = scalars_df[col_name].abs().to_pandas() pd_result = scalars_pandas_df[col_name].abs() - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_fillna(scalars_dfs): @@ -218,7 +215,7 @@ def test_fillna(scalars_dfs): col_name = "string_col" bf_result = scalars_df[col_name].fillna("Missing").to_pandas() pd_result = scalars_pandas_df[col_name].fillna("Missing") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -496,7 +493,7 @@ def test_series_int_int_operators_scalar( bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_pow_scalar(scalars_dfs): @@ -505,7 +502,7 @@ def test_series_pow_scalar(scalars_dfs): bf_result = (scalars_df["int64_col"] ** 2).to_pandas() pd_result = scalars_pandas_df["int64_col"] ** 2 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_pow_scalar_reverse(scalars_dfs): @@ -514,7 +511,7 @@ def test_series_pow_scalar_reverse(scalars_dfs): bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas() pd_result = 0.8 ** scalars_pandas_df["int64_col"] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -540,7 +537,7 @@ def test_series_bool_bool_operators_scalar( bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar) - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) @pytest.mark.parametrize( @@ -578,7 +575,7 @@ def test_series_int_int_operators_series(scalars_dfs, operator): scalars_df, scalars_pandas_df = scalars_dfs bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas() pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -728,7 +725,7 @@ def test_series_add_scalar(scalars_dfs, other): bf_result = (scalars_df["float64_col"] + other).to_pandas() pd_result = scalars_pandas_df["float64_col"] + other - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -744,7 +741,7 @@ def test_series_add_bigframes_series(scalars_dfs, left_col, right_col): bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas() pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -766,7 +763,7 @@ def test_series_add_bigframes_series_nested( scalars_pandas_df[left_col] + scalars_pandas_df[right_col] ) + scalars_pandas_df[righter_col] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_add_different_table_default_index( @@ -924,7 +921,7 @@ def test_isnull(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series) + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) def test_notnull(scalars_dfs): @@ -935,7 +932,7 @@ def test_notnull(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series) + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) def test_round(scalars_dfs): @@ -944,7 +941,7 @@ def test_round(scalars_dfs): bf_result = scalars_df[col_name].round().to_pandas() pd_result = scalars_pandas_df[col_name].round() - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_scalar(scalars_dfs): @@ -953,7 +950,7 @@ def test_eq_scalar(scalars_dfs): bf_result = scalars_df[col_name].eq(0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(0) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_wider_type_scalar(scalars_dfs): @@ -962,7 +959,7 @@ def test_eq_wider_type_scalar(scalars_dfs): bf_result = scalars_df[col_name].eq(1.0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(1.0) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_ne_scalar(scalars_dfs): @@ -971,7 +968,7 @@ def test_ne_scalar(scalars_dfs): bf_result = (scalars_df[col_name] != 0).to_pandas() pd_result = scalars_pandas_df[col_name] != 0 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_int_scalar(scalars_dfs): @@ -980,7 +977,7 @@ def test_eq_int_scalar(scalars_dfs): bf_result = (scalars_df[col_name] == 0).to_pandas() pd_result = scalars_pandas_df[col_name] == 0 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -999,7 +996,7 @@ def test_eq_same_type_series(scalars_dfs, col_name): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): @@ -1036,7 +1033,7 @@ def test_ne_obj_series(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) def test_indexing_using_unselected_series(scalars_dfs): @@ -1045,7 +1042,7 @@ def test_indexing_using_unselected_series(scalars_dfs): bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas() pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1061,7 +1058,7 @@ def test_indexing_using_selected_series(scalars_dfs): scalars_pandas_df["string_col"].eq("Hello, World!") ] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1083,7 +1080,7 @@ def test_nested_filter(scalars_dfs): ) # Convert from nullable bool to nonnullable bool usable as indexer pd_result = pd_string_col[pd_int64_too == 0][~pd_bool_col] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1102,7 +1099,7 @@ def test_binop_repeated_application_does_row_identity_joins(scalars_dfs): bf_result = bf_series.to_pandas() pd_result = pd_series - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1124,10 +1121,9 @@ def test_binop_opposite_filters(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()] - assert_series_equal_ignoring_order( - bf_result, - pd_result, - ) + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) def test_binop_left_filtered(scalars_dfs): @@ -1142,10 +1138,9 @@ def test_binop_left_filtered(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col[pd_bool_col] + pd_float64_col - assert_series_equal_ignoring_order( - bf_result, - pd_result, - ) + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) def test_binop_right_filtered(scalars_dfs): @@ -1160,7 +1155,7 @@ def test_binop_right_filtered(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_float64_col + pd_int64_col[pd_bool_col] - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1265,7 +1260,7 @@ def test_groupby_sum(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1283,7 +1278,7 @@ def test_groupby_std(scalars_dfs): .astype(pd.Float64Dtype()) ) bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1298,7 +1293,7 @@ def test_groupby_var(scalars_dfs): scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var() ) bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1350,7 +1345,7 @@ def test_groupby_mean(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, ) @@ -1388,7 +1383,7 @@ def test_groupby_prod(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, ) @@ -1598,7 +1593,7 @@ def test_head(scalars_dfs): bf_result = scalars_df["string_col"].head(2).to_pandas() pd_result = scalars_pandas_df["string_col"].head(2) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1613,7 +1608,7 @@ def test_tail(scalars_dfs): bf_result = scalars_df["string_col"].tail(2).to_pandas() pd_result = scalars_pandas_df["string_col"].tail(2) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -2081,11 +2076,7 @@ def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_series_equal_ignoring_order( - bf_result, - pd_result, - check_names=False, - ) + assert_series_equal(bf_result, pd_result, check_names=False, ignore_order=True) def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): @@ -2213,21 +2204,25 @@ def test_where_with_default(scalars_df_index, scalars_pandas_df_index): ) -def test_clip(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): col_bf = scalars_df_index["int64_col"] lower_bf = scalars_df_index["int64_too"] - 1 upper_bf = scalars_df_index["int64_too"] + 1 - bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas(ordered=ordered) col_pd = scalars_pandas_df_index["int64_col"] lower_pd = scalars_pandas_df_index["int64_too"] - 1 upper_pd = scalars_pandas_df_index["int64_too"] + 1 pd_result = col_pd.clip(lower_pd, upper_pd) - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) + assert_series_equal(bf_result, pd_result, ignore_order=not ordered) def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): @@ -2298,7 +2293,7 @@ def test_to_frame(scalars_dfs): bf_result = scalars_df["int64_col"].to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].to_frame() - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_to_json(scalars_df_index, scalars_pandas_df_index): @@ -2466,7 +2461,7 @@ def test_mask_default_value(scalars_dfs): pd_col_masked = pd_col.mask(pd_col % 2 == 1) pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_mask_custom_value(scalars_dfs): @@ -2484,7 +2479,7 @@ def test_mask_custom_value(scalars_dfs): # odd so should be left as is, but it is being masked in pandas. # Accidentally the bigframes bahavior matches, but it should be updated # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -2581,7 +2576,7 @@ def test_loc_bool_series_default_index( scalars_pandas_df_default_index.bool_col ] - assert_pandas_df_equal_ignore_ordering( + assert_pandas_df_equal( bf_result.to_frame(), pd_result.to_frame(), ) diff --git a/tests/system/utils.py b/tests/system/utils.py index e2daf3b8bf..f7831972b8 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -21,29 +21,33 @@ import pyarrow as pa # type: ignore -def assert_pandas_df_equal_ignore_ordering(df0, df1, **kwargs): - # Sort by a column to get consistent results. - if df0.index.name != "rowindex": - df0 = df0.sort_values( - list(df0.columns.drop("geography_col", errors="ignore")) - ).reset_index(drop=True) - df1 = df1.sort_values( - list(df1.columns.drop("geography_col", errors="ignore")) - ).reset_index(drop=True) - else: - df0 = df0.sort_index() - df1 = df1.sort_index() +def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): + if ignore_order: + # Sort by a column to get consistent results. + if df0.index.name != "rowindex": + df0 = df0.sort_values( + list(df0.columns.drop("geography_col", errors="ignore")) + ).reset_index(drop=True) + df1 = df1.sort_values( + list(df1.columns.drop("geography_col", errors="ignore")) + ).reset_index(drop=True) + else: + df0 = df0.sort_index() + df1 = df1.sort_index() pd.testing.assert_frame_equal(df0, df1, **kwargs) -def assert_series_equal_ignoring_order(left: pd.Series, right: pd.Series, **kwargs): - if left.index.name is None: - left = left.sort_values().reset_index(drop=True) - right = right.sort_values().reset_index(drop=True) - else: - left = left.sort_index() - right = right.sort_index() +def assert_series_equal( + left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs +): + if ignore_order: + if left.index.name is None: + left = left.sort_values().reset_index(drop=True) + right = right.sort_values().reset_index(drop=True) + else: + left = left.sort_index() + right = right.sort_index() pd.testing.assert_series_equal(left, right, **kwargs) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index d9672b2635..623448b3aa 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -49,7 +49,7 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): ordering=ordering, hidden_ordering_columns=(), ) - assert actual.compile()._table is ibis_table + assert actual._compile_ordered()._table is ibis_table assert len(actual.column_ids) == 3 @@ -83,7 +83,7 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value.compile()._get_ibis_column("col1") + col1 = value._compile_ordered()._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() @@ -100,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value.compile()._get_ibis_column("col1") + expr = value._compile_ordered()._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() @@ -117,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr.compile()._to_ibis_expr("unordered") + actual = expr._compile_ordered()._to_ibis_expr(ordering_mode="unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -136,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string")).compile() - assert value.compile().columns[0].type().is_int64() + expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile_ordered() + assert value._compile_ordered().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -152,9 +152,11 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op("col2", "col3", ops.add_op, "col4").compile() + expr = value.project_binary_op( + "col2", "col3", ops.add_op, "col4" + )._compile_ordered() assert expr.columns[3].type().is_float64() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 4 assert actual.columns[3] == "col4" @@ -173,9 +175,9 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ) expr = value.project_ternary_op( "col2", "col3", "col4", ops.where_op, "col5" - ).compile() + )._compile_ordered() assert expr.columns[4].type().is_float64() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 5 assert actual.columns[4] == "col5" @@ -195,8 +197,8 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): aggregations=(("col1", agg_ops.sum_op, "col4"),), by_column_ids=["col1"], dropna=False, - ).compile() - actual = expr._to_ibis_expr("unordered") + )._compile_ordered() + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" assert actual.columns[1] == "col4" @@ -214,8 +216,10 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): ), total_ordering_columns=["col1"], ) - expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]).compile() - actual = expr._to_ibis_expr("unordered") + expr = value.corr_aggregate( + corr_aggregations=[("col1", "col3", "col4")] + )._compile_ordered() + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" assert expr.columns[0].type().is_float64()